This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add embed.fnc entries for Cv inline fcns
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9
KW
8 * This file contains tables and code adapted from
9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
8d919b0a 188 assert(isREGEXP(re));
df6b4bd5
DM
189 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
190 : (struct regexp *)p;
8d919b0a
FC
191}
192
27669aa4
FC
193/* ------------------------------- sv.h ------------------------------- */
194
195PERL_STATIC_INLINE SV *
c9182d9c 196Perl_SvREFCNT_inc(SV *sv)
27669aa4 197{
2439e033 198 if (LIKELY(sv != NULL))
27669aa4
FC
199 SvREFCNT(sv)++;
200 return sv;
201}
202PERL_STATIC_INLINE SV *
c9182d9c 203Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 204{
3f2f854a
KW
205 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
206
27669aa4
FC
207 SvREFCNT(sv)++;
208 return sv;
209}
210PERL_STATIC_INLINE void
c9182d9c 211Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 212{
2439e033 213 if (LIKELY(sv != NULL))
27669aa4
FC
214 SvREFCNT(sv)++;
215}
75e16a44 216PERL_STATIC_INLINE void
c9182d9c 217Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 218{
2439e033 219 if (LIKELY(sv != NULL)) {
75a9bf96 220 U32 rc = SvREFCNT(sv);
79e2a32a 221 if (LIKELY(rc > 1))
75a9bf96
DM
222 SvREFCNT(sv) = rc - 1;
223 else
224 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
225 }
226}
541377b1
FC
227
228PERL_STATIC_INLINE void
c9182d9c 229Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
230{
231 U32 rc = SvREFCNT(sv);
3f2f854a
KW
232
233 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
234
79e2a32a 235 if (LIKELY(rc > 1))
4a9a56a7
DM
236 SvREFCNT(sv) = rc - 1;
237 else
238 Perl_sv_free2(aTHX_ sv, rc);
239}
240
241PERL_STATIC_INLINE void
541377b1
FC
242SvAMAGIC_on(SV *sv)
243{
244 assert(SvROK(sv));
245 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
246}
247PERL_STATIC_INLINE void
248SvAMAGIC_off(SV *sv)
249{
250 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
251 HvAMAGIC_off(SvSTASH(SvRV(sv)));
252}
253
254PERL_STATIC_INLINE U32
c9182d9c 255Perl_SvPADSTALE_on(SV *sv)
541377b1 256{
c0683843 257 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
258 return SvFLAGS(sv) |= SVs_PADSTALE;
259}
260PERL_STATIC_INLINE U32
c9182d9c 261Perl_SvPADSTALE_off(SV *sv)
541377b1 262{
c0683843 263 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
264 return SvFLAGS(sv) &= ~SVs_PADSTALE;
265}
25fdce4a 266#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 267PERL_STATIC_INLINE STRLEN
6964422a 268S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 269{
25fdce4a 270 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
271 if (SvGAMAGIC(sv)) {
272 U8 *hopped = utf8_hop((U8 *)pv, pos);
273 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
274 return (STRLEN)(hopped - (U8 *)pv);
275 }
276 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
277}
278#endif
f019c49e 279
d1decf2b
TC
280/* ------------------------------- handy.h ------------------------------- */
281
282/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 283GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 284static void
c9182d9c 285Perl_croak_memory_wrap(void)
d1decf2b
TC
286{
287 Perl_croak_nocontext("%s",PL_memory_wrap);
288}
7347ee54 289GCC_DIAG_RESTORE_DECL;
d1decf2b 290
a8a2ceaa
KW
291/* ------------------------------- utf8.h ------------------------------- */
292
2fe720e2
KW
293/*
294=head1 Unicode Support
295*/
296
55d09dc8 297PERL_STATIC_INLINE void
c9182d9c 298Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
299{
300 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
301 * encoded string at '*dest', updating '*dest' to include it */
302
55d09dc8
KW
303 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
304
6f2d5cbc 305 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 306 *((*dest)++) = byte;
55d09dc8 307 else {
a09ec51a
KW
308 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
309 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
310 }
311}
312
e123187a 313/*
2fe720e2 314=for apidoc valid_utf8_to_uvchr
09232555
KW
315Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
316known that the next character in the input UTF-8 string C<s> is well-formed
317(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
318points, and non-Unicode code points are allowed.
2fe720e2
KW
319
320=cut
321
322 */
323
324PERL_STATIC_INLINE UV
325Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
326{
c41b2540 327 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
328 const U8* send = s + expectlen;
329 UV uv = *s;
330
331 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
332
333 if (retlen) {
334 *retlen = expectlen;
335 }
336
337 /* An invariant is trivially returned */
338 if (expectlen == 1) {
339 return uv;
340 }
341
342 /* Remove the leading bits that indicate the number of bytes, leaving just
343 * the bits that are part of the value */
344 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
345
346 /* Now, loop through the remaining bytes, accumulating each into the
347 * working total as we go. (I khw tried unrolling the loop for up to 4
348 * bytes, but there was no performance improvement) */
349 for (++s; s < send; s++) {
350 uv = UTF8_ACCUMULATE(uv, *s);
351 }
352
353 return UNI_TO_NATIVE(uv);
354
355}
356
1e599354
KW
357/*
358=for apidoc is_utf8_invariant_string
359
82c5d941 360Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 361regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
362EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
363are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
364the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
365characters are invariant, but so also are the C1 controls.
1e599354
KW
366
367If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
368use this option, that C<s> can't have embedded C<NUL> characters and has to
369have a terminating C<NUL> byte).
370
9f2abfde
KW
371See also
372C<L</is_utf8_string>>,
373C<L</is_utf8_string_flags>>,
374C<L</is_utf8_string_loc>>,
375C<L</is_utf8_string_loc_flags>>,
376C<L</is_utf8_string_loclen>>,
377C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
378C<L</is_utf8_fixed_width_buf_flags>>,
379C<L</is_utf8_fixed_width_buf_loc_flags>>,
380C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
381C<L</is_strict_utf8_string>>,
382C<L</is_strict_utf8_string_loc>>,
383C<L</is_strict_utf8_string_loclen>>,
384C<L</is_c9strict_utf8_string>>,
385C<L</is_c9strict_utf8_string_loc>>,
386and
387C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
388
389=cut
0cbf5865
KW
390
391*/
392
393#define is_utf8_invariant_string(s, len) \
394 is_utf8_invariant_string_loc(s, len, NULL)
395
396/*
397=for apidoc is_utf8_invariant_string_loc
398
399Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
400the first UTF-8 variant character in the C<ep> pointer; if all characters are
401UTF-8 invariant, this function does not change the contents of C<*ep>.
402
403=cut
404
1e599354
KW
405*/
406
407PERL_STATIC_INLINE bool
c9182d9c 408Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 409{
e17544a6 410 const U8* send;
1e599354
KW
411 const U8* x = s;
412
0cbf5865
KW
413 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
414
e17544a6
KW
415 if (len == 0) {
416 len = strlen((const char *)s);
417 }
418
419 send = s + len;
420
4ab2fd9b 421/* This looks like 0x010101... */
2c5c8af5 422# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
423
424/* This looks like 0x808080... */
2c5c8af5 425# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 426# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 427# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 428
099e59a4
KW
429/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
430 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
431 * optimized out completely on a 32-bit system, and its mask gets optimized out
432 * on a 64-bit system */
2c5c8af5 433# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
434 | ( PTR2nat(x) >> 1) \
435 | ( ( (PTR2nat(x) \
436 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 437
3f515a2e
KW
438#ifndef EBCDIC
439
099e59a4
KW
440 /* Do the word-at-a-time iff there is at least one usable full word. That
441 * means that after advancing to a word boundary, there still is at least a
442 * full word left. The number of bytes needed to advance is 'wordsize -
443 * offset' unless offset is 0. */
444 if ((STRLEN) (send - x) >= PERL_WORDSIZE
445
446 /* This term is wordsize if subword; 0 if not */
447 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
448
449 /* 'offset' */
450 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
451 {
b40579ff 452
46bb68f6
KW
453 /* Process per-byte until reach word boundary. XXX This loop could be
454 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 455 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
456 if (! UTF8_IS_INVARIANT(*x)) {
457 if (ep) {
458 *ep = x;
459 }
e17544a6 460
46bb68f6
KW
461 return FALSE;
462 }
463 x++;
e17544a6 464 }
e17544a6 465
099e59a4
KW
466 /* Here, we know we have at least one full word to process. Process
467 * per-word as long as we have at least a full word left */
468 do {
4ab2fd9b 469 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 470
46bb68f6
KW
471 /* Found a variant. Just return if caller doesn't want its
472 * exact position */
473 if (! ep) {
474 return FALSE;
475 }
e17544a6 476
2c5c8af5
KW
477# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
478 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 479
73f0a2eb 480 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
481 assert(*ep >= s && *ep < send);
482
483 return FALSE;
484
2c5c8af5 485# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
486 checks. */
487
46bb68f6 488 break;
2c5c8af5 489# endif
46bb68f6 490 }
1d2af574 491
46bb68f6 492 x += PERL_WORDSIZE;
1d2af574 493
099e59a4 494 } while (x + PERL_WORDSIZE <= send);
b40579ff 495 }
e17544a6 496
0b08cab0 497#endif /* End of ! EBCDIC */
e17544a6
KW
498
499 /* Process per-byte */
500 while (x < send) {
501 if (! UTF8_IS_INVARIANT(*x)) {
502 if (ep) {
503 *ep = x;
504 }
0cbf5865 505
e17544a6 506 return FALSE;
0cbf5865 507 }
1e599354 508
e17544a6 509 x++;
1e599354
KW
510 }
511
512 return TRUE;
513}
514
23a7ee81
KW
515#ifndef EBCDIC
516
1d2af574 517PERL_STATIC_INLINE unsigned int
73f0a2eb 518Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
519{
520
521 /* This returns the position in a word (0..7) of the first variant byte in
522 * it. This is a helper function. Note that there are no branches */
523
524 assert(word);
525
526 /* Get just the msb bits of each byte */
527 word &= PERL_VARIANTS_WORD_MASK;
528
597ee3f4
KW
529# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
530 easiest thing is to hide that from the callers */
531 {
532 unsigned int i;
533 const U8 * s = (U8 *) &word;
534 dTHX;
535
536 for (i = 0; i < sizeof(word); i++ ) {
537 if (s[i]) {
538 return i;
539 }
540 }
541
542 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
543 __FILE__, __LINE__);
544 }
545
546# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
547
548 /* Bytes are stored like
549 * Byte8 ... Byte2 Byte1
550 * 63..56...15...8 7...0
551 *
552 * Isolate the lsb;
553 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
554 *
555 * The word will look this this, with a rightmost set bit in position 's':
556 * ('x's are don't cares)
557 * s
558 * x..x100..0
559 * x..xx10..0 Right shift (rightmost 0 is shifted off)
560 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
561 * the 1 just to their left into a 0; the remainder is
562 * untouched
4fa92663
KW
563 * 0..0011..1 The xor with the original, x..xx10..0, clears that
564 * remainder, sets the bottom to all 1
1d2af574
KW
565 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
566 *
567 * Another method is to do 'word &= -word'; but it generates a compiler
568 * message on some platforms about taking the negative of an unsigned */
569
570 word >>= 1;
571 word = 1 + (word ^ (word - 1));
572
573# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
574
575 /* Bytes are stored like
576 * Byte1 Byte2 ... Byte8
577 * 63..56 55..47 ... 7...0
578 *
579 * Isolate the msb; http://codeforces.com/blog/entry/10330
580 *
581 * Only the most significant set bit matters. Or'ing word with its right
582 * shift of 1 makes that bit and the next one to its right both 1. Then
583 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
584 * msb and all to the right being 1. */
585 word |= word >> 1;
586 word |= word >> 2;
587 word |= word >> 4;
588 word |= word >> 8;
589 word |= word >> 16;
590 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
591
592 /* Then subtracting the right shift by 1 clears all but the left-most of
593 * the 1 bits, which is our desired result */
594 word -= (word >> 1);
595
596# else
597# error Unexpected byte order
598# endif
599
7cf2d6c7
KW
600 /* Here 'word' has a single bit set: the msb of the first byte in which it
601 * is set. Calculate that position in the word. We can use this
1d2af574 602 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
603 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
604 * just get shifted off at compile time) */
605 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
606 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
607 | (39 << 24) | (47 << 16)
608 | (55 << 8) | (63 << 0));
1d2af574
KW
609 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
610
611 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
612 word = ((word + 1) >> 3) - 1;
613
614# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
615
616 /* And invert the result */
617 word = CHARBITS - word - 1;
618
619# endif
620
621 return (unsigned int) word;
622}
623
23a7ee81 624#endif
03c1e4ab
KW
625#if defined(PERL_CORE) || defined(PERL_EXT)
626
627/*
628=for apidoc variant_under_utf8_count
629
630This function looks at the sequence of bytes between C<s> and C<e>, which are
631assumed to be encoded in ASCII/Latin1, and returns how many of them would
632change should the string be translated into UTF-8. Due to the nature of UTF-8,
633each of these would occupy two bytes instead of the single one in the input
634string. Thus, this function returns the precise number of bytes the string
635would expand by when translated to UTF-8.
636
637Unlike most of the other functions that have C<utf8> in their name, the input
638to this function is NOT a UTF-8-encoded string. The function name is slightly
639I<odd> to emphasize this.
640
641This function is internal to Perl because khw thinks that any XS code that
642would want this is probably operating too close to the internals. Presenting a
643valid use case could change that.
644
645See also
646C<L<perlapi/is_utf8_invariant_string>>
647and
648C<L<perlapi/is_utf8_invariant_string_loc>>,
649
650=cut
651
652*/
653
654PERL_STATIC_INLINE Size_t
655S_variant_under_utf8_count(const U8* const s, const U8* const e)
656{
657 const U8* x = s;
658 Size_t count = 0;
659
660 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
661
662# ifndef EBCDIC
663
5d0379de
KW
664 /* Test if the string is long enough to use word-at-a-time. (Logic is the
665 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
666 if ((STRLEN) (e - x) >= PERL_WORDSIZE
667 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
668 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
669 {
670
671 /* Process per-byte until reach word boundary. XXX This loop could be
672 * eliminated if we knew that this platform had fast unaligned reads */
673 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
674 count += ! UTF8_IS_INVARIANT(*x++);
675 }
676
677 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
678 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
679 explanation of how this works */
e5863284
KW
680 PERL_UINTMAX_T increment
681 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
682 * PERL_COUNT_MULTIPLIER)
683 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 684 count += (Size_t) increment;
03c1e4ab
KW
685 x += PERL_WORDSIZE;
686 } while (x + PERL_WORDSIZE <= e);
687 }
688
689# endif
690
691 /* Process per-byte */
692 while (x < e) {
693 if (! UTF8_IS_INVARIANT(*x)) {
694 count++;
695 }
696
697 x++;
698 }
699
700 return count;
701}
702
703#endif
704
aff4cafe
KW
705#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
706# undef PERL_WORDSIZE
707# undef PERL_COUNT_MULTIPLIER
708# undef PERL_WORD_BOUNDARY_MASK
709# undef PERL_VARIANTS_WORD_MASK
710#endif
03c1e4ab 711
7c93d8f0 712/*
5ff889fb
KW
713=for apidoc is_utf8_string
714
82c5d941
KW
715Returns TRUE if the first C<len> bytes of string C<s> form a valid
716Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
717be calculated using C<strlen(s)> (which means if you use this option, that C<s>
718can't have embedded C<NUL> characters and has to have a terminating C<NUL>
719byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
720
2717076a
KW
721This function considers Perl's extended UTF-8 to be valid. That means that
722code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
723considered valid by this function. Use C<L</is_strict_utf8_string>>,
724C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
725code points are considered valid.
5ff889fb 726
9f2abfde
KW
727See also
728C<L</is_utf8_invariant_string>>,
0cbf5865 729C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
730C<L</is_utf8_string_loc>>,
731C<L</is_utf8_string_loclen>>,
8bc127bf
KW
732C<L</is_utf8_fixed_width_buf_flags>>,
733C<L</is_utf8_fixed_width_buf_loc_flags>>,
734C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
735
736=cut
737*/
738
dd237e82 739#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 740
c9cd936b
KW
741#if defined(PERL_CORE) || defined (PERL_EXT)
742
743/*
744=for apidoc is_utf8_non_invariant_string
745
746Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
747C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
748UTF-8; otherwise returns FALSE.
749
750A TRUE return means that at least one code point represented by the sequence
751either is a wide character not representable as a single byte, or the
752representation differs depending on whether the sequence is encoded in UTF-8 or
753not.
754
755See also
756C<L<perlapi/is_utf8_invariant_string>>,
757C<L<perlapi/is_utf8_string>>
758
759=cut
760
761This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
762It generally needn't be if its string is entirely UTF-8 invariant, and it
763shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
764
765It is an internal function because khw thinks that XS code shouldn't be working
766at this low a level. A valid use case could change that.
767
768*/
769
770PERL_STATIC_INLINE bool
771S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
772{
773 const U8 * first_variant;
774
775 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
776
777 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
778 return FALSE;
779 }
780
781 return is_utf8_string(first_variant, len - (first_variant - s));
782}
783
784#endif
785
5ff889fb 786/*
9f2abfde
KW
787=for apidoc is_strict_utf8_string
788
789Returns TRUE if the first C<len> bytes of string C<s> form a valid
790UTF-8-encoded string that is fully interchangeable by any application using
791Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
792calculated using C<strlen(s)> (which means if you use this option, that C<s>
793can't have embedded C<NUL> characters and has to have a terminating C<NUL>
794byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
795
796This function returns FALSE for strings containing any
797code points above the Unicode max of 0x10FFFF, surrogate code points, or
798non-character code points.
799
800See also
801C<L</is_utf8_invariant_string>>,
0cbf5865 802C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
803C<L</is_utf8_string>>,
804C<L</is_utf8_string_flags>>,
805C<L</is_utf8_string_loc>>,
806C<L</is_utf8_string_loc_flags>>,
807C<L</is_utf8_string_loclen>>,
808C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
809C<L</is_utf8_fixed_width_buf_flags>>,
810C<L</is_utf8_fixed_width_buf_loc_flags>>,
811C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
812C<L</is_strict_utf8_string_loc>>,
813C<L</is_strict_utf8_string_loclen>>,
814C<L</is_c9strict_utf8_string>>,
815C<L</is_c9strict_utf8_string_loc>>,
816and
817C<L</is_c9strict_utf8_string_loclen>>.
818
819=cut
820*/
821
dd237e82 822#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
823
824/*
825=for apidoc is_c9strict_utf8_string
826
827Returns TRUE if the first C<len> bytes of string C<s> form a valid
828UTF-8-encoded string that conforms to
829L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
830otherwise it returns FALSE. If C<len> is 0, it will be calculated using
831C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
832C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
833characters being ASCII constitute 'a valid UTF-8 string'.
834
835This function returns FALSE for strings containing any code points above the
836Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
837code points per
838L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
839
840See also
841C<L</is_utf8_invariant_string>>,
0cbf5865 842C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
843C<L</is_utf8_string>>,
844C<L</is_utf8_string_flags>>,
845C<L</is_utf8_string_loc>>,
846C<L</is_utf8_string_loc_flags>>,
847C<L</is_utf8_string_loclen>>,
848C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
849C<L</is_utf8_fixed_width_buf_flags>>,
850C<L</is_utf8_fixed_width_buf_loc_flags>>,
851C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
852C<L</is_strict_utf8_string>>,
853C<L</is_strict_utf8_string_loc>>,
854C<L</is_strict_utf8_string_loclen>>,
855C<L</is_c9strict_utf8_string_loc>>,
856and
857C<L</is_c9strict_utf8_string_loclen>>.
858
859=cut
860*/
861
dd237e82 862#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
863
864/*
865=for apidoc is_utf8_string_flags
866
867Returns TRUE if the first C<len> bytes of string C<s> form a valid
868UTF-8 string, subject to the restrictions imposed by C<flags>;
869returns FALSE otherwise. If C<len> is 0, it will be calculated
870using C<strlen(s)> (which means if you use this option, that C<s> can't have
871embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
872that all characters being ASCII constitute 'a valid UTF-8 string'.
873
874If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
875C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
876as C<L</is_strict_utf8_string>>; and if C<flags> is
877C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
878C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
879combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
880C<L</utf8n_to_uvchr>>, with the same meanings.
881
882See also
883C<L</is_utf8_invariant_string>>,
0cbf5865 884C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
885C<L</is_utf8_string>>,
886C<L</is_utf8_string_loc>>,
887C<L</is_utf8_string_loc_flags>>,
888C<L</is_utf8_string_loclen>>,
889C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
890C<L</is_utf8_fixed_width_buf_flags>>,
891C<L</is_utf8_fixed_width_buf_loc_flags>>,
892C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
893C<L</is_strict_utf8_string>>,
894C<L</is_strict_utf8_string_loc>>,
895C<L</is_strict_utf8_string_loclen>>,
896C<L</is_c9strict_utf8_string>>,
897C<L</is_c9strict_utf8_string_loc>>,
898and
899C<L</is_c9strict_utf8_string_loclen>>.
900
901=cut
902*/
903
904PERL_STATIC_INLINE bool
c9182d9c 905Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 906{
33756530 907 const U8 * first_variant;
9f2abfde
KW
908
909 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
910 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 911 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 912
f60f61fd
KW
913 if (len == 0) {
914 len = strlen((const char *)s);
915 }
916
9f2abfde
KW
917 if (flags == 0) {
918 return is_utf8_string(s, len);
919 }
920
d044b7a7 921 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
922 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
923 {
924 return is_strict_utf8_string(s, len);
925 }
926
d044b7a7 927 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
928 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
929 {
930 return is_c9strict_utf8_string(s, len);
931 }
932
33756530
KW
933 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
934 const U8* const send = s + len;
935 const U8* x = first_variant;
936
a0d7f935
KW
937 while (x < send) {
938 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
939 if (UNLIKELY(! cur_len)) {
940 return FALSE;
941 }
942 x += cur_len;
9f2abfde 943 }
33756530 944 }
9f2abfde
KW
945
946 return TRUE;
947}
948
949/*
5ff889fb
KW
950
951=for apidoc is_utf8_string_loc
952
2717076a 953Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 954case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 955"utf8ness success") in the C<ep> pointer.
5ff889fb 956
2717076a 957See also C<L</is_utf8_string_loclen>>.
5ff889fb 958
3964c812
KW
959=cut
960*/
961
962#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
963
964/*
965
5ff889fb
KW
966=for apidoc is_utf8_string_loclen
967
2717076a 968Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 969case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 970"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 971encoded characters in the C<el> pointer.
5ff889fb 972
2717076a 973See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
974
975=cut
976*/
977
56e4cf64 978PERL_STATIC_INLINE bool
33756530 979Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 980{
33756530 981 const U8 * first_variant;
5ff889fb
KW
982
983 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
984
33756530
KW
985 if (len == 0) {
986 len = strlen((const char *) s);
987 }
988
989 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
990 if (el)
991 *el = len;
992
993 if (ep) {
994 *ep = s + len;
995 }
996
997 return TRUE;
998 }
999
1000 {
1001 const U8* const send = s + len;
1002 const U8* x = first_variant;
1003 STRLEN outlen = first_variant - s;
1004
a0d7f935
KW
1005 while (x < send) {
1006 const STRLEN cur_len = isUTF8_CHAR(x, send);
1007 if (UNLIKELY(! cur_len)) {
1008 break;
1009 }
1010 x += cur_len;
1011 outlen++;
5ff889fb 1012 }
5ff889fb 1013
a0d7f935
KW
1014 if (el)
1015 *el = outlen;
5ff889fb 1016
a0d7f935
KW
1017 if (ep) {
1018 *ep = x;
1019 }
5ff889fb 1020
a0d7f935 1021 return (x == send);
33756530 1022 }
5ff889fb
KW
1023}
1024
1025/*
9f2abfde 1026
44170c9a 1027=for apidoc isUTF8_CHAR
8ed185f9
KW
1028
1029Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1030looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1031that represents some code point; otherwise it evaluates to 0. If non-zero, the
1032value gives how many bytes starting at C<s> comprise the code point's
1033representation. Any bytes remaining before C<e>, but beyond the ones needed to
1034form the first code point in C<s>, are not examined.
1035
13aab5dd 1036The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1037extension to official UTF-8 to represent those higher than the Unicode maximum
1038of 0x10FFFF. That means that this macro is used to efficiently decide if the
1039next few bytes in C<s> is legal UTF-8 for a single character.
1040
1041Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1042defined by Unicode to be fully interchangeable across applications;
1043C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1044#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1045code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1046
1047Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1048C<L</is_utf8_string_loclen>> to check entire strings.
1049
13aab5dd
KW
1050Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1051machines) is a valid UTF-8 character.
8ed185f9
KW
1052
1053=cut
1054
1055This uses an adaptation of the table and algorithm given in
1056http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1057documentation of the original version. A copyright notice for the original
1058version is given at the beginning of this file. The Perl adapation is
71525f77 1059documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1060
1061*/
1062
1063PERL_STATIC_INLINE Size_t
c9182d9c 1064Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1065{
1066 const U8 * s = s0;
1067 UV state = 0;
1068
1069 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1070
1071 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1072 * code point, which can be returned immediately. Otherwise, it is either
1073 * malformed, or for the start byte FF which the dfa doesn't handle (except
1074 * on 32-bit ASCII platforms where it trivially is an error). Call a
1075 * helper function for the other platforms. */
1076
1077 while (s < e && LIKELY(state != 1)) {
71525f77 1078 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1079 + state
71525f77 1080 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1081 if (state != 0) {
1082 s++;
1083 continue;
1084 }
1085
1086 return s - s0 + 1;
1087 }
1088
1089#if defined(UV_IS_QUAD) || defined(EBCDIC)
1090
1091 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1092 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1093 }
1094
1095#endif
1096
1097 return 0;
1098}
1099
1100/*
1101
67049a5f
KW
1102=for apidoc isSTRICT_UTF8_CHAR
1103
1104Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1105looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1106Unicode code point completely acceptable for open interchange between all
1107applications; otherwise it evaluates to 0. If non-zero, the value gives how
1108many bytes starting at C<s> comprise the code point's representation. Any
1109bytes remaining before C<e>, but beyond the ones needed to form the first code
1110point in C<s>, are not examined.
1111
1112The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1113be a surrogate nor a non-character code point. Thus this excludes any code
1114point from Perl's extended UTF-8.
1115
1116This is used to efficiently decide if the next few bytes in C<s> is
1117legal Unicode-acceptable UTF-8 for a single character.
1118
1119Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1120#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1121code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1122and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1123
1124Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1125C<L</is_strict_utf8_string_loclen>> to check entire strings.
1126
1127=cut
1128
1129This uses an adaptation of the tables and algorithm given in
1130http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1131documentation of the original version. A copyright notice for the original
1132version is given at the beginning of this file. The Perl adapation is
1133documented at the definition of strict_extended_utf8_dfa_tab[].
1134
1135*/
1136
1137PERL_STATIC_INLINE Size_t
c9182d9c 1138Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1139{
1140 const U8 * s = s0;
1141 UV state = 0;
1142
1143 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1144
1145 while (s < e && LIKELY(state != 1)) {
71525f77 1146 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1147
1148 if (state != 0) {
1149 s++;
1150 continue;
1151 }
1152
1153 return s - s0 + 1;
1154 }
1155
1156#ifndef EBCDIC
1157
1158 /* The dfa above drops out for certain Hanguls; handle them specially */
1159 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1160 return 3;
1161 }
1162
1163#endif
1164
1165 return 0;
1166}
1167
1168/*
1169
44170c9a 1170=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1171
1172Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1173looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1174Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1175the value gives how many bytes starting at C<s> comprise the code point's
1176representation. Any bytes remaining before C<e>, but beyond the ones needed to
1177form the first code point in C<s>, are not examined.
1178
1179The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1180differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1181code points. This corresponds to
1182L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1183which said that non-character code points are merely discouraged rather than
1184completely forbidden in open interchange. See
1185L<perlunicode/Noncharacter code points>.
1186
1187Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1188C<L</isUTF8_CHAR_flags>> for a more customized definition.
1189
1190Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1191C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1192
1193=cut
1194
1195This uses an adaptation of the tables and algorithm given in
1196http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1197documentation of the original version. A copyright notice for the original
1198version is given at the beginning of this file. The Perl adapation is
71525f77 1199documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1200
1201*/
1202
1203PERL_STATIC_INLINE Size_t
c9182d9c 1204Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1205{
1206 const U8 * s = s0;
1207 UV state = 0;
1208
1209 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1210
1211 while (s < e && LIKELY(state != 1)) {
71525f77 1212 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1213
1214 if (state != 0) {
1215 s++;
1216 continue;
1217 }
1218
1219 return s - s0 + 1;
1220 }
1221
1222 return 0;
1223}
1224
1225/*
1226
9f2abfde
KW
1227=for apidoc is_strict_utf8_string_loc
1228
1229Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1230case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1231"utf8ness success") in the C<ep> pointer.
1232
1233See also C<L</is_strict_utf8_string_loclen>>.
1234
1235=cut
1236*/
1237
1238#define is_strict_utf8_string_loc(s, len, ep) \
1239 is_strict_utf8_string_loclen(s, len, ep, 0)
1240
1241/*
1242
1243=for apidoc is_strict_utf8_string_loclen
1244
1245Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1246case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1247"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1248encoded characters in the C<el> pointer.
1249
1250See also C<L</is_strict_utf8_string_loc>>.
1251
1252=cut
1253*/
1254
1255PERL_STATIC_INLINE bool
c9182d9c 1256Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1257{
33756530 1258 const U8 * first_variant;
9f2abfde
KW
1259
1260 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1261
33756530
KW
1262 if (len == 0) {
1263 len = strlen((const char *) s);
1264 }
1265
1266 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1267 if (el)
1268 *el = len;
1269
1270 if (ep) {
1271 *ep = s + len;
1272 }
1273
1274 return TRUE;
1275 }
1276
1277 {
1278 const U8* const send = s + len;
1279 const U8* x = first_variant;
1280 STRLEN outlen = first_variant - s;
1281
a0d7f935
KW
1282 while (x < send) {
1283 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1284 if (UNLIKELY(! cur_len)) {
1285 break;
1286 }
1287 x += cur_len;
1288 outlen++;
9f2abfde 1289 }
9f2abfde 1290
a0d7f935
KW
1291 if (el)
1292 *el = outlen;
9f2abfde 1293
a0d7f935
KW
1294 if (ep) {
1295 *ep = x;
1296 }
9f2abfde 1297
a0d7f935 1298 return (x == send);
33756530 1299 }
9f2abfde
KW
1300}
1301
1302/*
1303
1304=for apidoc is_c9strict_utf8_string_loc
1305
1306Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1307the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1308"utf8ness success") in the C<ep> pointer.
1309
1310See also C<L</is_c9strict_utf8_string_loclen>>.
1311
1312=cut
1313*/
1314
1315#define is_c9strict_utf8_string_loc(s, len, ep) \
1316 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1317
1318/*
1319
1320=for apidoc is_c9strict_utf8_string_loclen
1321
1322Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1323the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1324"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1325characters in the C<el> pointer.
1326
1327See also C<L</is_c9strict_utf8_string_loc>>.
1328
1329=cut
1330*/
1331
1332PERL_STATIC_INLINE bool
c9182d9c 1333Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1334{
33756530 1335 const U8 * first_variant;
9f2abfde
KW
1336
1337 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1338
33756530
KW
1339 if (len == 0) {
1340 len = strlen((const char *) s);
1341 }
1342
1343 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1344 if (el)
1345 *el = len;
1346
1347 if (ep) {
1348 *ep = s + len;
1349 }
1350
1351 return TRUE;
1352 }
1353
1354 {
1355 const U8* const send = s + len;
1356 const U8* x = first_variant;
1357 STRLEN outlen = first_variant - s;
1358
a0d7f935
KW
1359 while (x < send) {
1360 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1361 if (UNLIKELY(! cur_len)) {
1362 break;
1363 }
1364 x += cur_len;
1365 outlen++;
9f2abfde 1366 }
9f2abfde 1367
a0d7f935
KW
1368 if (el)
1369 *el = outlen;
9f2abfde 1370
a0d7f935
KW
1371 if (ep) {
1372 *ep = x;
1373 }
9f2abfde 1374
a0d7f935 1375 return (x == send);
33756530 1376 }
9f2abfde
KW
1377}
1378
1379/*
1380
1381=for apidoc is_utf8_string_loc_flags
1382
1383Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1384case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1385"utf8ness success") in the C<ep> pointer.
1386
1387See also C<L</is_utf8_string_loclen_flags>>.
1388
1389=cut
1390*/
1391
1392#define is_utf8_string_loc_flags(s, len, ep, flags) \
1393 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1394
1395
1396/* The above 3 actual functions could have been moved into the more general one
1397 * just below, and made #defines that call it with the right 'flags'. They are
1398 * currently kept separate to increase their chances of getting inlined */
1399
1400/*
1401
1402=for apidoc is_utf8_string_loclen_flags
1403
1404Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1405case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1406"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1407encoded characters in the C<el> pointer.
1408
1409See also C<L</is_utf8_string_loc_flags>>.
1410
1411=cut
1412*/
1413
1414PERL_STATIC_INLINE bool
c9182d9c 1415Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1416{
33756530 1417 const U8 * first_variant;
9f2abfde
KW
1418
1419 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1420 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1421 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1422
f60f61fd 1423 if (len == 0) {
a0d7f935 1424 len = strlen((const char *) s);
f60f61fd
KW
1425 }
1426
9f2abfde
KW
1427 if (flags == 0) {
1428 return is_utf8_string_loclen(s, len, ep, el);
1429 }
1430
d044b7a7 1431 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1432 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1433 {
1434 return is_strict_utf8_string_loclen(s, len, ep, el);
1435 }
1436
d044b7a7 1437 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1438 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1439 {
1440 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1441 }
1442
33756530
KW
1443 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1444 if (el)
1445 *el = len;
1446
1447 if (ep) {
1448 *ep = s + len;
1449 }
1450
1451 return TRUE;
1452 }
1453
1454 {
1455 const U8* send = s + len;
1456 const U8* x = first_variant;
1457 STRLEN outlen = first_variant - s;
1458
a0d7f935
KW
1459 while (x < send) {
1460 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1461 if (UNLIKELY(! cur_len)) {
1462 break;
1463 }
1464 x += cur_len;
1465 outlen++;
9f2abfde 1466 }
9f2abfde 1467
a0d7f935
KW
1468 if (el)
1469 *el = outlen;
9f2abfde 1470
a0d7f935
KW
1471 if (ep) {
1472 *ep = x;
1473 }
9f2abfde 1474
a0d7f935 1475 return (x == send);
33756530 1476 }
9f2abfde
KW
1477}
1478
1479/*
7c93d8f0
KW
1480=for apidoc utf8_distance
1481
1482Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1483and C<b>.
1484
1485WARNING: use only if you *know* that the pointers point inside the
1486same UTF-8 buffer.
1487
1488=cut
1489*/
1490
1491PERL_STATIC_INLINE IV
1492Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1493{
1494 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1495
1496 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1497}
1498
1499/*
1500=for apidoc utf8_hop
1501
1502Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1503forward or backward.
1504
1505WARNING: do not use the following unless you *know* C<off> is within
1506the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1507on the first byte of character or just after the last byte of a character.
1508
1509=cut
1510*/
1511
1512PERL_STATIC_INLINE U8 *
1513Perl_utf8_hop(const U8 *s, SSize_t off)
1514{
1515 PERL_ARGS_ASSERT_UTF8_HOP;
1516
1517 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1518 * the bitops (especially ~) can create illegal UTF-8.
1519 * In other words: in Perl UTF-8 is not just for Unicode. */
1520
1521 if (off >= 0) {
1522 while (off--)
1523 s += UTF8SKIP(s);
1524 }
1525 else {
1526 while (off++) {
1527 s--;
1528 while (UTF8_IS_CONTINUATION(*s))
1529 s--;
1530 }
1531 }
e099ea69 1532 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1533 return (U8 *)s;
e099ea69 1534 GCC_DIAG_RESTORE
7c93d8f0
KW
1535}
1536
4dab108f 1537/*
65df57a8
TC
1538=for apidoc utf8_hop_forward
1539
1540Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1541forward.
1542
1543C<off> must be non-negative.
1544
1545C<s> must be before or equal to C<end>.
1546
1547When moving forward it will not move beyond C<end>.
1548
1549Will not exceed this limit even if the string is not valid "UTF-8".
1550
1551=cut
1552*/
1553
1554PERL_STATIC_INLINE U8 *
1555Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1556{
1557 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1558
1559 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1560 * the bitops (especially ~) can create illegal UTF-8.
1561 * In other words: in Perl UTF-8 is not just for Unicode. */
1562
1563 assert(s <= end);
1564 assert(off >= 0);
1565
1566 while (off--) {
1567 STRLEN skip = UTF8SKIP(s);
de979548 1568 if ((STRLEN)(end - s) <= skip) {
e099ea69 1569 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1570 return (U8 *)end;
e099ea69 1571 GCC_DIAG_RESTORE
de979548 1572 }
65df57a8
TC
1573 s += skip;
1574 }
1575
e099ea69 1576 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1577 return (U8 *)s;
e099ea69 1578 GCC_DIAG_RESTORE
65df57a8
TC
1579}
1580
1581/*
1582=for apidoc utf8_hop_back
1583
1584Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1585backward.
1586
1587C<off> must be non-positive.
1588
1589C<s> must be after or equal to C<start>.
1590
1591When moving backward it will not move before C<start>.
1592
1593Will not exceed this limit even if the string is not valid "UTF-8".
1594
1595=cut
1596*/
1597
1598PERL_STATIC_INLINE U8 *
1599Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1600{
1601 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1602
1603 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1604 * the bitops (especially ~) can create illegal UTF-8.
1605 * In other words: in Perl UTF-8 is not just for Unicode. */
1606
1607 assert(start <= s);
1608 assert(off <= 0);
1609
1610 while (off++ && s > start) {
e7185695 1611 do {
65df57a8 1612 s--;
e7185695 1613 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8
TC
1614 }
1615
e099ea69 1616 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1617 return (U8 *)s;
e099ea69 1618 GCC_DIAG_RESTORE
65df57a8
TC
1619}
1620
1621/*
1622=for apidoc utf8_hop_safe
1623
1624Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1625either forward or backward.
1626
1627When moving backward it will not move before C<start>.
1628
1629When moving forward it will not move beyond C<end>.
1630
1631Will not exceed those limits even if the string is not valid "UTF-8".
1632
1633=cut
1634*/
1635
1636PERL_STATIC_INLINE U8 *
1637Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1638{
1639 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1640
1641 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1642 * the bitops (especially ~) can create illegal UTF-8.
1643 * In other words: in Perl UTF-8 is not just for Unicode. */
1644
1645 assert(start <= s && s <= end);
1646
1647 if (off >= 0) {
1648 return utf8_hop_forward(s, off, end);
1649 }
1650 else {
1651 return utf8_hop_back(s, off, start);
1652 }
1653}
1654
1655/*
4dab108f
KW
1656
1657=for apidoc is_utf8_valid_partial_char
1658
6cbb9248
KW
1659Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1660S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1661points. Otherwise, it returns 1 if there exists at least one non-empty
1662sequence of bytes that when appended to sequence C<s>, starting at position
1663C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1664otherwise returns 0.
1665
1666In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1667point.
1668
1669This is useful when a fixed-length buffer is being tested for being well-formed
1670UTF-8, but the final few bytes in it don't comprise a full character; that is,
1671it is split somewhere in the middle of the final code point's UTF-8
1672representation. (Presumably when the buffer is refreshed with the next chunk
1673of data, the new first bytes will complete the partial code point.) This
1674function is used to verify that the final bytes in the current buffer are in
1675fact the legal beginning of some code point, so that if they aren't, the
1676failure can be signalled without having to wait for the next read.
4dab108f
KW
1677
1678=cut
1679*/
2717076a
KW
1680#define is_utf8_valid_partial_char(s, e) \
1681 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1682
1683/*
1684
1685=for apidoc is_utf8_valid_partial_char_flags
1686
1687Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1688or not the input is a valid UTF-8 encoded partial character, but it takes an
1689extra parameter, C<flags>, which can further restrict which code points are
1690considered valid.
1691
1692If C<flags> is 0, this behaves identically to
1693C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1694of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1695there is any sequence of bytes that can complete the input partial character in
1696such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1697TRUE; otherwise FALSE. Non character code points cannot be determined based on
1698partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1699determined from just the first one or two bytes.
1700
1701=cut
1702 */
1703
56e4cf64 1704PERL_STATIC_INLINE bool
c9182d9c 1705Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1706{
f1c999a7 1707 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1708
f1c999a7 1709 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1710 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1711
8875bd48 1712 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1713 return FALSE;
1714 }
1715
1376b35c 1716 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1717}
1718
8bc127bf
KW
1719/*
1720
1721=for apidoc is_utf8_fixed_width_buf_flags
1722
1723Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1724is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1725otherwise it returns FALSE.
1726
1727If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1728without restriction. If the final few bytes of the buffer do not form a
1729complete code point, this will return TRUE anyway, provided that
1730C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1731
1732If C<flags> in non-zero, it can be any combination of the
1733C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1734same meanings.
1735
1736This function differs from C<L</is_utf8_string_flags>> only in that the latter
1737returns FALSE if the final few bytes of the string don't form a complete code
1738point.
1739
1740=cut
1741 */
1742#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1743 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1744
1745/*
1746
1747=for apidoc is_utf8_fixed_width_buf_loc_flags
1748
1749Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1750failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1751to the beginning of any partial character at the end of the buffer; if there is
1752no partial character C<*ep> will contain C<s>+C<len>.
1753
1754See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1755
1756=cut
1757*/
1758
1759#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1760 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1761
1762/*
1763
1764=for apidoc is_utf8_fixed_width_buf_loclen_flags
1765
1766Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1767complete, valid characters found in the C<el> pointer.
1768
1769=cut
1770*/
1771
1772PERL_STATIC_INLINE bool
c9182d9c 1773Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1774 STRLEN len,
8bc127bf
KW
1775 const U8 **ep,
1776 STRLEN *el,
1777 const U32 flags)
1778{
1779 const U8 * maybe_partial;
1780
1781 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1782
1783 if (! ep) {
1784 ep = &maybe_partial;
1785 }
1786
1787 /* If it's entirely valid, return that; otherwise see if the only error is
1788 * that the final few bytes are for a partial character */
1789 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1790 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1791}
1792
e6a4ffc3 1793PERL_STATIC_INLINE UV
c9182d9c 1794Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1795 STRLEN curlen,
1796 STRLEN *retlen,
1797 const U32 flags,
1798 U32 * errors,
1799 AV ** msgs)
1800{
1801 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1802 * simple cases, and, if necessary calls a helper function to deal with the
1803 * more complex ones. Almost all well-formed non-problematic code points
1804 * are considered simple, so that it's unlikely that the helper function
1805 * will need to be called.
1806 *
1807 * This is an adaptation of the tables and algorithm given in
1808 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1809 * comprehensive documentation of the original version. A copyright notice
1810 * for the original version is given at the beginning of this file. The
71525f77 1811 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1812 */
1813
1814 const U8 * const s0 = s;
1815 const U8 * send = s0 + curlen;
f73cfe7d 1816 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1817 UV state = 0;
1818
1819 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1820
1821 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1822 * non-problematic code point, which can be returned immediately.
1823 * Otherwise we call a helper function to figure out the more complicated
1824 * cases. */
1825
1826 while (s < send && LIKELY(state != 1)) {
71525f77 1827 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1828
1829 uv = (state == 0)
1830 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1831 : UTF8_ACCUMULATE(uv, *s);
71525f77 1832 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1833
1834 if (state != 0) {
1835 s++;
1836 continue;
1837 }
1838
1839 if (retlen) {
1840 *retlen = s - s0 + 1;
1841 }
1842 if (errors) {
1843 *errors = 0;
1844 }
1845 if (msgs) {
1846 *msgs = NULL;
1847 }
1848
cfebc7aa 1849 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1850 }
1851
1852 /* Here is potentially problematic. Use the full mechanism */
1853 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1854}
1855
82651abe 1856PERL_STATIC_INLINE UV
9a9a6c98 1857Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1858{
9a9a6c98 1859 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1860
1861 assert(s < send);
1862
1863 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1864
1865 /* EMPTY is not really allowed, and asserts on debugging builds. But
1866 * on non-debugging we have to deal with it, and this causes it to
1867 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1868 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1869 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1870 }
1871 else {
1872 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1873 if (retlen && ret == 0 && *s != '\0') {
1874 *retlen = (STRLEN) -1;
1875 }
1876
1877 return ret;
1878 }
1879}
1880
c8028aa6
TC
1881/* ------------------------------- perl.h ----------------------------- */
1882
1883/*
dcccc8ff
KW
1884=head1 Miscellaneous Functions
1885
44170c9a 1886=for apidoc is_safe_syscall
c8028aa6 1887
6602b933 1888Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1889If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1890
1891Return TRUE if the name is safe.
1892
796b6530 1893Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1894
1895=cut
1896*/
1897
1898PERL_STATIC_INLINE bool
ffd62fc2
KW
1899Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1900{
c8028aa6
TC
1901 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1902 * perl itself uses xce*() functions which accept 8-bit strings.
1903 */
1904
1905 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1906
6c4650b3 1907 if (len > 1) {
c8028aa6 1908 char *null_at;
41188aa0 1909 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1910 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1911 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1912 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1913 what, op_name, pv, null_at+1);
c8028aa6
TC
1914 return FALSE;
1915 }
1916 }
1917
1918 return TRUE;
1919}
1920
1921/*
7cb3f959
TC
1922
1923Return true if the supplied filename has a newline character
fa6c7d00 1924immediately before the first (hopefully only) NUL.
7cb3f959
TC
1925
1926My original look at this incorrectly used the len from SvPV(), but
1927that's incorrect, since we allow for a NUL in pv[len-1].
1928
1929So instead, strlen() and work from there.
1930
1931This allow for the user reading a filename, forgetting to chomp it,
1932then calling:
1933
1934 open my $foo, "$file\0";
1935
1936*/
1937
1938#ifdef PERL_CORE
1939
1940PERL_STATIC_INLINE bool
ffd62fc2
KW
1941S_should_warn_nl(const char *pv)
1942{
7cb3f959
TC
1943 STRLEN len;
1944
1945 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1946
1947 len = strlen(pv);
1948
1949 return len > 0 && pv[len-1] == '\n';
1950}
1951
1952#endif
1953
3a019afd
KW
1954#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1955
1956PERL_STATIC_INLINE bool
1957S_lossless_NV_to_IV(const NV nv, IV *ivp)
1958{
1959 /* This function determines if the input NV 'nv' may be converted without
1960 * loss of data to an IV. If not, it returns FALSE taking no other action.
1961 * But if it is possible, it does the conversion, returning TRUE, and
1962 * storing the converted result in '*ivp' */
1963
1964 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1965
1966# if defined(Perl_isnan)
1967
1968 if (UNLIKELY(Perl_isnan(nv))) {
1969 return FALSE;
1970 }
1971
1972# endif
1973
1974 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1975 return FALSE;
1976 }
1977
1978 if ((IV) nv != nv) {
1979 return FALSE;
1980 }
1981
1982 *ivp = (IV) nv;
1983 return TRUE;
1984}
1985
1986#endif
1987
81d52ecd
JH
1988/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1989
94b0cb42
KW
1990#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
1991
81d52ecd
JH
1992#define MAX_CHARSET_NAME_LENGTH 2
1993
1994PERL_STATIC_INLINE const char *
94b0cb42 1995S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 1996{
94b0cb42
KW
1997 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
1998
81d52ecd
JH
1999 /* Returns a string that corresponds to the name of the regex character set
2000 * given by 'flags', and *lenp is set the length of that string, which
2001 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2002
2003 *lenp = 1;
2004 switch (get_regex_charset(flags)) {
2005 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2006 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2007 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2008 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2009 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2010 *lenp = 2;
2011 return ASCII_MORE_RESTRICT_PAT_MODS;
2012 }
2013 /* The NOT_REACHED; hides an assert() which has a rather complex
2014 * definition in perl.h. */
2015 NOT_REACHED; /* NOTREACHED */
2016 return "?"; /* Unknown */
2017}
2018
94b0cb42
KW
2019#endif
2020
7cb3f959 2021/*
ed382232
TC
2022
2023Return false if any get magic is on the SV other than taint magic.
2024
2025*/
2026
2027PERL_STATIC_INLINE bool
ffd62fc2
KW
2028Perl_sv_only_taint_gmagic(SV *sv)
2029{
ed382232
TC
2030 MAGIC *mg = SvMAGIC(sv);
2031
2032 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2033
2034 while (mg) {
2035 if (mg->mg_type != PERL_MAGIC_taint
2036 && !(mg->mg_flags & MGf_GSKIP)
2037 && mg->mg_virtual->svt_get) {
2038 return FALSE;
2039 }
2040 mg = mg->mg_moremagic;
2041 }
2042
2043 return TRUE;
2044}
2045
ed8ff0f3
DM
2046/* ------------------ cop.h ------------------------------------------- */
2047
2048
2049/* Enter a block. Push a new base context and return its address. */
2050
2051PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2052Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2053{
2054 PERL_CONTEXT * cx;
2055
2056 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2057
2058 CXINC;
2059 cx = CX_CUR();
2060 cx->cx_type = type;
2061 cx->blk_gimme = gimme;
2062 cx->blk_oldsaveix = saveix;
4caf7d8c 2063 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2064 cx->blk_oldcop = PL_curcop;
4caf7d8c 2065 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2066 cx->blk_oldscopesp = PL_scopestack_ix;
2067 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2068 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2069
2070 PL_tmps_floor = PL_tmps_ix;
2071 CX_DEBUG(cx, "PUSH");
2072 return cx;
2073}
2074
2075
2076/* Exit a block (RETURN and LAST). */
2077
2078PERL_STATIC_INLINE void
c9182d9c 2079Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2080{
2081 PERL_ARGS_ASSERT_CX_POPBLOCK;
2082
2083 CX_DEBUG(cx, "POP");
2084 /* these 3 are common to cx_popblock and cx_topblock */
2085 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2086 PL_scopestack_ix = cx->blk_oldscopesp;
2087 PL_curpm = cx->blk_oldpm;
2088
2089 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2090 * and leaves a CX entry lying around for repeated use, so
2091 * skip for multicall */ \
2092 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2093 || PL_savestack_ix == cx->blk_oldsaveix);
2094 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2095 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2096}
2097
2098/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2099 * Whereas cx_popblock() restores the state to the point just before
2100 * cx_pushblock() was called, cx_topblock() restores it to the point just
2101 * *after* cx_pushblock() was called. */
2102
2103PERL_STATIC_INLINE void
c9182d9c 2104Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2105{
2106 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2107
2108 CX_DEBUG(cx, "TOP");
2109 /* these 3 are common to cx_popblock and cx_topblock */
2110 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2111 PL_scopestack_ix = cx->blk_oldscopesp;
2112 PL_curpm = cx->blk_oldpm;
2113
2114 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2115}
2116
2117
a73d8813 2118PERL_STATIC_INLINE void
c9182d9c 2119Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2120{
2121 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2122
2123 PERL_ARGS_ASSERT_CX_PUSHSUB;
2124
3f6bd23a 2125 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
2126 cx->blk_sub.cv = cv;
2127 cx->blk_sub.olddepth = CvDEPTH(cv);
2128 cx->blk_sub.prevcomppad = PL_comppad;
2129 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2130 cx->blk_sub.retop = retop;
2131 SvREFCNT_inc_simple_void_NN(cv);
2132 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2133}
2134
2135
2136/* subsets of cx_popsub() */
2137
2138PERL_STATIC_INLINE void
c9182d9c 2139Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2140{
2141 CV *cv;
2142
2143 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2144 assert(CxTYPE(cx) == CXt_SUB);
2145
2146 PL_comppad = cx->blk_sub.prevcomppad;
2147 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2148 cv = cx->blk_sub.cv;
2149 CvDEPTH(cv) = cx->blk_sub.olddepth;
2150 cx->blk_sub.cv = NULL;
2151 SvREFCNT_dec(cv);
2152}
2153
2154
2155/* handle the @_ part of leaving a sub */
2156
2157PERL_STATIC_INLINE void
c9182d9c 2158Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2159{
2160 AV *av;
2161
2162 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2163 assert(CxTYPE(cx) == CXt_SUB);
2164 assert(AvARRAY(MUTABLE_AV(
2165 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2166 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2167
2168 CX_POP_SAVEARRAY(cx);
2169 av = MUTABLE_AV(PAD_SVl(0));
2170 if (UNLIKELY(AvREAL(av)))
2171 /* abandon @_ if it got reified */
2172 clear_defarray(av, 0);
2173 else {
2174 CLEAR_ARGARRAY(av);
2175 }
2176}
2177
2178
2179PERL_STATIC_INLINE void
c9182d9c 2180Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2181{
2182 PERL_ARGS_ASSERT_CX_POPSUB;
2183 assert(CxTYPE(cx) == CXt_SUB);
2184
3f6bd23a 2185 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2186
2187 if (CxHASARGS(cx))
2188 cx_popsub_args(cx);
2189 cx_popsub_common(cx);
2190}
2191
2192
6a7d52cc 2193PERL_STATIC_INLINE void
c9182d9c 2194Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2195{
2196 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2197
2198 cx->blk_format.cv = cv;
2199 cx->blk_format.retop = retop;
2200 cx->blk_format.gv = gv;
2201 cx->blk_format.dfoutgv = PL_defoutgv;
2202 cx->blk_format.prevcomppad = PL_comppad;
2203 cx->blk_u16 = 0;
2204
2205 SvREFCNT_inc_simple_void_NN(cv);
2206 CvDEPTH(cv)++;
2207 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2208}
2209
2210
2211PERL_STATIC_INLINE void
c9182d9c 2212Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2213{
2214 CV *cv;
2215 GV *dfout;
2216
2217 PERL_ARGS_ASSERT_CX_POPFORMAT;
2218 assert(CxTYPE(cx) == CXt_FORMAT);
2219
2220 dfout = cx->blk_format.dfoutgv;
2221 setdefout(dfout);
2222 cx->blk_format.dfoutgv = NULL;
2223 SvREFCNT_dec_NN(dfout);
2224
2225 PL_comppad = cx->blk_format.prevcomppad;
2226 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2227 cv = cx->blk_format.cv;
2228 cx->blk_format.cv = NULL;
2229 --CvDEPTH(cv);
2230 SvREFCNT_dec_NN(cv);
2231}
2232
2233
13febba5 2234PERL_STATIC_INLINE void
c9182d9c 2235Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2236{
2237 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2238
2239 cx->blk_eval.retop = retop;
2240 cx->blk_eval.old_namesv = namesv;
2241 cx->blk_eval.old_eval_root = PL_eval_root;
2242 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2243 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2244 cx->blk_eval.cur_top_env = PL_top_env;
2245
4c57ced5 2246 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2247 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2248 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2249}
2250
2251
2252PERL_STATIC_INLINE void
c9182d9c 2253Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2254{
2255 SV *sv;
2256
2257 PERL_ARGS_ASSERT_CX_POPEVAL;
2258 assert(CxTYPE(cx) == CXt_EVAL);
2259
2260 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2261 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2262 PL_eval_root = cx->blk_eval.old_eval_root;
2263 sv = cx->blk_eval.cur_text;
4c57ced5 2264 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2265 cx->blk_eval.cur_text = NULL;
2266 SvREFCNT_dec_NN(sv);
2267 }
2268
2269 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2270 if (sv) {
2271 cx->blk_eval.old_namesv = NULL;
2272 SvREFCNT_dec_NN(sv);
2273 }
13febba5 2274}
6a7d52cc 2275
a73d8813 2276
d1b6bf72
DM
2277/* push a plain loop, i.e.
2278 * { block }
2279 * while (cond) { block }
2280 * for (init;cond;continue) { block }
2281 * This loop can be last/redo'ed etc.
2282 */
2283
2284PERL_STATIC_INLINE void
c9182d9c 2285Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2286{
2287 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2288 cx->blk_loop.my_op = cLOOP;
2289}
2290
2291
2292/* push a true for loop, i.e.
2293 * for var (list) { block }
2294 */
2295
2296PERL_STATIC_INLINE void
c9182d9c 2297Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2298{
2299 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2300
2301 /* this one line is common with cx_pushloop_plain */
2302 cx->blk_loop.my_op = cLOOP;
2303
2304 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2305 cx->blk_loop.itersave = itersave;
2306#ifdef USE_ITHREADS
2307 cx->blk_loop.oldcomppad = PL_comppad;
2308#endif
2309}
2310
2311
2312/* pop all loop types, including plain */
2313
2314PERL_STATIC_INLINE void
c9182d9c 2315Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2316{
2317 PERL_ARGS_ASSERT_CX_POPLOOP;
2318
2319 assert(CxTYPE_is_LOOP(cx));
2320 if ( CxTYPE(cx) == CXt_LOOP_ARY
2321 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2322 {
2323 /* Free ary or cur. This assumes that state_u.ary.ary
2324 * aligns with state_u.lazysv.cur. See cx_dup() */
2325 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2326 cx->blk_loop.state_u.lazysv.cur = NULL;
2327 SvREFCNT_dec_NN(sv);
2328 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2329 sv = cx->blk_loop.state_u.lazysv.end;
2330 cx->blk_loop.state_u.lazysv.end = NULL;
2331 SvREFCNT_dec_NN(sv);
2332 }
2333 }
2334 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2335 SV *cursv;
2336 SV **svp = (cx)->blk_loop.itervar_u.svp;
2337 if ((cx->cx_type & CXp_FOR_GV))
2338 svp = &GvSV((GV*)svp);
2339 cursv = *svp;
2340 *svp = cx->blk_loop.itersave;
2341 cx->blk_loop.itersave = NULL;
2342 SvREFCNT_dec(cursv);
2343 }
2344}
2345
2a7b7c61
DM
2346
2347PERL_STATIC_INLINE void
c9182d9c 2348Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2349{
7896dde7 2350 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2351
7896dde7 2352 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2353}
2354
2355
2356PERL_STATIC_INLINE void
c9182d9c 2357Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2358{
7896dde7
Z
2359 PERL_ARGS_ASSERT_CX_POPWHEN;
2360 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2361
2362 PERL_UNUSED_ARG(cx);
59a14f30 2363 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2364 /* currently NOOP */
2365}
2366
2367
7896dde7 2368PERL_STATIC_INLINE void
c9182d9c 2369Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2370{
2371 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2372
2373 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2374 cx->blk_givwhen.defsv_save = orig_defsv;
2375}
2376
2377
2378PERL_STATIC_INLINE void
c9182d9c 2379Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2380{
2381 SV *sv;
2382
2383 PERL_ARGS_ASSERT_CX_POPGIVEN;
2384 assert(CxTYPE(cx) == CXt_GIVEN);
2385
2386 sv = GvSV(PL_defgv);
2387 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2388 cx->blk_givwhen.defsv_save = NULL;
2389 SvREFCNT_dec(sv);
2390}
2391
ec2c235b
KW
2392/* ------------------ util.h ------------------------------------------- */
2393
2394/*
2395=head1 Miscellaneous Functions
2396
2397=for apidoc foldEQ
2398
2399Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2400same
2401case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2402match themselves and their opposite case counterparts. Non-cased and non-ASCII
2403range bytes match only themselves.
2404
2405=cut
2406*/
2407
2408PERL_STATIC_INLINE I32
2409Perl_foldEQ(const char *s1, const char *s2, I32 len)
2410{
2411 const U8 *a = (const U8 *)s1;
2412 const U8 *b = (const U8 *)s2;
2413
2414 PERL_ARGS_ASSERT_FOLDEQ;
2415
2416 assert(len >= 0);
2417
2418 while (len--) {
2419 if (*a != *b && *a != PL_fold[*b])
2420 return 0;
2421 a++,b++;
2422 }
2423 return 1;
2424}
2425
0f9cb40c 2426PERL_STATIC_INLINE I32
ec2c235b
KW
2427Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2428{
79a1fabd
KW
2429 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2430 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2431 * does not check for this. Nor does it check that the strings each have
2432 * at least 'len' characters. */
ec2c235b
KW
2433
2434 const U8 *a = (const U8 *)s1;
2435 const U8 *b = (const U8 *)s2;
2436
2437 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2438
2439 assert(len >= 0);
2440
2441 while (len--) {
2442 if (*a != *b && *a != PL_fold_latin1[*b]) {
2443 return 0;
2444 }
2445 a++, b++;
2446 }
2447 return 1;
2448}
2449
2450/*
2451=for apidoc foldEQ_locale
2452
2453Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2454same case-insensitively in the current locale; false otherwise.
2455
2456=cut
2457*/
2458
0f9cb40c 2459PERL_STATIC_INLINE I32
ec2c235b
KW
2460Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2461{
2462 dVAR;
2463 const U8 *a = (const U8 *)s1;
2464 const U8 *b = (const U8 *)s2;
2465
2466 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2467
2468 assert(len >= 0);
2469
2470 while (len--) {
2471 if (*a != *b && *a != PL_fold_locale[*b])
2472 return 0;
2473 a++,b++;
2474 }
2475 return 1;
2476}
2477
6dba01e2
KW
2478#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2479
2480PERL_STATIC_INLINE void *
2481S_my_memrchr(const char * s, const char c, const STRLEN len)
2482{
2483 /* memrchr(), since many platforms lack it */
2484
2485 const char * t = s + len - 1;
2486
2487 PERL_ARGS_ASSERT_MY_MEMRCHR;
2488
2489 while (t >= s) {
2490 if (*t == c) {
2491 return (void *) t;
2492 }
2493 t--;
2494 }
2495
2496 return NULL;
2497}
2498
2499#endif
2500
ed382232 2501/*
c8028aa6
TC
2502 * ex: set ts=8 sts=4 sw=4 et:
2503 */