This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Perl_ to SvAMAGIC _off,_on names
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9
KW
8 * This file contains tables and code adapted from
9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
8d919b0a 188 assert(isREGEXP(re));
df6b4bd5
DM
189 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
190 : (struct regexp *)p;
8d919b0a
FC
191}
192
27669aa4
FC
193/* ------------------------------- sv.h ------------------------------- */
194
195PERL_STATIC_INLINE SV *
c9182d9c 196Perl_SvREFCNT_inc(SV *sv)
27669aa4 197{
2439e033 198 if (LIKELY(sv != NULL))
27669aa4
FC
199 SvREFCNT(sv)++;
200 return sv;
201}
202PERL_STATIC_INLINE SV *
c9182d9c 203Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 204{
3f2f854a
KW
205 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
206
27669aa4
FC
207 SvREFCNT(sv)++;
208 return sv;
209}
210PERL_STATIC_INLINE void
c9182d9c 211Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 212{
2439e033 213 if (LIKELY(sv != NULL))
27669aa4
FC
214 SvREFCNT(sv)++;
215}
75e16a44 216PERL_STATIC_INLINE void
c9182d9c 217Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 218{
2439e033 219 if (LIKELY(sv != NULL)) {
75a9bf96 220 U32 rc = SvREFCNT(sv);
79e2a32a 221 if (LIKELY(rc > 1))
75a9bf96
DM
222 SvREFCNT(sv) = rc - 1;
223 else
224 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
225 }
226}
541377b1
FC
227
228PERL_STATIC_INLINE void
c9182d9c 229Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
230{
231 U32 rc = SvREFCNT(sv);
3f2f854a
KW
232
233 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
234
79e2a32a 235 if (LIKELY(rc > 1))
4a9a56a7
DM
236 SvREFCNT(sv) = rc - 1;
237 else
238 Perl_sv_free2(aTHX_ sv, rc);
239}
240
241PERL_STATIC_INLINE void
1bd041dc 242Perl_SvAMAGIC_on(SV *sv)
541377b1 243{
1bd041dc 244 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 245 assert(SvROK(sv));
1bd041dc 246
541377b1
FC
247 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
248}
249PERL_STATIC_INLINE void
1bd041dc 250Perl_SvAMAGIC_off(SV *sv)
541377b1 251{
1bd041dc
KW
252 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
253
541377b1
FC
254 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
255 HvAMAGIC_off(SvSTASH(SvRV(sv)));
256}
257
258PERL_STATIC_INLINE U32
c9182d9c 259Perl_SvPADSTALE_on(SV *sv)
541377b1 260{
c0683843 261 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
262 return SvFLAGS(sv) |= SVs_PADSTALE;
263}
264PERL_STATIC_INLINE U32
c9182d9c 265Perl_SvPADSTALE_off(SV *sv)
541377b1 266{
c0683843 267 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
268 return SvFLAGS(sv) &= ~SVs_PADSTALE;
269}
25fdce4a 270#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 271PERL_STATIC_INLINE STRLEN
6964422a 272S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 273{
25fdce4a 274 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
275 if (SvGAMAGIC(sv)) {
276 U8 *hopped = utf8_hop((U8 *)pv, pos);
277 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
278 return (STRLEN)(hopped - (U8 *)pv);
279 }
280 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
281}
282#endif
f019c49e 283
d1decf2b
TC
284/* ------------------------------- handy.h ------------------------------- */
285
286/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 287GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 288static void
c9182d9c 289Perl_croak_memory_wrap(void)
d1decf2b
TC
290{
291 Perl_croak_nocontext("%s",PL_memory_wrap);
292}
7347ee54 293GCC_DIAG_RESTORE_DECL;
d1decf2b 294
a8a2ceaa
KW
295/* ------------------------------- utf8.h ------------------------------- */
296
2fe720e2
KW
297/*
298=head1 Unicode Support
299*/
300
55d09dc8 301PERL_STATIC_INLINE void
c9182d9c 302Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
303{
304 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
305 * encoded string at '*dest', updating '*dest' to include it */
306
55d09dc8
KW
307 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
308
6f2d5cbc 309 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 310 *((*dest)++) = byte;
55d09dc8 311 else {
a09ec51a
KW
312 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
313 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
314 }
315}
316
e123187a 317/*
2fe720e2 318=for apidoc valid_utf8_to_uvchr
09232555
KW
319Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
320known that the next character in the input UTF-8 string C<s> is well-formed
321(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
322points, and non-Unicode code points are allowed.
2fe720e2
KW
323
324=cut
325
326 */
327
328PERL_STATIC_INLINE UV
329Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
330{
c41b2540 331 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
332 const U8* send = s + expectlen;
333 UV uv = *s;
334
335 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
336
337 if (retlen) {
338 *retlen = expectlen;
339 }
340
341 /* An invariant is trivially returned */
342 if (expectlen == 1) {
343 return uv;
344 }
345
346 /* Remove the leading bits that indicate the number of bytes, leaving just
347 * the bits that are part of the value */
348 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
349
350 /* Now, loop through the remaining bytes, accumulating each into the
351 * working total as we go. (I khw tried unrolling the loop for up to 4
352 * bytes, but there was no performance improvement) */
353 for (++s; s < send; s++) {
354 uv = UTF8_ACCUMULATE(uv, *s);
355 }
356
357 return UNI_TO_NATIVE(uv);
358
359}
360
1e599354
KW
361/*
362=for apidoc is_utf8_invariant_string
363
82c5d941 364Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 365regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
366EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
367are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
368the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
369characters are invariant, but so also are the C1 controls.
1e599354
KW
370
371If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
372use this option, that C<s> can't have embedded C<NUL> characters and has to
373have a terminating C<NUL> byte).
374
9f2abfde
KW
375See also
376C<L</is_utf8_string>>,
377C<L</is_utf8_string_flags>>,
378C<L</is_utf8_string_loc>>,
379C<L</is_utf8_string_loc_flags>>,
380C<L</is_utf8_string_loclen>>,
381C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
382C<L</is_utf8_fixed_width_buf_flags>>,
383C<L</is_utf8_fixed_width_buf_loc_flags>>,
384C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
385C<L</is_strict_utf8_string>>,
386C<L</is_strict_utf8_string_loc>>,
387C<L</is_strict_utf8_string_loclen>>,
388C<L</is_c9strict_utf8_string>>,
389C<L</is_c9strict_utf8_string_loc>>,
390and
391C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
392
393=cut
0cbf5865
KW
394
395*/
396
397#define is_utf8_invariant_string(s, len) \
398 is_utf8_invariant_string_loc(s, len, NULL)
399
400/*
401=for apidoc is_utf8_invariant_string_loc
402
403Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
404the first UTF-8 variant character in the C<ep> pointer; if all characters are
405UTF-8 invariant, this function does not change the contents of C<*ep>.
406
407=cut
408
1e599354
KW
409*/
410
411PERL_STATIC_INLINE bool
c9182d9c 412Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 413{
e17544a6 414 const U8* send;
1e599354
KW
415 const U8* x = s;
416
0cbf5865
KW
417 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
418
e17544a6
KW
419 if (len == 0) {
420 len = strlen((const char *)s);
421 }
422
423 send = s + len;
424
4ab2fd9b 425/* This looks like 0x010101... */
2c5c8af5 426# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
427
428/* This looks like 0x808080... */
2c5c8af5 429# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 430# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 431# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 432
099e59a4
KW
433/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
434 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
435 * optimized out completely on a 32-bit system, and its mask gets optimized out
436 * on a 64-bit system */
2c5c8af5 437# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
438 | ( PTR2nat(x) >> 1) \
439 | ( ( (PTR2nat(x) \
440 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 441
3f515a2e
KW
442#ifndef EBCDIC
443
099e59a4
KW
444 /* Do the word-at-a-time iff there is at least one usable full word. That
445 * means that after advancing to a word boundary, there still is at least a
446 * full word left. The number of bytes needed to advance is 'wordsize -
447 * offset' unless offset is 0. */
448 if ((STRLEN) (send - x) >= PERL_WORDSIZE
449
450 /* This term is wordsize if subword; 0 if not */
451 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
452
453 /* 'offset' */
454 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
455 {
b40579ff 456
46bb68f6
KW
457 /* Process per-byte until reach word boundary. XXX This loop could be
458 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 459 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
460 if (! UTF8_IS_INVARIANT(*x)) {
461 if (ep) {
462 *ep = x;
463 }
e17544a6 464
46bb68f6
KW
465 return FALSE;
466 }
467 x++;
e17544a6 468 }
e17544a6 469
099e59a4
KW
470 /* Here, we know we have at least one full word to process. Process
471 * per-word as long as we have at least a full word left */
472 do {
4ab2fd9b 473 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 474
46bb68f6
KW
475 /* Found a variant. Just return if caller doesn't want its
476 * exact position */
477 if (! ep) {
478 return FALSE;
479 }
e17544a6 480
2c5c8af5
KW
481# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
482 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 483
73f0a2eb 484 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
485 assert(*ep >= s && *ep < send);
486
487 return FALSE;
488
2c5c8af5 489# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
490 checks. */
491
46bb68f6 492 break;
2c5c8af5 493# endif
46bb68f6 494 }
1d2af574 495
46bb68f6 496 x += PERL_WORDSIZE;
1d2af574 497
099e59a4 498 } while (x + PERL_WORDSIZE <= send);
b40579ff 499 }
e17544a6 500
0b08cab0 501#endif /* End of ! EBCDIC */
e17544a6
KW
502
503 /* Process per-byte */
504 while (x < send) {
505 if (! UTF8_IS_INVARIANT(*x)) {
506 if (ep) {
507 *ep = x;
508 }
0cbf5865 509
e17544a6 510 return FALSE;
0cbf5865 511 }
1e599354 512
e17544a6 513 x++;
1e599354
KW
514 }
515
516 return TRUE;
517}
518
23a7ee81
KW
519#ifndef EBCDIC
520
1d2af574 521PERL_STATIC_INLINE unsigned int
73f0a2eb 522Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
523{
524
525 /* This returns the position in a word (0..7) of the first variant byte in
526 * it. This is a helper function. Note that there are no branches */
527
528 assert(word);
529
530 /* Get just the msb bits of each byte */
531 word &= PERL_VARIANTS_WORD_MASK;
532
597ee3f4
KW
533# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
534 easiest thing is to hide that from the callers */
535 {
536 unsigned int i;
537 const U8 * s = (U8 *) &word;
538 dTHX;
539
540 for (i = 0; i < sizeof(word); i++ ) {
541 if (s[i]) {
542 return i;
543 }
544 }
545
546 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
547 __FILE__, __LINE__);
548 }
549
550# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
551
552 /* Bytes are stored like
553 * Byte8 ... Byte2 Byte1
554 * 63..56...15...8 7...0
555 *
556 * Isolate the lsb;
557 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
558 *
559 * The word will look this this, with a rightmost set bit in position 's':
560 * ('x's are don't cares)
561 * s
562 * x..x100..0
563 * x..xx10..0 Right shift (rightmost 0 is shifted off)
564 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
565 * the 1 just to their left into a 0; the remainder is
566 * untouched
4fa92663
KW
567 * 0..0011..1 The xor with the original, x..xx10..0, clears that
568 * remainder, sets the bottom to all 1
1d2af574
KW
569 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
570 *
571 * Another method is to do 'word &= -word'; but it generates a compiler
572 * message on some platforms about taking the negative of an unsigned */
573
574 word >>= 1;
575 word = 1 + (word ^ (word - 1));
576
577# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
578
579 /* Bytes are stored like
580 * Byte1 Byte2 ... Byte8
581 * 63..56 55..47 ... 7...0
582 *
583 * Isolate the msb; http://codeforces.com/blog/entry/10330
584 *
585 * Only the most significant set bit matters. Or'ing word with its right
586 * shift of 1 makes that bit and the next one to its right both 1. Then
587 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
588 * msb and all to the right being 1. */
589 word |= word >> 1;
590 word |= word >> 2;
591 word |= word >> 4;
592 word |= word >> 8;
593 word |= word >> 16;
594 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
595
596 /* Then subtracting the right shift by 1 clears all but the left-most of
597 * the 1 bits, which is our desired result */
598 word -= (word >> 1);
599
600# else
601# error Unexpected byte order
602# endif
603
7cf2d6c7
KW
604 /* Here 'word' has a single bit set: the msb of the first byte in which it
605 * is set. Calculate that position in the word. We can use this
1d2af574 606 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
607 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
608 * just get shifted off at compile time) */
609 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
610 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
611 | (39 << 24) | (47 << 16)
612 | (55 << 8) | (63 << 0));
1d2af574
KW
613 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
614
615 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
616 word = ((word + 1) >> 3) - 1;
617
618# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
619
620 /* And invert the result */
621 word = CHARBITS - word - 1;
622
623# endif
624
625 return (unsigned int) word;
626}
627
23a7ee81 628#endif
03c1e4ab
KW
629#if defined(PERL_CORE) || defined(PERL_EXT)
630
631/*
632=for apidoc variant_under_utf8_count
633
634This function looks at the sequence of bytes between C<s> and C<e>, which are
635assumed to be encoded in ASCII/Latin1, and returns how many of them would
636change should the string be translated into UTF-8. Due to the nature of UTF-8,
637each of these would occupy two bytes instead of the single one in the input
638string. Thus, this function returns the precise number of bytes the string
639would expand by when translated to UTF-8.
640
641Unlike most of the other functions that have C<utf8> in their name, the input
642to this function is NOT a UTF-8-encoded string. The function name is slightly
643I<odd> to emphasize this.
644
645This function is internal to Perl because khw thinks that any XS code that
646would want this is probably operating too close to the internals. Presenting a
647valid use case could change that.
648
649See also
650C<L<perlapi/is_utf8_invariant_string>>
651and
652C<L<perlapi/is_utf8_invariant_string_loc>>,
653
654=cut
655
656*/
657
658PERL_STATIC_INLINE Size_t
659S_variant_under_utf8_count(const U8* const s, const U8* const e)
660{
661 const U8* x = s;
662 Size_t count = 0;
663
664 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
665
666# ifndef EBCDIC
667
5d0379de
KW
668 /* Test if the string is long enough to use word-at-a-time. (Logic is the
669 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
670 if ((STRLEN) (e - x) >= PERL_WORDSIZE
671 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
672 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
673 {
674
675 /* Process per-byte until reach word boundary. XXX This loop could be
676 * eliminated if we knew that this platform had fast unaligned reads */
677 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
678 count += ! UTF8_IS_INVARIANT(*x++);
679 }
680
681 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
682 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
683 explanation of how this works */
e5863284
KW
684 PERL_UINTMAX_T increment
685 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
686 * PERL_COUNT_MULTIPLIER)
687 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 688 count += (Size_t) increment;
03c1e4ab
KW
689 x += PERL_WORDSIZE;
690 } while (x + PERL_WORDSIZE <= e);
691 }
692
693# endif
694
695 /* Process per-byte */
696 while (x < e) {
697 if (! UTF8_IS_INVARIANT(*x)) {
698 count++;
699 }
700
701 x++;
702 }
703
704 return count;
705}
706
707#endif
708
aff4cafe
KW
709#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
710# undef PERL_WORDSIZE
711# undef PERL_COUNT_MULTIPLIER
712# undef PERL_WORD_BOUNDARY_MASK
713# undef PERL_VARIANTS_WORD_MASK
714#endif
03c1e4ab 715
7c93d8f0 716/*
5ff889fb
KW
717=for apidoc is_utf8_string
718
82c5d941
KW
719Returns TRUE if the first C<len> bytes of string C<s> form a valid
720Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
721be calculated using C<strlen(s)> (which means if you use this option, that C<s>
722can't have embedded C<NUL> characters and has to have a terminating C<NUL>
723byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
724
2717076a
KW
725This function considers Perl's extended UTF-8 to be valid. That means that
726code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
727considered valid by this function. Use C<L</is_strict_utf8_string>>,
728C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
729code points are considered valid.
5ff889fb 730
9f2abfde
KW
731See also
732C<L</is_utf8_invariant_string>>,
0cbf5865 733C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
734C<L</is_utf8_string_loc>>,
735C<L</is_utf8_string_loclen>>,
8bc127bf
KW
736C<L</is_utf8_fixed_width_buf_flags>>,
737C<L</is_utf8_fixed_width_buf_loc_flags>>,
738C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
739
740=cut
741*/
742
dd237e82 743#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 744
c9cd936b
KW
745#if defined(PERL_CORE) || defined (PERL_EXT)
746
747/*
748=for apidoc is_utf8_non_invariant_string
749
750Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
751C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
752UTF-8; otherwise returns FALSE.
753
754A TRUE return means that at least one code point represented by the sequence
755either is a wide character not representable as a single byte, or the
756representation differs depending on whether the sequence is encoded in UTF-8 or
757not.
758
759See also
760C<L<perlapi/is_utf8_invariant_string>>,
761C<L<perlapi/is_utf8_string>>
762
763=cut
764
765This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
766It generally needn't be if its string is entirely UTF-8 invariant, and it
767shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
768
769It is an internal function because khw thinks that XS code shouldn't be working
770at this low a level. A valid use case could change that.
771
772*/
773
774PERL_STATIC_INLINE bool
775S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
776{
777 const U8 * first_variant;
778
779 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
780
781 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
782 return FALSE;
783 }
784
785 return is_utf8_string(first_variant, len - (first_variant - s));
786}
787
788#endif
789
5ff889fb 790/*
9f2abfde
KW
791=for apidoc is_strict_utf8_string
792
793Returns TRUE if the first C<len> bytes of string C<s> form a valid
794UTF-8-encoded string that is fully interchangeable by any application using
795Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
796calculated using C<strlen(s)> (which means if you use this option, that C<s>
797can't have embedded C<NUL> characters and has to have a terminating C<NUL>
798byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
799
800This function returns FALSE for strings containing any
801code points above the Unicode max of 0x10FFFF, surrogate code points, or
802non-character code points.
803
804See also
805C<L</is_utf8_invariant_string>>,
0cbf5865 806C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
807C<L</is_utf8_string>>,
808C<L</is_utf8_string_flags>>,
809C<L</is_utf8_string_loc>>,
810C<L</is_utf8_string_loc_flags>>,
811C<L</is_utf8_string_loclen>>,
812C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
813C<L</is_utf8_fixed_width_buf_flags>>,
814C<L</is_utf8_fixed_width_buf_loc_flags>>,
815C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
816C<L</is_strict_utf8_string_loc>>,
817C<L</is_strict_utf8_string_loclen>>,
818C<L</is_c9strict_utf8_string>>,
819C<L</is_c9strict_utf8_string_loc>>,
820and
821C<L</is_c9strict_utf8_string_loclen>>.
822
823=cut
824*/
825
dd237e82 826#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
827
828/*
829=for apidoc is_c9strict_utf8_string
830
831Returns TRUE if the first C<len> bytes of string C<s> form a valid
832UTF-8-encoded string that conforms to
833L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
834otherwise it returns FALSE. If C<len> is 0, it will be calculated using
835C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
836C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
837characters being ASCII constitute 'a valid UTF-8 string'.
838
839This function returns FALSE for strings containing any code points above the
840Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
841code points per
842L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
843
844See also
845C<L</is_utf8_invariant_string>>,
0cbf5865 846C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
847C<L</is_utf8_string>>,
848C<L</is_utf8_string_flags>>,
849C<L</is_utf8_string_loc>>,
850C<L</is_utf8_string_loc_flags>>,
851C<L</is_utf8_string_loclen>>,
852C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
853C<L</is_utf8_fixed_width_buf_flags>>,
854C<L</is_utf8_fixed_width_buf_loc_flags>>,
855C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
856C<L</is_strict_utf8_string>>,
857C<L</is_strict_utf8_string_loc>>,
858C<L</is_strict_utf8_string_loclen>>,
859C<L</is_c9strict_utf8_string_loc>>,
860and
861C<L</is_c9strict_utf8_string_loclen>>.
862
863=cut
864*/
865
dd237e82 866#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
867
868/*
869=for apidoc is_utf8_string_flags
870
871Returns TRUE if the first C<len> bytes of string C<s> form a valid
872UTF-8 string, subject to the restrictions imposed by C<flags>;
873returns FALSE otherwise. If C<len> is 0, it will be calculated
874using C<strlen(s)> (which means if you use this option, that C<s> can't have
875embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
876that all characters being ASCII constitute 'a valid UTF-8 string'.
877
878If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
879C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
880as C<L</is_strict_utf8_string>>; and if C<flags> is
881C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
882C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
883combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
884C<L</utf8n_to_uvchr>>, with the same meanings.
885
886See also
887C<L</is_utf8_invariant_string>>,
0cbf5865 888C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
889C<L</is_utf8_string>>,
890C<L</is_utf8_string_loc>>,
891C<L</is_utf8_string_loc_flags>>,
892C<L</is_utf8_string_loclen>>,
893C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
894C<L</is_utf8_fixed_width_buf_flags>>,
895C<L</is_utf8_fixed_width_buf_loc_flags>>,
896C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
897C<L</is_strict_utf8_string>>,
898C<L</is_strict_utf8_string_loc>>,
899C<L</is_strict_utf8_string_loclen>>,
900C<L</is_c9strict_utf8_string>>,
901C<L</is_c9strict_utf8_string_loc>>,
902and
903C<L</is_c9strict_utf8_string_loclen>>.
904
905=cut
906*/
907
908PERL_STATIC_INLINE bool
c9182d9c 909Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 910{
33756530 911 const U8 * first_variant;
9f2abfde
KW
912
913 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
914 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 915 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 916
f60f61fd
KW
917 if (len == 0) {
918 len = strlen((const char *)s);
919 }
920
9f2abfde
KW
921 if (flags == 0) {
922 return is_utf8_string(s, len);
923 }
924
d044b7a7 925 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
926 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
927 {
928 return is_strict_utf8_string(s, len);
929 }
930
d044b7a7 931 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
932 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
933 {
934 return is_c9strict_utf8_string(s, len);
935 }
936
33756530
KW
937 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
938 const U8* const send = s + len;
939 const U8* x = first_variant;
940
a0d7f935
KW
941 while (x < send) {
942 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
943 if (UNLIKELY(! cur_len)) {
944 return FALSE;
945 }
946 x += cur_len;
9f2abfde 947 }
33756530 948 }
9f2abfde
KW
949
950 return TRUE;
951}
952
953/*
5ff889fb
KW
954
955=for apidoc is_utf8_string_loc
956
2717076a 957Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 958case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 959"utf8ness success") in the C<ep> pointer.
5ff889fb 960
2717076a 961See also C<L</is_utf8_string_loclen>>.
5ff889fb 962
3964c812
KW
963=cut
964*/
965
966#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
967
968/*
969
5ff889fb
KW
970=for apidoc is_utf8_string_loclen
971
2717076a 972Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 973case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 974"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 975encoded characters in the C<el> pointer.
5ff889fb 976
2717076a 977See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
978
979=cut
980*/
981
56e4cf64 982PERL_STATIC_INLINE bool
33756530 983Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 984{
33756530 985 const U8 * first_variant;
5ff889fb
KW
986
987 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
988
33756530
KW
989 if (len == 0) {
990 len = strlen((const char *) s);
991 }
992
993 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
994 if (el)
995 *el = len;
996
997 if (ep) {
998 *ep = s + len;
999 }
1000
1001 return TRUE;
1002 }
1003
1004 {
1005 const U8* const send = s + len;
1006 const U8* x = first_variant;
1007 STRLEN outlen = first_variant - s;
1008
a0d7f935
KW
1009 while (x < send) {
1010 const STRLEN cur_len = isUTF8_CHAR(x, send);
1011 if (UNLIKELY(! cur_len)) {
1012 break;
1013 }
1014 x += cur_len;
1015 outlen++;
5ff889fb 1016 }
5ff889fb 1017
a0d7f935
KW
1018 if (el)
1019 *el = outlen;
5ff889fb 1020
a0d7f935
KW
1021 if (ep) {
1022 *ep = x;
1023 }
5ff889fb 1024
a0d7f935 1025 return (x == send);
33756530 1026 }
5ff889fb
KW
1027}
1028
1029/*
9f2abfde 1030
44170c9a 1031=for apidoc isUTF8_CHAR
8ed185f9
KW
1032
1033Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1034looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1035that represents some code point; otherwise it evaluates to 0. If non-zero, the
1036value gives how many bytes starting at C<s> comprise the code point's
1037representation. Any bytes remaining before C<e>, but beyond the ones needed to
1038form the first code point in C<s>, are not examined.
1039
13aab5dd 1040The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1041extension to official UTF-8 to represent those higher than the Unicode maximum
1042of 0x10FFFF. That means that this macro is used to efficiently decide if the
1043next few bytes in C<s> is legal UTF-8 for a single character.
1044
1045Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1046defined by Unicode to be fully interchangeable across applications;
1047C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1048#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1049code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1050
1051Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1052C<L</is_utf8_string_loclen>> to check entire strings.
1053
13aab5dd
KW
1054Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1055machines) is a valid UTF-8 character.
8ed185f9
KW
1056
1057=cut
1058
1059This uses an adaptation of the table and algorithm given in
1060http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1061documentation of the original version. A copyright notice for the original
1062version is given at the beginning of this file. The Perl adapation is
71525f77 1063documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1064
1065*/
1066
1067PERL_STATIC_INLINE Size_t
c9182d9c 1068Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1069{
1070 const U8 * s = s0;
1071 UV state = 0;
1072
1073 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1074
1075 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1076 * code point, which can be returned immediately. Otherwise, it is either
1077 * malformed, or for the start byte FF which the dfa doesn't handle (except
1078 * on 32-bit ASCII platforms where it trivially is an error). Call a
1079 * helper function for the other platforms. */
1080
1081 while (s < e && LIKELY(state != 1)) {
71525f77 1082 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1083 + state
71525f77 1084 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1085 if (state != 0) {
1086 s++;
1087 continue;
1088 }
1089
1090 return s - s0 + 1;
1091 }
1092
1093#if defined(UV_IS_QUAD) || defined(EBCDIC)
1094
1095 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1096 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1097 }
1098
1099#endif
1100
1101 return 0;
1102}
1103
1104/*
1105
67049a5f
KW
1106=for apidoc isSTRICT_UTF8_CHAR
1107
1108Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1109looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1110Unicode code point completely acceptable for open interchange between all
1111applications; otherwise it evaluates to 0. If non-zero, the value gives how
1112many bytes starting at C<s> comprise the code point's representation. Any
1113bytes remaining before C<e>, but beyond the ones needed to form the first code
1114point in C<s>, are not examined.
1115
1116The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1117be a surrogate nor a non-character code point. Thus this excludes any code
1118point from Perl's extended UTF-8.
1119
1120This is used to efficiently decide if the next few bytes in C<s> is
1121legal Unicode-acceptable UTF-8 for a single character.
1122
1123Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1124#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1125code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1126and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1127
1128Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1129C<L</is_strict_utf8_string_loclen>> to check entire strings.
1130
1131=cut
1132
1133This uses an adaptation of the tables and algorithm given in
1134http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1135documentation of the original version. A copyright notice for the original
1136version is given at the beginning of this file. The Perl adapation is
1137documented at the definition of strict_extended_utf8_dfa_tab[].
1138
1139*/
1140
1141PERL_STATIC_INLINE Size_t
c9182d9c 1142Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1143{
1144 const U8 * s = s0;
1145 UV state = 0;
1146
1147 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1148
1149 while (s < e && LIKELY(state != 1)) {
71525f77 1150 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1151
1152 if (state != 0) {
1153 s++;
1154 continue;
1155 }
1156
1157 return s - s0 + 1;
1158 }
1159
1160#ifndef EBCDIC
1161
1162 /* The dfa above drops out for certain Hanguls; handle them specially */
1163 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1164 return 3;
1165 }
1166
1167#endif
1168
1169 return 0;
1170}
1171
1172/*
1173
44170c9a 1174=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1175
1176Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1177looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1178Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1179the value gives how many bytes starting at C<s> comprise the code point's
1180representation. Any bytes remaining before C<e>, but beyond the ones needed to
1181form the first code point in C<s>, are not examined.
1182
1183The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1184differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1185code points. This corresponds to
1186L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1187which said that non-character code points are merely discouraged rather than
1188completely forbidden in open interchange. See
1189L<perlunicode/Noncharacter code points>.
1190
1191Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1192C<L</isUTF8_CHAR_flags>> for a more customized definition.
1193
1194Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1195C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1196
1197=cut
1198
1199This uses an adaptation of the tables and algorithm given in
1200http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1201documentation of the original version. A copyright notice for the original
1202version is given at the beginning of this file. The Perl adapation is
71525f77 1203documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1204
1205*/
1206
1207PERL_STATIC_INLINE Size_t
c9182d9c 1208Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1209{
1210 const U8 * s = s0;
1211 UV state = 0;
1212
1213 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1214
1215 while (s < e && LIKELY(state != 1)) {
71525f77 1216 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1217
1218 if (state != 0) {
1219 s++;
1220 continue;
1221 }
1222
1223 return s - s0 + 1;
1224 }
1225
1226 return 0;
1227}
1228
1229/*
1230
9f2abfde
KW
1231=for apidoc is_strict_utf8_string_loc
1232
1233Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1234case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1235"utf8ness success") in the C<ep> pointer.
1236
1237See also C<L</is_strict_utf8_string_loclen>>.
1238
1239=cut
1240*/
1241
1242#define is_strict_utf8_string_loc(s, len, ep) \
1243 is_strict_utf8_string_loclen(s, len, ep, 0)
1244
1245/*
1246
1247=for apidoc is_strict_utf8_string_loclen
1248
1249Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1250case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1251"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1252encoded characters in the C<el> pointer.
1253
1254See also C<L</is_strict_utf8_string_loc>>.
1255
1256=cut
1257*/
1258
1259PERL_STATIC_INLINE bool
c9182d9c 1260Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1261{
33756530 1262 const U8 * first_variant;
9f2abfde
KW
1263
1264 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1265
33756530
KW
1266 if (len == 0) {
1267 len = strlen((const char *) s);
1268 }
1269
1270 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1271 if (el)
1272 *el = len;
1273
1274 if (ep) {
1275 *ep = s + len;
1276 }
1277
1278 return TRUE;
1279 }
1280
1281 {
1282 const U8* const send = s + len;
1283 const U8* x = first_variant;
1284 STRLEN outlen = first_variant - s;
1285
a0d7f935
KW
1286 while (x < send) {
1287 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1288 if (UNLIKELY(! cur_len)) {
1289 break;
1290 }
1291 x += cur_len;
1292 outlen++;
9f2abfde 1293 }
9f2abfde 1294
a0d7f935
KW
1295 if (el)
1296 *el = outlen;
9f2abfde 1297
a0d7f935
KW
1298 if (ep) {
1299 *ep = x;
1300 }
9f2abfde 1301
a0d7f935 1302 return (x == send);
33756530 1303 }
9f2abfde
KW
1304}
1305
1306/*
1307
1308=for apidoc is_c9strict_utf8_string_loc
1309
1310Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1311the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1312"utf8ness success") in the C<ep> pointer.
1313
1314See also C<L</is_c9strict_utf8_string_loclen>>.
1315
1316=cut
1317*/
1318
1319#define is_c9strict_utf8_string_loc(s, len, ep) \
1320 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1321
1322/*
1323
1324=for apidoc is_c9strict_utf8_string_loclen
1325
1326Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1327the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1328"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1329characters in the C<el> pointer.
1330
1331See also C<L</is_c9strict_utf8_string_loc>>.
1332
1333=cut
1334*/
1335
1336PERL_STATIC_INLINE bool
c9182d9c 1337Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1338{
33756530 1339 const U8 * first_variant;
9f2abfde
KW
1340
1341 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1342
33756530
KW
1343 if (len == 0) {
1344 len = strlen((const char *) s);
1345 }
1346
1347 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1348 if (el)
1349 *el = len;
1350
1351 if (ep) {
1352 *ep = s + len;
1353 }
1354
1355 return TRUE;
1356 }
1357
1358 {
1359 const U8* const send = s + len;
1360 const U8* x = first_variant;
1361 STRLEN outlen = first_variant - s;
1362
a0d7f935
KW
1363 while (x < send) {
1364 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1365 if (UNLIKELY(! cur_len)) {
1366 break;
1367 }
1368 x += cur_len;
1369 outlen++;
9f2abfde 1370 }
9f2abfde 1371
a0d7f935
KW
1372 if (el)
1373 *el = outlen;
9f2abfde 1374
a0d7f935
KW
1375 if (ep) {
1376 *ep = x;
1377 }
9f2abfde 1378
a0d7f935 1379 return (x == send);
33756530 1380 }
9f2abfde
KW
1381}
1382
1383/*
1384
1385=for apidoc is_utf8_string_loc_flags
1386
1387Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1388case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1389"utf8ness success") in the C<ep> pointer.
1390
1391See also C<L</is_utf8_string_loclen_flags>>.
1392
1393=cut
1394*/
1395
1396#define is_utf8_string_loc_flags(s, len, ep, flags) \
1397 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1398
1399
1400/* The above 3 actual functions could have been moved into the more general one
1401 * just below, and made #defines that call it with the right 'flags'. They are
1402 * currently kept separate to increase their chances of getting inlined */
1403
1404/*
1405
1406=for apidoc is_utf8_string_loclen_flags
1407
1408Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1409case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1410"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1411encoded characters in the C<el> pointer.
1412
1413See also C<L</is_utf8_string_loc_flags>>.
1414
1415=cut
1416*/
1417
1418PERL_STATIC_INLINE bool
c9182d9c 1419Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1420{
33756530 1421 const U8 * first_variant;
9f2abfde
KW
1422
1423 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1424 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1425 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1426
f60f61fd 1427 if (len == 0) {
a0d7f935 1428 len = strlen((const char *) s);
f60f61fd
KW
1429 }
1430
9f2abfde
KW
1431 if (flags == 0) {
1432 return is_utf8_string_loclen(s, len, ep, el);
1433 }
1434
d044b7a7 1435 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1436 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1437 {
1438 return is_strict_utf8_string_loclen(s, len, ep, el);
1439 }
1440
d044b7a7 1441 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1442 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1443 {
1444 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1445 }
1446
33756530
KW
1447 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1448 if (el)
1449 *el = len;
1450
1451 if (ep) {
1452 *ep = s + len;
1453 }
1454
1455 return TRUE;
1456 }
1457
1458 {
1459 const U8* send = s + len;
1460 const U8* x = first_variant;
1461 STRLEN outlen = first_variant - s;
1462
a0d7f935
KW
1463 while (x < send) {
1464 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1465 if (UNLIKELY(! cur_len)) {
1466 break;
1467 }
1468 x += cur_len;
1469 outlen++;
9f2abfde 1470 }
9f2abfde 1471
a0d7f935
KW
1472 if (el)
1473 *el = outlen;
9f2abfde 1474
a0d7f935
KW
1475 if (ep) {
1476 *ep = x;
1477 }
9f2abfde 1478
a0d7f935 1479 return (x == send);
33756530 1480 }
9f2abfde
KW
1481}
1482
1483/*
7c93d8f0
KW
1484=for apidoc utf8_distance
1485
1486Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1487and C<b>.
1488
1489WARNING: use only if you *know* that the pointers point inside the
1490same UTF-8 buffer.
1491
1492=cut
1493*/
1494
1495PERL_STATIC_INLINE IV
1496Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1497{
1498 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1499
1500 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1501}
1502
1503/*
1504=for apidoc utf8_hop
1505
1506Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1507forward or backward.
1508
1509WARNING: do not use the following unless you *know* C<off> is within
1510the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1511on the first byte of character or just after the last byte of a character.
1512
1513=cut
1514*/
1515
1516PERL_STATIC_INLINE U8 *
1517Perl_utf8_hop(const U8 *s, SSize_t off)
1518{
1519 PERL_ARGS_ASSERT_UTF8_HOP;
1520
1521 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1522 * the bitops (especially ~) can create illegal UTF-8.
1523 * In other words: in Perl UTF-8 is not just for Unicode. */
1524
1525 if (off >= 0) {
1526 while (off--)
1527 s += UTF8SKIP(s);
1528 }
1529 else {
1530 while (off++) {
1531 s--;
1532 while (UTF8_IS_CONTINUATION(*s))
1533 s--;
1534 }
1535 }
e099ea69 1536 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1537 return (U8 *)s;
e099ea69 1538 GCC_DIAG_RESTORE
7c93d8f0
KW
1539}
1540
4dab108f 1541/*
65df57a8
TC
1542=for apidoc utf8_hop_forward
1543
1544Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1545forward.
1546
1547C<off> must be non-negative.
1548
1549C<s> must be before or equal to C<end>.
1550
1551When moving forward it will not move beyond C<end>.
1552
1553Will not exceed this limit even if the string is not valid "UTF-8".
1554
1555=cut
1556*/
1557
1558PERL_STATIC_INLINE U8 *
1559Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1560{
1561 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1562
1563 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1564 * the bitops (especially ~) can create illegal UTF-8.
1565 * In other words: in Perl UTF-8 is not just for Unicode. */
1566
1567 assert(s <= end);
1568 assert(off >= 0);
1569
1570 while (off--) {
1571 STRLEN skip = UTF8SKIP(s);
de979548 1572 if ((STRLEN)(end - s) <= skip) {
e099ea69 1573 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1574 return (U8 *)end;
e099ea69 1575 GCC_DIAG_RESTORE
de979548 1576 }
65df57a8
TC
1577 s += skip;
1578 }
1579
e099ea69 1580 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1581 return (U8 *)s;
e099ea69 1582 GCC_DIAG_RESTORE
65df57a8
TC
1583}
1584
1585/*
1586=for apidoc utf8_hop_back
1587
1588Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1589backward.
1590
1591C<off> must be non-positive.
1592
1593C<s> must be after or equal to C<start>.
1594
1595When moving backward it will not move before C<start>.
1596
1597Will not exceed this limit even if the string is not valid "UTF-8".
1598
1599=cut
1600*/
1601
1602PERL_STATIC_INLINE U8 *
1603Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1604{
1605 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1606
1607 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1608 * the bitops (especially ~) can create illegal UTF-8.
1609 * In other words: in Perl UTF-8 is not just for Unicode. */
1610
1611 assert(start <= s);
1612 assert(off <= 0);
1613
1614 while (off++ && s > start) {
e7185695 1615 do {
65df57a8 1616 s--;
e7185695 1617 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8
TC
1618 }
1619
e099ea69 1620 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1621 return (U8 *)s;
e099ea69 1622 GCC_DIAG_RESTORE
65df57a8
TC
1623}
1624
1625/*
1626=for apidoc utf8_hop_safe
1627
1628Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1629either forward or backward.
1630
1631When moving backward it will not move before C<start>.
1632
1633When moving forward it will not move beyond C<end>.
1634
1635Will not exceed those limits even if the string is not valid "UTF-8".
1636
1637=cut
1638*/
1639
1640PERL_STATIC_INLINE U8 *
1641Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1642{
1643 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1644
1645 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1646 * the bitops (especially ~) can create illegal UTF-8.
1647 * In other words: in Perl UTF-8 is not just for Unicode. */
1648
1649 assert(start <= s && s <= end);
1650
1651 if (off >= 0) {
1652 return utf8_hop_forward(s, off, end);
1653 }
1654 else {
1655 return utf8_hop_back(s, off, start);
1656 }
1657}
1658
1659/*
4dab108f
KW
1660
1661=for apidoc is_utf8_valid_partial_char
1662
6cbb9248
KW
1663Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1664S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1665points. Otherwise, it returns 1 if there exists at least one non-empty
1666sequence of bytes that when appended to sequence C<s>, starting at position
1667C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1668otherwise returns 0.
1669
1670In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1671point.
1672
1673This is useful when a fixed-length buffer is being tested for being well-formed
1674UTF-8, but the final few bytes in it don't comprise a full character; that is,
1675it is split somewhere in the middle of the final code point's UTF-8
1676representation. (Presumably when the buffer is refreshed with the next chunk
1677of data, the new first bytes will complete the partial code point.) This
1678function is used to verify that the final bytes in the current buffer are in
1679fact the legal beginning of some code point, so that if they aren't, the
1680failure can be signalled without having to wait for the next read.
4dab108f
KW
1681
1682=cut
1683*/
2717076a
KW
1684#define is_utf8_valid_partial_char(s, e) \
1685 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1686
1687/*
1688
1689=for apidoc is_utf8_valid_partial_char_flags
1690
1691Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1692or not the input is a valid UTF-8 encoded partial character, but it takes an
1693extra parameter, C<flags>, which can further restrict which code points are
1694considered valid.
1695
1696If C<flags> is 0, this behaves identically to
1697C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1698of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1699there is any sequence of bytes that can complete the input partial character in
1700such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1701TRUE; otherwise FALSE. Non character code points cannot be determined based on
1702partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1703determined from just the first one or two bytes.
1704
1705=cut
1706 */
1707
56e4cf64 1708PERL_STATIC_INLINE bool
c9182d9c 1709Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1710{
f1c999a7 1711 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1712
f1c999a7 1713 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1714 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1715
8875bd48 1716 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1717 return FALSE;
1718 }
1719
1376b35c 1720 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1721}
1722
8bc127bf
KW
1723/*
1724
1725=for apidoc is_utf8_fixed_width_buf_flags
1726
1727Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1728is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1729otherwise it returns FALSE.
1730
1731If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1732without restriction. If the final few bytes of the buffer do not form a
1733complete code point, this will return TRUE anyway, provided that
1734C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1735
1736If C<flags> in non-zero, it can be any combination of the
1737C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1738same meanings.
1739
1740This function differs from C<L</is_utf8_string_flags>> only in that the latter
1741returns FALSE if the final few bytes of the string don't form a complete code
1742point.
1743
1744=cut
1745 */
1746#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1747 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1748
1749/*
1750
1751=for apidoc is_utf8_fixed_width_buf_loc_flags
1752
1753Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1754failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1755to the beginning of any partial character at the end of the buffer; if there is
1756no partial character C<*ep> will contain C<s>+C<len>.
1757
1758See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1759
1760=cut
1761*/
1762
1763#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1764 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1765
1766/*
1767
1768=for apidoc is_utf8_fixed_width_buf_loclen_flags
1769
1770Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1771complete, valid characters found in the C<el> pointer.
1772
1773=cut
1774*/
1775
1776PERL_STATIC_INLINE bool
c9182d9c 1777Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1778 STRLEN len,
8bc127bf
KW
1779 const U8 **ep,
1780 STRLEN *el,
1781 const U32 flags)
1782{
1783 const U8 * maybe_partial;
1784
1785 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1786
1787 if (! ep) {
1788 ep = &maybe_partial;
1789 }
1790
1791 /* If it's entirely valid, return that; otherwise see if the only error is
1792 * that the final few bytes are for a partial character */
1793 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1794 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1795}
1796
e6a4ffc3 1797PERL_STATIC_INLINE UV
c9182d9c 1798Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1799 STRLEN curlen,
1800 STRLEN *retlen,
1801 const U32 flags,
1802 U32 * errors,
1803 AV ** msgs)
1804{
1805 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1806 * simple cases, and, if necessary calls a helper function to deal with the
1807 * more complex ones. Almost all well-formed non-problematic code points
1808 * are considered simple, so that it's unlikely that the helper function
1809 * will need to be called.
1810 *
1811 * This is an adaptation of the tables and algorithm given in
1812 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1813 * comprehensive documentation of the original version. A copyright notice
1814 * for the original version is given at the beginning of this file. The
71525f77 1815 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1816 */
1817
1818 const U8 * const s0 = s;
1819 const U8 * send = s0 + curlen;
f73cfe7d 1820 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1821 UV state = 0;
1822
1823 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1824
1825 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1826 * non-problematic code point, which can be returned immediately.
1827 * Otherwise we call a helper function to figure out the more complicated
1828 * cases. */
1829
1830 while (s < send && LIKELY(state != 1)) {
71525f77 1831 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1832
1833 uv = (state == 0)
1834 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1835 : UTF8_ACCUMULATE(uv, *s);
71525f77 1836 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1837
1838 if (state != 0) {
1839 s++;
1840 continue;
1841 }
1842
1843 if (retlen) {
1844 *retlen = s - s0 + 1;
1845 }
1846 if (errors) {
1847 *errors = 0;
1848 }
1849 if (msgs) {
1850 *msgs = NULL;
1851 }
1852
cfebc7aa 1853 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1854 }
1855
1856 /* Here is potentially problematic. Use the full mechanism */
1857 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1858}
1859
82651abe 1860PERL_STATIC_INLINE UV
9a9a6c98 1861Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1862{
9a9a6c98 1863 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1864
1865 assert(s < send);
1866
1867 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1868
1869 /* EMPTY is not really allowed, and asserts on debugging builds. But
1870 * on non-debugging we have to deal with it, and this causes it to
1871 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1872 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1873 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1874 }
1875 else {
1876 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1877 if (retlen && ret == 0 && *s != '\0') {
1878 *retlen = (STRLEN) -1;
1879 }
1880
1881 return ret;
1882 }
1883}
1884
c8028aa6
TC
1885/* ------------------------------- perl.h ----------------------------- */
1886
1887/*
dcccc8ff
KW
1888=head1 Miscellaneous Functions
1889
44170c9a 1890=for apidoc is_safe_syscall
c8028aa6 1891
6602b933 1892Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1893If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1894
1895Return TRUE if the name is safe.
1896
796b6530 1897Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1898
1899=cut
1900*/
1901
1902PERL_STATIC_INLINE bool
ffd62fc2
KW
1903Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1904{
c8028aa6
TC
1905 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1906 * perl itself uses xce*() functions which accept 8-bit strings.
1907 */
1908
1909 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1910
6c4650b3 1911 if (len > 1) {
c8028aa6 1912 char *null_at;
41188aa0 1913 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1914 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1915 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1916 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1917 what, op_name, pv, null_at+1);
c8028aa6
TC
1918 return FALSE;
1919 }
1920 }
1921
1922 return TRUE;
1923}
1924
1925/*
7cb3f959
TC
1926
1927Return true if the supplied filename has a newline character
fa6c7d00 1928immediately before the first (hopefully only) NUL.
7cb3f959
TC
1929
1930My original look at this incorrectly used the len from SvPV(), but
1931that's incorrect, since we allow for a NUL in pv[len-1].
1932
1933So instead, strlen() and work from there.
1934
1935This allow for the user reading a filename, forgetting to chomp it,
1936then calling:
1937
1938 open my $foo, "$file\0";
1939
1940*/
1941
1942#ifdef PERL_CORE
1943
1944PERL_STATIC_INLINE bool
ffd62fc2
KW
1945S_should_warn_nl(const char *pv)
1946{
7cb3f959
TC
1947 STRLEN len;
1948
1949 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1950
1951 len = strlen(pv);
1952
1953 return len > 0 && pv[len-1] == '\n';
1954}
1955
1956#endif
1957
3a019afd
KW
1958#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1959
1960PERL_STATIC_INLINE bool
1961S_lossless_NV_to_IV(const NV nv, IV *ivp)
1962{
1963 /* This function determines if the input NV 'nv' may be converted without
1964 * loss of data to an IV. If not, it returns FALSE taking no other action.
1965 * But if it is possible, it does the conversion, returning TRUE, and
1966 * storing the converted result in '*ivp' */
1967
1968 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1969
1970# if defined(Perl_isnan)
1971
1972 if (UNLIKELY(Perl_isnan(nv))) {
1973 return FALSE;
1974 }
1975
1976# endif
1977
1978 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1979 return FALSE;
1980 }
1981
1982 if ((IV) nv != nv) {
1983 return FALSE;
1984 }
1985
1986 *ivp = (IV) nv;
1987 return TRUE;
1988}
1989
1990#endif
1991
81d52ecd
JH
1992/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1993
94b0cb42
KW
1994#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
1995
81d52ecd
JH
1996#define MAX_CHARSET_NAME_LENGTH 2
1997
1998PERL_STATIC_INLINE const char *
94b0cb42 1999S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2000{
94b0cb42
KW
2001 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2002
81d52ecd
JH
2003 /* Returns a string that corresponds to the name of the regex character set
2004 * given by 'flags', and *lenp is set the length of that string, which
2005 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2006
2007 *lenp = 1;
2008 switch (get_regex_charset(flags)) {
2009 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2010 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2011 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2012 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2013 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2014 *lenp = 2;
2015 return ASCII_MORE_RESTRICT_PAT_MODS;
2016 }
2017 /* The NOT_REACHED; hides an assert() which has a rather complex
2018 * definition in perl.h. */
2019 NOT_REACHED; /* NOTREACHED */
2020 return "?"; /* Unknown */
2021}
2022
94b0cb42
KW
2023#endif
2024
7cb3f959 2025/*
ed382232
TC
2026
2027Return false if any get magic is on the SV other than taint magic.
2028
2029*/
2030
2031PERL_STATIC_INLINE bool
ffd62fc2
KW
2032Perl_sv_only_taint_gmagic(SV *sv)
2033{
ed382232
TC
2034 MAGIC *mg = SvMAGIC(sv);
2035
2036 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2037
2038 while (mg) {
2039 if (mg->mg_type != PERL_MAGIC_taint
2040 && !(mg->mg_flags & MGf_GSKIP)
2041 && mg->mg_virtual->svt_get) {
2042 return FALSE;
2043 }
2044 mg = mg->mg_moremagic;
2045 }
2046
2047 return TRUE;
2048}
2049
ed8ff0f3
DM
2050/* ------------------ cop.h ------------------------------------------- */
2051
2052
2053/* Enter a block. Push a new base context and return its address. */
2054
2055PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2056Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2057{
2058 PERL_CONTEXT * cx;
2059
2060 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2061
2062 CXINC;
2063 cx = CX_CUR();
2064 cx->cx_type = type;
2065 cx->blk_gimme = gimme;
2066 cx->blk_oldsaveix = saveix;
4caf7d8c 2067 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2068 cx->blk_oldcop = PL_curcop;
4caf7d8c 2069 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2070 cx->blk_oldscopesp = PL_scopestack_ix;
2071 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2072 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2073
2074 PL_tmps_floor = PL_tmps_ix;
2075 CX_DEBUG(cx, "PUSH");
2076 return cx;
2077}
2078
2079
2080/* Exit a block (RETURN and LAST). */
2081
2082PERL_STATIC_INLINE void
c9182d9c 2083Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2084{
2085 PERL_ARGS_ASSERT_CX_POPBLOCK;
2086
2087 CX_DEBUG(cx, "POP");
2088 /* these 3 are common to cx_popblock and cx_topblock */
2089 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2090 PL_scopestack_ix = cx->blk_oldscopesp;
2091 PL_curpm = cx->blk_oldpm;
2092
2093 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2094 * and leaves a CX entry lying around for repeated use, so
2095 * skip for multicall */ \
2096 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2097 || PL_savestack_ix == cx->blk_oldsaveix);
2098 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2099 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2100}
2101
2102/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2103 * Whereas cx_popblock() restores the state to the point just before
2104 * cx_pushblock() was called, cx_topblock() restores it to the point just
2105 * *after* cx_pushblock() was called. */
2106
2107PERL_STATIC_INLINE void
c9182d9c 2108Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2109{
2110 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2111
2112 CX_DEBUG(cx, "TOP");
2113 /* these 3 are common to cx_popblock and cx_topblock */
2114 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2115 PL_scopestack_ix = cx->blk_oldscopesp;
2116 PL_curpm = cx->blk_oldpm;
2117
2118 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2119}
2120
2121
a73d8813 2122PERL_STATIC_INLINE void
c9182d9c 2123Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2124{
2125 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2126
2127 PERL_ARGS_ASSERT_CX_PUSHSUB;
2128
3f6bd23a 2129 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
2130 cx->blk_sub.cv = cv;
2131 cx->blk_sub.olddepth = CvDEPTH(cv);
2132 cx->blk_sub.prevcomppad = PL_comppad;
2133 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2134 cx->blk_sub.retop = retop;
2135 SvREFCNT_inc_simple_void_NN(cv);
2136 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2137}
2138
2139
2140/* subsets of cx_popsub() */
2141
2142PERL_STATIC_INLINE void
c9182d9c 2143Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2144{
2145 CV *cv;
2146
2147 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2148 assert(CxTYPE(cx) == CXt_SUB);
2149
2150 PL_comppad = cx->blk_sub.prevcomppad;
2151 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2152 cv = cx->blk_sub.cv;
2153 CvDEPTH(cv) = cx->blk_sub.olddepth;
2154 cx->blk_sub.cv = NULL;
2155 SvREFCNT_dec(cv);
2156}
2157
2158
2159/* handle the @_ part of leaving a sub */
2160
2161PERL_STATIC_INLINE void
c9182d9c 2162Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2163{
2164 AV *av;
2165
2166 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2167 assert(CxTYPE(cx) == CXt_SUB);
2168 assert(AvARRAY(MUTABLE_AV(
2169 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2170 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2171
2172 CX_POP_SAVEARRAY(cx);
2173 av = MUTABLE_AV(PAD_SVl(0));
2174 if (UNLIKELY(AvREAL(av)))
2175 /* abandon @_ if it got reified */
2176 clear_defarray(av, 0);
2177 else {
2178 CLEAR_ARGARRAY(av);
2179 }
2180}
2181
2182
2183PERL_STATIC_INLINE void
c9182d9c 2184Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2185{
2186 PERL_ARGS_ASSERT_CX_POPSUB;
2187 assert(CxTYPE(cx) == CXt_SUB);
2188
3f6bd23a 2189 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2190
2191 if (CxHASARGS(cx))
2192 cx_popsub_args(cx);
2193 cx_popsub_common(cx);
2194}
2195
2196
6a7d52cc 2197PERL_STATIC_INLINE void
c9182d9c 2198Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2199{
2200 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2201
2202 cx->blk_format.cv = cv;
2203 cx->blk_format.retop = retop;
2204 cx->blk_format.gv = gv;
2205 cx->blk_format.dfoutgv = PL_defoutgv;
2206 cx->blk_format.prevcomppad = PL_comppad;
2207 cx->blk_u16 = 0;
2208
2209 SvREFCNT_inc_simple_void_NN(cv);
2210 CvDEPTH(cv)++;
2211 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2212}
2213
2214
2215PERL_STATIC_INLINE void
c9182d9c 2216Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2217{
2218 CV *cv;
2219 GV *dfout;
2220
2221 PERL_ARGS_ASSERT_CX_POPFORMAT;
2222 assert(CxTYPE(cx) == CXt_FORMAT);
2223
2224 dfout = cx->blk_format.dfoutgv;
2225 setdefout(dfout);
2226 cx->blk_format.dfoutgv = NULL;
2227 SvREFCNT_dec_NN(dfout);
2228
2229 PL_comppad = cx->blk_format.prevcomppad;
2230 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2231 cv = cx->blk_format.cv;
2232 cx->blk_format.cv = NULL;
2233 --CvDEPTH(cv);
2234 SvREFCNT_dec_NN(cv);
2235}
2236
2237
13febba5 2238PERL_STATIC_INLINE void
c9182d9c 2239Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2240{
2241 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2242
2243 cx->blk_eval.retop = retop;
2244 cx->blk_eval.old_namesv = namesv;
2245 cx->blk_eval.old_eval_root = PL_eval_root;
2246 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2247 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2248 cx->blk_eval.cur_top_env = PL_top_env;
2249
4c57ced5 2250 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2251 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2252 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2253}
2254
2255
2256PERL_STATIC_INLINE void
c9182d9c 2257Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2258{
2259 SV *sv;
2260
2261 PERL_ARGS_ASSERT_CX_POPEVAL;
2262 assert(CxTYPE(cx) == CXt_EVAL);
2263
2264 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2265 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2266 PL_eval_root = cx->blk_eval.old_eval_root;
2267 sv = cx->blk_eval.cur_text;
4c57ced5 2268 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2269 cx->blk_eval.cur_text = NULL;
2270 SvREFCNT_dec_NN(sv);
2271 }
2272
2273 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2274 if (sv) {
2275 cx->blk_eval.old_namesv = NULL;
2276 SvREFCNT_dec_NN(sv);
2277 }
13febba5 2278}
6a7d52cc 2279
a73d8813 2280
d1b6bf72
DM
2281/* push a plain loop, i.e.
2282 * { block }
2283 * while (cond) { block }
2284 * for (init;cond;continue) { block }
2285 * This loop can be last/redo'ed etc.
2286 */
2287
2288PERL_STATIC_INLINE void
c9182d9c 2289Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2290{
2291 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2292 cx->blk_loop.my_op = cLOOP;
2293}
2294
2295
2296/* push a true for loop, i.e.
2297 * for var (list) { block }
2298 */
2299
2300PERL_STATIC_INLINE void
c9182d9c 2301Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2302{
2303 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2304
2305 /* this one line is common with cx_pushloop_plain */
2306 cx->blk_loop.my_op = cLOOP;
2307
2308 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2309 cx->blk_loop.itersave = itersave;
2310#ifdef USE_ITHREADS
2311 cx->blk_loop.oldcomppad = PL_comppad;
2312#endif
2313}
2314
2315
2316/* pop all loop types, including plain */
2317
2318PERL_STATIC_INLINE void
c9182d9c 2319Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2320{
2321 PERL_ARGS_ASSERT_CX_POPLOOP;
2322
2323 assert(CxTYPE_is_LOOP(cx));
2324 if ( CxTYPE(cx) == CXt_LOOP_ARY
2325 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2326 {
2327 /* Free ary or cur. This assumes that state_u.ary.ary
2328 * aligns with state_u.lazysv.cur. See cx_dup() */
2329 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2330 cx->blk_loop.state_u.lazysv.cur = NULL;
2331 SvREFCNT_dec_NN(sv);
2332 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2333 sv = cx->blk_loop.state_u.lazysv.end;
2334 cx->blk_loop.state_u.lazysv.end = NULL;
2335 SvREFCNT_dec_NN(sv);
2336 }
2337 }
2338 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2339 SV *cursv;
2340 SV **svp = (cx)->blk_loop.itervar_u.svp;
2341 if ((cx->cx_type & CXp_FOR_GV))
2342 svp = &GvSV((GV*)svp);
2343 cursv = *svp;
2344 *svp = cx->blk_loop.itersave;
2345 cx->blk_loop.itersave = NULL;
2346 SvREFCNT_dec(cursv);
2347 }
2348}
2349
2a7b7c61
DM
2350
2351PERL_STATIC_INLINE void
c9182d9c 2352Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2353{
7896dde7 2354 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2355
7896dde7 2356 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2357}
2358
2359
2360PERL_STATIC_INLINE void
c9182d9c 2361Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2362{
7896dde7
Z
2363 PERL_ARGS_ASSERT_CX_POPWHEN;
2364 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2365
2366 PERL_UNUSED_ARG(cx);
59a14f30 2367 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2368 /* currently NOOP */
2369}
2370
2371
7896dde7 2372PERL_STATIC_INLINE void
c9182d9c 2373Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2374{
2375 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2376
2377 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2378 cx->blk_givwhen.defsv_save = orig_defsv;
2379}
2380
2381
2382PERL_STATIC_INLINE void
c9182d9c 2383Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2384{
2385 SV *sv;
2386
2387 PERL_ARGS_ASSERT_CX_POPGIVEN;
2388 assert(CxTYPE(cx) == CXt_GIVEN);
2389
2390 sv = GvSV(PL_defgv);
2391 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2392 cx->blk_givwhen.defsv_save = NULL;
2393 SvREFCNT_dec(sv);
2394}
2395
ec2c235b
KW
2396/* ------------------ util.h ------------------------------------------- */
2397
2398/*
2399=head1 Miscellaneous Functions
2400
2401=for apidoc foldEQ
2402
2403Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2404same
2405case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2406match themselves and their opposite case counterparts. Non-cased and non-ASCII
2407range bytes match only themselves.
2408
2409=cut
2410*/
2411
2412PERL_STATIC_INLINE I32
2413Perl_foldEQ(const char *s1, const char *s2, I32 len)
2414{
2415 const U8 *a = (const U8 *)s1;
2416 const U8 *b = (const U8 *)s2;
2417
2418 PERL_ARGS_ASSERT_FOLDEQ;
2419
2420 assert(len >= 0);
2421
2422 while (len--) {
2423 if (*a != *b && *a != PL_fold[*b])
2424 return 0;
2425 a++,b++;
2426 }
2427 return 1;
2428}
2429
0f9cb40c 2430PERL_STATIC_INLINE I32
ec2c235b
KW
2431Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2432{
79a1fabd
KW
2433 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2434 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2435 * does not check for this. Nor does it check that the strings each have
2436 * at least 'len' characters. */
ec2c235b
KW
2437
2438 const U8 *a = (const U8 *)s1;
2439 const U8 *b = (const U8 *)s2;
2440
2441 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2442
2443 assert(len >= 0);
2444
2445 while (len--) {
2446 if (*a != *b && *a != PL_fold_latin1[*b]) {
2447 return 0;
2448 }
2449 a++, b++;
2450 }
2451 return 1;
2452}
2453
2454/*
2455=for apidoc foldEQ_locale
2456
2457Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2458same case-insensitively in the current locale; false otherwise.
2459
2460=cut
2461*/
2462
0f9cb40c 2463PERL_STATIC_INLINE I32
ec2c235b
KW
2464Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2465{
2466 dVAR;
2467 const U8 *a = (const U8 *)s1;
2468 const U8 *b = (const U8 *)s2;
2469
2470 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2471
2472 assert(len >= 0);
2473
2474 while (len--) {
2475 if (*a != *b && *a != PL_fold_locale[*b])
2476 return 0;
2477 a++,b++;
2478 }
2479 return 1;
2480}
2481
6dba01e2
KW
2482#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2483
2484PERL_STATIC_INLINE void *
2485S_my_memrchr(const char * s, const char c, const STRLEN len)
2486{
2487 /* memrchr(), since many platforms lack it */
2488
2489 const char * t = s + len - 1;
2490
2491 PERL_ARGS_ASSERT_MY_MEMRCHR;
2492
2493 while (t >= s) {
2494 if (*t == c) {
2495 return (void *) t;
2496 }
2497 t--;
2498 }
2499
2500 return NULL;
2501}
2502
2503#endif
2504
ed382232 2505/*
c8028aa6
TC
2506 * ex: set ts=8 sts=4 sw=4 et:
2507 */