This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/apicheck.pl: Silence warnings on old perls
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9
KW
8 * This file contains tables and code adapted from
9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
188
189 PERL_ARGS_ASSERT_REANY;
8d919b0a 190 assert(isREGEXP(re));
bdef45de 191
df6b4bd5
DM
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
8d919b0a
FC
194}
195
27669aa4
FC
196/* ------------------------------- sv.h ------------------------------- */
197
198PERL_STATIC_INLINE SV *
c9182d9c 199Perl_SvREFCNT_inc(SV *sv)
27669aa4 200{
2439e033 201 if (LIKELY(sv != NULL))
27669aa4
FC
202 SvREFCNT(sv)++;
203 return sv;
204}
205PERL_STATIC_INLINE SV *
c9182d9c 206Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 207{
3f2f854a
KW
208 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
209
27669aa4
FC
210 SvREFCNT(sv)++;
211 return sv;
212}
213PERL_STATIC_INLINE void
c9182d9c 214Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 215{
2439e033 216 if (LIKELY(sv != NULL))
27669aa4
FC
217 SvREFCNT(sv)++;
218}
75e16a44 219PERL_STATIC_INLINE void
c9182d9c 220Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 221{
2439e033 222 if (LIKELY(sv != NULL)) {
75a9bf96 223 U32 rc = SvREFCNT(sv);
79e2a32a 224 if (LIKELY(rc > 1))
75a9bf96
DM
225 SvREFCNT(sv) = rc - 1;
226 else
227 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
228 }
229}
541377b1
FC
230
231PERL_STATIC_INLINE void
c9182d9c 232Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
233{
234 U32 rc = SvREFCNT(sv);
3f2f854a
KW
235
236 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
237
79e2a32a 238 if (LIKELY(rc > 1))
4a9a56a7
DM
239 SvREFCNT(sv) = rc - 1;
240 else
241 Perl_sv_free2(aTHX_ sv, rc);
242}
243
244PERL_STATIC_INLINE void
1bd041dc 245Perl_SvAMAGIC_on(SV *sv)
541377b1 246{
1bd041dc 247 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 248 assert(SvROK(sv));
1bd041dc 249
541377b1
FC
250 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
251}
252PERL_STATIC_INLINE void
1bd041dc 253Perl_SvAMAGIC_off(SV *sv)
541377b1 254{
1bd041dc
KW
255 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
256
541377b1
FC
257 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
258 HvAMAGIC_off(SvSTASH(SvRV(sv)));
259}
260
261PERL_STATIC_INLINE U32
c9182d9c 262Perl_SvPADSTALE_on(SV *sv)
541377b1 263{
c0683843 264 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
265 return SvFLAGS(sv) |= SVs_PADSTALE;
266}
267PERL_STATIC_INLINE U32
c9182d9c 268Perl_SvPADSTALE_off(SV *sv)
541377b1 269{
c0683843 270 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
271 return SvFLAGS(sv) &= ~SVs_PADSTALE;
272}
25fdce4a 273#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 274PERL_STATIC_INLINE STRLEN
6964422a 275S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 276{
25fdce4a 277 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
278 if (SvGAMAGIC(sv)) {
279 U8 *hopped = utf8_hop((U8 *)pv, pos);
280 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
281 return (STRLEN)(hopped - (U8 *)pv);
282 }
283 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
284}
285#endif
f019c49e 286
d1decf2b
TC
287/* ------------------------------- handy.h ------------------------------- */
288
289/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 290GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 291static void
c9182d9c 292Perl_croak_memory_wrap(void)
d1decf2b
TC
293{
294 Perl_croak_nocontext("%s",PL_memory_wrap);
295}
7347ee54 296GCC_DIAG_RESTORE_DECL;
d1decf2b 297
a8a2ceaa
KW
298/* ------------------------------- utf8.h ------------------------------- */
299
2fe720e2
KW
300/*
301=head1 Unicode Support
302*/
303
55d09dc8 304PERL_STATIC_INLINE void
c9182d9c 305Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
306{
307 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
308 * encoded string at '*dest', updating '*dest' to include it */
309
55d09dc8
KW
310 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
311
6f2d5cbc 312 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 313 *((*dest)++) = byte;
55d09dc8 314 else {
a09ec51a
KW
315 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
316 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
317 }
318}
319
e123187a 320/*
2fe720e2 321=for apidoc valid_utf8_to_uvchr
09232555
KW
322Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
323known that the next character in the input UTF-8 string C<s> is well-formed
324(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
325points, and non-Unicode code points are allowed.
2fe720e2
KW
326
327=cut
328
329 */
330
331PERL_STATIC_INLINE UV
332Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
333{
c41b2540 334 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
335 const U8* send = s + expectlen;
336 UV uv = *s;
337
338 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
339
340 if (retlen) {
341 *retlen = expectlen;
342 }
343
344 /* An invariant is trivially returned */
345 if (expectlen == 1) {
346 return uv;
347 }
348
349 /* Remove the leading bits that indicate the number of bytes, leaving just
350 * the bits that are part of the value */
351 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
352
353 /* Now, loop through the remaining bytes, accumulating each into the
354 * working total as we go. (I khw tried unrolling the loop for up to 4
355 * bytes, but there was no performance improvement) */
356 for (++s; s < send; s++) {
357 uv = UTF8_ACCUMULATE(uv, *s);
358 }
359
360 return UNI_TO_NATIVE(uv);
361
362}
363
1e599354
KW
364/*
365=for apidoc is_utf8_invariant_string
366
82c5d941 367Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 368regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
369EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
370are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
371the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
372characters are invariant, but so also are the C1 controls.
1e599354
KW
373
374If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
375use this option, that C<s> can't have embedded C<NUL> characters and has to
376have a terminating C<NUL> byte).
377
9f2abfde
KW
378See also
379C<L</is_utf8_string>>,
380C<L</is_utf8_string_flags>>,
381C<L</is_utf8_string_loc>>,
382C<L</is_utf8_string_loc_flags>>,
383C<L</is_utf8_string_loclen>>,
384C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
385C<L</is_utf8_fixed_width_buf_flags>>,
386C<L</is_utf8_fixed_width_buf_loc_flags>>,
387C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
388C<L</is_strict_utf8_string>>,
389C<L</is_strict_utf8_string_loc>>,
390C<L</is_strict_utf8_string_loclen>>,
391C<L</is_c9strict_utf8_string>>,
392C<L</is_c9strict_utf8_string_loc>>,
393and
394C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
395
396=cut
0cbf5865
KW
397
398*/
399
400#define is_utf8_invariant_string(s, len) \
401 is_utf8_invariant_string_loc(s, len, NULL)
402
403/*
404=for apidoc is_utf8_invariant_string_loc
405
406Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
407the first UTF-8 variant character in the C<ep> pointer; if all characters are
408UTF-8 invariant, this function does not change the contents of C<*ep>.
409
410=cut
411
1e599354
KW
412*/
413
414PERL_STATIC_INLINE bool
c9182d9c 415Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 416{
e17544a6 417 const U8* send;
1e599354
KW
418 const U8* x = s;
419
0cbf5865
KW
420 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
421
e17544a6
KW
422 if (len == 0) {
423 len = strlen((const char *)s);
424 }
425
426 send = s + len;
427
4ab2fd9b 428/* This looks like 0x010101... */
2c5c8af5 429# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
430
431/* This looks like 0x808080... */
2c5c8af5 432# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 433# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 434# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 435
099e59a4
KW
436/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
437 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
438 * optimized out completely on a 32-bit system, and its mask gets optimized out
439 * on a 64-bit system */
2c5c8af5 440# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
441 | ( PTR2nat(x) >> 1) \
442 | ( ( (PTR2nat(x) \
443 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 444
3f515a2e
KW
445#ifndef EBCDIC
446
099e59a4
KW
447 /* Do the word-at-a-time iff there is at least one usable full word. That
448 * means that after advancing to a word boundary, there still is at least a
449 * full word left. The number of bytes needed to advance is 'wordsize -
450 * offset' unless offset is 0. */
451 if ((STRLEN) (send - x) >= PERL_WORDSIZE
452
453 /* This term is wordsize if subword; 0 if not */
454 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
455
456 /* 'offset' */
457 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
458 {
b40579ff 459
46bb68f6
KW
460 /* Process per-byte until reach word boundary. XXX This loop could be
461 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 462 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
463 if (! UTF8_IS_INVARIANT(*x)) {
464 if (ep) {
465 *ep = x;
466 }
e17544a6 467
46bb68f6
KW
468 return FALSE;
469 }
470 x++;
e17544a6 471 }
e17544a6 472
099e59a4
KW
473 /* Here, we know we have at least one full word to process. Process
474 * per-word as long as we have at least a full word left */
475 do {
4ab2fd9b 476 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 477
46bb68f6
KW
478 /* Found a variant. Just return if caller doesn't want its
479 * exact position */
480 if (! ep) {
481 return FALSE;
482 }
e17544a6 483
2c5c8af5
KW
484# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
485 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 486
73f0a2eb 487 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
488 assert(*ep >= s && *ep < send);
489
490 return FALSE;
491
2c5c8af5 492# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
493 checks. */
494
46bb68f6 495 break;
2c5c8af5 496# endif
46bb68f6 497 }
1d2af574 498
46bb68f6 499 x += PERL_WORDSIZE;
1d2af574 500
099e59a4 501 } while (x + PERL_WORDSIZE <= send);
b40579ff 502 }
e17544a6 503
0b08cab0 504#endif /* End of ! EBCDIC */
e17544a6
KW
505
506 /* Process per-byte */
507 while (x < send) {
508 if (! UTF8_IS_INVARIANT(*x)) {
509 if (ep) {
510 *ep = x;
511 }
0cbf5865 512
e17544a6 513 return FALSE;
0cbf5865 514 }
1e599354 515
e17544a6 516 x++;
1e599354
KW
517 }
518
519 return TRUE;
520}
521
23a7ee81
KW
522#ifndef EBCDIC
523
1d2af574 524PERL_STATIC_INLINE unsigned int
73f0a2eb 525Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
526{
527
528 /* This returns the position in a word (0..7) of the first variant byte in
529 * it. This is a helper function. Note that there are no branches */
530
531 assert(word);
532
533 /* Get just the msb bits of each byte */
534 word &= PERL_VARIANTS_WORD_MASK;
535
597ee3f4
KW
536# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
537 easiest thing is to hide that from the callers */
538 {
539 unsigned int i;
540 const U8 * s = (U8 *) &word;
541 dTHX;
542
543 for (i = 0; i < sizeof(word); i++ ) {
544 if (s[i]) {
545 return i;
546 }
547 }
548
549 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
550 __FILE__, __LINE__);
551 }
552
553# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
554
555 /* Bytes are stored like
556 * Byte8 ... Byte2 Byte1
557 * 63..56...15...8 7...0
558 *
559 * Isolate the lsb;
560 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
561 *
562 * The word will look this this, with a rightmost set bit in position 's':
563 * ('x's are don't cares)
564 * s
565 * x..x100..0
566 * x..xx10..0 Right shift (rightmost 0 is shifted off)
567 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
568 * the 1 just to their left into a 0; the remainder is
569 * untouched
4fa92663
KW
570 * 0..0011..1 The xor with the original, x..xx10..0, clears that
571 * remainder, sets the bottom to all 1
1d2af574
KW
572 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
573 *
574 * Another method is to do 'word &= -word'; but it generates a compiler
575 * message on some platforms about taking the negative of an unsigned */
576
577 word >>= 1;
578 word = 1 + (word ^ (word - 1));
579
580# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
581
582 /* Bytes are stored like
583 * Byte1 Byte2 ... Byte8
584 * 63..56 55..47 ... 7...0
585 *
586 * Isolate the msb; http://codeforces.com/blog/entry/10330
587 *
588 * Only the most significant set bit matters. Or'ing word with its right
589 * shift of 1 makes that bit and the next one to its right both 1. Then
590 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
591 * msb and all to the right being 1. */
592 word |= word >> 1;
593 word |= word >> 2;
594 word |= word >> 4;
595 word |= word >> 8;
596 word |= word >> 16;
597 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
598
599 /* Then subtracting the right shift by 1 clears all but the left-most of
600 * the 1 bits, which is our desired result */
601 word -= (word >> 1);
602
603# else
604# error Unexpected byte order
605# endif
606
7cf2d6c7
KW
607 /* Here 'word' has a single bit set: the msb of the first byte in which it
608 * is set. Calculate that position in the word. We can use this
1d2af574 609 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
610 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
611 * just get shifted off at compile time) */
612 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
613 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
614 | (39 << 24) | (47 << 16)
615 | (55 << 8) | (63 << 0));
1d2af574
KW
616 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
617
618 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
619 word = ((word + 1) >> 3) - 1;
620
621# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
622
623 /* And invert the result */
624 word = CHARBITS - word - 1;
625
626# endif
627
628 return (unsigned int) word;
629}
630
23a7ee81 631#endif
03c1e4ab
KW
632#if defined(PERL_CORE) || defined(PERL_EXT)
633
634/*
635=for apidoc variant_under_utf8_count
636
637This function looks at the sequence of bytes between C<s> and C<e>, which are
638assumed to be encoded in ASCII/Latin1, and returns how many of them would
639change should the string be translated into UTF-8. Due to the nature of UTF-8,
640each of these would occupy two bytes instead of the single one in the input
641string. Thus, this function returns the precise number of bytes the string
642would expand by when translated to UTF-8.
643
644Unlike most of the other functions that have C<utf8> in their name, the input
645to this function is NOT a UTF-8-encoded string. The function name is slightly
646I<odd> to emphasize this.
647
648This function is internal to Perl because khw thinks that any XS code that
649would want this is probably operating too close to the internals. Presenting a
650valid use case could change that.
651
652See also
653C<L<perlapi/is_utf8_invariant_string>>
654and
655C<L<perlapi/is_utf8_invariant_string_loc>>,
656
657=cut
658
659*/
660
661PERL_STATIC_INLINE Size_t
662S_variant_under_utf8_count(const U8* const s, const U8* const e)
663{
664 const U8* x = s;
665 Size_t count = 0;
666
667 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
668
669# ifndef EBCDIC
670
5d0379de
KW
671 /* Test if the string is long enough to use word-at-a-time. (Logic is the
672 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
673 if ((STRLEN) (e - x) >= PERL_WORDSIZE
674 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
675 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
676 {
677
678 /* Process per-byte until reach word boundary. XXX This loop could be
679 * eliminated if we knew that this platform had fast unaligned reads */
680 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
681 count += ! UTF8_IS_INVARIANT(*x++);
682 }
683
684 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
685 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
686 explanation of how this works */
e5863284
KW
687 PERL_UINTMAX_T increment
688 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
689 * PERL_COUNT_MULTIPLIER)
690 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 691 count += (Size_t) increment;
03c1e4ab
KW
692 x += PERL_WORDSIZE;
693 } while (x + PERL_WORDSIZE <= e);
694 }
695
696# endif
697
698 /* Process per-byte */
699 while (x < e) {
700 if (! UTF8_IS_INVARIANT(*x)) {
701 count++;
702 }
703
704 x++;
705 }
706
707 return count;
708}
709
710#endif
711
aff4cafe
KW
712#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
713# undef PERL_WORDSIZE
714# undef PERL_COUNT_MULTIPLIER
715# undef PERL_WORD_BOUNDARY_MASK
716# undef PERL_VARIANTS_WORD_MASK
717#endif
03c1e4ab 718
7c93d8f0 719/*
5ff889fb
KW
720=for apidoc is_utf8_string
721
82c5d941
KW
722Returns TRUE if the first C<len> bytes of string C<s> form a valid
723Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
724be calculated using C<strlen(s)> (which means if you use this option, that C<s>
725can't have embedded C<NUL> characters and has to have a terminating C<NUL>
726byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
727
2717076a
KW
728This function considers Perl's extended UTF-8 to be valid. That means that
729code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
730considered valid by this function. Use C<L</is_strict_utf8_string>>,
731C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
732code points are considered valid.
5ff889fb 733
9f2abfde
KW
734See also
735C<L</is_utf8_invariant_string>>,
0cbf5865 736C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
737C<L</is_utf8_string_loc>>,
738C<L</is_utf8_string_loclen>>,
8bc127bf
KW
739C<L</is_utf8_fixed_width_buf_flags>>,
740C<L</is_utf8_fixed_width_buf_loc_flags>>,
741C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
742
743=cut
744*/
745
dd237e82 746#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 747
c9cd936b
KW
748#if defined(PERL_CORE) || defined (PERL_EXT)
749
750/*
751=for apidoc is_utf8_non_invariant_string
752
753Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
754C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
755UTF-8; otherwise returns FALSE.
756
757A TRUE return means that at least one code point represented by the sequence
758either is a wide character not representable as a single byte, or the
759representation differs depending on whether the sequence is encoded in UTF-8 or
760not.
761
762See also
763C<L<perlapi/is_utf8_invariant_string>>,
764C<L<perlapi/is_utf8_string>>
765
766=cut
767
768This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
769It generally needn't be if its string is entirely UTF-8 invariant, and it
770shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
771
772It is an internal function because khw thinks that XS code shouldn't be working
773at this low a level. A valid use case could change that.
774
775*/
776
777PERL_STATIC_INLINE bool
86a87e17 778Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
779{
780 const U8 * first_variant;
781
782 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
783
784 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
785 return FALSE;
786 }
787
788 return is_utf8_string(first_variant, len - (first_variant - s));
789}
790
791#endif
792
5ff889fb 793/*
9f2abfde
KW
794=for apidoc is_strict_utf8_string
795
796Returns TRUE if the first C<len> bytes of string C<s> form a valid
797UTF-8-encoded string that is fully interchangeable by any application using
798Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
799calculated using C<strlen(s)> (which means if you use this option, that C<s>
800can't have embedded C<NUL> characters and has to have a terminating C<NUL>
801byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
802
803This function returns FALSE for strings containing any
804code points above the Unicode max of 0x10FFFF, surrogate code points, or
805non-character code points.
806
807See also
808C<L</is_utf8_invariant_string>>,
0cbf5865 809C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
810C<L</is_utf8_string>>,
811C<L</is_utf8_string_flags>>,
812C<L</is_utf8_string_loc>>,
813C<L</is_utf8_string_loc_flags>>,
814C<L</is_utf8_string_loclen>>,
815C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
816C<L</is_utf8_fixed_width_buf_flags>>,
817C<L</is_utf8_fixed_width_buf_loc_flags>>,
818C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
819C<L</is_strict_utf8_string_loc>>,
820C<L</is_strict_utf8_string_loclen>>,
821C<L</is_c9strict_utf8_string>>,
822C<L</is_c9strict_utf8_string_loc>>,
823and
824C<L</is_c9strict_utf8_string_loclen>>.
825
826=cut
827*/
828
dd237e82 829#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
830
831/*
832=for apidoc is_c9strict_utf8_string
833
834Returns TRUE if the first C<len> bytes of string C<s> form a valid
835UTF-8-encoded string that conforms to
836L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
837otherwise it returns FALSE. If C<len> is 0, it will be calculated using
838C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
839C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
840characters being ASCII constitute 'a valid UTF-8 string'.
841
842This function returns FALSE for strings containing any code points above the
843Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
844code points per
845L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
846
847See also
848C<L</is_utf8_invariant_string>>,
0cbf5865 849C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
850C<L</is_utf8_string>>,
851C<L</is_utf8_string_flags>>,
852C<L</is_utf8_string_loc>>,
853C<L</is_utf8_string_loc_flags>>,
854C<L</is_utf8_string_loclen>>,
855C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
856C<L</is_utf8_fixed_width_buf_flags>>,
857C<L</is_utf8_fixed_width_buf_loc_flags>>,
858C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
859C<L</is_strict_utf8_string>>,
860C<L</is_strict_utf8_string_loc>>,
861C<L</is_strict_utf8_string_loclen>>,
862C<L</is_c9strict_utf8_string_loc>>,
863and
864C<L</is_c9strict_utf8_string_loclen>>.
865
866=cut
867*/
868
dd237e82 869#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
870
871/*
872=for apidoc is_utf8_string_flags
873
874Returns TRUE if the first C<len> bytes of string C<s> form a valid
875UTF-8 string, subject to the restrictions imposed by C<flags>;
876returns FALSE otherwise. If C<len> is 0, it will be calculated
877using C<strlen(s)> (which means if you use this option, that C<s> can't have
878embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
879that all characters being ASCII constitute 'a valid UTF-8 string'.
880
881If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
882C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
883as C<L</is_strict_utf8_string>>; and if C<flags> is
884C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
885C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
886combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
887C<L</utf8n_to_uvchr>>, with the same meanings.
888
889See also
890C<L</is_utf8_invariant_string>>,
0cbf5865 891C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
892C<L</is_utf8_string>>,
893C<L</is_utf8_string_loc>>,
894C<L</is_utf8_string_loc_flags>>,
895C<L</is_utf8_string_loclen>>,
896C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
897C<L</is_utf8_fixed_width_buf_flags>>,
898C<L</is_utf8_fixed_width_buf_loc_flags>>,
899C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
900C<L</is_strict_utf8_string>>,
901C<L</is_strict_utf8_string_loc>>,
902C<L</is_strict_utf8_string_loclen>>,
903C<L</is_c9strict_utf8_string>>,
904C<L</is_c9strict_utf8_string_loc>>,
905and
906C<L</is_c9strict_utf8_string_loclen>>.
907
908=cut
909*/
910
911PERL_STATIC_INLINE bool
c9182d9c 912Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 913{
33756530 914 const U8 * first_variant;
9f2abfde
KW
915
916 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
917 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 918 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 919
f60f61fd
KW
920 if (len == 0) {
921 len = strlen((const char *)s);
922 }
923
9f2abfde
KW
924 if (flags == 0) {
925 return is_utf8_string(s, len);
926 }
927
d044b7a7 928 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
929 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
930 {
931 return is_strict_utf8_string(s, len);
932 }
933
d044b7a7 934 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
935 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
936 {
937 return is_c9strict_utf8_string(s, len);
938 }
939
33756530
KW
940 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
941 const U8* const send = s + len;
942 const U8* x = first_variant;
943
a0d7f935
KW
944 while (x < send) {
945 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
946 if (UNLIKELY(! cur_len)) {
947 return FALSE;
948 }
949 x += cur_len;
9f2abfde 950 }
33756530 951 }
9f2abfde
KW
952
953 return TRUE;
954}
955
956/*
5ff889fb
KW
957
958=for apidoc is_utf8_string_loc
959
2717076a 960Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 961case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 962"utf8ness success") in the C<ep> pointer.
5ff889fb 963
2717076a 964See also C<L</is_utf8_string_loclen>>.
5ff889fb 965
3964c812
KW
966=cut
967*/
968
969#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
970
971/*
972
5ff889fb
KW
973=for apidoc is_utf8_string_loclen
974
2717076a 975Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 976case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 977"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 978encoded characters in the C<el> pointer.
5ff889fb 979
2717076a 980See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
981
982=cut
983*/
984
56e4cf64 985PERL_STATIC_INLINE bool
33756530 986Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 987{
33756530 988 const U8 * first_variant;
5ff889fb
KW
989
990 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
991
33756530
KW
992 if (len == 0) {
993 len = strlen((const char *) s);
994 }
995
996 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
997 if (el)
998 *el = len;
999
1000 if (ep) {
1001 *ep = s + len;
1002 }
1003
1004 return TRUE;
1005 }
1006
1007 {
1008 const U8* const send = s + len;
1009 const U8* x = first_variant;
1010 STRLEN outlen = first_variant - s;
1011
a0d7f935
KW
1012 while (x < send) {
1013 const STRLEN cur_len = isUTF8_CHAR(x, send);
1014 if (UNLIKELY(! cur_len)) {
1015 break;
1016 }
1017 x += cur_len;
1018 outlen++;
5ff889fb 1019 }
5ff889fb 1020
a0d7f935
KW
1021 if (el)
1022 *el = outlen;
5ff889fb 1023
a0d7f935
KW
1024 if (ep) {
1025 *ep = x;
1026 }
5ff889fb 1027
a0d7f935 1028 return (x == send);
33756530 1029 }
5ff889fb
KW
1030}
1031
1032/*
9f2abfde 1033
44170c9a 1034=for apidoc isUTF8_CHAR
8ed185f9
KW
1035
1036Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1037looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1038that represents some code point; otherwise it evaluates to 0. If non-zero, the
1039value gives how many bytes starting at C<s> comprise the code point's
1040representation. Any bytes remaining before C<e>, but beyond the ones needed to
1041form the first code point in C<s>, are not examined.
1042
13aab5dd 1043The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1044extension to official UTF-8 to represent those higher than the Unicode maximum
1045of 0x10FFFF. That means that this macro is used to efficiently decide if the
1046next few bytes in C<s> is legal UTF-8 for a single character.
1047
1048Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1049defined by Unicode to be fully interchangeable across applications;
1050C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1051#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1052code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1053
1054Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1055C<L</is_utf8_string_loclen>> to check entire strings.
1056
13aab5dd
KW
1057Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1058machines) is a valid UTF-8 character.
8ed185f9
KW
1059
1060=cut
1061
1062This uses an adaptation of the table and algorithm given in
1063http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1064documentation of the original version. A copyright notice for the original
1065version is given at the beginning of this file. The Perl adapation is
71525f77 1066documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1067
1068*/
1069
1070PERL_STATIC_INLINE Size_t
c9182d9c 1071Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1072{
1073 const U8 * s = s0;
1074 UV state = 0;
1075
1076 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1077
1078 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1079 * code point, which can be returned immediately. Otherwise, it is either
1080 * malformed, or for the start byte FF which the dfa doesn't handle (except
1081 * on 32-bit ASCII platforms where it trivially is an error). Call a
1082 * helper function for the other platforms. */
1083
1084 while (s < e && LIKELY(state != 1)) {
71525f77 1085 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1086 + state
71525f77 1087 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1088 if (state != 0) {
1089 s++;
1090 continue;
1091 }
1092
1093 return s - s0 + 1;
1094 }
1095
1096#if defined(UV_IS_QUAD) || defined(EBCDIC)
1097
1098 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1099 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1100 }
1101
1102#endif
1103
1104 return 0;
1105}
1106
1107/*
1108
67049a5f
KW
1109=for apidoc isSTRICT_UTF8_CHAR
1110
1111Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1112looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1113Unicode code point completely acceptable for open interchange between all
1114applications; otherwise it evaluates to 0. If non-zero, the value gives how
1115many bytes starting at C<s> comprise the code point's representation. Any
1116bytes remaining before C<e>, but beyond the ones needed to form the first code
1117point in C<s>, are not examined.
1118
1119The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1120be a surrogate nor a non-character code point. Thus this excludes any code
1121point from Perl's extended UTF-8.
1122
1123This is used to efficiently decide if the next few bytes in C<s> is
1124legal Unicode-acceptable UTF-8 for a single character.
1125
1126Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1127#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1128code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1129and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1130
1131Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1132C<L</is_strict_utf8_string_loclen>> to check entire strings.
1133
1134=cut
1135
1136This uses an adaptation of the tables and algorithm given in
1137http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1138documentation of the original version. A copyright notice for the original
1139version is given at the beginning of this file. The Perl adapation is
1140documented at the definition of strict_extended_utf8_dfa_tab[].
1141
1142*/
1143
1144PERL_STATIC_INLINE Size_t
c9182d9c 1145Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1146{
1147 const U8 * s = s0;
1148 UV state = 0;
1149
1150 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1151
1152 while (s < e && LIKELY(state != 1)) {
71525f77 1153 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1154
1155 if (state != 0) {
1156 s++;
1157 continue;
1158 }
1159
1160 return s - s0 + 1;
1161 }
1162
1163#ifndef EBCDIC
1164
1165 /* The dfa above drops out for certain Hanguls; handle them specially */
1166 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1167 return 3;
1168 }
1169
1170#endif
1171
1172 return 0;
1173}
1174
1175/*
1176
44170c9a 1177=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1178
1179Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1180looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1181Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1182the value gives how many bytes starting at C<s> comprise the code point's
1183representation. Any bytes remaining before C<e>, but beyond the ones needed to
1184form the first code point in C<s>, are not examined.
1185
1186The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1187differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1188code points. This corresponds to
1189L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1190which said that non-character code points are merely discouraged rather than
1191completely forbidden in open interchange. See
1192L<perlunicode/Noncharacter code points>.
1193
1194Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1195C<L</isUTF8_CHAR_flags>> for a more customized definition.
1196
1197Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1198C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1199
1200=cut
1201
1202This uses an adaptation of the tables and algorithm given in
1203http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1204documentation of the original version. A copyright notice for the original
1205version is given at the beginning of this file. The Perl adapation is
71525f77 1206documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1207
1208*/
1209
1210PERL_STATIC_INLINE Size_t
c9182d9c 1211Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1212{
1213 const U8 * s = s0;
1214 UV state = 0;
1215
1216 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1217
1218 while (s < e && LIKELY(state != 1)) {
71525f77 1219 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1220
1221 if (state != 0) {
1222 s++;
1223 continue;
1224 }
1225
1226 return s - s0 + 1;
1227 }
1228
1229 return 0;
1230}
1231
1232/*
1233
9f2abfde
KW
1234=for apidoc is_strict_utf8_string_loc
1235
1236Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1237case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1238"utf8ness success") in the C<ep> pointer.
1239
1240See also C<L</is_strict_utf8_string_loclen>>.
1241
1242=cut
1243*/
1244
1245#define is_strict_utf8_string_loc(s, len, ep) \
1246 is_strict_utf8_string_loclen(s, len, ep, 0)
1247
1248/*
1249
1250=for apidoc is_strict_utf8_string_loclen
1251
1252Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1253case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1254"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1255encoded characters in the C<el> pointer.
1256
1257See also C<L</is_strict_utf8_string_loc>>.
1258
1259=cut
1260*/
1261
1262PERL_STATIC_INLINE bool
c9182d9c 1263Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1264{
33756530 1265 const U8 * first_variant;
9f2abfde
KW
1266
1267 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1268
33756530
KW
1269 if (len == 0) {
1270 len = strlen((const char *) s);
1271 }
1272
1273 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1274 if (el)
1275 *el = len;
1276
1277 if (ep) {
1278 *ep = s + len;
1279 }
1280
1281 return TRUE;
1282 }
1283
1284 {
1285 const U8* const send = s + len;
1286 const U8* x = first_variant;
1287 STRLEN outlen = first_variant - s;
1288
a0d7f935
KW
1289 while (x < send) {
1290 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1291 if (UNLIKELY(! cur_len)) {
1292 break;
1293 }
1294 x += cur_len;
1295 outlen++;
9f2abfde 1296 }
9f2abfde 1297
a0d7f935
KW
1298 if (el)
1299 *el = outlen;
9f2abfde 1300
a0d7f935
KW
1301 if (ep) {
1302 *ep = x;
1303 }
9f2abfde 1304
a0d7f935 1305 return (x == send);
33756530 1306 }
9f2abfde
KW
1307}
1308
1309/*
1310
1311=for apidoc is_c9strict_utf8_string_loc
1312
1313Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1314the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1315"utf8ness success") in the C<ep> pointer.
1316
1317See also C<L</is_c9strict_utf8_string_loclen>>.
1318
1319=cut
1320*/
1321
1322#define is_c9strict_utf8_string_loc(s, len, ep) \
1323 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1324
1325/*
1326
1327=for apidoc is_c9strict_utf8_string_loclen
1328
1329Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1330the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1331"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1332characters in the C<el> pointer.
1333
1334See also C<L</is_c9strict_utf8_string_loc>>.
1335
1336=cut
1337*/
1338
1339PERL_STATIC_INLINE bool
c9182d9c 1340Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1341{
33756530 1342 const U8 * first_variant;
9f2abfde
KW
1343
1344 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1345
33756530
KW
1346 if (len == 0) {
1347 len = strlen((const char *) s);
1348 }
1349
1350 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1351 if (el)
1352 *el = len;
1353
1354 if (ep) {
1355 *ep = s + len;
1356 }
1357
1358 return TRUE;
1359 }
1360
1361 {
1362 const U8* const send = s + len;
1363 const U8* x = first_variant;
1364 STRLEN outlen = first_variant - s;
1365
a0d7f935
KW
1366 while (x < send) {
1367 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1368 if (UNLIKELY(! cur_len)) {
1369 break;
1370 }
1371 x += cur_len;
1372 outlen++;
9f2abfde 1373 }
9f2abfde 1374
a0d7f935
KW
1375 if (el)
1376 *el = outlen;
9f2abfde 1377
a0d7f935
KW
1378 if (ep) {
1379 *ep = x;
1380 }
9f2abfde 1381
a0d7f935 1382 return (x == send);
33756530 1383 }
9f2abfde
KW
1384}
1385
1386/*
1387
1388=for apidoc is_utf8_string_loc_flags
1389
1390Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1391case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1392"utf8ness success") in the C<ep> pointer.
1393
1394See also C<L</is_utf8_string_loclen_flags>>.
1395
1396=cut
1397*/
1398
1399#define is_utf8_string_loc_flags(s, len, ep, flags) \
1400 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1401
1402
1403/* The above 3 actual functions could have been moved into the more general one
1404 * just below, and made #defines that call it with the right 'flags'. They are
1405 * currently kept separate to increase their chances of getting inlined */
1406
1407/*
1408
1409=for apidoc is_utf8_string_loclen_flags
1410
1411Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1412case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1413"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1414encoded characters in the C<el> pointer.
1415
1416See also C<L</is_utf8_string_loc_flags>>.
1417
1418=cut
1419*/
1420
1421PERL_STATIC_INLINE bool
c9182d9c 1422Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1423{
33756530 1424 const U8 * first_variant;
9f2abfde
KW
1425
1426 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1427 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1428 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1429
f60f61fd 1430 if (len == 0) {
a0d7f935 1431 len = strlen((const char *) s);
f60f61fd
KW
1432 }
1433
9f2abfde
KW
1434 if (flags == 0) {
1435 return is_utf8_string_loclen(s, len, ep, el);
1436 }
1437
d044b7a7 1438 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1439 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1440 {
1441 return is_strict_utf8_string_loclen(s, len, ep, el);
1442 }
1443
d044b7a7 1444 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1445 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1446 {
1447 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1448 }
1449
33756530
KW
1450 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1451 if (el)
1452 *el = len;
1453
1454 if (ep) {
1455 *ep = s + len;
1456 }
1457
1458 return TRUE;
1459 }
1460
1461 {
1462 const U8* send = s + len;
1463 const U8* x = first_variant;
1464 STRLEN outlen = first_variant - s;
1465
a0d7f935
KW
1466 while (x < send) {
1467 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1468 if (UNLIKELY(! cur_len)) {
1469 break;
1470 }
1471 x += cur_len;
1472 outlen++;
9f2abfde 1473 }
9f2abfde 1474
a0d7f935
KW
1475 if (el)
1476 *el = outlen;
9f2abfde 1477
a0d7f935
KW
1478 if (ep) {
1479 *ep = x;
1480 }
9f2abfde 1481
a0d7f935 1482 return (x == send);
33756530 1483 }
9f2abfde
KW
1484}
1485
1486/*
7c93d8f0
KW
1487=for apidoc utf8_distance
1488
1489Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1490and C<b>.
1491
1492WARNING: use only if you *know* that the pointers point inside the
1493same UTF-8 buffer.
1494
1495=cut
1496*/
1497
1498PERL_STATIC_INLINE IV
1499Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1500{
1501 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1502
1503 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1504}
1505
1506/*
1507=for apidoc utf8_hop
1508
1509Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1510forward or backward.
1511
1512WARNING: do not use the following unless you *know* C<off> is within
1513the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1514on the first byte of character or just after the last byte of a character.
1515
1516=cut
1517*/
1518
1519PERL_STATIC_INLINE U8 *
1520Perl_utf8_hop(const U8 *s, SSize_t off)
1521{
1522 PERL_ARGS_ASSERT_UTF8_HOP;
1523
1524 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1525 * the bitops (especially ~) can create illegal UTF-8.
1526 * In other words: in Perl UTF-8 is not just for Unicode. */
1527
1528 if (off >= 0) {
1529 while (off--)
1530 s += UTF8SKIP(s);
1531 }
1532 else {
1533 while (off++) {
1534 s--;
1535 while (UTF8_IS_CONTINUATION(*s))
1536 s--;
1537 }
1538 }
e099ea69 1539 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1540 return (U8 *)s;
e099ea69 1541 GCC_DIAG_RESTORE
7c93d8f0
KW
1542}
1543
4dab108f 1544/*
65df57a8
TC
1545=for apidoc utf8_hop_forward
1546
1547Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1548forward.
1549
1550C<off> must be non-negative.
1551
1552C<s> must be before or equal to C<end>.
1553
1554When moving forward it will not move beyond C<end>.
1555
1556Will not exceed this limit even if the string is not valid "UTF-8".
1557
1558=cut
1559*/
1560
1561PERL_STATIC_INLINE U8 *
1562Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1563{
1564 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1565
1566 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1567 * the bitops (especially ~) can create illegal UTF-8.
1568 * In other words: in Perl UTF-8 is not just for Unicode. */
1569
1570 assert(s <= end);
1571 assert(off >= 0);
1572
1573 while (off--) {
1574 STRLEN skip = UTF8SKIP(s);
de979548 1575 if ((STRLEN)(end - s) <= skip) {
e099ea69 1576 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1577 return (U8 *)end;
e099ea69 1578 GCC_DIAG_RESTORE
de979548 1579 }
65df57a8
TC
1580 s += skip;
1581 }
1582
e099ea69 1583 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1584 return (U8 *)s;
e099ea69 1585 GCC_DIAG_RESTORE
65df57a8
TC
1586}
1587
1588/*
1589=for apidoc utf8_hop_back
1590
1591Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1592backward.
1593
1594C<off> must be non-positive.
1595
1596C<s> must be after or equal to C<start>.
1597
1598When moving backward it will not move before C<start>.
1599
1600Will not exceed this limit even if the string is not valid "UTF-8".
1601
1602=cut
1603*/
1604
1605PERL_STATIC_INLINE U8 *
1606Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1607{
1608 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1609
1610 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1611 * the bitops (especially ~) can create illegal UTF-8.
1612 * In other words: in Perl UTF-8 is not just for Unicode. */
1613
1614 assert(start <= s);
1615 assert(off <= 0);
1616
1617 while (off++ && s > start) {
e7185695 1618 do {
65df57a8 1619 s--;
e7185695 1620 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8
TC
1621 }
1622
e099ea69 1623 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1624 return (U8 *)s;
e099ea69 1625 GCC_DIAG_RESTORE
65df57a8
TC
1626}
1627
1628/*
1629=for apidoc utf8_hop_safe
1630
1631Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1632either forward or backward.
1633
1634When moving backward it will not move before C<start>.
1635
1636When moving forward it will not move beyond C<end>.
1637
1638Will not exceed those limits even if the string is not valid "UTF-8".
1639
1640=cut
1641*/
1642
1643PERL_STATIC_INLINE U8 *
1644Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1645{
1646 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1647
1648 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1649 * the bitops (especially ~) can create illegal UTF-8.
1650 * In other words: in Perl UTF-8 is not just for Unicode. */
1651
1652 assert(start <= s && s <= end);
1653
1654 if (off >= 0) {
1655 return utf8_hop_forward(s, off, end);
1656 }
1657 else {
1658 return utf8_hop_back(s, off, start);
1659 }
1660}
1661
1662/*
4dab108f
KW
1663
1664=for apidoc is_utf8_valid_partial_char
1665
6cbb9248
KW
1666Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1667S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1668points. Otherwise, it returns 1 if there exists at least one non-empty
1669sequence of bytes that when appended to sequence C<s>, starting at position
1670C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1671otherwise returns 0.
1672
1673In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1674point.
1675
1676This is useful when a fixed-length buffer is being tested for being well-formed
1677UTF-8, but the final few bytes in it don't comprise a full character; that is,
1678it is split somewhere in the middle of the final code point's UTF-8
1679representation. (Presumably when the buffer is refreshed with the next chunk
1680of data, the new first bytes will complete the partial code point.) This
1681function is used to verify that the final bytes in the current buffer are in
1682fact the legal beginning of some code point, so that if they aren't, the
1683failure can be signalled without having to wait for the next read.
4dab108f
KW
1684
1685=cut
1686*/
2717076a
KW
1687#define is_utf8_valid_partial_char(s, e) \
1688 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1689
1690/*
1691
1692=for apidoc is_utf8_valid_partial_char_flags
1693
1694Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1695or not the input is a valid UTF-8 encoded partial character, but it takes an
1696extra parameter, C<flags>, which can further restrict which code points are
1697considered valid.
1698
1699If C<flags> is 0, this behaves identically to
1700C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1701of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1702there is any sequence of bytes that can complete the input partial character in
1703such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1704TRUE; otherwise FALSE. Non character code points cannot be determined based on
1705partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1706determined from just the first one or two bytes.
1707
1708=cut
1709 */
1710
56e4cf64 1711PERL_STATIC_INLINE bool
c9182d9c 1712Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1713{
f1c999a7 1714 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1715
f1c999a7 1716 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1717 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1718
8875bd48 1719 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1720 return FALSE;
1721 }
1722
1376b35c 1723 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1724}
1725
8bc127bf
KW
1726/*
1727
1728=for apidoc is_utf8_fixed_width_buf_flags
1729
1730Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1731is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1732otherwise it returns FALSE.
1733
1734If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1735without restriction. If the final few bytes of the buffer do not form a
1736complete code point, this will return TRUE anyway, provided that
1737C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1738
1739If C<flags> in non-zero, it can be any combination of the
1740C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1741same meanings.
1742
1743This function differs from C<L</is_utf8_string_flags>> only in that the latter
1744returns FALSE if the final few bytes of the string don't form a complete code
1745point.
1746
1747=cut
1748 */
1749#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1750 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1751
1752/*
1753
1754=for apidoc is_utf8_fixed_width_buf_loc_flags
1755
1756Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1757failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1758to the beginning of any partial character at the end of the buffer; if there is
1759no partial character C<*ep> will contain C<s>+C<len>.
1760
1761See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1762
1763=cut
1764*/
1765
1766#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1767 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1768
1769/*
1770
1771=for apidoc is_utf8_fixed_width_buf_loclen_flags
1772
1773Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1774complete, valid characters found in the C<el> pointer.
1775
1776=cut
1777*/
1778
1779PERL_STATIC_INLINE bool
c9182d9c 1780Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1781 STRLEN len,
8bc127bf
KW
1782 const U8 **ep,
1783 STRLEN *el,
1784 const U32 flags)
1785{
1786 const U8 * maybe_partial;
1787
1788 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1789
1790 if (! ep) {
1791 ep = &maybe_partial;
1792 }
1793
1794 /* If it's entirely valid, return that; otherwise see if the only error is
1795 * that the final few bytes are for a partial character */
1796 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1797 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1798}
1799
e6a4ffc3 1800PERL_STATIC_INLINE UV
c9182d9c 1801Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1802 STRLEN curlen,
1803 STRLEN *retlen,
1804 const U32 flags,
1805 U32 * errors,
1806 AV ** msgs)
1807{
1808 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1809 * simple cases, and, if necessary calls a helper function to deal with the
1810 * more complex ones. Almost all well-formed non-problematic code points
1811 * are considered simple, so that it's unlikely that the helper function
1812 * will need to be called.
1813 *
1814 * This is an adaptation of the tables and algorithm given in
1815 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1816 * comprehensive documentation of the original version. A copyright notice
1817 * for the original version is given at the beginning of this file. The
71525f77 1818 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1819 */
1820
1821 const U8 * const s0 = s;
1822 const U8 * send = s0 + curlen;
f73cfe7d 1823 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1824 UV state = 0;
1825
1826 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1827
1828 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1829 * non-problematic code point, which can be returned immediately.
1830 * Otherwise we call a helper function to figure out the more complicated
1831 * cases. */
1832
1833 while (s < send && LIKELY(state != 1)) {
71525f77 1834 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1835
1836 uv = (state == 0)
1837 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1838 : UTF8_ACCUMULATE(uv, *s);
71525f77 1839 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1840
1841 if (state != 0) {
1842 s++;
1843 continue;
1844 }
1845
1846 if (retlen) {
1847 *retlen = s - s0 + 1;
1848 }
1849 if (errors) {
1850 *errors = 0;
1851 }
1852 if (msgs) {
1853 *msgs = NULL;
1854 }
1855
cfebc7aa 1856 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1857 }
1858
1859 /* Here is potentially problematic. Use the full mechanism */
1860 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1861}
1862
82651abe 1863PERL_STATIC_INLINE UV
9a9a6c98 1864Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1865{
9a9a6c98 1866 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1867
1868 assert(s < send);
1869
1870 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1871
1872 /* EMPTY is not really allowed, and asserts on debugging builds. But
1873 * on non-debugging we have to deal with it, and this causes it to
1874 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1875 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1876 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1877 }
1878 else {
1879 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1880 if (retlen && ret == 0 && *s != '\0') {
1881 *retlen = (STRLEN) -1;
1882 }
1883
1884 return ret;
1885 }
1886}
1887
c8028aa6
TC
1888/* ------------------------------- perl.h ----------------------------- */
1889
1890/*
dcccc8ff
KW
1891=head1 Miscellaneous Functions
1892
44170c9a 1893=for apidoc is_safe_syscall
c8028aa6 1894
1a0efc9a
KW
1895Test that the given C<pv> (with length C<len>) doesn't contain any internal
1896C<NUL> characters.
1897If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1898category, and return FALSE.
c8028aa6
TC
1899
1900Return TRUE if the name is safe.
1901
1a0efc9a
KW
1902C<what> and C<op_name> are used in any warning.
1903
796b6530 1904Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1905
1906=cut
1907*/
1908
1909PERL_STATIC_INLINE bool
ffd62fc2
KW
1910Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1911{
c8028aa6
TC
1912 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1913 * perl itself uses xce*() functions which accept 8-bit strings.
1914 */
1915
1916 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1917
6c4650b3 1918 if (len > 1) {
c8028aa6 1919 char *null_at;
41188aa0 1920 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1921 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1922 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1923 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1924 what, op_name, pv, null_at+1);
c8028aa6
TC
1925 return FALSE;
1926 }
1927 }
1928
1929 return TRUE;
1930}
1931
1932/*
7cb3f959
TC
1933
1934Return true if the supplied filename has a newline character
fa6c7d00 1935immediately before the first (hopefully only) NUL.
7cb3f959
TC
1936
1937My original look at this incorrectly used the len from SvPV(), but
1938that's incorrect, since we allow for a NUL in pv[len-1].
1939
1940So instead, strlen() and work from there.
1941
1942This allow for the user reading a filename, forgetting to chomp it,
1943then calling:
1944
1945 open my $foo, "$file\0";
1946
1947*/
1948
1949#ifdef PERL_CORE
1950
1951PERL_STATIC_INLINE bool
ffd62fc2
KW
1952S_should_warn_nl(const char *pv)
1953{
7cb3f959
TC
1954 STRLEN len;
1955
1956 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1957
1958 len = strlen(pv);
1959
1960 return len > 0 && pv[len-1] == '\n';
1961}
1962
1963#endif
1964
3a019afd
KW
1965#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1966
1967PERL_STATIC_INLINE bool
1968S_lossless_NV_to_IV(const NV nv, IV *ivp)
1969{
1970 /* This function determines if the input NV 'nv' may be converted without
1971 * loss of data to an IV. If not, it returns FALSE taking no other action.
1972 * But if it is possible, it does the conversion, returning TRUE, and
1973 * storing the converted result in '*ivp' */
1974
1975 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1976
1977# if defined(Perl_isnan)
1978
1979 if (UNLIKELY(Perl_isnan(nv))) {
1980 return FALSE;
1981 }
1982
1983# endif
1984
1985 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1986 return FALSE;
1987 }
1988
1989 if ((IV) nv != nv) {
1990 return FALSE;
1991 }
1992
1993 *ivp = (IV) nv;
1994 return TRUE;
1995}
1996
1997#endif
1998
81d52ecd
JH
1999/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2000
94b0cb42
KW
2001#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2002
81d52ecd
JH
2003#define MAX_CHARSET_NAME_LENGTH 2
2004
2005PERL_STATIC_INLINE const char *
94b0cb42 2006S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2007{
94b0cb42
KW
2008 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2009
81d52ecd
JH
2010 /* Returns a string that corresponds to the name of the regex character set
2011 * given by 'flags', and *lenp is set the length of that string, which
2012 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2013
2014 *lenp = 1;
2015 switch (get_regex_charset(flags)) {
2016 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2017 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2018 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2019 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2020 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2021 *lenp = 2;
2022 return ASCII_MORE_RESTRICT_PAT_MODS;
2023 }
2024 /* The NOT_REACHED; hides an assert() which has a rather complex
2025 * definition in perl.h. */
2026 NOT_REACHED; /* NOTREACHED */
2027 return "?"; /* Unknown */
2028}
2029
94b0cb42
KW
2030#endif
2031
7cb3f959 2032/*
ed382232
TC
2033
2034Return false if any get magic is on the SV other than taint magic.
2035
2036*/
2037
2038PERL_STATIC_INLINE bool
ffd62fc2
KW
2039Perl_sv_only_taint_gmagic(SV *sv)
2040{
ed382232
TC
2041 MAGIC *mg = SvMAGIC(sv);
2042
2043 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2044
2045 while (mg) {
2046 if (mg->mg_type != PERL_MAGIC_taint
2047 && !(mg->mg_flags & MGf_GSKIP)
2048 && mg->mg_virtual->svt_get) {
2049 return FALSE;
2050 }
2051 mg = mg->mg_moremagic;
2052 }
2053
2054 return TRUE;
2055}
2056
ed8ff0f3
DM
2057/* ------------------ cop.h ------------------------------------------- */
2058
5b6f7443
DM
2059/* implement GIMME_V() macro */
2060
2061PERL_STATIC_INLINE U8
2062Perl_gimme_V(pTHX)
2063{
2064 I32 cxix;
2065 U8 gimme = (PL_op->op_flags & OPf_WANT);
2066
2067 if (gimme)
2068 return gimme;
2069 cxix = PL_curstackinfo->si_cxsubix;
2070 if (cxix < 0)
2071 return G_VOID;
2072 assert(cxstack[cxix].blk_gimme & G_WANT);
2073 return (cxstack[cxix].blk_gimme & G_WANT);
2074}
2075
ed8ff0f3
DM
2076
2077/* Enter a block. Push a new base context and return its address. */
2078
2079PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2080Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2081{
2082 PERL_CONTEXT * cx;
2083
2084 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2085
2086 CXINC;
2087 cx = CX_CUR();
2088 cx->cx_type = type;
2089 cx->blk_gimme = gimme;
2090 cx->blk_oldsaveix = saveix;
4caf7d8c 2091 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2092 cx->blk_oldcop = PL_curcop;
4caf7d8c 2093 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2094 cx->blk_oldscopesp = PL_scopestack_ix;
2095 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2096 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2097
2098 PL_tmps_floor = PL_tmps_ix;
2099 CX_DEBUG(cx, "PUSH");
2100 return cx;
2101}
2102
2103
2104/* Exit a block (RETURN and LAST). */
2105
2106PERL_STATIC_INLINE void
c9182d9c 2107Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2108{
2109 PERL_ARGS_ASSERT_CX_POPBLOCK;
2110
2111 CX_DEBUG(cx, "POP");
2112 /* these 3 are common to cx_popblock and cx_topblock */
2113 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2114 PL_scopestack_ix = cx->blk_oldscopesp;
2115 PL_curpm = cx->blk_oldpm;
2116
2117 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2118 * and leaves a CX entry lying around for repeated use, so
2119 * skip for multicall */ \
2120 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2121 || PL_savestack_ix == cx->blk_oldsaveix);
2122 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2123 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2124}
2125
2126/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2127 * Whereas cx_popblock() restores the state to the point just before
2128 * cx_pushblock() was called, cx_topblock() restores it to the point just
2129 * *after* cx_pushblock() was called. */
2130
2131PERL_STATIC_INLINE void
c9182d9c 2132Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2133{
2134 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2135
2136 CX_DEBUG(cx, "TOP");
2137 /* these 3 are common to cx_popblock and cx_topblock */
2138 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2139 PL_scopestack_ix = cx->blk_oldscopesp;
2140 PL_curpm = cx->blk_oldpm;
2141
2142 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2143}
2144
2145
a73d8813 2146PERL_STATIC_INLINE void
c9182d9c 2147Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2148{
2149 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2150
2151 PERL_ARGS_ASSERT_CX_PUSHSUB;
2152
3f6bd23a 2153 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2154 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2155 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2156 cx->blk_sub.cv = cv;
2157 cx->blk_sub.olddepth = CvDEPTH(cv);
2158 cx->blk_sub.prevcomppad = PL_comppad;
2159 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2160 cx->blk_sub.retop = retop;
2161 SvREFCNT_inc_simple_void_NN(cv);
2162 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2163}
2164
2165
2166/* subsets of cx_popsub() */
2167
2168PERL_STATIC_INLINE void
c9182d9c 2169Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2170{
2171 CV *cv;
2172
2173 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2174 assert(CxTYPE(cx) == CXt_SUB);
2175
2176 PL_comppad = cx->blk_sub.prevcomppad;
2177 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2178 cv = cx->blk_sub.cv;
2179 CvDEPTH(cv) = cx->blk_sub.olddepth;
2180 cx->blk_sub.cv = NULL;
2181 SvREFCNT_dec(cv);
5b6f7443 2182 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2183}
2184
2185
2186/* handle the @_ part of leaving a sub */
2187
2188PERL_STATIC_INLINE void
c9182d9c 2189Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2190{
2191 AV *av;
2192
2193 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2194 assert(CxTYPE(cx) == CXt_SUB);
2195 assert(AvARRAY(MUTABLE_AV(
2196 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2197 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2198
2199 CX_POP_SAVEARRAY(cx);
2200 av = MUTABLE_AV(PAD_SVl(0));
2201 if (UNLIKELY(AvREAL(av)))
2202 /* abandon @_ if it got reified */
2203 clear_defarray(av, 0);
2204 else {
2205 CLEAR_ARGARRAY(av);
2206 }
2207}
2208
2209
2210PERL_STATIC_INLINE void
c9182d9c 2211Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2212{
2213 PERL_ARGS_ASSERT_CX_POPSUB;
2214 assert(CxTYPE(cx) == CXt_SUB);
2215
3f6bd23a 2216 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2217
2218 if (CxHASARGS(cx))
2219 cx_popsub_args(cx);
2220 cx_popsub_common(cx);
2221}
2222
2223
6a7d52cc 2224PERL_STATIC_INLINE void
c9182d9c 2225Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2226{
2227 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2228
5b6f7443
DM
2229 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2230 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2231 cx->blk_format.cv = cv;
2232 cx->blk_format.retop = retop;
2233 cx->blk_format.gv = gv;
2234 cx->blk_format.dfoutgv = PL_defoutgv;
2235 cx->blk_format.prevcomppad = PL_comppad;
2236 cx->blk_u16 = 0;
2237
2238 SvREFCNT_inc_simple_void_NN(cv);
2239 CvDEPTH(cv)++;
2240 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2241}
2242
2243
2244PERL_STATIC_INLINE void
c9182d9c 2245Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2246{
2247 CV *cv;
2248 GV *dfout;
2249
2250 PERL_ARGS_ASSERT_CX_POPFORMAT;
2251 assert(CxTYPE(cx) == CXt_FORMAT);
2252
2253 dfout = cx->blk_format.dfoutgv;
2254 setdefout(dfout);
2255 cx->blk_format.dfoutgv = NULL;
2256 SvREFCNT_dec_NN(dfout);
2257
2258 PL_comppad = cx->blk_format.prevcomppad;
2259 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2260 cv = cx->blk_format.cv;
2261 cx->blk_format.cv = NULL;
2262 --CvDEPTH(cv);
2263 SvREFCNT_dec_NN(cv);
5b6f7443 2264 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2265}
2266
2267
13febba5 2268PERL_STATIC_INLINE void
c9182d9c 2269Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2270{
2271 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2272
5b6f7443
DM
2273 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2274 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2275 cx->blk_eval.retop = retop;
2276 cx->blk_eval.old_namesv = namesv;
2277 cx->blk_eval.old_eval_root = PL_eval_root;
2278 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2279 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2280 cx->blk_eval.cur_top_env = PL_top_env;
2281
4c57ced5 2282 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2283 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2284 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2285}
2286
2287
2288PERL_STATIC_INLINE void
c9182d9c 2289Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2290{
2291 SV *sv;
2292
2293 PERL_ARGS_ASSERT_CX_POPEVAL;
2294 assert(CxTYPE(cx) == CXt_EVAL);
2295
2296 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2297 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2298 PL_eval_root = cx->blk_eval.old_eval_root;
2299 sv = cx->blk_eval.cur_text;
4c57ced5 2300 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2301 cx->blk_eval.cur_text = NULL;
2302 SvREFCNT_dec_NN(sv);
2303 }
2304
2305 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2306 if (sv) {
2307 cx->blk_eval.old_namesv = NULL;
2308 SvREFCNT_dec_NN(sv);
2309 }
5b6f7443 2310 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2311}
6a7d52cc 2312
a73d8813 2313
d1b6bf72
DM
2314/* push a plain loop, i.e.
2315 * { block }
2316 * while (cond) { block }
2317 * for (init;cond;continue) { block }
2318 * This loop can be last/redo'ed etc.
2319 */
2320
2321PERL_STATIC_INLINE void
c9182d9c 2322Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2323{
2324 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2325 cx->blk_loop.my_op = cLOOP;
2326}
2327
2328
2329/* push a true for loop, i.e.
2330 * for var (list) { block }
2331 */
2332
2333PERL_STATIC_INLINE void
c9182d9c 2334Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2335{
2336 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2337
2338 /* this one line is common with cx_pushloop_plain */
2339 cx->blk_loop.my_op = cLOOP;
2340
2341 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2342 cx->blk_loop.itersave = itersave;
2343#ifdef USE_ITHREADS
2344 cx->blk_loop.oldcomppad = PL_comppad;
2345#endif
2346}
2347
2348
2349/* pop all loop types, including plain */
2350
2351PERL_STATIC_INLINE void
c9182d9c 2352Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2353{
2354 PERL_ARGS_ASSERT_CX_POPLOOP;
2355
2356 assert(CxTYPE_is_LOOP(cx));
2357 if ( CxTYPE(cx) == CXt_LOOP_ARY
2358 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2359 {
2360 /* Free ary or cur. This assumes that state_u.ary.ary
2361 * aligns with state_u.lazysv.cur. See cx_dup() */
2362 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2363 cx->blk_loop.state_u.lazysv.cur = NULL;
2364 SvREFCNT_dec_NN(sv);
2365 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2366 sv = cx->blk_loop.state_u.lazysv.end;
2367 cx->blk_loop.state_u.lazysv.end = NULL;
2368 SvREFCNT_dec_NN(sv);
2369 }
2370 }
2371 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2372 SV *cursv;
2373 SV **svp = (cx)->blk_loop.itervar_u.svp;
2374 if ((cx->cx_type & CXp_FOR_GV))
2375 svp = &GvSV((GV*)svp);
2376 cursv = *svp;
2377 *svp = cx->blk_loop.itersave;
2378 cx->blk_loop.itersave = NULL;
2379 SvREFCNT_dec(cursv);
2380 }
2381}
2382
2a7b7c61
DM
2383
2384PERL_STATIC_INLINE void
c9182d9c 2385Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2386{
7896dde7 2387 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2388
7896dde7 2389 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2390}
2391
2392
2393PERL_STATIC_INLINE void
c9182d9c 2394Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2395{
7896dde7
Z
2396 PERL_ARGS_ASSERT_CX_POPWHEN;
2397 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2398
2399 PERL_UNUSED_ARG(cx);
59a14f30 2400 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2401 /* currently NOOP */
2402}
2403
2404
7896dde7 2405PERL_STATIC_INLINE void
c9182d9c 2406Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2407{
2408 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2409
2410 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2411 cx->blk_givwhen.defsv_save = orig_defsv;
2412}
2413
2414
2415PERL_STATIC_INLINE void
c9182d9c 2416Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2417{
2418 SV *sv;
2419
2420 PERL_ARGS_ASSERT_CX_POPGIVEN;
2421 assert(CxTYPE(cx) == CXt_GIVEN);
2422
2423 sv = GvSV(PL_defgv);
2424 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2425 cx->blk_givwhen.defsv_save = NULL;
2426 SvREFCNT_dec(sv);
2427}
2428
ec2c235b
KW
2429/* ------------------ util.h ------------------------------------------- */
2430
2431/*
2432=head1 Miscellaneous Functions
2433
2434=for apidoc foldEQ
2435
2436Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2437same
2438case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2439match themselves and their opposite case counterparts. Non-cased and non-ASCII
2440range bytes match only themselves.
2441
2442=cut
2443*/
2444
2445PERL_STATIC_INLINE I32
2446Perl_foldEQ(const char *s1, const char *s2, I32 len)
2447{
2448 const U8 *a = (const U8 *)s1;
2449 const U8 *b = (const U8 *)s2;
2450
2451 PERL_ARGS_ASSERT_FOLDEQ;
2452
2453 assert(len >= 0);
2454
2455 while (len--) {
2456 if (*a != *b && *a != PL_fold[*b])
2457 return 0;
2458 a++,b++;
2459 }
2460 return 1;
2461}
2462
0f9cb40c 2463PERL_STATIC_INLINE I32
ec2c235b
KW
2464Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2465{
79a1fabd
KW
2466 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2467 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2468 * does not check for this. Nor does it check that the strings each have
2469 * at least 'len' characters. */
ec2c235b
KW
2470
2471 const U8 *a = (const U8 *)s1;
2472 const U8 *b = (const U8 *)s2;
2473
2474 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2475
2476 assert(len >= 0);
2477
2478 while (len--) {
2479 if (*a != *b && *a != PL_fold_latin1[*b]) {
2480 return 0;
2481 }
2482 a++, b++;
2483 }
2484 return 1;
2485}
2486
2487/*
2488=for apidoc foldEQ_locale
2489
2490Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2491same case-insensitively in the current locale; false otherwise.
2492
2493=cut
2494*/
2495
0f9cb40c 2496PERL_STATIC_INLINE I32
ec2c235b
KW
2497Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2498{
2499 dVAR;
2500 const U8 *a = (const U8 *)s1;
2501 const U8 *b = (const U8 *)s2;
2502
2503 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2504
2505 assert(len >= 0);
2506
2507 while (len--) {
2508 if (*a != *b && *a != PL_fold_locale[*b])
2509 return 0;
2510 a++,b++;
2511 }
2512 return 1;
2513}
2514
6dba01e2
KW
2515#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2516
2517PERL_STATIC_INLINE void *
2518S_my_memrchr(const char * s, const char c, const STRLEN len)
2519{
2520 /* memrchr(), since many platforms lack it */
2521
2522 const char * t = s + len - 1;
2523
2524 PERL_ARGS_ASSERT_MY_MEMRCHR;
2525
2526 while (t >= s) {
2527 if (*t == c) {
2528 return (void *) t;
2529 }
2530 t--;
2531 }
2532
2533 return NULL;
2534}
2535
2536#endif
2537
ed382232 2538/*
c8028aa6
TC
2539 * ex: set ts=8 sts=4 sw=4 et:
2540 */