This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
sv_utf8_decode: Reverse order of tests for speed
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file is a home for static inline functions that cannot go in other
9 * headers files, because they depend on proto.h (included after most other
10 * headers) or struct definitions.
11 *
12 * Each section names the header file that the functions "belong" to.
13 */
27669aa4 14
be3a7a5d
KW
15/* ------------------------------- av.h ------------------------------- */
16
c70927a6 17PERL_STATIC_INLINE SSize_t
be3a7a5d
KW
18S_av_top_index(pTHX_ AV *av)
19{
20 PERL_ARGS_ASSERT_AV_TOP_INDEX;
21 assert(SvTYPE(av) == SVt_PVAV);
22
23 return AvFILL(av);
24}
25
1afe1db1
FC
26/* ------------------------------- cv.h ------------------------------- */
27
ae77754a
FC
28PERL_STATIC_INLINE GV *
29S_CvGV(pTHX_ CV *sv)
30{
31 return CvNAMED(sv)
32 ? Perl_cvgv_from_hek(aTHX_ sv)
33 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
34}
35
1afe1db1
FC
36PERL_STATIC_INLINE I32 *
37S_CvDEPTHp(const CV * const sv)
38{
39 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
8de47657 40 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
41}
42
d16269d8
PM
43/*
44 CvPROTO returns the prototype as stored, which is not necessarily what
45 the interpreter should be using. Specifically, the interpreter assumes
46 that spaces have been stripped, which has been the case if the prototype
47 was added by toke.c, but is generally not the case if it was added elsewhere.
48 Since we can't enforce the spacelessness at assignment time, this routine
49 provides a temporary copy at parse time with spaces removed.
50 I<orig> is the start of the original buffer, I<len> is the length of the
51 prototype and will be updated when this returns.
52 */
53
5b67adb8 54#ifdef PERL_CORE
d16269d8
PM
55PERL_STATIC_INLINE char *
56S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
57{
58 SV * tmpsv;
59 char * tmps;
60 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
61 tmps = SvPVX(tmpsv);
62 while ((*len)--) {
63 if (!isSPACE(*orig))
64 *tmps++ = *orig;
65 orig++;
66 }
67 *tmps = '\0';
68 *len = tmps - SvPVX(tmpsv);
69 return SvPVX(tmpsv);
70}
5b67adb8 71#endif
d16269d8 72
25fdce4a
FC
73/* ------------------------------- mg.h ------------------------------- */
74
75#if defined(PERL_CORE) || defined(PERL_EXT)
76/* assumes get-magic and stringification have already occurred */
77PERL_STATIC_INLINE STRLEN
78S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
79{
80 assert(mg->mg_type == PERL_MAGIC_regex_global);
81 assert(mg->mg_len != -1);
82 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
83 return (STRLEN)mg->mg_len;
84 else {
85 const STRLEN pos = (STRLEN)mg->mg_len;
86 /* Without this check, we may read past the end of the buffer: */
87 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
88 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
89 }
90}
91#endif
92
03414f05
FC
93/* ------------------------------- pad.h ------------------------------ */
94
95#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
96PERL_STATIC_INLINE bool
97PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
98{
99 /* is seq within the range _LOW to _HIGH ?
100 * This is complicated by the fact that PL_cop_seqmax
101 * may have wrapped around at some point */
102 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
103 return FALSE; /* not yet introduced */
104
105 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
106 /* in compiling scope */
107 if (
108 (seq > COP_SEQ_RANGE_LOW(pn))
109 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
110 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
111 )
112 return TRUE;
113 }
114 else if (
115 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
116 ?
117 ( seq > COP_SEQ_RANGE_LOW(pn)
118 || seq <= COP_SEQ_RANGE_HIGH(pn))
119
120 : ( seq > COP_SEQ_RANGE_LOW(pn)
121 && seq <= COP_SEQ_RANGE_HIGH(pn))
122 )
123 return TRUE;
124 return FALSE;
125}
126#endif
127
33a4312b
FC
128/* ------------------------------- pp.h ------------------------------- */
129
130PERL_STATIC_INLINE I32
131S_TOPMARK(pTHX)
132{
133 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 134 "MARK top %p %" IVdf "\n",
33a4312b
FC
135 PL_markstack_ptr,
136 (IV)*PL_markstack_ptr)));
137 return *PL_markstack_ptr;
138}
139
140PERL_STATIC_INLINE I32
141S_POPMARK(pTHX)
142{
143 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 144 "MARK pop %p %" IVdf "\n",
33a4312b
FC
145 (PL_markstack_ptr-1),
146 (IV)*(PL_markstack_ptr-1))));
147 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
148 return *PL_markstack_ptr--;
149}
150
8d919b0a
FC
151/* ----------------------------- regexp.h ----------------------------- */
152
153PERL_STATIC_INLINE struct regexp *
154S_ReANY(const REGEXP * const re)
155{
df6b4bd5 156 XPV* const p = (XPV*)SvANY(re);
8d919b0a 157 assert(isREGEXP(re));
df6b4bd5
DM
158 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
159 : (struct regexp *)p;
8d919b0a
FC
160}
161
27669aa4
FC
162/* ------------------------------- sv.h ------------------------------- */
163
164PERL_STATIC_INLINE SV *
165S_SvREFCNT_inc(SV *sv)
166{
2439e033 167 if (LIKELY(sv != NULL))
27669aa4
FC
168 SvREFCNT(sv)++;
169 return sv;
170}
171PERL_STATIC_INLINE SV *
172S_SvREFCNT_inc_NN(SV *sv)
173{
174 SvREFCNT(sv)++;
175 return sv;
176}
177PERL_STATIC_INLINE void
178S_SvREFCNT_inc_void(SV *sv)
179{
2439e033 180 if (LIKELY(sv != NULL))
27669aa4
FC
181 SvREFCNT(sv)++;
182}
75e16a44
FC
183PERL_STATIC_INLINE void
184S_SvREFCNT_dec(pTHX_ SV *sv)
185{
2439e033 186 if (LIKELY(sv != NULL)) {
75a9bf96 187 U32 rc = SvREFCNT(sv);
79e2a32a 188 if (LIKELY(rc > 1))
75a9bf96
DM
189 SvREFCNT(sv) = rc - 1;
190 else
191 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
192 }
193}
541377b1
FC
194
195PERL_STATIC_INLINE void
4a9a56a7
DM
196S_SvREFCNT_dec_NN(pTHX_ SV *sv)
197{
198 U32 rc = SvREFCNT(sv);
79e2a32a 199 if (LIKELY(rc > 1))
4a9a56a7
DM
200 SvREFCNT(sv) = rc - 1;
201 else
202 Perl_sv_free2(aTHX_ sv, rc);
203}
204
205PERL_STATIC_INLINE void
541377b1
FC
206SvAMAGIC_on(SV *sv)
207{
208 assert(SvROK(sv));
209 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
210}
211PERL_STATIC_INLINE void
212SvAMAGIC_off(SV *sv)
213{
214 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
215 HvAMAGIC_off(SvSTASH(SvRV(sv)));
216}
217
218PERL_STATIC_INLINE U32
541377b1
FC
219S_SvPADSTALE_on(SV *sv)
220{
c0683843 221 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
222 return SvFLAGS(sv) |= SVs_PADSTALE;
223}
224PERL_STATIC_INLINE U32
225S_SvPADSTALE_off(SV *sv)
226{
c0683843 227 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
228 return SvFLAGS(sv) &= ~SVs_PADSTALE;
229}
25fdce4a 230#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 231PERL_STATIC_INLINE STRLEN
6964422a 232S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 233{
25fdce4a 234 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
235 if (SvGAMAGIC(sv)) {
236 U8 *hopped = utf8_hop((U8 *)pv, pos);
237 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
238 return (STRLEN)(hopped - (U8 *)pv);
239 }
240 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
241}
242#endif
f019c49e 243
d1decf2b
TC
244/* ------------------------------- handy.h ------------------------------- */
245
246/* saves machine code for a common noreturn idiom typically used in Newx*() */
c1d6452f 247#ifdef GCC_DIAG_PRAGMA
6ab56f1e 248GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
c1d6452f 249#endif
d1decf2b
TC
250static void
251S_croak_memory_wrap(void)
252{
253 Perl_croak_nocontext("%s",PL_memory_wrap);
254}
c1d6452f 255#ifdef GCC_DIAG_PRAGMA
6ab56f1e 256GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
c1d6452f 257#endif
d1decf2b 258
a8a2ceaa
KW
259/* ------------------------------- utf8.h ------------------------------- */
260
2fe720e2
KW
261/*
262=head1 Unicode Support
263*/
264
55d09dc8
KW
265PERL_STATIC_INLINE void
266S_append_utf8_from_native_byte(const U8 byte, U8** dest)
267{
268 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
269 * encoded string at '*dest', updating '*dest' to include it */
270
55d09dc8
KW
271 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
272
6f2d5cbc 273 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 274 *((*dest)++) = byte;
55d09dc8 275 else {
a09ec51a
KW
276 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
277 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
278 }
279}
280
e123187a 281/*
2fe720e2 282=for apidoc valid_utf8_to_uvchr
2717076a 283Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
2fe720e2
KW
284the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
285it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and
286non-Unicode code points are allowed.
287
288=cut
289
290 */
291
292PERL_STATIC_INLINE UV
293Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
294{
c41b2540 295 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
296 const U8* send = s + expectlen;
297 UV uv = *s;
298
299 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
300
301 if (retlen) {
302 *retlen = expectlen;
303 }
304
305 /* An invariant is trivially returned */
306 if (expectlen == 1) {
307 return uv;
308 }
309
310 /* Remove the leading bits that indicate the number of bytes, leaving just
311 * the bits that are part of the value */
312 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
313
314 /* Now, loop through the remaining bytes, accumulating each into the
315 * working total as we go. (I khw tried unrolling the loop for up to 4
316 * bytes, but there was no performance improvement) */
317 for (++s; s < send; s++) {
318 uv = UTF8_ACCUMULATE(uv, *s);
319 }
320
321 return UNI_TO_NATIVE(uv);
322
323}
324
1e599354
KW
325/*
326=for apidoc is_utf8_invariant_string
327
82c5d941 328Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 329regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
330EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
331are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
332the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
333characters are invariant, but so also are the C1 controls.
1e599354
KW
334
335If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
336use this option, that C<s> can't have embedded C<NUL> characters and has to
337have a terminating C<NUL> byte).
338
9f2abfde
KW
339See also
340C<L</is_utf8_string>>,
341C<L</is_utf8_string_flags>>,
342C<L</is_utf8_string_loc>>,
343C<L</is_utf8_string_loc_flags>>,
344C<L</is_utf8_string_loclen>>,
345C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
346C<L</is_utf8_fixed_width_buf_flags>>,
347C<L</is_utf8_fixed_width_buf_loc_flags>>,
348C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
349C<L</is_strict_utf8_string>>,
350C<L</is_strict_utf8_string_loc>>,
351C<L</is_strict_utf8_string_loclen>>,
352C<L</is_c9strict_utf8_string>>,
353C<L</is_c9strict_utf8_string_loc>>,
354and
355C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
356
357=cut
0cbf5865
KW
358
359*/
360
361#define is_utf8_invariant_string(s, len) \
362 is_utf8_invariant_string_loc(s, len, NULL)
363
364/*
365=for apidoc is_utf8_invariant_string_loc
366
367Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
368the first UTF-8 variant character in the C<ep> pointer; if all characters are
369UTF-8 invariant, this function does not change the contents of C<*ep>.
370
371=cut
372
1e599354
KW
373*/
374
375PERL_STATIC_INLINE bool
e17544a6 376S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 377{
e17544a6 378 const U8* send;
1e599354
KW
379 const U8* x = s;
380
0cbf5865
KW
381 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
382
e17544a6
KW
383 if (len == 0) {
384 len = strlen((const char *)s);
385 }
386
387 send = s + len;
388
389#ifndef EBCDIC
390 /* Try to get the widest word on this platform */
391# ifdef HAS_LONG_LONG
392# define PERL_WORDCAST unsigned long long
393# define PERL_WORDSIZE LONGLONGSIZE
394# else
395# define PERL_WORDCAST UV
396# define PERL_WORDSIZE UVSIZE
397# endif
398
399# if PERL_WORDSIZE == 4
400# define PERL_VARIANTS_WORD_MASK 0x80808080
401# define PERL_WORD_BOUNDARY_MASK 0x3
402# elif PERL_WORDSIZE == 8
403# define PERL_VARIANTS_WORD_MASK 0x8080808080808080
404# define PERL_WORD_BOUNDARY_MASK 0x7
405# else
406# error Unexpected word size
407# endif
408
409 /* Process per-byte until reach word boundary. XXX This loop could be
410 * eliminated if we knew that this platform had fast unaligned reads */
411 while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
412 if (! UTF8_IS_INVARIANT(*x)) {
413 if (ep) {
414 *ep = x;
415 }
416
417 return FALSE;
418 }
419 x++;
420 }
421
422 /* Process per-word as long as we have at least a full word left */
423 while (x + PERL_WORDSIZE <= send) {
424 if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
425
426 /* Found a variant. Just return if caller doesn't want its exact
427 * position */
428 if (! ep) {
429 return FALSE;
430 }
431
432 /* Otherwise fall into final loop to find which byte it is */
433 break;
0cbf5865 434 }
e17544a6
KW
435 x += PERL_WORDSIZE;
436 }
437
438# undef PERL_WORDCAST
439# undef PERL_WORDSIZE
440# undef PERL_WORD_BOUNDARY_MASK
441# undef PERL_VARIANTS_WORD_MASK
442#endif
443
444 /* Process per-byte */
445 while (x < send) {
446 if (! UTF8_IS_INVARIANT(*x)) {
447 if (ep) {
448 *ep = x;
449 }
0cbf5865 450
e17544a6 451 return FALSE;
0cbf5865 452 }
1e599354 453
e17544a6 454 x++;
1e599354
KW
455 }
456
457 return TRUE;
458}
459
7c93d8f0 460/*
5ff889fb
KW
461=for apidoc is_utf8_string
462
82c5d941
KW
463Returns TRUE if the first C<len> bytes of string C<s> form a valid
464Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
465be calculated using C<strlen(s)> (which means if you use this option, that C<s>
466can't have embedded C<NUL> characters and has to have a terminating C<NUL>
467byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
468
2717076a
KW
469This function considers Perl's extended UTF-8 to be valid. That means that
470code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
471considered valid by this function. Use C<L</is_strict_utf8_string>>,
472C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
473code points are considered valid.
5ff889fb 474
9f2abfde
KW
475See also
476C<L</is_utf8_invariant_string>>,
0cbf5865 477C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
478C<L</is_utf8_string_loc>>,
479C<L</is_utf8_string_loclen>>,
8bc127bf
KW
480C<L</is_utf8_fixed_width_buf_flags>>,
481C<L</is_utf8_fixed_width_buf_loc_flags>>,
482C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
483
484=cut
485*/
486
56e4cf64 487PERL_STATIC_INLINE bool
c41b2540 488Perl_is_utf8_string(const U8 *s, const STRLEN len)
5ff889fb 489{
35936d22
KW
490 /* This is now marked pure in embed.fnc, because isUTF8_CHAR now is pure.
491 * Be aware of possible changes to that */
492
5ff889fb
KW
493 const U8* const send = s + (len ? len : strlen((const char *)s));
494 const U8* x = s;
495
496 PERL_ARGS_ASSERT_IS_UTF8_STRING;
497
498 while (x < send) {
c41b2540
KW
499 const STRLEN cur_len = isUTF8_CHAR(x, send);
500 if (UNLIKELY(! cur_len)) {
5ff889fb
KW
501 return FALSE;
502 }
c41b2540 503 x += cur_len;
5ff889fb
KW
504 }
505
506 return TRUE;
507}
508
509/*
9f2abfde
KW
510=for apidoc is_strict_utf8_string
511
512Returns TRUE if the first C<len> bytes of string C<s> form a valid
513UTF-8-encoded string that is fully interchangeable by any application using
514Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
515calculated using C<strlen(s)> (which means if you use this option, that C<s>
516can't have embedded C<NUL> characters and has to have a terminating C<NUL>
517byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
518
519This function returns FALSE for strings containing any
520code points above the Unicode max of 0x10FFFF, surrogate code points, or
521non-character code points.
522
523See also
524C<L</is_utf8_invariant_string>>,
0cbf5865 525C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
526C<L</is_utf8_string>>,
527C<L</is_utf8_string_flags>>,
528C<L</is_utf8_string_loc>>,
529C<L</is_utf8_string_loc_flags>>,
530C<L</is_utf8_string_loclen>>,
531C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
532C<L</is_utf8_fixed_width_buf_flags>>,
533C<L</is_utf8_fixed_width_buf_loc_flags>>,
534C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
535C<L</is_strict_utf8_string_loc>>,
536C<L</is_strict_utf8_string_loclen>>,
537C<L</is_c9strict_utf8_string>>,
538C<L</is_c9strict_utf8_string_loc>>,
539and
540C<L</is_c9strict_utf8_string_loclen>>.
541
542=cut
543*/
544
545PERL_STATIC_INLINE bool
546S_is_strict_utf8_string(const U8 *s, const STRLEN len)
547{
548 const U8* const send = s + (len ? len : strlen((const char *)s));
549 const U8* x = s;
550
551 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING;
552
553 while (x < send) {
554 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
555 if (UNLIKELY(! cur_len)) {
556 return FALSE;
557 }
558 x += cur_len;
559 }
560
561 return TRUE;
562}
563
564/*
565=for apidoc is_c9strict_utf8_string
566
567Returns TRUE if the first C<len> bytes of string C<s> form a valid
568UTF-8-encoded string that conforms to
569L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
570otherwise it returns FALSE. If C<len> is 0, it will be calculated using
571C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
572C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
573characters being ASCII constitute 'a valid UTF-8 string'.
574
575This function returns FALSE for strings containing any code points above the
576Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
577code points per
578L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
579
580See also
581C<L</is_utf8_invariant_string>>,
0cbf5865 582C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
583C<L</is_utf8_string>>,
584C<L</is_utf8_string_flags>>,
585C<L</is_utf8_string_loc>>,
586C<L</is_utf8_string_loc_flags>>,
587C<L</is_utf8_string_loclen>>,
588C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
589C<L</is_utf8_fixed_width_buf_flags>>,
590C<L</is_utf8_fixed_width_buf_loc_flags>>,
591C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
592C<L</is_strict_utf8_string>>,
593C<L</is_strict_utf8_string_loc>>,
594C<L</is_strict_utf8_string_loclen>>,
595C<L</is_c9strict_utf8_string_loc>>,
596and
597C<L</is_c9strict_utf8_string_loclen>>.
598
599=cut
600*/
601
602PERL_STATIC_INLINE bool
603S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
604{
605 const U8* const send = s + (len ? len : strlen((const char *)s));
606 const U8* x = s;
607
608 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING;
609
610 while (x < send) {
611 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
612 if (UNLIKELY(! cur_len)) {
613 return FALSE;
614 }
615 x += cur_len;
616 }
617
618 return TRUE;
619}
620
621/* The above 3 functions could have been moved into the more general one just
622 * below, and made #defines that call it with the right 'flags'. They are
623 * currently kept separate to increase their chances of getting inlined */
624
625/*
626=for apidoc is_utf8_string_flags
627
628Returns TRUE if the first C<len> bytes of string C<s> form a valid
629UTF-8 string, subject to the restrictions imposed by C<flags>;
630returns FALSE otherwise. If C<len> is 0, it will be calculated
631using C<strlen(s)> (which means if you use this option, that C<s> can't have
632embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
633that all characters being ASCII constitute 'a valid UTF-8 string'.
634
635If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
636C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
637as C<L</is_strict_utf8_string>>; and if C<flags> is
638C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
639C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
640combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
641C<L</utf8n_to_uvchr>>, with the same meanings.
642
643See also
644C<L</is_utf8_invariant_string>>,
0cbf5865 645C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
646C<L</is_utf8_string>>,
647C<L</is_utf8_string_loc>>,
648C<L</is_utf8_string_loc_flags>>,
649C<L</is_utf8_string_loclen>>,
650C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
651C<L</is_utf8_fixed_width_buf_flags>>,
652C<L</is_utf8_fixed_width_buf_loc_flags>>,
653C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
654C<L</is_strict_utf8_string>>,
655C<L</is_strict_utf8_string_loc>>,
656C<L</is_strict_utf8_string_loclen>>,
657C<L</is_c9strict_utf8_string>>,
658C<L</is_c9strict_utf8_string_loc>>,
659and
660C<L</is_c9strict_utf8_string_loclen>>.
661
662=cut
663*/
664
665PERL_STATIC_INLINE bool
666S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
667{
668 const U8* const send = s + (len ? len : strlen((const char *)s));
669 const U8* x = s;
670
671 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
672 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 673 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde
KW
674
675 if (flags == 0) {
676 return is_utf8_string(s, len);
677 }
678
d044b7a7 679 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
680 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
681 {
682 return is_strict_utf8_string(s, len);
683 }
684
d044b7a7 685 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
686 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
687 {
688 return is_c9strict_utf8_string(s, len);
689 }
690
691 while (x < send) {
692 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
693 if (UNLIKELY(! cur_len)) {
694 return FALSE;
695 }
696 x += cur_len;
697 }
698
699 return TRUE;
700}
701
702/*
5ff889fb
KW
703
704=for apidoc is_utf8_string_loc
705
2717076a 706Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 707case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 708"utf8ness success") in the C<ep> pointer.
5ff889fb 709
2717076a 710See also C<L</is_utf8_string_loclen>>.
5ff889fb 711
3964c812
KW
712=cut
713*/
714
715#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
716
717/*
718
5ff889fb
KW
719=for apidoc is_utf8_string_loclen
720
2717076a 721Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 722case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 723"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 724encoded characters in the C<el> pointer.
5ff889fb 725
2717076a 726See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
727
728=cut
729*/
730
56e4cf64 731PERL_STATIC_INLINE bool
c41b2540 732Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb
KW
733{
734 const U8* const send = s + (len ? len : strlen((const char *)s));
735 const U8* x = s;
736 STRLEN outlen = 0;
737
738 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
739
740 while (x < send) {
c41b2540
KW
741 const STRLEN cur_len = isUTF8_CHAR(x, send);
742 if (UNLIKELY(! cur_len)) {
5ff889fb
KW
743 break;
744 }
c41b2540 745 x += cur_len;
5ff889fb
KW
746 outlen++;
747 }
748
749 if (el)
750 *el = outlen;
751
752 if (ep) {
753 *ep = x;
754 }
755
756 return (x == send);
757}
758
759/*
9f2abfde
KW
760
761=for apidoc is_strict_utf8_string_loc
762
763Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
764case of "utf8ness failure") or the location C<s>+C<len> (in the case of
765"utf8ness success") in the C<ep> pointer.
766
767See also C<L</is_strict_utf8_string_loclen>>.
768
769=cut
770*/
771
772#define is_strict_utf8_string_loc(s, len, ep) \
773 is_strict_utf8_string_loclen(s, len, ep, 0)
774
775/*
776
777=for apidoc is_strict_utf8_string_loclen
778
779Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
780case of "utf8ness failure") or the location C<s>+C<len> (in the case of
781"utf8ness success") in the C<ep> pointer, and the number of UTF-8
782encoded characters in the C<el> pointer.
783
784See also C<L</is_strict_utf8_string_loc>>.
785
786=cut
787*/
788
789PERL_STATIC_INLINE bool
790S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
791{
792 const U8* const send = s + (len ? len : strlen((const char *)s));
793 const U8* x = s;
794 STRLEN outlen = 0;
795
796 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
797
798 while (x < send) {
799 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
800 if (UNLIKELY(! cur_len)) {
801 break;
802 }
803 x += cur_len;
804 outlen++;
805 }
806
807 if (el)
808 *el = outlen;
809
810 if (ep) {
811 *ep = x;
812 }
813
814 return (x == send);
815}
816
817/*
818
819=for apidoc is_c9strict_utf8_string_loc
820
821Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
822the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
823"utf8ness success") in the C<ep> pointer.
824
825See also C<L</is_c9strict_utf8_string_loclen>>.
826
827=cut
828*/
829
830#define is_c9strict_utf8_string_loc(s, len, ep) \
831 is_c9strict_utf8_string_loclen(s, len, ep, 0)
832
833/*
834
835=for apidoc is_c9strict_utf8_string_loclen
836
837Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
838the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
839"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
840characters in the C<el> pointer.
841
842See also C<L</is_c9strict_utf8_string_loc>>.
843
844=cut
845*/
846
847PERL_STATIC_INLINE bool
848S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
849{
850 const U8* const send = s + (len ? len : strlen((const char *)s));
851 const U8* x = s;
852 STRLEN outlen = 0;
853
854 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
855
856 while (x < send) {
857 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
858 if (UNLIKELY(! cur_len)) {
859 break;
860 }
861 x += cur_len;
862 outlen++;
863 }
864
865 if (el)
866 *el = outlen;
867
868 if (ep) {
869 *ep = x;
870 }
871
872 return (x == send);
873}
874
875/*
876
877=for apidoc is_utf8_string_loc_flags
878
879Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
880case of "utf8ness failure") or the location C<s>+C<len> (in the case of
881"utf8ness success") in the C<ep> pointer.
882
883See also C<L</is_utf8_string_loclen_flags>>.
884
885=cut
886*/
887
888#define is_utf8_string_loc_flags(s, len, ep, flags) \
889 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
890
891
892/* The above 3 actual functions could have been moved into the more general one
893 * just below, and made #defines that call it with the right 'flags'. They are
894 * currently kept separate to increase their chances of getting inlined */
895
896/*
897
898=for apidoc is_utf8_string_loclen_flags
899
900Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
901case of "utf8ness failure") or the location C<s>+C<len> (in the case of
902"utf8ness success") in the C<ep> pointer, and the number of UTF-8
903encoded characters in the C<el> pointer.
904
905See also C<L</is_utf8_string_loc_flags>>.
906
907=cut
908*/
909
910PERL_STATIC_INLINE bool
911S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
912{
913 const U8* const send = s + (len ? len : strlen((const char *)s));
914 const U8* x = s;
915 STRLEN outlen = 0;
916
917 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
918 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 919 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde
KW
920
921 if (flags == 0) {
922 return is_utf8_string_loclen(s, len, ep, el);
923 }
924
d044b7a7 925 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
926 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
927 {
928 return is_strict_utf8_string_loclen(s, len, ep, el);
929 }
930
d044b7a7 931 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
932 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
933 {
934 return is_c9strict_utf8_string_loclen(s, len, ep, el);
935 }
936
937 while (x < send) {
938 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
939 if (UNLIKELY(! cur_len)) {
940 break;
941 }
942 x += cur_len;
943 outlen++;
944 }
945
946 if (el)
947 *el = outlen;
948
949 if (ep) {
950 *ep = x;
951 }
952
953 return (x == send);
954}
955
956/*
7c93d8f0
KW
957=for apidoc utf8_distance
958
959Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
960and C<b>.
961
962WARNING: use only if you *know* that the pointers point inside the
963same UTF-8 buffer.
964
965=cut
966*/
967
968PERL_STATIC_INLINE IV
969Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
970{
971 PERL_ARGS_ASSERT_UTF8_DISTANCE;
972
973 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
974}
975
976/*
977=for apidoc utf8_hop
978
979Return the UTF-8 pointer C<s> displaced by C<off> characters, either
980forward or backward.
981
982WARNING: do not use the following unless you *know* C<off> is within
983the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
984on the first byte of character or just after the last byte of a character.
985
986=cut
987*/
988
989PERL_STATIC_INLINE U8 *
990Perl_utf8_hop(const U8 *s, SSize_t off)
991{
992 PERL_ARGS_ASSERT_UTF8_HOP;
993
994 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
995 * the bitops (especially ~) can create illegal UTF-8.
996 * In other words: in Perl UTF-8 is not just for Unicode. */
997
998 if (off >= 0) {
999 while (off--)
1000 s += UTF8SKIP(s);
1001 }
1002 else {
1003 while (off++) {
1004 s--;
1005 while (UTF8_IS_CONTINUATION(*s))
1006 s--;
1007 }
1008 }
de979548 1009 GCC_DIAG_IGNORE(-Wcast-qual);
7c93d8f0 1010 return (U8 *)s;
de979548 1011 GCC_DIAG_RESTORE;
7c93d8f0
KW
1012}
1013
4dab108f 1014/*
65df57a8
TC
1015=for apidoc utf8_hop_forward
1016
1017Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1018forward.
1019
1020C<off> must be non-negative.
1021
1022C<s> must be before or equal to C<end>.
1023
1024When moving forward it will not move beyond C<end>.
1025
1026Will not exceed this limit even if the string is not valid "UTF-8".
1027
1028=cut
1029*/
1030
1031PERL_STATIC_INLINE U8 *
1032Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1033{
1034 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1035
1036 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1037 * the bitops (especially ~) can create illegal UTF-8.
1038 * In other words: in Perl UTF-8 is not just for Unicode. */
1039
1040 assert(s <= end);
1041 assert(off >= 0);
1042
1043 while (off--) {
1044 STRLEN skip = UTF8SKIP(s);
de979548
P
1045 if ((STRLEN)(end - s) <= skip) {
1046 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1047 return (U8 *)end;
de979548
P
1048 GCC_DIAG_RESTORE;
1049 }
65df57a8
TC
1050 s += skip;
1051 }
1052
de979548 1053 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1054 return (U8 *)s;
de979548 1055 GCC_DIAG_RESTORE;
65df57a8
TC
1056}
1057
1058/*
1059=for apidoc utf8_hop_back
1060
1061Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1062backward.
1063
1064C<off> must be non-positive.
1065
1066C<s> must be after or equal to C<start>.
1067
1068When moving backward it will not move before C<start>.
1069
1070Will not exceed this limit even if the string is not valid "UTF-8".
1071
1072=cut
1073*/
1074
1075PERL_STATIC_INLINE U8 *
1076Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1077{
1078 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1079
1080 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1081 * the bitops (especially ~) can create illegal UTF-8.
1082 * In other words: in Perl UTF-8 is not just for Unicode. */
1083
1084 assert(start <= s);
1085 assert(off <= 0);
1086
1087 while (off++ && s > start) {
1088 s--;
1089 while (UTF8_IS_CONTINUATION(*s) && s > start)
1090 s--;
1091 }
1092
de979548 1093 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1094 return (U8 *)s;
de979548 1095 GCC_DIAG_RESTORE;
65df57a8
TC
1096}
1097
1098/*
1099=for apidoc utf8_hop_safe
1100
1101Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1102either forward or backward.
1103
1104When moving backward it will not move before C<start>.
1105
1106When moving forward it will not move beyond C<end>.
1107
1108Will not exceed those limits even if the string is not valid "UTF-8".
1109
1110=cut
1111*/
1112
1113PERL_STATIC_INLINE U8 *
1114Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1115{
1116 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1117
1118 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1119 * the bitops (especially ~) can create illegal UTF-8.
1120 * In other words: in Perl UTF-8 is not just for Unicode. */
1121
1122 assert(start <= s && s <= end);
1123
1124 if (off >= 0) {
1125 return utf8_hop_forward(s, off, end);
1126 }
1127 else {
1128 return utf8_hop_back(s, off, start);
1129 }
1130}
1131
1132/*
4dab108f
KW
1133
1134=for apidoc is_utf8_valid_partial_char
1135
6cbb9248
KW
1136Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1137S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1138points. Otherwise, it returns 1 if there exists at least one non-empty
1139sequence of bytes that when appended to sequence C<s>, starting at position
1140C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1141otherwise returns 0.
1142
1143In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1144point.
1145
1146This is useful when a fixed-length buffer is being tested for being well-formed
1147UTF-8, but the final few bytes in it don't comprise a full character; that is,
1148it is split somewhere in the middle of the final code point's UTF-8
1149representation. (Presumably when the buffer is refreshed with the next chunk
1150of data, the new first bytes will complete the partial code point.) This
1151function is used to verify that the final bytes in the current buffer are in
1152fact the legal beginning of some code point, so that if they aren't, the
1153failure can be signalled without having to wait for the next read.
4dab108f
KW
1154
1155=cut
1156*/
2717076a
KW
1157#define is_utf8_valid_partial_char(s, e) \
1158 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1159
1160/*
1161
1162=for apidoc is_utf8_valid_partial_char_flags
1163
1164Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1165or not the input is a valid UTF-8 encoded partial character, but it takes an
1166extra parameter, C<flags>, which can further restrict which code points are
1167considered valid.
1168
1169If C<flags> is 0, this behaves identically to
1170C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1171of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1172there is any sequence of bytes that can complete the input partial character in
1173such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1174TRUE; otherwise FALSE. Non character code points cannot be determined based on
1175partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1176determined from just the first one or two bytes.
1177
1178=cut
1179 */
1180
56e4cf64 1181PERL_STATIC_INLINE bool
f1c999a7 1182S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1183{
f1c999a7 1184 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1185
f1c999a7 1186 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1187 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1188
8875bd48 1189 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1190 return FALSE;
1191 }
1192
f1c999a7 1193 return cBOOL(_is_utf8_char_helper(s, e, flags));
4dab108f
KW
1194}
1195
8bc127bf
KW
1196/*
1197
1198=for apidoc is_utf8_fixed_width_buf_flags
1199
1200Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1201is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1202otherwise it returns FALSE.
1203
1204If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1205without restriction. If the final few bytes of the buffer do not form a
1206complete code point, this will return TRUE anyway, provided that
1207C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1208
1209If C<flags> in non-zero, it can be any combination of the
1210C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1211same meanings.
1212
1213This function differs from C<L</is_utf8_string_flags>> only in that the latter
1214returns FALSE if the final few bytes of the string don't form a complete code
1215point.
1216
1217=cut
1218 */
1219#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1220 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1221
1222/*
1223
1224=for apidoc is_utf8_fixed_width_buf_loc_flags
1225
1226Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1227failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1228to the beginning of any partial character at the end of the buffer; if there is
1229no partial character C<*ep> will contain C<s>+C<len>.
1230
1231See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1232
1233=cut
1234*/
1235
1236#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1237 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1238
1239/*
1240
1241=for apidoc is_utf8_fixed_width_buf_loclen_flags
1242
1243Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1244complete, valid characters found in the C<el> pointer.
1245
1246=cut
1247*/
1248
1249PERL_STATIC_INLINE bool
1250S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1251 const STRLEN len,
1252 const U8 **ep,
1253 STRLEN *el,
1254 const U32 flags)
1255{
1256 const U8 * maybe_partial;
1257
1258 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1259
1260 if (! ep) {
1261 ep = &maybe_partial;
1262 }
1263
1264 /* If it's entirely valid, return that; otherwise see if the only error is
1265 * that the final few bytes are for a partial character */
1266 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1267 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1268}
1269
c8028aa6
TC
1270/* ------------------------------- perl.h ----------------------------- */
1271
1272/*
dcccc8ff
KW
1273=head1 Miscellaneous Functions
1274
41188aa0 1275=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
c8028aa6 1276
6602b933 1277Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1278If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1279
1280Return TRUE if the name is safe.
1281
796b6530 1282Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1283
1284=cut
1285*/
1286
1287PERL_STATIC_INLINE bool
41188aa0 1288S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) {
c8028aa6
TC
1289 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1290 * perl itself uses xce*() functions which accept 8-bit strings.
1291 */
1292
1293 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1294
6c4650b3 1295 if (len > 1) {
c8028aa6 1296 char *null_at;
41188aa0 1297 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1298 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1299 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1300 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1301 what, op_name, pv, null_at+1);
c8028aa6
TC
1302 return FALSE;
1303 }
1304 }
1305
1306 return TRUE;
1307}
1308
1309/*
7cb3f959
TC
1310
1311Return true if the supplied filename has a newline character
fa6c7d00 1312immediately before the first (hopefully only) NUL.
7cb3f959
TC
1313
1314My original look at this incorrectly used the len from SvPV(), but
1315that's incorrect, since we allow for a NUL in pv[len-1].
1316
1317So instead, strlen() and work from there.
1318
1319This allow for the user reading a filename, forgetting to chomp it,
1320then calling:
1321
1322 open my $foo, "$file\0";
1323
1324*/
1325
1326#ifdef PERL_CORE
1327
1328PERL_STATIC_INLINE bool
1329S_should_warn_nl(const char *pv) {
1330 STRLEN len;
1331
1332 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1333
1334 len = strlen(pv);
1335
1336 return len > 0 && pv[len-1] == '\n';
1337}
1338
1339#endif
1340
81d52ecd
JH
1341/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1342
1343#define MAX_CHARSET_NAME_LENGTH 2
1344
1345PERL_STATIC_INLINE const char *
1346get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1347{
1348 /* Returns a string that corresponds to the name of the regex character set
1349 * given by 'flags', and *lenp is set the length of that string, which
1350 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
1351
1352 *lenp = 1;
1353 switch (get_regex_charset(flags)) {
1354 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
1355 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
1356 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1357 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
1358 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
1359 *lenp = 2;
1360 return ASCII_MORE_RESTRICT_PAT_MODS;
1361 }
1362 /* The NOT_REACHED; hides an assert() which has a rather complex
1363 * definition in perl.h. */
1364 NOT_REACHED; /* NOTREACHED */
1365 return "?"; /* Unknown */
1366}
1367
7cb3f959 1368/*
ed382232
TC
1369
1370Return false if any get magic is on the SV other than taint magic.
1371
1372*/
1373
1374PERL_STATIC_INLINE bool
1375S_sv_only_taint_gmagic(SV *sv) {
1376 MAGIC *mg = SvMAGIC(sv);
1377
1378 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
1379
1380 while (mg) {
1381 if (mg->mg_type != PERL_MAGIC_taint
1382 && !(mg->mg_flags & MGf_GSKIP)
1383 && mg->mg_virtual->svt_get) {
1384 return FALSE;
1385 }
1386 mg = mg->mg_moremagic;
1387 }
1388
1389 return TRUE;
1390}
1391
ed8ff0f3
DM
1392/* ------------------ cop.h ------------------------------------------- */
1393
1394
1395/* Enter a block. Push a new base context and return its address. */
1396
1397PERL_STATIC_INLINE PERL_CONTEXT *
1398S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
1399{
1400 PERL_CONTEXT * cx;
1401
1402 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
1403
1404 CXINC;
1405 cx = CX_CUR();
1406 cx->cx_type = type;
1407 cx->blk_gimme = gimme;
1408 cx->blk_oldsaveix = saveix;
4caf7d8c 1409 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 1410 cx->blk_oldcop = PL_curcop;
4caf7d8c 1411 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
1412 cx->blk_oldscopesp = PL_scopestack_ix;
1413 cx->blk_oldpm = PL_curpm;
ce8bb8d8 1414 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
1415
1416 PL_tmps_floor = PL_tmps_ix;
1417 CX_DEBUG(cx, "PUSH");
1418 return cx;
1419}
1420
1421
1422/* Exit a block (RETURN and LAST). */
1423
1424PERL_STATIC_INLINE void
1425S_cx_popblock(pTHX_ PERL_CONTEXT *cx)
1426{
1427 PERL_ARGS_ASSERT_CX_POPBLOCK;
1428
1429 CX_DEBUG(cx, "POP");
1430 /* these 3 are common to cx_popblock and cx_topblock */
1431 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1432 PL_scopestack_ix = cx->blk_oldscopesp;
1433 PL_curpm = cx->blk_oldpm;
1434
1435 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
1436 * and leaves a CX entry lying around for repeated use, so
1437 * skip for multicall */ \
1438 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
1439 || PL_savestack_ix == cx->blk_oldsaveix);
1440 PL_curcop = cx->blk_oldcop;
ce8bb8d8 1441 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
1442}
1443
1444/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
1445 * Whereas cx_popblock() restores the state to the point just before
1446 * cx_pushblock() was called, cx_topblock() restores it to the point just
1447 * *after* cx_pushblock() was called. */
1448
1449PERL_STATIC_INLINE void
1450S_cx_topblock(pTHX_ PERL_CONTEXT *cx)
1451{
1452 PERL_ARGS_ASSERT_CX_TOPBLOCK;
1453
1454 CX_DEBUG(cx, "TOP");
1455 /* these 3 are common to cx_popblock and cx_topblock */
1456 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1457 PL_scopestack_ix = cx->blk_oldscopesp;
1458 PL_curpm = cx->blk_oldpm;
1459
1460 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
1461}
1462
1463
a73d8813
DM
1464PERL_STATIC_INLINE void
1465S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
1466{
1467 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
1468
1469 PERL_ARGS_ASSERT_CX_PUSHSUB;
1470
3f6bd23a 1471 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
1472 cx->blk_sub.cv = cv;
1473 cx->blk_sub.olddepth = CvDEPTH(cv);
1474 cx->blk_sub.prevcomppad = PL_comppad;
1475 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
1476 cx->blk_sub.retop = retop;
1477 SvREFCNT_inc_simple_void_NN(cv);
1478 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
1479}
1480
1481
1482/* subsets of cx_popsub() */
1483
1484PERL_STATIC_INLINE void
1485S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
1486{
1487 CV *cv;
1488
1489 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
1490 assert(CxTYPE(cx) == CXt_SUB);
1491
1492 PL_comppad = cx->blk_sub.prevcomppad;
1493 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1494 cv = cx->blk_sub.cv;
1495 CvDEPTH(cv) = cx->blk_sub.olddepth;
1496 cx->blk_sub.cv = NULL;
1497 SvREFCNT_dec(cv);
1498}
1499
1500
1501/* handle the @_ part of leaving a sub */
1502
1503PERL_STATIC_INLINE void
1504S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
1505{
1506 AV *av;
1507
1508 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
1509 assert(CxTYPE(cx) == CXt_SUB);
1510 assert(AvARRAY(MUTABLE_AV(
1511 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
1512 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
1513
1514 CX_POP_SAVEARRAY(cx);
1515 av = MUTABLE_AV(PAD_SVl(0));
1516 if (UNLIKELY(AvREAL(av)))
1517 /* abandon @_ if it got reified */
1518 clear_defarray(av, 0);
1519 else {
1520 CLEAR_ARGARRAY(av);
1521 }
1522}
1523
1524
1525PERL_STATIC_INLINE void
1526S_cx_popsub(pTHX_ PERL_CONTEXT *cx)
1527{
1528 PERL_ARGS_ASSERT_CX_POPSUB;
1529 assert(CxTYPE(cx) == CXt_SUB);
1530
3f6bd23a 1531 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
1532
1533 if (CxHASARGS(cx))
1534 cx_popsub_args(cx);
1535 cx_popsub_common(cx);
1536}
1537
1538
6a7d52cc
DM
1539PERL_STATIC_INLINE void
1540S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
1541{
1542 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
1543
1544 cx->blk_format.cv = cv;
1545 cx->blk_format.retop = retop;
1546 cx->blk_format.gv = gv;
1547 cx->blk_format.dfoutgv = PL_defoutgv;
1548 cx->blk_format.prevcomppad = PL_comppad;
1549 cx->blk_u16 = 0;
1550
1551 SvREFCNT_inc_simple_void_NN(cv);
1552 CvDEPTH(cv)++;
1553 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
1554}
1555
1556
1557PERL_STATIC_INLINE void
1558S_cx_popformat(pTHX_ PERL_CONTEXT *cx)
1559{
1560 CV *cv;
1561 GV *dfout;
1562
1563 PERL_ARGS_ASSERT_CX_POPFORMAT;
1564 assert(CxTYPE(cx) == CXt_FORMAT);
1565
1566 dfout = cx->blk_format.dfoutgv;
1567 setdefout(dfout);
1568 cx->blk_format.dfoutgv = NULL;
1569 SvREFCNT_dec_NN(dfout);
1570
1571 PL_comppad = cx->blk_format.prevcomppad;
1572 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1573 cv = cx->blk_format.cv;
1574 cx->blk_format.cv = NULL;
1575 --CvDEPTH(cv);
1576 SvREFCNT_dec_NN(cv);
1577}
1578
1579
13febba5
DM
1580PERL_STATIC_INLINE void
1581S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
1582{
1583 PERL_ARGS_ASSERT_CX_PUSHEVAL;
1584
1585 cx->blk_eval.retop = retop;
1586 cx->blk_eval.old_namesv = namesv;
1587 cx->blk_eval.old_eval_root = PL_eval_root;
1588 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
1589 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
1590 cx->blk_eval.cur_top_env = PL_top_env;
1591
4c57ced5 1592 assert(!(PL_in_eval & ~ 0x3F));
13febba5 1593 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 1594 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
1595}
1596
1597
1598PERL_STATIC_INLINE void
1599S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
1600{
1601 SV *sv;
1602
1603 PERL_ARGS_ASSERT_CX_POPEVAL;
1604 assert(CxTYPE(cx) == CXt_EVAL);
1605
1606 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 1607 assert(!(PL_in_eval & 0xc0));
13febba5
DM
1608 PL_eval_root = cx->blk_eval.old_eval_root;
1609 sv = cx->blk_eval.cur_text;
4c57ced5 1610 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
1611 cx->blk_eval.cur_text = NULL;
1612 SvREFCNT_dec_NN(sv);
1613 }
1614
1615 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
1616 if (sv) {
1617 cx->blk_eval.old_namesv = NULL;
1618 SvREFCNT_dec_NN(sv);
1619 }
13febba5 1620}
6a7d52cc 1621
a73d8813 1622
d1b6bf72
DM
1623/* push a plain loop, i.e.
1624 * { block }
1625 * while (cond) { block }
1626 * for (init;cond;continue) { block }
1627 * This loop can be last/redo'ed etc.
1628 */
1629
1630PERL_STATIC_INLINE void
1631S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
1632{
1633 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
1634 cx->blk_loop.my_op = cLOOP;
1635}
1636
1637
1638/* push a true for loop, i.e.
1639 * for var (list) { block }
1640 */
1641
1642PERL_STATIC_INLINE void
1643S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
1644{
1645 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
1646
1647 /* this one line is common with cx_pushloop_plain */
1648 cx->blk_loop.my_op = cLOOP;
1649
1650 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
1651 cx->blk_loop.itersave = itersave;
1652#ifdef USE_ITHREADS
1653 cx->blk_loop.oldcomppad = PL_comppad;
1654#endif
1655}
1656
1657
1658/* pop all loop types, including plain */
1659
1660PERL_STATIC_INLINE void
1661S_cx_poploop(pTHX_ PERL_CONTEXT *cx)
1662{
1663 PERL_ARGS_ASSERT_CX_POPLOOP;
1664
1665 assert(CxTYPE_is_LOOP(cx));
1666 if ( CxTYPE(cx) == CXt_LOOP_ARY
1667 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
1668 {
1669 /* Free ary or cur. This assumes that state_u.ary.ary
1670 * aligns with state_u.lazysv.cur. See cx_dup() */
1671 SV *sv = cx->blk_loop.state_u.lazysv.cur;
1672 cx->blk_loop.state_u.lazysv.cur = NULL;
1673 SvREFCNT_dec_NN(sv);
1674 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
1675 sv = cx->blk_loop.state_u.lazysv.end;
1676 cx->blk_loop.state_u.lazysv.end = NULL;
1677 SvREFCNT_dec_NN(sv);
1678 }
1679 }
1680 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
1681 SV *cursv;
1682 SV **svp = (cx)->blk_loop.itervar_u.svp;
1683 if ((cx->cx_type & CXp_FOR_GV))
1684 svp = &GvSV((GV*)svp);
1685 cursv = *svp;
1686 *svp = cx->blk_loop.itersave;
1687 cx->blk_loop.itersave = NULL;
1688 SvREFCNT_dec(cursv);
1689 }
1690}
1691
2a7b7c61
DM
1692
1693PERL_STATIC_INLINE void
1694S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
1695{
1696 PERL_ARGS_ASSERT_CX_PUSHWHEN;
1697
1698 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1699}
1700
1701
1702PERL_STATIC_INLINE void
1703S_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
1704{
1705 PERL_ARGS_ASSERT_CX_POPWHEN;
1706 assert(CxTYPE(cx) == CXt_WHEN);
1707
1708 PERL_UNUSED_ARG(cx);
59a14f30 1709 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
1710 /* currently NOOP */
1711}
1712
1713
1714PERL_STATIC_INLINE void
1715S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
1716{
1717 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
1718
1719 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1720 cx->blk_givwhen.defsv_save = orig_defsv;
1721}
1722
1723
1724PERL_STATIC_INLINE void
1725S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
1726{
1727 SV *sv;
1728
1729 PERL_ARGS_ASSERT_CX_POPGIVEN;
1730 assert(CxTYPE(cx) == CXt_GIVEN);
1731
1732 sv = GvSV(PL_defgv);
1733 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
1734 cx->blk_givwhen.defsv_save = NULL;
1735 SvREFCNT_dec(sv);
1736}
1737
ec2c235b
KW
1738/* ------------------ util.h ------------------------------------------- */
1739
1740/*
1741=head1 Miscellaneous Functions
1742
1743=for apidoc foldEQ
1744
1745Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1746same
1747case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
1748match themselves and their opposite case counterparts. Non-cased and non-ASCII
1749range bytes match only themselves.
1750
1751=cut
1752*/
1753
1754PERL_STATIC_INLINE I32
1755Perl_foldEQ(const char *s1, const char *s2, I32 len)
1756{
1757 const U8 *a = (const U8 *)s1;
1758 const U8 *b = (const U8 *)s2;
1759
1760 PERL_ARGS_ASSERT_FOLDEQ;
1761
1762 assert(len >= 0);
1763
1764 while (len--) {
1765 if (*a != *b && *a != PL_fold[*b])
1766 return 0;
1767 a++,b++;
1768 }
1769 return 1;
1770}
1771
0f9cb40c 1772PERL_STATIC_INLINE I32
ec2c235b
KW
1773Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
1774{
1775 /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on
1776 * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
1777 * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor
1778 * does it check that the strings each have at least 'len' characters */
1779
1780 const U8 *a = (const U8 *)s1;
1781 const U8 *b = (const U8 *)s2;
1782
1783 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
1784
1785 assert(len >= 0);
1786
1787 while (len--) {
1788 if (*a != *b && *a != PL_fold_latin1[*b]) {
1789 return 0;
1790 }
1791 a++, b++;
1792 }
1793 return 1;
1794}
1795
1796/*
1797=for apidoc foldEQ_locale
1798
1799Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1800same case-insensitively in the current locale; false otherwise.
1801
1802=cut
1803*/
1804
0f9cb40c 1805PERL_STATIC_INLINE I32
ec2c235b
KW
1806Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
1807{
1808 dVAR;
1809 const U8 *a = (const U8 *)s1;
1810 const U8 *b = (const U8 *)s2;
1811
1812 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
1813
1814 assert(len >= 0);
1815
1816 while (len--) {
1817 if (*a != *b && *a != PL_fold_locale[*b])
1818 return 0;
1819 a++,b++;
1820 }
1821 return 1;
1822}
1823
6dba01e2
KW
1824#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
1825
1826PERL_STATIC_INLINE void *
1827S_my_memrchr(const char * s, const char c, const STRLEN len)
1828{
1829 /* memrchr(), since many platforms lack it */
1830
1831 const char * t = s + len - 1;
1832
1833 PERL_ARGS_ASSERT_MY_MEMRCHR;
1834
1835 while (t >= s) {
1836 if (*t == c) {
1837 return (void *) t;
1838 }
1839 t--;
1840 }
1841
1842 return NULL;
1843}
1844
1845#endif
1846
ed382232 1847/*
c8028aa6
TC
1848 * ex: set ts=8 sts=4 sw=4 et:
1849 */