This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix and clarify the pod for utf8_length()
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file is a home for static inline functions that cannot go in other
9 * headers files, because they depend on proto.h (included after most other
10 * headers) or struct definitions.
11 *
12 * Each section names the header file that the functions "belong" to.
13 */
27669aa4 14
be3a7a5d
KW
15/* ------------------------------- av.h ------------------------------- */
16
c70927a6 17PERL_STATIC_INLINE SSize_t
be3a7a5d
KW
18S_av_top_index(pTHX_ AV *av)
19{
20 PERL_ARGS_ASSERT_AV_TOP_INDEX;
21 assert(SvTYPE(av) == SVt_PVAV);
22
23 return AvFILL(av);
24}
25
1afe1db1
FC
26/* ------------------------------- cv.h ------------------------------- */
27
ae77754a
FC
28PERL_STATIC_INLINE GV *
29S_CvGV(pTHX_ CV *sv)
30{
31 return CvNAMED(sv)
32 ? Perl_cvgv_from_hek(aTHX_ sv)
33 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
34}
35
1afe1db1
FC
36PERL_STATIC_INLINE I32 *
37S_CvDEPTHp(const CV * const sv)
38{
39 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
8de47657 40 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
41}
42
d16269d8
PM
43/*
44 CvPROTO returns the prototype as stored, which is not necessarily what
45 the interpreter should be using. Specifically, the interpreter assumes
46 that spaces have been stripped, which has been the case if the prototype
47 was added by toke.c, but is generally not the case if it was added elsewhere.
48 Since we can't enforce the spacelessness at assignment time, this routine
49 provides a temporary copy at parse time with spaces removed.
50 I<orig> is the start of the original buffer, I<len> is the length of the
51 prototype and will be updated when this returns.
52 */
53
5b67adb8 54#ifdef PERL_CORE
d16269d8
PM
55PERL_STATIC_INLINE char *
56S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
57{
58 SV * tmpsv;
59 char * tmps;
60 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
61 tmps = SvPVX(tmpsv);
62 while ((*len)--) {
63 if (!isSPACE(*orig))
64 *tmps++ = *orig;
65 orig++;
66 }
67 *tmps = '\0';
68 *len = tmps - SvPVX(tmpsv);
69 return SvPVX(tmpsv);
70}
5b67adb8 71#endif
d16269d8 72
25fdce4a
FC
73/* ------------------------------- mg.h ------------------------------- */
74
75#if defined(PERL_CORE) || defined(PERL_EXT)
76/* assumes get-magic and stringification have already occurred */
77PERL_STATIC_INLINE STRLEN
78S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
79{
80 assert(mg->mg_type == PERL_MAGIC_regex_global);
81 assert(mg->mg_len != -1);
82 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
83 return (STRLEN)mg->mg_len;
84 else {
85 const STRLEN pos = (STRLEN)mg->mg_len;
86 /* Without this check, we may read past the end of the buffer: */
87 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
88 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
89 }
90}
91#endif
92
03414f05
FC
93/* ------------------------------- pad.h ------------------------------ */
94
95#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
96PERL_STATIC_INLINE bool
97PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
98{
99 /* is seq within the range _LOW to _HIGH ?
100 * This is complicated by the fact that PL_cop_seqmax
101 * may have wrapped around at some point */
102 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
103 return FALSE; /* not yet introduced */
104
105 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
106 /* in compiling scope */
107 if (
108 (seq > COP_SEQ_RANGE_LOW(pn))
109 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
110 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
111 )
112 return TRUE;
113 }
114 else if (
115 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
116 ?
117 ( seq > COP_SEQ_RANGE_LOW(pn)
118 || seq <= COP_SEQ_RANGE_HIGH(pn))
119
120 : ( seq > COP_SEQ_RANGE_LOW(pn)
121 && seq <= COP_SEQ_RANGE_HIGH(pn))
122 )
123 return TRUE;
124 return FALSE;
125}
126#endif
127
33a4312b
FC
128/* ------------------------------- pp.h ------------------------------- */
129
130PERL_STATIC_INLINE I32
131S_TOPMARK(pTHX)
132{
133 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 134 "MARK top %p %" IVdf "\n",
33a4312b
FC
135 PL_markstack_ptr,
136 (IV)*PL_markstack_ptr)));
137 return *PL_markstack_ptr;
138}
139
140PERL_STATIC_INLINE I32
141S_POPMARK(pTHX)
142{
143 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 144 "MARK pop %p %" IVdf "\n",
33a4312b
FC
145 (PL_markstack_ptr-1),
146 (IV)*(PL_markstack_ptr-1))));
147 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
148 return *PL_markstack_ptr--;
149}
150
8d919b0a
FC
151/* ----------------------------- regexp.h ----------------------------- */
152
153PERL_STATIC_INLINE struct regexp *
154S_ReANY(const REGEXP * const re)
155{
df6b4bd5 156 XPV* const p = (XPV*)SvANY(re);
8d919b0a 157 assert(isREGEXP(re));
df6b4bd5
DM
158 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
159 : (struct regexp *)p;
8d919b0a
FC
160}
161
27669aa4
FC
162/* ------------------------------- sv.h ------------------------------- */
163
164PERL_STATIC_INLINE SV *
165S_SvREFCNT_inc(SV *sv)
166{
2439e033 167 if (LIKELY(sv != NULL))
27669aa4
FC
168 SvREFCNT(sv)++;
169 return sv;
170}
171PERL_STATIC_INLINE SV *
172S_SvREFCNT_inc_NN(SV *sv)
173{
174 SvREFCNT(sv)++;
175 return sv;
176}
177PERL_STATIC_INLINE void
178S_SvREFCNT_inc_void(SV *sv)
179{
2439e033 180 if (LIKELY(sv != NULL))
27669aa4
FC
181 SvREFCNT(sv)++;
182}
75e16a44
FC
183PERL_STATIC_INLINE void
184S_SvREFCNT_dec(pTHX_ SV *sv)
185{
2439e033 186 if (LIKELY(sv != NULL)) {
75a9bf96 187 U32 rc = SvREFCNT(sv);
79e2a32a 188 if (LIKELY(rc > 1))
75a9bf96
DM
189 SvREFCNT(sv) = rc - 1;
190 else
191 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
192 }
193}
541377b1
FC
194
195PERL_STATIC_INLINE void
4a9a56a7
DM
196S_SvREFCNT_dec_NN(pTHX_ SV *sv)
197{
198 U32 rc = SvREFCNT(sv);
79e2a32a 199 if (LIKELY(rc > 1))
4a9a56a7
DM
200 SvREFCNT(sv) = rc - 1;
201 else
202 Perl_sv_free2(aTHX_ sv, rc);
203}
204
205PERL_STATIC_INLINE void
541377b1
FC
206SvAMAGIC_on(SV *sv)
207{
208 assert(SvROK(sv));
209 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
210}
211PERL_STATIC_INLINE void
212SvAMAGIC_off(SV *sv)
213{
214 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
215 HvAMAGIC_off(SvSTASH(SvRV(sv)));
216}
217
218PERL_STATIC_INLINE U32
541377b1
FC
219S_SvPADSTALE_on(SV *sv)
220{
c0683843 221 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
222 return SvFLAGS(sv) |= SVs_PADSTALE;
223}
224PERL_STATIC_INLINE U32
225S_SvPADSTALE_off(SV *sv)
226{
c0683843 227 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
228 return SvFLAGS(sv) &= ~SVs_PADSTALE;
229}
25fdce4a 230#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 231PERL_STATIC_INLINE STRLEN
6964422a 232S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 233{
25fdce4a 234 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
235 if (SvGAMAGIC(sv)) {
236 U8 *hopped = utf8_hop((U8 *)pv, pos);
237 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
238 return (STRLEN)(hopped - (U8 *)pv);
239 }
240 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
241}
242#endif
f019c49e 243
d1decf2b
TC
244/* ------------------------------- handy.h ------------------------------- */
245
246/* saves machine code for a common noreturn idiom typically used in Newx*() */
c1d6452f 247#ifdef GCC_DIAG_PRAGMA
6ab56f1e 248GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
c1d6452f 249#endif
d1decf2b
TC
250static void
251S_croak_memory_wrap(void)
252{
253 Perl_croak_nocontext("%s",PL_memory_wrap);
254}
c1d6452f 255#ifdef GCC_DIAG_PRAGMA
6ab56f1e 256GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
c1d6452f 257#endif
d1decf2b 258
a8a2ceaa
KW
259/* ------------------------------- utf8.h ------------------------------- */
260
2fe720e2
KW
261/*
262=head1 Unicode Support
263*/
264
55d09dc8
KW
265PERL_STATIC_INLINE void
266S_append_utf8_from_native_byte(const U8 byte, U8** dest)
267{
268 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
269 * encoded string at '*dest', updating '*dest' to include it */
270
55d09dc8
KW
271 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
272
6f2d5cbc 273 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 274 *((*dest)++) = byte;
55d09dc8 275 else {
a09ec51a
KW
276 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
277 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
278 }
279}
280
e123187a 281/*
2fe720e2 282=for apidoc valid_utf8_to_uvchr
2717076a 283Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
2fe720e2
KW
284the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
285it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and
286non-Unicode code points are allowed.
287
288=cut
289
290 */
291
292PERL_STATIC_INLINE UV
293Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
294{
c41b2540 295 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
296 const U8* send = s + expectlen;
297 UV uv = *s;
298
299 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
300
301 if (retlen) {
302 *retlen = expectlen;
303 }
304
305 /* An invariant is trivially returned */
306 if (expectlen == 1) {
307 return uv;
308 }
309
310 /* Remove the leading bits that indicate the number of bytes, leaving just
311 * the bits that are part of the value */
312 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
313
314 /* Now, loop through the remaining bytes, accumulating each into the
315 * working total as we go. (I khw tried unrolling the loop for up to 4
316 * bytes, but there was no performance improvement) */
317 for (++s; s < send; s++) {
318 uv = UTF8_ACCUMULATE(uv, *s);
319 }
320
321 return UNI_TO_NATIVE(uv);
322
323}
324
1e599354
KW
325/*
326=for apidoc is_utf8_invariant_string
327
82c5d941 328Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 329regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
330EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
331are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
332the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
333characters are invariant, but so also are the C1 controls.
1e599354
KW
334
335If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
336use this option, that C<s> can't have embedded C<NUL> characters and has to
337have a terminating C<NUL> byte).
338
9f2abfde
KW
339See also
340C<L</is_utf8_string>>,
341C<L</is_utf8_string_flags>>,
342C<L</is_utf8_string_loc>>,
343C<L</is_utf8_string_loc_flags>>,
344C<L</is_utf8_string_loclen>>,
345C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
346C<L</is_utf8_fixed_width_buf_flags>>,
347C<L</is_utf8_fixed_width_buf_loc_flags>>,
348C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
349C<L</is_strict_utf8_string>>,
350C<L</is_strict_utf8_string_loc>>,
351C<L</is_strict_utf8_string_loclen>>,
352C<L</is_c9strict_utf8_string>>,
353C<L</is_c9strict_utf8_string_loc>>,
354and
355C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
356
357=cut
0cbf5865
KW
358
359*/
360
361#define is_utf8_invariant_string(s, len) \
362 is_utf8_invariant_string_loc(s, len, NULL)
363
364/*
365=for apidoc is_utf8_invariant_string_loc
366
367Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
368the first UTF-8 variant character in the C<ep> pointer; if all characters are
369UTF-8 invariant, this function does not change the contents of C<*ep>.
370
371=cut
372
1e599354
KW
373*/
374
375PERL_STATIC_INLINE bool
e17544a6 376S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 377{
e17544a6 378 const U8* send;
1e599354
KW
379 const U8* x = s;
380
0cbf5865
KW
381 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
382
e17544a6
KW
383 if (len == 0) {
384 len = strlen((const char *)s);
385 }
386
387 send = s + len;
388
389#ifndef EBCDIC
390 /* Try to get the widest word on this platform */
391# ifdef HAS_LONG_LONG
392# define PERL_WORDCAST unsigned long long
393# define PERL_WORDSIZE LONGLONGSIZE
394# else
395# define PERL_WORDCAST UV
396# define PERL_WORDSIZE UVSIZE
397# endif
398
399# if PERL_WORDSIZE == 4
400# define PERL_VARIANTS_WORD_MASK 0x80808080
401# define PERL_WORD_BOUNDARY_MASK 0x3
402# elif PERL_WORDSIZE == 8
403# define PERL_VARIANTS_WORD_MASK 0x8080808080808080
404# define PERL_WORD_BOUNDARY_MASK 0x7
405# else
406# error Unexpected word size
407# endif
408
409 /* Process per-byte until reach word boundary. XXX This loop could be
410 * eliminated if we knew that this platform had fast unaligned reads */
411 while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
412 if (! UTF8_IS_INVARIANT(*x)) {
413 if (ep) {
414 *ep = x;
415 }
416
417 return FALSE;
418 }
419 x++;
420 }
421
422 /* Process per-word as long as we have at least a full word left */
423 while (x + PERL_WORDSIZE <= send) {
424 if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
425
426 /* Found a variant. Just return if caller doesn't want its exact
427 * position */
428 if (! ep) {
429 return FALSE;
430 }
431
432 /* Otherwise fall into final loop to find which byte it is */
433 break;
0cbf5865 434 }
e17544a6
KW
435 x += PERL_WORDSIZE;
436 }
437
438# undef PERL_WORDCAST
439# undef PERL_WORDSIZE
440# undef PERL_WORD_BOUNDARY_MASK
441# undef PERL_VARIANTS_WORD_MASK
442#endif
443
444 /* Process per-byte */
445 while (x < send) {
446 if (! UTF8_IS_INVARIANT(*x)) {
447 if (ep) {
448 *ep = x;
449 }
0cbf5865 450
e17544a6 451 return FALSE;
0cbf5865 452 }
1e599354 453
e17544a6 454 x++;
1e599354
KW
455 }
456
457 return TRUE;
458}
459
7c93d8f0 460/*
5ff889fb
KW
461=for apidoc is_utf8_string
462
82c5d941
KW
463Returns TRUE if the first C<len> bytes of string C<s> form a valid
464Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
465be calculated using C<strlen(s)> (which means if you use this option, that C<s>
466can't have embedded C<NUL> characters and has to have a terminating C<NUL>
467byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
468
2717076a
KW
469This function considers Perl's extended UTF-8 to be valid. That means that
470code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
471considered valid by this function. Use C<L</is_strict_utf8_string>>,
472C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
473code points are considered valid.
5ff889fb 474
9f2abfde
KW
475See also
476C<L</is_utf8_invariant_string>>,
0cbf5865 477C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
478C<L</is_utf8_string_loc>>,
479C<L</is_utf8_string_loclen>>,
8bc127bf
KW
480C<L</is_utf8_fixed_width_buf_flags>>,
481C<L</is_utf8_fixed_width_buf_loc_flags>>,
482C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
483
484=cut
485*/
486
dd237e82 487#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 488
c9cd936b
KW
489#if defined(PERL_CORE) || defined (PERL_EXT)
490
491/*
492=for apidoc is_utf8_non_invariant_string
493
494Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
495C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
496UTF-8; otherwise returns FALSE.
497
498A TRUE return means that at least one code point represented by the sequence
499either is a wide character not representable as a single byte, or the
500representation differs depending on whether the sequence is encoded in UTF-8 or
501not.
502
503See also
504C<L<perlapi/is_utf8_invariant_string>>,
505C<L<perlapi/is_utf8_string>>
506
507=cut
508
509This is commonly used to determine if a SV's UTF-8 flag should be turned on.
510It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
511it otherwise contains invalid UTF-8.
512
513It is an internal function because khw thinks that XS code shouldn't be working
514at this low a level. A valid use case could change that.
515
516*/
517
518PERL_STATIC_INLINE bool
519S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
520{
521 const U8 * first_variant;
522
523 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
524
525 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
526 return FALSE;
527 }
528
529 return is_utf8_string(first_variant, len - (first_variant - s));
530}
531
532#endif
533
5ff889fb 534/*
9f2abfde
KW
535=for apidoc is_strict_utf8_string
536
537Returns TRUE if the first C<len> bytes of string C<s> form a valid
538UTF-8-encoded string that is fully interchangeable by any application using
539Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
540calculated using C<strlen(s)> (which means if you use this option, that C<s>
541can't have embedded C<NUL> characters and has to have a terminating C<NUL>
542byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
543
544This function returns FALSE for strings containing any
545code points above the Unicode max of 0x10FFFF, surrogate code points, or
546non-character code points.
547
548See also
549C<L</is_utf8_invariant_string>>,
0cbf5865 550C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
551C<L</is_utf8_string>>,
552C<L</is_utf8_string_flags>>,
553C<L</is_utf8_string_loc>>,
554C<L</is_utf8_string_loc_flags>>,
555C<L</is_utf8_string_loclen>>,
556C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
557C<L</is_utf8_fixed_width_buf_flags>>,
558C<L</is_utf8_fixed_width_buf_loc_flags>>,
559C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
560C<L</is_strict_utf8_string_loc>>,
561C<L</is_strict_utf8_string_loclen>>,
562C<L</is_c9strict_utf8_string>>,
563C<L</is_c9strict_utf8_string_loc>>,
564and
565C<L</is_c9strict_utf8_string_loclen>>.
566
567=cut
568*/
569
dd237e82 570#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
571
572/*
573=for apidoc is_c9strict_utf8_string
574
575Returns TRUE if the first C<len> bytes of string C<s> form a valid
576UTF-8-encoded string that conforms to
577L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
578otherwise it returns FALSE. If C<len> is 0, it will be calculated using
579C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
580C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
581characters being ASCII constitute 'a valid UTF-8 string'.
582
583This function returns FALSE for strings containing any code points above the
584Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
585code points per
586L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
587
588See also
589C<L</is_utf8_invariant_string>>,
0cbf5865 590C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
591C<L</is_utf8_string>>,
592C<L</is_utf8_string_flags>>,
593C<L</is_utf8_string_loc>>,
594C<L</is_utf8_string_loc_flags>>,
595C<L</is_utf8_string_loclen>>,
596C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
597C<L</is_utf8_fixed_width_buf_flags>>,
598C<L</is_utf8_fixed_width_buf_loc_flags>>,
599C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
600C<L</is_strict_utf8_string>>,
601C<L</is_strict_utf8_string_loc>>,
602C<L</is_strict_utf8_string_loclen>>,
603C<L</is_c9strict_utf8_string_loc>>,
604and
605C<L</is_c9strict_utf8_string_loclen>>.
606
607=cut
608*/
609
dd237e82 610#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
611
612/*
613=for apidoc is_utf8_string_flags
614
615Returns TRUE if the first C<len> bytes of string C<s> form a valid
616UTF-8 string, subject to the restrictions imposed by C<flags>;
617returns FALSE otherwise. If C<len> is 0, it will be calculated
618using C<strlen(s)> (which means if you use this option, that C<s> can't have
619embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
620that all characters being ASCII constitute 'a valid UTF-8 string'.
621
622If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
623C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
624as C<L</is_strict_utf8_string>>; and if C<flags> is
625C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
626C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
627combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
628C<L</utf8n_to_uvchr>>, with the same meanings.
629
630See also
631C<L</is_utf8_invariant_string>>,
0cbf5865 632C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
633C<L</is_utf8_string>>,
634C<L</is_utf8_string_loc>>,
635C<L</is_utf8_string_loc_flags>>,
636C<L</is_utf8_string_loclen>>,
637C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
638C<L</is_utf8_fixed_width_buf_flags>>,
639C<L</is_utf8_fixed_width_buf_loc_flags>>,
640C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
641C<L</is_strict_utf8_string>>,
642C<L</is_strict_utf8_string_loc>>,
643C<L</is_strict_utf8_string_loclen>>,
644C<L</is_c9strict_utf8_string>>,
645C<L</is_c9strict_utf8_string_loc>>,
646and
647C<L</is_c9strict_utf8_string_loclen>>.
648
649=cut
650*/
651
652PERL_STATIC_INLINE bool
f60f61fd 653S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 654{
f60f61fd 655 const U8* send;
9f2abfde
KW
656 const U8* x = s;
657
658 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
659 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 660 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 661
f60f61fd
KW
662 if (len == 0) {
663 len = strlen((const char *)s);
664 }
665
9f2abfde
KW
666 if (flags == 0) {
667 return is_utf8_string(s, len);
668 }
669
d044b7a7 670 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
671 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
672 {
673 return is_strict_utf8_string(s, len);
674 }
675
d044b7a7 676 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
677 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
678 {
679 return is_c9strict_utf8_string(s, len);
680 }
681
f60f61fd 682 send = s + len;
9f2abfde
KW
683 while (x < send) {
684 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
685 if (UNLIKELY(! cur_len)) {
686 return FALSE;
687 }
688 x += cur_len;
689 }
690
691 return TRUE;
692}
693
694/*
5ff889fb
KW
695
696=for apidoc is_utf8_string_loc
697
2717076a 698Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 699case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 700"utf8ness success") in the C<ep> pointer.
5ff889fb 701
2717076a 702See also C<L</is_utf8_string_loclen>>.
5ff889fb 703
3964c812
KW
704=cut
705*/
706
707#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
708
709/*
710
5ff889fb
KW
711=for apidoc is_utf8_string_loclen
712
2717076a 713Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 714case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 715"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 716encoded characters in the C<el> pointer.
5ff889fb 717
2717076a 718See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
719
720=cut
721*/
722
56e4cf64 723PERL_STATIC_INLINE bool
c41b2540 724Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb
KW
725{
726 const U8* const send = s + (len ? len : strlen((const char *)s));
727 const U8* x = s;
728 STRLEN outlen = 0;
729
730 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
731
732 while (x < send) {
c41b2540
KW
733 const STRLEN cur_len = isUTF8_CHAR(x, send);
734 if (UNLIKELY(! cur_len)) {
5ff889fb
KW
735 break;
736 }
c41b2540 737 x += cur_len;
5ff889fb
KW
738 outlen++;
739 }
740
741 if (el)
742 *el = outlen;
743
744 if (ep) {
745 *ep = x;
746 }
747
748 return (x == send);
749}
750
751/*
9f2abfde
KW
752
753=for apidoc is_strict_utf8_string_loc
754
755Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
756case of "utf8ness failure") or the location C<s>+C<len> (in the case of
757"utf8ness success") in the C<ep> pointer.
758
759See also C<L</is_strict_utf8_string_loclen>>.
760
761=cut
762*/
763
764#define is_strict_utf8_string_loc(s, len, ep) \
765 is_strict_utf8_string_loclen(s, len, ep, 0)
766
767/*
768
769=for apidoc is_strict_utf8_string_loclen
770
771Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
772case of "utf8ness failure") or the location C<s>+C<len> (in the case of
773"utf8ness success") in the C<ep> pointer, and the number of UTF-8
774encoded characters in the C<el> pointer.
775
776See also C<L</is_strict_utf8_string_loc>>.
777
778=cut
779*/
780
781PERL_STATIC_INLINE bool
782S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
783{
784 const U8* const send = s + (len ? len : strlen((const char *)s));
785 const U8* x = s;
786 STRLEN outlen = 0;
787
788 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
789
790 while (x < send) {
791 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
792 if (UNLIKELY(! cur_len)) {
793 break;
794 }
795 x += cur_len;
796 outlen++;
797 }
798
799 if (el)
800 *el = outlen;
801
802 if (ep) {
803 *ep = x;
804 }
805
806 return (x == send);
807}
808
809/*
810
811=for apidoc is_c9strict_utf8_string_loc
812
813Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
814the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
815"utf8ness success") in the C<ep> pointer.
816
817See also C<L</is_c9strict_utf8_string_loclen>>.
818
819=cut
820*/
821
822#define is_c9strict_utf8_string_loc(s, len, ep) \
823 is_c9strict_utf8_string_loclen(s, len, ep, 0)
824
825/*
826
827=for apidoc is_c9strict_utf8_string_loclen
828
829Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
830the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
831"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
832characters in the C<el> pointer.
833
834See also C<L</is_c9strict_utf8_string_loc>>.
835
836=cut
837*/
838
839PERL_STATIC_INLINE bool
840S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
841{
842 const U8* const send = s + (len ? len : strlen((const char *)s));
843 const U8* x = s;
844 STRLEN outlen = 0;
845
846 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
847
848 while (x < send) {
849 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
850 if (UNLIKELY(! cur_len)) {
851 break;
852 }
853 x += cur_len;
854 outlen++;
855 }
856
857 if (el)
858 *el = outlen;
859
860 if (ep) {
861 *ep = x;
862 }
863
864 return (x == send);
865}
866
867/*
868
869=for apidoc is_utf8_string_loc_flags
870
871Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
872case of "utf8ness failure") or the location C<s>+C<len> (in the case of
873"utf8ness success") in the C<ep> pointer.
874
875See also C<L</is_utf8_string_loclen_flags>>.
876
877=cut
878*/
879
880#define is_utf8_string_loc_flags(s, len, ep, flags) \
881 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
882
883
884/* The above 3 actual functions could have been moved into the more general one
885 * just below, and made #defines that call it with the right 'flags'. They are
886 * currently kept separate to increase their chances of getting inlined */
887
888/*
889
890=for apidoc is_utf8_string_loclen_flags
891
892Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
893case of "utf8ness failure") or the location C<s>+C<len> (in the case of
894"utf8ness success") in the C<ep> pointer, and the number of UTF-8
895encoded characters in the C<el> pointer.
896
897See also C<L</is_utf8_string_loc_flags>>.
898
899=cut
900*/
901
902PERL_STATIC_INLINE bool
f60f61fd 903S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 904{
f60f61fd 905 const U8* send;
9f2abfde
KW
906 const U8* x = s;
907 STRLEN outlen = 0;
908
909 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
910 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 911 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 912
f60f61fd
KW
913 if (len == 0) {
914 len = strlen((const char *)s);
915 }
916
9f2abfde
KW
917 if (flags == 0) {
918 return is_utf8_string_loclen(s, len, ep, el);
919 }
920
d044b7a7 921 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
922 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
923 {
924 return is_strict_utf8_string_loclen(s, len, ep, el);
925 }
926
d044b7a7 927 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
928 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
929 {
930 return is_c9strict_utf8_string_loclen(s, len, ep, el);
931 }
932
f60f61fd 933 send = s + len;
9f2abfde
KW
934 while (x < send) {
935 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
936 if (UNLIKELY(! cur_len)) {
937 break;
938 }
939 x += cur_len;
940 outlen++;
941 }
942
943 if (el)
944 *el = outlen;
945
946 if (ep) {
947 *ep = x;
948 }
949
950 return (x == send);
951}
952
953/*
7c93d8f0
KW
954=for apidoc utf8_distance
955
956Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
957and C<b>.
958
959WARNING: use only if you *know* that the pointers point inside the
960same UTF-8 buffer.
961
962=cut
963*/
964
965PERL_STATIC_INLINE IV
966Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
967{
968 PERL_ARGS_ASSERT_UTF8_DISTANCE;
969
970 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
971}
972
973/*
974=for apidoc utf8_hop
975
976Return the UTF-8 pointer C<s> displaced by C<off> characters, either
977forward or backward.
978
979WARNING: do not use the following unless you *know* C<off> is within
980the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
981on the first byte of character or just after the last byte of a character.
982
983=cut
984*/
985
986PERL_STATIC_INLINE U8 *
987Perl_utf8_hop(const U8 *s, SSize_t off)
988{
989 PERL_ARGS_ASSERT_UTF8_HOP;
990
991 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
992 * the bitops (especially ~) can create illegal UTF-8.
993 * In other words: in Perl UTF-8 is not just for Unicode. */
994
995 if (off >= 0) {
996 while (off--)
997 s += UTF8SKIP(s);
998 }
999 else {
1000 while (off++) {
1001 s--;
1002 while (UTF8_IS_CONTINUATION(*s))
1003 s--;
1004 }
1005 }
de979548 1006 GCC_DIAG_IGNORE(-Wcast-qual);
7c93d8f0 1007 return (U8 *)s;
de979548 1008 GCC_DIAG_RESTORE;
7c93d8f0
KW
1009}
1010
4dab108f 1011/*
65df57a8
TC
1012=for apidoc utf8_hop_forward
1013
1014Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1015forward.
1016
1017C<off> must be non-negative.
1018
1019C<s> must be before or equal to C<end>.
1020
1021When moving forward it will not move beyond C<end>.
1022
1023Will not exceed this limit even if the string is not valid "UTF-8".
1024
1025=cut
1026*/
1027
1028PERL_STATIC_INLINE U8 *
1029Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1030{
1031 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1032
1033 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1034 * the bitops (especially ~) can create illegal UTF-8.
1035 * In other words: in Perl UTF-8 is not just for Unicode. */
1036
1037 assert(s <= end);
1038 assert(off >= 0);
1039
1040 while (off--) {
1041 STRLEN skip = UTF8SKIP(s);
de979548
P
1042 if ((STRLEN)(end - s) <= skip) {
1043 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1044 return (U8 *)end;
de979548
P
1045 GCC_DIAG_RESTORE;
1046 }
65df57a8
TC
1047 s += skip;
1048 }
1049
de979548 1050 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1051 return (U8 *)s;
de979548 1052 GCC_DIAG_RESTORE;
65df57a8
TC
1053}
1054
1055/*
1056=for apidoc utf8_hop_back
1057
1058Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1059backward.
1060
1061C<off> must be non-positive.
1062
1063C<s> must be after or equal to C<start>.
1064
1065When moving backward it will not move before C<start>.
1066
1067Will not exceed this limit even if the string is not valid "UTF-8".
1068
1069=cut
1070*/
1071
1072PERL_STATIC_INLINE U8 *
1073Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1074{
1075 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1076
1077 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1078 * the bitops (especially ~) can create illegal UTF-8.
1079 * In other words: in Perl UTF-8 is not just for Unicode. */
1080
1081 assert(start <= s);
1082 assert(off <= 0);
1083
1084 while (off++ && s > start) {
1085 s--;
1086 while (UTF8_IS_CONTINUATION(*s) && s > start)
1087 s--;
1088 }
1089
de979548 1090 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1091 return (U8 *)s;
de979548 1092 GCC_DIAG_RESTORE;
65df57a8
TC
1093}
1094
1095/*
1096=for apidoc utf8_hop_safe
1097
1098Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1099either forward or backward.
1100
1101When moving backward it will not move before C<start>.
1102
1103When moving forward it will not move beyond C<end>.
1104
1105Will not exceed those limits even if the string is not valid "UTF-8".
1106
1107=cut
1108*/
1109
1110PERL_STATIC_INLINE U8 *
1111Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1112{
1113 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1114
1115 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1116 * the bitops (especially ~) can create illegal UTF-8.
1117 * In other words: in Perl UTF-8 is not just for Unicode. */
1118
1119 assert(start <= s && s <= end);
1120
1121 if (off >= 0) {
1122 return utf8_hop_forward(s, off, end);
1123 }
1124 else {
1125 return utf8_hop_back(s, off, start);
1126 }
1127}
1128
1129/*
4dab108f
KW
1130
1131=for apidoc is_utf8_valid_partial_char
1132
6cbb9248
KW
1133Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1134S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1135points. Otherwise, it returns 1 if there exists at least one non-empty
1136sequence of bytes that when appended to sequence C<s>, starting at position
1137C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1138otherwise returns 0.
1139
1140In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1141point.
1142
1143This is useful when a fixed-length buffer is being tested for being well-formed
1144UTF-8, but the final few bytes in it don't comprise a full character; that is,
1145it is split somewhere in the middle of the final code point's UTF-8
1146representation. (Presumably when the buffer is refreshed with the next chunk
1147of data, the new first bytes will complete the partial code point.) This
1148function is used to verify that the final bytes in the current buffer are in
1149fact the legal beginning of some code point, so that if they aren't, the
1150failure can be signalled without having to wait for the next read.
4dab108f
KW
1151
1152=cut
1153*/
2717076a
KW
1154#define is_utf8_valid_partial_char(s, e) \
1155 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1156
1157/*
1158
1159=for apidoc is_utf8_valid_partial_char_flags
1160
1161Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1162or not the input is a valid UTF-8 encoded partial character, but it takes an
1163extra parameter, C<flags>, which can further restrict which code points are
1164considered valid.
1165
1166If C<flags> is 0, this behaves identically to
1167C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1168of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1169there is any sequence of bytes that can complete the input partial character in
1170such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1171TRUE; otherwise FALSE. Non character code points cannot be determined based on
1172partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1173determined from just the first one or two bytes.
1174
1175=cut
1176 */
1177
56e4cf64 1178PERL_STATIC_INLINE bool
f1c999a7 1179S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1180{
f1c999a7 1181 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1182
f1c999a7 1183 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1184 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1185
8875bd48 1186 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1187 return FALSE;
1188 }
1189
f1c999a7 1190 return cBOOL(_is_utf8_char_helper(s, e, flags));
4dab108f
KW
1191}
1192
8bc127bf
KW
1193/*
1194
1195=for apidoc is_utf8_fixed_width_buf_flags
1196
1197Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1198is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1199otherwise it returns FALSE.
1200
1201If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1202without restriction. If the final few bytes of the buffer do not form a
1203complete code point, this will return TRUE anyway, provided that
1204C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1205
1206If C<flags> in non-zero, it can be any combination of the
1207C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1208same meanings.
1209
1210This function differs from C<L</is_utf8_string_flags>> only in that the latter
1211returns FALSE if the final few bytes of the string don't form a complete code
1212point.
1213
1214=cut
1215 */
1216#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1217 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1218
1219/*
1220
1221=for apidoc is_utf8_fixed_width_buf_loc_flags
1222
1223Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1224failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1225to the beginning of any partial character at the end of the buffer; if there is
1226no partial character C<*ep> will contain C<s>+C<len>.
1227
1228See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1229
1230=cut
1231*/
1232
1233#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1234 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1235
1236/*
1237
1238=for apidoc is_utf8_fixed_width_buf_loclen_flags
1239
1240Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1241complete, valid characters found in the C<el> pointer.
1242
1243=cut
1244*/
1245
1246PERL_STATIC_INLINE bool
1247S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1248 const STRLEN len,
1249 const U8 **ep,
1250 STRLEN *el,
1251 const U32 flags)
1252{
1253 const U8 * maybe_partial;
1254
1255 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1256
1257 if (! ep) {
1258 ep = &maybe_partial;
1259 }
1260
1261 /* If it's entirely valid, return that; otherwise see if the only error is
1262 * that the final few bytes are for a partial character */
1263 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1264 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1265}
1266
c8028aa6
TC
1267/* ------------------------------- perl.h ----------------------------- */
1268
1269/*
dcccc8ff
KW
1270=head1 Miscellaneous Functions
1271
41188aa0 1272=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
c8028aa6 1273
6602b933 1274Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1275If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1276
1277Return TRUE if the name is safe.
1278
796b6530 1279Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1280
1281=cut
1282*/
1283
1284PERL_STATIC_INLINE bool
41188aa0 1285S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) {
c8028aa6
TC
1286 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1287 * perl itself uses xce*() functions which accept 8-bit strings.
1288 */
1289
1290 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1291
6c4650b3 1292 if (len > 1) {
c8028aa6 1293 char *null_at;
41188aa0 1294 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1295 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1296 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1297 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1298 what, op_name, pv, null_at+1);
c8028aa6
TC
1299 return FALSE;
1300 }
1301 }
1302
1303 return TRUE;
1304}
1305
1306/*
7cb3f959
TC
1307
1308Return true if the supplied filename has a newline character
fa6c7d00 1309immediately before the first (hopefully only) NUL.
7cb3f959
TC
1310
1311My original look at this incorrectly used the len from SvPV(), but
1312that's incorrect, since we allow for a NUL in pv[len-1].
1313
1314So instead, strlen() and work from there.
1315
1316This allow for the user reading a filename, forgetting to chomp it,
1317then calling:
1318
1319 open my $foo, "$file\0";
1320
1321*/
1322
1323#ifdef PERL_CORE
1324
1325PERL_STATIC_INLINE bool
1326S_should_warn_nl(const char *pv) {
1327 STRLEN len;
1328
1329 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1330
1331 len = strlen(pv);
1332
1333 return len > 0 && pv[len-1] == '\n';
1334}
1335
1336#endif
1337
81d52ecd
JH
1338/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1339
1340#define MAX_CHARSET_NAME_LENGTH 2
1341
1342PERL_STATIC_INLINE const char *
1343get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1344{
1345 /* Returns a string that corresponds to the name of the regex character set
1346 * given by 'flags', and *lenp is set the length of that string, which
1347 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
1348
1349 *lenp = 1;
1350 switch (get_regex_charset(flags)) {
1351 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
1352 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
1353 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1354 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
1355 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
1356 *lenp = 2;
1357 return ASCII_MORE_RESTRICT_PAT_MODS;
1358 }
1359 /* The NOT_REACHED; hides an assert() which has a rather complex
1360 * definition in perl.h. */
1361 NOT_REACHED; /* NOTREACHED */
1362 return "?"; /* Unknown */
1363}
1364
7cb3f959 1365/*
ed382232
TC
1366
1367Return false if any get magic is on the SV other than taint magic.
1368
1369*/
1370
1371PERL_STATIC_INLINE bool
1372S_sv_only_taint_gmagic(SV *sv) {
1373 MAGIC *mg = SvMAGIC(sv);
1374
1375 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
1376
1377 while (mg) {
1378 if (mg->mg_type != PERL_MAGIC_taint
1379 && !(mg->mg_flags & MGf_GSKIP)
1380 && mg->mg_virtual->svt_get) {
1381 return FALSE;
1382 }
1383 mg = mg->mg_moremagic;
1384 }
1385
1386 return TRUE;
1387}
1388
ed8ff0f3
DM
1389/* ------------------ cop.h ------------------------------------------- */
1390
1391
1392/* Enter a block. Push a new base context and return its address. */
1393
1394PERL_STATIC_INLINE PERL_CONTEXT *
1395S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
1396{
1397 PERL_CONTEXT * cx;
1398
1399 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
1400
1401 CXINC;
1402 cx = CX_CUR();
1403 cx->cx_type = type;
1404 cx->blk_gimme = gimme;
1405 cx->blk_oldsaveix = saveix;
4caf7d8c 1406 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 1407 cx->blk_oldcop = PL_curcop;
4caf7d8c 1408 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
1409 cx->blk_oldscopesp = PL_scopestack_ix;
1410 cx->blk_oldpm = PL_curpm;
ce8bb8d8 1411 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
1412
1413 PL_tmps_floor = PL_tmps_ix;
1414 CX_DEBUG(cx, "PUSH");
1415 return cx;
1416}
1417
1418
1419/* Exit a block (RETURN and LAST). */
1420
1421PERL_STATIC_INLINE void
1422S_cx_popblock(pTHX_ PERL_CONTEXT *cx)
1423{
1424 PERL_ARGS_ASSERT_CX_POPBLOCK;
1425
1426 CX_DEBUG(cx, "POP");
1427 /* these 3 are common to cx_popblock and cx_topblock */
1428 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1429 PL_scopestack_ix = cx->blk_oldscopesp;
1430 PL_curpm = cx->blk_oldpm;
1431
1432 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
1433 * and leaves a CX entry lying around for repeated use, so
1434 * skip for multicall */ \
1435 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
1436 || PL_savestack_ix == cx->blk_oldsaveix);
1437 PL_curcop = cx->blk_oldcop;
ce8bb8d8 1438 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
1439}
1440
1441/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
1442 * Whereas cx_popblock() restores the state to the point just before
1443 * cx_pushblock() was called, cx_topblock() restores it to the point just
1444 * *after* cx_pushblock() was called. */
1445
1446PERL_STATIC_INLINE void
1447S_cx_topblock(pTHX_ PERL_CONTEXT *cx)
1448{
1449 PERL_ARGS_ASSERT_CX_TOPBLOCK;
1450
1451 CX_DEBUG(cx, "TOP");
1452 /* these 3 are common to cx_popblock and cx_topblock */
1453 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1454 PL_scopestack_ix = cx->blk_oldscopesp;
1455 PL_curpm = cx->blk_oldpm;
1456
1457 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
1458}
1459
1460
a73d8813
DM
1461PERL_STATIC_INLINE void
1462S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
1463{
1464 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
1465
1466 PERL_ARGS_ASSERT_CX_PUSHSUB;
1467
3f6bd23a 1468 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
1469 cx->blk_sub.cv = cv;
1470 cx->blk_sub.olddepth = CvDEPTH(cv);
1471 cx->blk_sub.prevcomppad = PL_comppad;
1472 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
1473 cx->blk_sub.retop = retop;
1474 SvREFCNT_inc_simple_void_NN(cv);
1475 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
1476}
1477
1478
1479/* subsets of cx_popsub() */
1480
1481PERL_STATIC_INLINE void
1482S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
1483{
1484 CV *cv;
1485
1486 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
1487 assert(CxTYPE(cx) == CXt_SUB);
1488
1489 PL_comppad = cx->blk_sub.prevcomppad;
1490 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1491 cv = cx->blk_sub.cv;
1492 CvDEPTH(cv) = cx->blk_sub.olddepth;
1493 cx->blk_sub.cv = NULL;
1494 SvREFCNT_dec(cv);
1495}
1496
1497
1498/* handle the @_ part of leaving a sub */
1499
1500PERL_STATIC_INLINE void
1501S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
1502{
1503 AV *av;
1504
1505 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
1506 assert(CxTYPE(cx) == CXt_SUB);
1507 assert(AvARRAY(MUTABLE_AV(
1508 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
1509 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
1510
1511 CX_POP_SAVEARRAY(cx);
1512 av = MUTABLE_AV(PAD_SVl(0));
1513 if (UNLIKELY(AvREAL(av)))
1514 /* abandon @_ if it got reified */
1515 clear_defarray(av, 0);
1516 else {
1517 CLEAR_ARGARRAY(av);
1518 }
1519}
1520
1521
1522PERL_STATIC_INLINE void
1523S_cx_popsub(pTHX_ PERL_CONTEXT *cx)
1524{
1525 PERL_ARGS_ASSERT_CX_POPSUB;
1526 assert(CxTYPE(cx) == CXt_SUB);
1527
3f6bd23a 1528 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
1529
1530 if (CxHASARGS(cx))
1531 cx_popsub_args(cx);
1532 cx_popsub_common(cx);
1533}
1534
1535
6a7d52cc
DM
1536PERL_STATIC_INLINE void
1537S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
1538{
1539 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
1540
1541 cx->blk_format.cv = cv;
1542 cx->blk_format.retop = retop;
1543 cx->blk_format.gv = gv;
1544 cx->blk_format.dfoutgv = PL_defoutgv;
1545 cx->blk_format.prevcomppad = PL_comppad;
1546 cx->blk_u16 = 0;
1547
1548 SvREFCNT_inc_simple_void_NN(cv);
1549 CvDEPTH(cv)++;
1550 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
1551}
1552
1553
1554PERL_STATIC_INLINE void
1555S_cx_popformat(pTHX_ PERL_CONTEXT *cx)
1556{
1557 CV *cv;
1558 GV *dfout;
1559
1560 PERL_ARGS_ASSERT_CX_POPFORMAT;
1561 assert(CxTYPE(cx) == CXt_FORMAT);
1562
1563 dfout = cx->blk_format.dfoutgv;
1564 setdefout(dfout);
1565 cx->blk_format.dfoutgv = NULL;
1566 SvREFCNT_dec_NN(dfout);
1567
1568 PL_comppad = cx->blk_format.prevcomppad;
1569 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1570 cv = cx->blk_format.cv;
1571 cx->blk_format.cv = NULL;
1572 --CvDEPTH(cv);
1573 SvREFCNT_dec_NN(cv);
1574}
1575
1576
13febba5
DM
1577PERL_STATIC_INLINE void
1578S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
1579{
1580 PERL_ARGS_ASSERT_CX_PUSHEVAL;
1581
1582 cx->blk_eval.retop = retop;
1583 cx->blk_eval.old_namesv = namesv;
1584 cx->blk_eval.old_eval_root = PL_eval_root;
1585 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
1586 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
1587 cx->blk_eval.cur_top_env = PL_top_env;
1588
4c57ced5 1589 assert(!(PL_in_eval & ~ 0x3F));
13febba5 1590 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 1591 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
1592}
1593
1594
1595PERL_STATIC_INLINE void
1596S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
1597{
1598 SV *sv;
1599
1600 PERL_ARGS_ASSERT_CX_POPEVAL;
1601 assert(CxTYPE(cx) == CXt_EVAL);
1602
1603 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 1604 assert(!(PL_in_eval & 0xc0));
13febba5
DM
1605 PL_eval_root = cx->blk_eval.old_eval_root;
1606 sv = cx->blk_eval.cur_text;
4c57ced5 1607 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
1608 cx->blk_eval.cur_text = NULL;
1609 SvREFCNT_dec_NN(sv);
1610 }
1611
1612 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
1613 if (sv) {
1614 cx->blk_eval.old_namesv = NULL;
1615 SvREFCNT_dec_NN(sv);
1616 }
13febba5 1617}
6a7d52cc 1618
a73d8813 1619
d1b6bf72
DM
1620/* push a plain loop, i.e.
1621 * { block }
1622 * while (cond) { block }
1623 * for (init;cond;continue) { block }
1624 * This loop can be last/redo'ed etc.
1625 */
1626
1627PERL_STATIC_INLINE void
1628S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
1629{
1630 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
1631 cx->blk_loop.my_op = cLOOP;
1632}
1633
1634
1635/* push a true for loop, i.e.
1636 * for var (list) { block }
1637 */
1638
1639PERL_STATIC_INLINE void
1640S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
1641{
1642 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
1643
1644 /* this one line is common with cx_pushloop_plain */
1645 cx->blk_loop.my_op = cLOOP;
1646
1647 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
1648 cx->blk_loop.itersave = itersave;
1649#ifdef USE_ITHREADS
1650 cx->blk_loop.oldcomppad = PL_comppad;
1651#endif
1652}
1653
1654
1655/* pop all loop types, including plain */
1656
1657PERL_STATIC_INLINE void
1658S_cx_poploop(pTHX_ PERL_CONTEXT *cx)
1659{
1660 PERL_ARGS_ASSERT_CX_POPLOOP;
1661
1662 assert(CxTYPE_is_LOOP(cx));
1663 if ( CxTYPE(cx) == CXt_LOOP_ARY
1664 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
1665 {
1666 /* Free ary or cur. This assumes that state_u.ary.ary
1667 * aligns with state_u.lazysv.cur. See cx_dup() */
1668 SV *sv = cx->blk_loop.state_u.lazysv.cur;
1669 cx->blk_loop.state_u.lazysv.cur = NULL;
1670 SvREFCNT_dec_NN(sv);
1671 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
1672 sv = cx->blk_loop.state_u.lazysv.end;
1673 cx->blk_loop.state_u.lazysv.end = NULL;
1674 SvREFCNT_dec_NN(sv);
1675 }
1676 }
1677 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
1678 SV *cursv;
1679 SV **svp = (cx)->blk_loop.itervar_u.svp;
1680 if ((cx->cx_type & CXp_FOR_GV))
1681 svp = &GvSV((GV*)svp);
1682 cursv = *svp;
1683 *svp = cx->blk_loop.itersave;
1684 cx->blk_loop.itersave = NULL;
1685 SvREFCNT_dec(cursv);
1686 }
1687}
1688
2a7b7c61
DM
1689
1690PERL_STATIC_INLINE void
1691S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
1692{
1693 PERL_ARGS_ASSERT_CX_PUSHWHEN;
1694
1695 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1696}
1697
1698
1699PERL_STATIC_INLINE void
1700S_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
1701{
1702 PERL_ARGS_ASSERT_CX_POPWHEN;
1703 assert(CxTYPE(cx) == CXt_WHEN);
1704
1705 PERL_UNUSED_ARG(cx);
59a14f30 1706 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
1707 /* currently NOOP */
1708}
1709
1710
1711PERL_STATIC_INLINE void
1712S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
1713{
1714 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
1715
1716 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1717 cx->blk_givwhen.defsv_save = orig_defsv;
1718}
1719
1720
1721PERL_STATIC_INLINE void
1722S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
1723{
1724 SV *sv;
1725
1726 PERL_ARGS_ASSERT_CX_POPGIVEN;
1727 assert(CxTYPE(cx) == CXt_GIVEN);
1728
1729 sv = GvSV(PL_defgv);
1730 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
1731 cx->blk_givwhen.defsv_save = NULL;
1732 SvREFCNT_dec(sv);
1733}
1734
ec2c235b
KW
1735/* ------------------ util.h ------------------------------------------- */
1736
1737/*
1738=head1 Miscellaneous Functions
1739
1740=for apidoc foldEQ
1741
1742Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1743same
1744case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
1745match themselves and their opposite case counterparts. Non-cased and non-ASCII
1746range bytes match only themselves.
1747
1748=cut
1749*/
1750
1751PERL_STATIC_INLINE I32
1752Perl_foldEQ(const char *s1, const char *s2, I32 len)
1753{
1754 const U8 *a = (const U8 *)s1;
1755 const U8 *b = (const U8 *)s2;
1756
1757 PERL_ARGS_ASSERT_FOLDEQ;
1758
1759 assert(len >= 0);
1760
1761 while (len--) {
1762 if (*a != *b && *a != PL_fold[*b])
1763 return 0;
1764 a++,b++;
1765 }
1766 return 1;
1767}
1768
0f9cb40c 1769PERL_STATIC_INLINE I32
ec2c235b
KW
1770Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
1771{
1772 /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on
1773 * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
1774 * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor
1775 * does it check that the strings each have at least 'len' characters */
1776
1777 const U8 *a = (const U8 *)s1;
1778 const U8 *b = (const U8 *)s2;
1779
1780 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
1781
1782 assert(len >= 0);
1783
1784 while (len--) {
1785 if (*a != *b && *a != PL_fold_latin1[*b]) {
1786 return 0;
1787 }
1788 a++, b++;
1789 }
1790 return 1;
1791}
1792
1793/*
1794=for apidoc foldEQ_locale
1795
1796Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1797same case-insensitively in the current locale; false otherwise.
1798
1799=cut
1800*/
1801
0f9cb40c 1802PERL_STATIC_INLINE I32
ec2c235b
KW
1803Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
1804{
1805 dVAR;
1806 const U8 *a = (const U8 *)s1;
1807 const U8 *b = (const U8 *)s2;
1808
1809 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
1810
1811 assert(len >= 0);
1812
1813 while (len--) {
1814 if (*a != *b && *a != PL_fold_locale[*b])
1815 return 0;
1816 a++,b++;
1817 }
1818 return 1;
1819}
1820
6dba01e2
KW
1821#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
1822
1823PERL_STATIC_INLINE void *
1824S_my_memrchr(const char * s, const char c, const STRLEN len)
1825{
1826 /* memrchr(), since many platforms lack it */
1827
1828 const char * t = s + len - 1;
1829
1830 PERL_ARGS_ASSERT_MY_MEMRCHR;
1831
1832 while (t >= s) {
1833 if (*t == c) {
1834 return (void *) t;
1835 }
1836 t--;
1837 }
1838
1839 return NULL;
1840}
1841
1842#endif
1843
ed382232 1844/*
c8028aa6
TC
1845 * ex: set ts=8 sts=4 sw=4 et:
1846 */