This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
is_utf8_invariant_string(): small speed optimization
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file is a home for static inline functions that cannot go in other
9 * headers files, because they depend on proto.h (included after most other
10 * headers) or struct definitions.
11 *
12 * Each section names the header file that the functions "belong" to.
13 */
27669aa4 14
be3a7a5d
KW
15/* ------------------------------- av.h ------------------------------- */
16
c70927a6 17PERL_STATIC_INLINE SSize_t
be3a7a5d
KW
18S_av_top_index(pTHX_ AV *av)
19{
20 PERL_ARGS_ASSERT_AV_TOP_INDEX;
21 assert(SvTYPE(av) == SVt_PVAV);
22
23 return AvFILL(av);
24}
25
1afe1db1
FC
26/* ------------------------------- cv.h ------------------------------- */
27
ae77754a
FC
28PERL_STATIC_INLINE GV *
29S_CvGV(pTHX_ CV *sv)
30{
31 return CvNAMED(sv)
32 ? Perl_cvgv_from_hek(aTHX_ sv)
33 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
34}
35
1afe1db1
FC
36PERL_STATIC_INLINE I32 *
37S_CvDEPTHp(const CV * const sv)
38{
39 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
8de47657 40 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
41}
42
d16269d8
PM
43/*
44 CvPROTO returns the prototype as stored, which is not necessarily what
45 the interpreter should be using. Specifically, the interpreter assumes
46 that spaces have been stripped, which has been the case if the prototype
47 was added by toke.c, but is generally not the case if it was added elsewhere.
48 Since we can't enforce the spacelessness at assignment time, this routine
49 provides a temporary copy at parse time with spaces removed.
50 I<orig> is the start of the original buffer, I<len> is the length of the
51 prototype and will be updated when this returns.
52 */
53
5b67adb8 54#ifdef PERL_CORE
d16269d8
PM
55PERL_STATIC_INLINE char *
56S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
57{
58 SV * tmpsv;
59 char * tmps;
60 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
61 tmps = SvPVX(tmpsv);
62 while ((*len)--) {
63 if (!isSPACE(*orig))
64 *tmps++ = *orig;
65 orig++;
66 }
67 *tmps = '\0';
68 *len = tmps - SvPVX(tmpsv);
69 return SvPVX(tmpsv);
70}
5b67adb8 71#endif
d16269d8 72
25fdce4a
FC
73/* ------------------------------- mg.h ------------------------------- */
74
75#if defined(PERL_CORE) || defined(PERL_EXT)
76/* assumes get-magic and stringification have already occurred */
77PERL_STATIC_INLINE STRLEN
78S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
79{
80 assert(mg->mg_type == PERL_MAGIC_regex_global);
81 assert(mg->mg_len != -1);
82 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
83 return (STRLEN)mg->mg_len;
84 else {
85 const STRLEN pos = (STRLEN)mg->mg_len;
86 /* Without this check, we may read past the end of the buffer: */
87 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
88 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
89 }
90}
91#endif
92
03414f05
FC
93/* ------------------------------- pad.h ------------------------------ */
94
95#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
96PERL_STATIC_INLINE bool
97PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
98{
99 /* is seq within the range _LOW to _HIGH ?
100 * This is complicated by the fact that PL_cop_seqmax
101 * may have wrapped around at some point */
102 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
103 return FALSE; /* not yet introduced */
104
105 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
106 /* in compiling scope */
107 if (
108 (seq > COP_SEQ_RANGE_LOW(pn))
109 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
110 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
111 )
112 return TRUE;
113 }
114 else if (
115 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
116 ?
117 ( seq > COP_SEQ_RANGE_LOW(pn)
118 || seq <= COP_SEQ_RANGE_HIGH(pn))
119
120 : ( seq > COP_SEQ_RANGE_LOW(pn)
121 && seq <= COP_SEQ_RANGE_HIGH(pn))
122 )
123 return TRUE;
124 return FALSE;
125}
126#endif
127
33a4312b
FC
128/* ------------------------------- pp.h ------------------------------- */
129
130PERL_STATIC_INLINE I32
131S_TOPMARK(pTHX)
132{
133 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 134 "MARK top %p %" IVdf "\n",
33a4312b
FC
135 PL_markstack_ptr,
136 (IV)*PL_markstack_ptr)));
137 return *PL_markstack_ptr;
138}
139
140PERL_STATIC_INLINE I32
141S_POPMARK(pTHX)
142{
143 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 144 "MARK pop %p %" IVdf "\n",
33a4312b
FC
145 (PL_markstack_ptr-1),
146 (IV)*(PL_markstack_ptr-1))));
147 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
148 return *PL_markstack_ptr--;
149}
150
8d919b0a
FC
151/* ----------------------------- regexp.h ----------------------------- */
152
153PERL_STATIC_INLINE struct regexp *
154S_ReANY(const REGEXP * const re)
155{
df6b4bd5 156 XPV* const p = (XPV*)SvANY(re);
8d919b0a 157 assert(isREGEXP(re));
df6b4bd5
DM
158 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
159 : (struct regexp *)p;
8d919b0a
FC
160}
161
27669aa4
FC
162/* ------------------------------- sv.h ------------------------------- */
163
164PERL_STATIC_INLINE SV *
165S_SvREFCNT_inc(SV *sv)
166{
2439e033 167 if (LIKELY(sv != NULL))
27669aa4
FC
168 SvREFCNT(sv)++;
169 return sv;
170}
171PERL_STATIC_INLINE SV *
172S_SvREFCNT_inc_NN(SV *sv)
173{
174 SvREFCNT(sv)++;
175 return sv;
176}
177PERL_STATIC_INLINE void
178S_SvREFCNT_inc_void(SV *sv)
179{
2439e033 180 if (LIKELY(sv != NULL))
27669aa4
FC
181 SvREFCNT(sv)++;
182}
75e16a44
FC
183PERL_STATIC_INLINE void
184S_SvREFCNT_dec(pTHX_ SV *sv)
185{
2439e033 186 if (LIKELY(sv != NULL)) {
75a9bf96 187 U32 rc = SvREFCNT(sv);
79e2a32a 188 if (LIKELY(rc > 1))
75a9bf96
DM
189 SvREFCNT(sv) = rc - 1;
190 else
191 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
192 }
193}
541377b1
FC
194
195PERL_STATIC_INLINE void
4a9a56a7
DM
196S_SvREFCNT_dec_NN(pTHX_ SV *sv)
197{
198 U32 rc = SvREFCNT(sv);
79e2a32a 199 if (LIKELY(rc > 1))
4a9a56a7
DM
200 SvREFCNT(sv) = rc - 1;
201 else
202 Perl_sv_free2(aTHX_ sv, rc);
203}
204
205PERL_STATIC_INLINE void
541377b1
FC
206SvAMAGIC_on(SV *sv)
207{
208 assert(SvROK(sv));
209 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
210}
211PERL_STATIC_INLINE void
212SvAMAGIC_off(SV *sv)
213{
214 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
215 HvAMAGIC_off(SvSTASH(SvRV(sv)));
216}
217
218PERL_STATIC_INLINE U32
541377b1
FC
219S_SvPADSTALE_on(SV *sv)
220{
c0683843 221 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
222 return SvFLAGS(sv) |= SVs_PADSTALE;
223}
224PERL_STATIC_INLINE U32
225S_SvPADSTALE_off(SV *sv)
226{
c0683843 227 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
228 return SvFLAGS(sv) &= ~SVs_PADSTALE;
229}
25fdce4a 230#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 231PERL_STATIC_INLINE STRLEN
6964422a 232S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 233{
25fdce4a 234 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
235 if (SvGAMAGIC(sv)) {
236 U8 *hopped = utf8_hop((U8 *)pv, pos);
237 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
238 return (STRLEN)(hopped - (U8 *)pv);
239 }
240 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
241}
242#endif
f019c49e 243
d1decf2b
TC
244/* ------------------------------- handy.h ------------------------------- */
245
246/* saves machine code for a common noreturn idiom typically used in Newx*() */
c1d6452f 247#ifdef GCC_DIAG_PRAGMA
6ab56f1e 248GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
c1d6452f 249#endif
d1decf2b
TC
250static void
251S_croak_memory_wrap(void)
252{
253 Perl_croak_nocontext("%s",PL_memory_wrap);
254}
c1d6452f 255#ifdef GCC_DIAG_PRAGMA
6ab56f1e 256GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
c1d6452f 257#endif
d1decf2b 258
a8a2ceaa
KW
259/* ------------------------------- utf8.h ------------------------------- */
260
2fe720e2
KW
261/*
262=head1 Unicode Support
263*/
264
55d09dc8
KW
265PERL_STATIC_INLINE void
266S_append_utf8_from_native_byte(const U8 byte, U8** dest)
267{
268 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
269 * encoded string at '*dest', updating '*dest' to include it */
270
55d09dc8
KW
271 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
272
6f2d5cbc 273 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 274 *((*dest)++) = byte;
55d09dc8 275 else {
a09ec51a
KW
276 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
277 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
278 }
279}
280
e123187a 281/*
2fe720e2 282=for apidoc valid_utf8_to_uvchr
2717076a 283Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
2fe720e2
KW
284the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
285it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and
286non-Unicode code points are allowed.
287
288=cut
289
290 */
291
292PERL_STATIC_INLINE UV
293Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
294{
c41b2540 295 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
296 const U8* send = s + expectlen;
297 UV uv = *s;
298
299 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
300
301 if (retlen) {
302 *retlen = expectlen;
303 }
304
305 /* An invariant is trivially returned */
306 if (expectlen == 1) {
307 return uv;
308 }
309
310 /* Remove the leading bits that indicate the number of bytes, leaving just
311 * the bits that are part of the value */
312 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
313
314 /* Now, loop through the remaining bytes, accumulating each into the
315 * working total as we go. (I khw tried unrolling the loop for up to 4
316 * bytes, but there was no performance improvement) */
317 for (++s; s < send; s++) {
318 uv = UTF8_ACCUMULATE(uv, *s);
319 }
320
321 return UNI_TO_NATIVE(uv);
322
323}
324
1e599354
KW
325/*
326=for apidoc is_utf8_invariant_string
327
82c5d941 328Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 329regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
330EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
331are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
332the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
333characters are invariant, but so also are the C1 controls.
1e599354
KW
334
335If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
336use this option, that C<s> can't have embedded C<NUL> characters and has to
337have a terminating C<NUL> byte).
338
9f2abfde
KW
339See also
340C<L</is_utf8_string>>,
341C<L</is_utf8_string_flags>>,
342C<L</is_utf8_string_loc>>,
343C<L</is_utf8_string_loc_flags>>,
344C<L</is_utf8_string_loclen>>,
345C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
346C<L</is_utf8_fixed_width_buf_flags>>,
347C<L</is_utf8_fixed_width_buf_loc_flags>>,
348C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
349C<L</is_strict_utf8_string>>,
350C<L</is_strict_utf8_string_loc>>,
351C<L</is_strict_utf8_string_loclen>>,
352C<L</is_c9strict_utf8_string>>,
353C<L</is_c9strict_utf8_string_loc>>,
354and
355C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
356
357=cut
0cbf5865
KW
358
359*/
360
361#define is_utf8_invariant_string(s, len) \
362 is_utf8_invariant_string_loc(s, len, NULL)
363
364/*
365=for apidoc is_utf8_invariant_string_loc
366
367Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
368the first UTF-8 variant character in the C<ep> pointer; if all characters are
369UTF-8 invariant, this function does not change the contents of C<*ep>.
370
371=cut
372
1e599354
KW
373*/
374
375PERL_STATIC_INLINE bool
e17544a6 376S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 377{
e17544a6 378 const U8* send;
1e599354
KW
379 const U8* x = s;
380
0cbf5865
KW
381 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
382
e17544a6
KW
383 if (len == 0) {
384 len = strlen((const char *)s);
385 }
386
387 send = s + len;
388
389#ifndef EBCDIC
4ab2fd9b
KW
390
391/* This looks like 0x010101... */
392#define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
393
394/* This looks like 0x808080... */
395#define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
396#define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER)
397#define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 398
099e59a4
KW
399/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
400 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
401 * optimized out completely on a 32-bit system, and its mask gets optimized out
402 * on a 64-bit system */
403#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
404 | (PTR2nat(x) >> 1) \
405 | ( (PTR2nat(x) >> 2) \
406 & PERL_WORD_BOUNDARY_MASK)))
407
408 /* Do the word-at-a-time iff there is at least one usable full word. That
409 * means that after advancing to a word boundary, there still is at least a
410 * full word left. The number of bytes needed to advance is 'wordsize -
411 * offset' unless offset is 0. */
412 if ((STRLEN) (send - x) >= PERL_WORDSIZE
413
414 /* This term is wordsize if subword; 0 if not */
415 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
416
417 /* 'offset' */
418 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
419 {
b40579ff 420
46bb68f6
KW
421 /* Process per-byte until reach word boundary. XXX This loop could be
422 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 423 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
424 if (! UTF8_IS_INVARIANT(*x)) {
425 if (ep) {
426 *ep = x;
427 }
e17544a6 428
46bb68f6
KW
429 return FALSE;
430 }
431 x++;
e17544a6 432 }
e17544a6 433
099e59a4
KW
434 /* Here, we know we have at least one full word to process. Process
435 * per-word as long as we have at least a full word left */
436 do {
4ab2fd9b 437 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 438
46bb68f6
KW
439 /* Found a variant. Just return if caller doesn't want its
440 * exact position */
441 if (! ep) {
442 return FALSE;
443 }
e17544a6 444
46bb68f6
KW
445 /* Otherwise fall into final loop to find which byte it is */
446 break;
447 }
448 x += PERL_WORDSIZE;
099e59a4 449 } while (x + PERL_WORDSIZE <= send);
b40579ff 450 }
e17544a6 451
e17544a6
KW
452# undef PERL_WORDSIZE
453# undef PERL_WORD_BOUNDARY_MASK
454# undef PERL_VARIANTS_WORD_MASK
455#endif
456
457 /* Process per-byte */
458 while (x < send) {
459 if (! UTF8_IS_INVARIANT(*x)) {
460 if (ep) {
461 *ep = x;
462 }
0cbf5865 463
e17544a6 464 return FALSE;
0cbf5865 465 }
1e599354 466
e17544a6 467 x++;
1e599354
KW
468 }
469
470 return TRUE;
471}
472
7c93d8f0 473/*
5ff889fb
KW
474=for apidoc is_utf8_string
475
82c5d941
KW
476Returns TRUE if the first C<len> bytes of string C<s> form a valid
477Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
478be calculated using C<strlen(s)> (which means if you use this option, that C<s>
479can't have embedded C<NUL> characters and has to have a terminating C<NUL>
480byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
481
2717076a
KW
482This function considers Perl's extended UTF-8 to be valid. That means that
483code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
484considered valid by this function. Use C<L</is_strict_utf8_string>>,
485C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
486code points are considered valid.
5ff889fb 487
9f2abfde
KW
488See also
489C<L</is_utf8_invariant_string>>,
0cbf5865 490C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
491C<L</is_utf8_string_loc>>,
492C<L</is_utf8_string_loclen>>,
8bc127bf
KW
493C<L</is_utf8_fixed_width_buf_flags>>,
494C<L</is_utf8_fixed_width_buf_loc_flags>>,
495C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
496
497=cut
498*/
499
dd237e82 500#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 501
c9cd936b
KW
502#if defined(PERL_CORE) || defined (PERL_EXT)
503
504/*
505=for apidoc is_utf8_non_invariant_string
506
507Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
508C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
509UTF-8; otherwise returns FALSE.
510
511A TRUE return means that at least one code point represented by the sequence
512either is a wide character not representable as a single byte, or the
513representation differs depending on whether the sequence is encoded in UTF-8 or
514not.
515
516See also
517C<L<perlapi/is_utf8_invariant_string>>,
518C<L<perlapi/is_utf8_string>>
519
520=cut
521
522This is commonly used to determine if a SV's UTF-8 flag should be turned on.
523It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
524it otherwise contains invalid UTF-8.
525
526It is an internal function because khw thinks that XS code shouldn't be working
527at this low a level. A valid use case could change that.
528
529*/
530
531PERL_STATIC_INLINE bool
532S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
533{
534 const U8 * first_variant;
535
536 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
537
538 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
539 return FALSE;
540 }
541
542 return is_utf8_string(first_variant, len - (first_variant - s));
543}
544
545#endif
546
5ff889fb 547/*
9f2abfde
KW
548=for apidoc is_strict_utf8_string
549
550Returns TRUE if the first C<len> bytes of string C<s> form a valid
551UTF-8-encoded string that is fully interchangeable by any application using
552Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
553calculated using C<strlen(s)> (which means if you use this option, that C<s>
554can't have embedded C<NUL> characters and has to have a terminating C<NUL>
555byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
556
557This function returns FALSE for strings containing any
558code points above the Unicode max of 0x10FFFF, surrogate code points, or
559non-character code points.
560
561See also
562C<L</is_utf8_invariant_string>>,
0cbf5865 563C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
564C<L</is_utf8_string>>,
565C<L</is_utf8_string_flags>>,
566C<L</is_utf8_string_loc>>,
567C<L</is_utf8_string_loc_flags>>,
568C<L</is_utf8_string_loclen>>,
569C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
570C<L</is_utf8_fixed_width_buf_flags>>,
571C<L</is_utf8_fixed_width_buf_loc_flags>>,
572C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
573C<L</is_strict_utf8_string_loc>>,
574C<L</is_strict_utf8_string_loclen>>,
575C<L</is_c9strict_utf8_string>>,
576C<L</is_c9strict_utf8_string_loc>>,
577and
578C<L</is_c9strict_utf8_string_loclen>>.
579
580=cut
581*/
582
dd237e82 583#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
584
585/*
586=for apidoc is_c9strict_utf8_string
587
588Returns TRUE if the first C<len> bytes of string C<s> form a valid
589UTF-8-encoded string that conforms to
590L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
591otherwise it returns FALSE. If C<len> is 0, it will be calculated using
592C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
593C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
594characters being ASCII constitute 'a valid UTF-8 string'.
595
596This function returns FALSE for strings containing any code points above the
597Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
598code points per
599L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
600
601See also
602C<L</is_utf8_invariant_string>>,
0cbf5865 603C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
604C<L</is_utf8_string>>,
605C<L</is_utf8_string_flags>>,
606C<L</is_utf8_string_loc>>,
607C<L</is_utf8_string_loc_flags>>,
608C<L</is_utf8_string_loclen>>,
609C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
610C<L</is_utf8_fixed_width_buf_flags>>,
611C<L</is_utf8_fixed_width_buf_loc_flags>>,
612C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
613C<L</is_strict_utf8_string>>,
614C<L</is_strict_utf8_string_loc>>,
615C<L</is_strict_utf8_string_loclen>>,
616C<L</is_c9strict_utf8_string_loc>>,
617and
618C<L</is_c9strict_utf8_string_loclen>>.
619
620=cut
621*/
622
dd237e82 623#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
624
625/*
626=for apidoc is_utf8_string_flags
627
628Returns TRUE if the first C<len> bytes of string C<s> form a valid
629UTF-8 string, subject to the restrictions imposed by C<flags>;
630returns FALSE otherwise. If C<len> is 0, it will be calculated
631using C<strlen(s)> (which means if you use this option, that C<s> can't have
632embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
633that all characters being ASCII constitute 'a valid UTF-8 string'.
634
635If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
636C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
637as C<L</is_strict_utf8_string>>; and if C<flags> is
638C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
639C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
640combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
641C<L</utf8n_to_uvchr>>, with the same meanings.
642
643See also
644C<L</is_utf8_invariant_string>>,
0cbf5865 645C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
646C<L</is_utf8_string>>,
647C<L</is_utf8_string_loc>>,
648C<L</is_utf8_string_loc_flags>>,
649C<L</is_utf8_string_loclen>>,
650C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
651C<L</is_utf8_fixed_width_buf_flags>>,
652C<L</is_utf8_fixed_width_buf_loc_flags>>,
653C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
654C<L</is_strict_utf8_string>>,
655C<L</is_strict_utf8_string_loc>>,
656C<L</is_strict_utf8_string_loclen>>,
657C<L</is_c9strict_utf8_string>>,
658C<L</is_c9strict_utf8_string_loc>>,
659and
660C<L</is_c9strict_utf8_string_loclen>>.
661
662=cut
663*/
664
665PERL_STATIC_INLINE bool
f60f61fd 666S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 667{
33756530 668 const U8 * first_variant;
9f2abfde
KW
669
670 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
671 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 672 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 673
f60f61fd
KW
674 if (len == 0) {
675 len = strlen((const char *)s);
676 }
677
9f2abfde
KW
678 if (flags == 0) {
679 return is_utf8_string(s, len);
680 }
681
d044b7a7 682 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
683 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
684 {
685 return is_strict_utf8_string(s, len);
686 }
687
d044b7a7 688 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
689 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
690 {
691 return is_c9strict_utf8_string(s, len);
692 }
693
33756530
KW
694 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
695 const U8* const send = s + len;
696 const U8* x = first_variant;
697
a0d7f935
KW
698 while (x < send) {
699 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
700 if (UNLIKELY(! cur_len)) {
701 return FALSE;
702 }
703 x += cur_len;
9f2abfde 704 }
33756530 705 }
9f2abfde
KW
706
707 return TRUE;
708}
709
710/*
5ff889fb
KW
711
712=for apidoc is_utf8_string_loc
713
2717076a 714Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 715case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 716"utf8ness success") in the C<ep> pointer.
5ff889fb 717
2717076a 718See also C<L</is_utf8_string_loclen>>.
5ff889fb 719
3964c812
KW
720=cut
721*/
722
723#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
724
725/*
726
5ff889fb
KW
727=for apidoc is_utf8_string_loclen
728
2717076a 729Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 730case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 731"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 732encoded characters in the C<el> pointer.
5ff889fb 733
2717076a 734See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
735
736=cut
737*/
738
56e4cf64 739PERL_STATIC_INLINE bool
33756530 740Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 741{
33756530 742 const U8 * first_variant;
5ff889fb
KW
743
744 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
745
33756530
KW
746 if (len == 0) {
747 len = strlen((const char *) s);
748 }
749
750 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
751 if (el)
752 *el = len;
753
754 if (ep) {
755 *ep = s + len;
756 }
757
758 return TRUE;
759 }
760
761 {
762 const U8* const send = s + len;
763 const U8* x = first_variant;
764 STRLEN outlen = first_variant - s;
765
a0d7f935
KW
766 while (x < send) {
767 const STRLEN cur_len = isUTF8_CHAR(x, send);
768 if (UNLIKELY(! cur_len)) {
769 break;
770 }
771 x += cur_len;
772 outlen++;
5ff889fb 773 }
5ff889fb 774
a0d7f935
KW
775 if (el)
776 *el = outlen;
5ff889fb 777
a0d7f935
KW
778 if (ep) {
779 *ep = x;
780 }
5ff889fb 781
a0d7f935 782 return (x == send);
33756530 783 }
5ff889fb
KW
784}
785
786/*
9f2abfde
KW
787
788=for apidoc is_strict_utf8_string_loc
789
790Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
791case of "utf8ness failure") or the location C<s>+C<len> (in the case of
792"utf8ness success") in the C<ep> pointer.
793
794See also C<L</is_strict_utf8_string_loclen>>.
795
796=cut
797*/
798
799#define is_strict_utf8_string_loc(s, len, ep) \
800 is_strict_utf8_string_loclen(s, len, ep, 0)
801
802/*
803
804=for apidoc is_strict_utf8_string_loclen
805
806Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
807case of "utf8ness failure") or the location C<s>+C<len> (in the case of
808"utf8ness success") in the C<ep> pointer, and the number of UTF-8
809encoded characters in the C<el> pointer.
810
811See also C<L</is_strict_utf8_string_loc>>.
812
813=cut
814*/
815
816PERL_STATIC_INLINE bool
33756530 817S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 818{
33756530 819 const U8 * first_variant;
9f2abfde
KW
820
821 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
822
33756530
KW
823 if (len == 0) {
824 len = strlen((const char *) s);
825 }
826
827 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
828 if (el)
829 *el = len;
830
831 if (ep) {
832 *ep = s + len;
833 }
834
835 return TRUE;
836 }
837
838 {
839 const U8* const send = s + len;
840 const U8* x = first_variant;
841 STRLEN outlen = first_variant - s;
842
a0d7f935
KW
843 while (x < send) {
844 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
845 if (UNLIKELY(! cur_len)) {
846 break;
847 }
848 x += cur_len;
849 outlen++;
9f2abfde 850 }
9f2abfde 851
a0d7f935
KW
852 if (el)
853 *el = outlen;
9f2abfde 854
a0d7f935
KW
855 if (ep) {
856 *ep = x;
857 }
9f2abfde 858
a0d7f935 859 return (x == send);
33756530 860 }
9f2abfde
KW
861}
862
863/*
864
865=for apidoc is_c9strict_utf8_string_loc
866
867Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
868the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
869"utf8ness success") in the C<ep> pointer.
870
871See also C<L</is_c9strict_utf8_string_loclen>>.
872
873=cut
874*/
875
876#define is_c9strict_utf8_string_loc(s, len, ep) \
877 is_c9strict_utf8_string_loclen(s, len, ep, 0)
878
879/*
880
881=for apidoc is_c9strict_utf8_string_loclen
882
883Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
884the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
885"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
886characters in the C<el> pointer.
887
888See also C<L</is_c9strict_utf8_string_loc>>.
889
890=cut
891*/
892
893PERL_STATIC_INLINE bool
33756530 894S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 895{
33756530 896 const U8 * first_variant;
9f2abfde
KW
897
898 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
899
33756530
KW
900 if (len == 0) {
901 len = strlen((const char *) s);
902 }
903
904 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
905 if (el)
906 *el = len;
907
908 if (ep) {
909 *ep = s + len;
910 }
911
912 return TRUE;
913 }
914
915 {
916 const U8* const send = s + len;
917 const U8* x = first_variant;
918 STRLEN outlen = first_variant - s;
919
a0d7f935
KW
920 while (x < send) {
921 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
922 if (UNLIKELY(! cur_len)) {
923 break;
924 }
925 x += cur_len;
926 outlen++;
9f2abfde 927 }
9f2abfde 928
a0d7f935
KW
929 if (el)
930 *el = outlen;
9f2abfde 931
a0d7f935
KW
932 if (ep) {
933 *ep = x;
934 }
9f2abfde 935
a0d7f935 936 return (x == send);
33756530 937 }
9f2abfde
KW
938}
939
940/*
941
942=for apidoc is_utf8_string_loc_flags
943
944Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
945case of "utf8ness failure") or the location C<s>+C<len> (in the case of
946"utf8ness success") in the C<ep> pointer.
947
948See also C<L</is_utf8_string_loclen_flags>>.
949
950=cut
951*/
952
953#define is_utf8_string_loc_flags(s, len, ep, flags) \
954 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
955
956
957/* The above 3 actual functions could have been moved into the more general one
958 * just below, and made #defines that call it with the right 'flags'. They are
959 * currently kept separate to increase their chances of getting inlined */
960
961/*
962
963=for apidoc is_utf8_string_loclen_flags
964
965Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
966case of "utf8ness failure") or the location C<s>+C<len> (in the case of
967"utf8ness success") in the C<ep> pointer, and the number of UTF-8
968encoded characters in the C<el> pointer.
969
970See also C<L</is_utf8_string_loc_flags>>.
971
972=cut
973*/
974
975PERL_STATIC_INLINE bool
f60f61fd 976S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 977{
33756530 978 const U8 * first_variant;
9f2abfde
KW
979
980 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
981 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 982 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 983
f60f61fd 984 if (len == 0) {
a0d7f935 985 len = strlen((const char *) s);
f60f61fd
KW
986 }
987
9f2abfde
KW
988 if (flags == 0) {
989 return is_utf8_string_loclen(s, len, ep, el);
990 }
991
d044b7a7 992 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
993 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
994 {
995 return is_strict_utf8_string_loclen(s, len, ep, el);
996 }
997
d044b7a7 998 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
999 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1000 {
1001 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1002 }
1003
33756530
KW
1004 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1005 if (el)
1006 *el = len;
1007
1008 if (ep) {
1009 *ep = s + len;
1010 }
1011
1012 return TRUE;
1013 }
1014
1015 {
1016 const U8* send = s + len;
1017 const U8* x = first_variant;
1018 STRLEN outlen = first_variant - s;
1019
a0d7f935
KW
1020 while (x < send) {
1021 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1022 if (UNLIKELY(! cur_len)) {
1023 break;
1024 }
1025 x += cur_len;
1026 outlen++;
9f2abfde 1027 }
9f2abfde 1028
a0d7f935
KW
1029 if (el)
1030 *el = outlen;
9f2abfde 1031
a0d7f935
KW
1032 if (ep) {
1033 *ep = x;
1034 }
9f2abfde 1035
a0d7f935 1036 return (x == send);
33756530 1037 }
9f2abfde
KW
1038}
1039
1040/*
7c93d8f0
KW
1041=for apidoc utf8_distance
1042
1043Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1044and C<b>.
1045
1046WARNING: use only if you *know* that the pointers point inside the
1047same UTF-8 buffer.
1048
1049=cut
1050*/
1051
1052PERL_STATIC_INLINE IV
1053Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1054{
1055 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1056
1057 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1058}
1059
1060/*
1061=for apidoc utf8_hop
1062
1063Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1064forward or backward.
1065
1066WARNING: do not use the following unless you *know* C<off> is within
1067the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1068on the first byte of character or just after the last byte of a character.
1069
1070=cut
1071*/
1072
1073PERL_STATIC_INLINE U8 *
1074Perl_utf8_hop(const U8 *s, SSize_t off)
1075{
1076 PERL_ARGS_ASSERT_UTF8_HOP;
1077
1078 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1079 * the bitops (especially ~) can create illegal UTF-8.
1080 * In other words: in Perl UTF-8 is not just for Unicode. */
1081
1082 if (off >= 0) {
1083 while (off--)
1084 s += UTF8SKIP(s);
1085 }
1086 else {
1087 while (off++) {
1088 s--;
1089 while (UTF8_IS_CONTINUATION(*s))
1090 s--;
1091 }
1092 }
de979548 1093 GCC_DIAG_IGNORE(-Wcast-qual);
7c93d8f0 1094 return (U8 *)s;
de979548 1095 GCC_DIAG_RESTORE;
7c93d8f0
KW
1096}
1097
4dab108f 1098/*
65df57a8
TC
1099=for apidoc utf8_hop_forward
1100
1101Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1102forward.
1103
1104C<off> must be non-negative.
1105
1106C<s> must be before or equal to C<end>.
1107
1108When moving forward it will not move beyond C<end>.
1109
1110Will not exceed this limit even if the string is not valid "UTF-8".
1111
1112=cut
1113*/
1114
1115PERL_STATIC_INLINE U8 *
1116Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1117{
1118 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1119
1120 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1121 * the bitops (especially ~) can create illegal UTF-8.
1122 * In other words: in Perl UTF-8 is not just for Unicode. */
1123
1124 assert(s <= end);
1125 assert(off >= 0);
1126
1127 while (off--) {
1128 STRLEN skip = UTF8SKIP(s);
de979548
P
1129 if ((STRLEN)(end - s) <= skip) {
1130 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1131 return (U8 *)end;
de979548
P
1132 GCC_DIAG_RESTORE;
1133 }
65df57a8
TC
1134 s += skip;
1135 }
1136
de979548 1137 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1138 return (U8 *)s;
de979548 1139 GCC_DIAG_RESTORE;
65df57a8
TC
1140}
1141
1142/*
1143=for apidoc utf8_hop_back
1144
1145Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1146backward.
1147
1148C<off> must be non-positive.
1149
1150C<s> must be after or equal to C<start>.
1151
1152When moving backward it will not move before C<start>.
1153
1154Will not exceed this limit even if the string is not valid "UTF-8".
1155
1156=cut
1157*/
1158
1159PERL_STATIC_INLINE U8 *
1160Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1161{
1162 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1163
1164 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1165 * the bitops (especially ~) can create illegal UTF-8.
1166 * In other words: in Perl UTF-8 is not just for Unicode. */
1167
1168 assert(start <= s);
1169 assert(off <= 0);
1170
1171 while (off++ && s > start) {
1172 s--;
1173 while (UTF8_IS_CONTINUATION(*s) && s > start)
1174 s--;
1175 }
1176
de979548 1177 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1178 return (U8 *)s;
de979548 1179 GCC_DIAG_RESTORE;
65df57a8
TC
1180}
1181
1182/*
1183=for apidoc utf8_hop_safe
1184
1185Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1186either forward or backward.
1187
1188When moving backward it will not move before C<start>.
1189
1190When moving forward it will not move beyond C<end>.
1191
1192Will not exceed those limits even if the string is not valid "UTF-8".
1193
1194=cut
1195*/
1196
1197PERL_STATIC_INLINE U8 *
1198Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1199{
1200 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1201
1202 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1203 * the bitops (especially ~) can create illegal UTF-8.
1204 * In other words: in Perl UTF-8 is not just for Unicode. */
1205
1206 assert(start <= s && s <= end);
1207
1208 if (off >= 0) {
1209 return utf8_hop_forward(s, off, end);
1210 }
1211 else {
1212 return utf8_hop_back(s, off, start);
1213 }
1214}
1215
1216/*
4dab108f
KW
1217
1218=for apidoc is_utf8_valid_partial_char
1219
6cbb9248
KW
1220Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1221S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1222points. Otherwise, it returns 1 if there exists at least one non-empty
1223sequence of bytes that when appended to sequence C<s>, starting at position
1224C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1225otherwise returns 0.
1226
1227In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1228point.
1229
1230This is useful when a fixed-length buffer is being tested for being well-formed
1231UTF-8, but the final few bytes in it don't comprise a full character; that is,
1232it is split somewhere in the middle of the final code point's UTF-8
1233representation. (Presumably when the buffer is refreshed with the next chunk
1234of data, the new first bytes will complete the partial code point.) This
1235function is used to verify that the final bytes in the current buffer are in
1236fact the legal beginning of some code point, so that if they aren't, the
1237failure can be signalled without having to wait for the next read.
4dab108f
KW
1238
1239=cut
1240*/
2717076a
KW
1241#define is_utf8_valid_partial_char(s, e) \
1242 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1243
1244/*
1245
1246=for apidoc is_utf8_valid_partial_char_flags
1247
1248Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1249or not the input is a valid UTF-8 encoded partial character, but it takes an
1250extra parameter, C<flags>, which can further restrict which code points are
1251considered valid.
1252
1253If C<flags> is 0, this behaves identically to
1254C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1255of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1256there is any sequence of bytes that can complete the input partial character in
1257such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1258TRUE; otherwise FALSE. Non character code points cannot be determined based on
1259partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1260determined from just the first one or two bytes.
1261
1262=cut
1263 */
1264
56e4cf64 1265PERL_STATIC_INLINE bool
f1c999a7 1266S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1267{
f1c999a7 1268 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1269
f1c999a7 1270 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1271 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1272
8875bd48 1273 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1274 return FALSE;
1275 }
1276
f1c999a7 1277 return cBOOL(_is_utf8_char_helper(s, e, flags));
4dab108f
KW
1278}
1279
8bc127bf
KW
1280/*
1281
1282=for apidoc is_utf8_fixed_width_buf_flags
1283
1284Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1285is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1286otherwise it returns FALSE.
1287
1288If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1289without restriction. If the final few bytes of the buffer do not form a
1290complete code point, this will return TRUE anyway, provided that
1291C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1292
1293If C<flags> in non-zero, it can be any combination of the
1294C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1295same meanings.
1296
1297This function differs from C<L</is_utf8_string_flags>> only in that the latter
1298returns FALSE if the final few bytes of the string don't form a complete code
1299point.
1300
1301=cut
1302 */
1303#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1304 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1305
1306/*
1307
1308=for apidoc is_utf8_fixed_width_buf_loc_flags
1309
1310Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1311failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1312to the beginning of any partial character at the end of the buffer; if there is
1313no partial character C<*ep> will contain C<s>+C<len>.
1314
1315See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1316
1317=cut
1318*/
1319
1320#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1321 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1322
1323/*
1324
1325=for apidoc is_utf8_fixed_width_buf_loclen_flags
1326
1327Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1328complete, valid characters found in the C<el> pointer.
1329
1330=cut
1331*/
1332
1333PERL_STATIC_INLINE bool
1334S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1335 STRLEN len,
8bc127bf
KW
1336 const U8 **ep,
1337 STRLEN *el,
1338 const U32 flags)
1339{
1340 const U8 * maybe_partial;
1341
1342 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1343
1344 if (! ep) {
1345 ep = &maybe_partial;
1346 }
1347
1348 /* If it's entirely valid, return that; otherwise see if the only error is
1349 * that the final few bytes are for a partial character */
1350 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1351 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1352}
1353
c8028aa6
TC
1354/* ------------------------------- perl.h ----------------------------- */
1355
1356/*
dcccc8ff
KW
1357=head1 Miscellaneous Functions
1358
41188aa0 1359=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
c8028aa6 1360
6602b933 1361Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1362If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1363
1364Return TRUE if the name is safe.
1365
796b6530 1366Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1367
1368=cut
1369*/
1370
1371PERL_STATIC_INLINE bool
41188aa0 1372S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) {
c8028aa6
TC
1373 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1374 * perl itself uses xce*() functions which accept 8-bit strings.
1375 */
1376
1377 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1378
6c4650b3 1379 if (len > 1) {
c8028aa6 1380 char *null_at;
41188aa0 1381 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1382 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1383 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1384 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1385 what, op_name, pv, null_at+1);
c8028aa6
TC
1386 return FALSE;
1387 }
1388 }
1389
1390 return TRUE;
1391}
1392
1393/*
7cb3f959
TC
1394
1395Return true if the supplied filename has a newline character
fa6c7d00 1396immediately before the first (hopefully only) NUL.
7cb3f959
TC
1397
1398My original look at this incorrectly used the len from SvPV(), but
1399that's incorrect, since we allow for a NUL in pv[len-1].
1400
1401So instead, strlen() and work from there.
1402
1403This allow for the user reading a filename, forgetting to chomp it,
1404then calling:
1405
1406 open my $foo, "$file\0";
1407
1408*/
1409
1410#ifdef PERL_CORE
1411
1412PERL_STATIC_INLINE bool
1413S_should_warn_nl(const char *pv) {
1414 STRLEN len;
1415
1416 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1417
1418 len = strlen(pv);
1419
1420 return len > 0 && pv[len-1] == '\n';
1421}
1422
1423#endif
1424
81d52ecd
JH
1425/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1426
1427#define MAX_CHARSET_NAME_LENGTH 2
1428
1429PERL_STATIC_INLINE const char *
1430get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1431{
1432 /* Returns a string that corresponds to the name of the regex character set
1433 * given by 'flags', and *lenp is set the length of that string, which
1434 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
1435
1436 *lenp = 1;
1437 switch (get_regex_charset(flags)) {
1438 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
1439 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
1440 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1441 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
1442 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
1443 *lenp = 2;
1444 return ASCII_MORE_RESTRICT_PAT_MODS;
1445 }
1446 /* The NOT_REACHED; hides an assert() which has a rather complex
1447 * definition in perl.h. */
1448 NOT_REACHED; /* NOTREACHED */
1449 return "?"; /* Unknown */
1450}
1451
7cb3f959 1452/*
ed382232
TC
1453
1454Return false if any get magic is on the SV other than taint magic.
1455
1456*/
1457
1458PERL_STATIC_INLINE bool
1459S_sv_only_taint_gmagic(SV *sv) {
1460 MAGIC *mg = SvMAGIC(sv);
1461
1462 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
1463
1464 while (mg) {
1465 if (mg->mg_type != PERL_MAGIC_taint
1466 && !(mg->mg_flags & MGf_GSKIP)
1467 && mg->mg_virtual->svt_get) {
1468 return FALSE;
1469 }
1470 mg = mg->mg_moremagic;
1471 }
1472
1473 return TRUE;
1474}
1475
ed8ff0f3
DM
1476/* ------------------ cop.h ------------------------------------------- */
1477
1478
1479/* Enter a block. Push a new base context and return its address. */
1480
1481PERL_STATIC_INLINE PERL_CONTEXT *
1482S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
1483{
1484 PERL_CONTEXT * cx;
1485
1486 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
1487
1488 CXINC;
1489 cx = CX_CUR();
1490 cx->cx_type = type;
1491 cx->blk_gimme = gimme;
1492 cx->blk_oldsaveix = saveix;
4caf7d8c 1493 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 1494 cx->blk_oldcop = PL_curcop;
4caf7d8c 1495 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
1496 cx->blk_oldscopesp = PL_scopestack_ix;
1497 cx->blk_oldpm = PL_curpm;
ce8bb8d8 1498 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
1499
1500 PL_tmps_floor = PL_tmps_ix;
1501 CX_DEBUG(cx, "PUSH");
1502 return cx;
1503}
1504
1505
1506/* Exit a block (RETURN and LAST). */
1507
1508PERL_STATIC_INLINE void
1509S_cx_popblock(pTHX_ PERL_CONTEXT *cx)
1510{
1511 PERL_ARGS_ASSERT_CX_POPBLOCK;
1512
1513 CX_DEBUG(cx, "POP");
1514 /* these 3 are common to cx_popblock and cx_topblock */
1515 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1516 PL_scopestack_ix = cx->blk_oldscopesp;
1517 PL_curpm = cx->blk_oldpm;
1518
1519 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
1520 * and leaves a CX entry lying around for repeated use, so
1521 * skip for multicall */ \
1522 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
1523 || PL_savestack_ix == cx->blk_oldsaveix);
1524 PL_curcop = cx->blk_oldcop;
ce8bb8d8 1525 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
1526}
1527
1528/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
1529 * Whereas cx_popblock() restores the state to the point just before
1530 * cx_pushblock() was called, cx_topblock() restores it to the point just
1531 * *after* cx_pushblock() was called. */
1532
1533PERL_STATIC_INLINE void
1534S_cx_topblock(pTHX_ PERL_CONTEXT *cx)
1535{
1536 PERL_ARGS_ASSERT_CX_TOPBLOCK;
1537
1538 CX_DEBUG(cx, "TOP");
1539 /* these 3 are common to cx_popblock and cx_topblock */
1540 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1541 PL_scopestack_ix = cx->blk_oldscopesp;
1542 PL_curpm = cx->blk_oldpm;
1543
1544 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
1545}
1546
1547
a73d8813
DM
1548PERL_STATIC_INLINE void
1549S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
1550{
1551 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
1552
1553 PERL_ARGS_ASSERT_CX_PUSHSUB;
1554
3f6bd23a 1555 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
1556 cx->blk_sub.cv = cv;
1557 cx->blk_sub.olddepth = CvDEPTH(cv);
1558 cx->blk_sub.prevcomppad = PL_comppad;
1559 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
1560 cx->blk_sub.retop = retop;
1561 SvREFCNT_inc_simple_void_NN(cv);
1562 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
1563}
1564
1565
1566/* subsets of cx_popsub() */
1567
1568PERL_STATIC_INLINE void
1569S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
1570{
1571 CV *cv;
1572
1573 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
1574 assert(CxTYPE(cx) == CXt_SUB);
1575
1576 PL_comppad = cx->blk_sub.prevcomppad;
1577 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1578 cv = cx->blk_sub.cv;
1579 CvDEPTH(cv) = cx->blk_sub.olddepth;
1580 cx->blk_sub.cv = NULL;
1581 SvREFCNT_dec(cv);
1582}
1583
1584
1585/* handle the @_ part of leaving a sub */
1586
1587PERL_STATIC_INLINE void
1588S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
1589{
1590 AV *av;
1591
1592 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
1593 assert(CxTYPE(cx) == CXt_SUB);
1594 assert(AvARRAY(MUTABLE_AV(
1595 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
1596 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
1597
1598 CX_POP_SAVEARRAY(cx);
1599 av = MUTABLE_AV(PAD_SVl(0));
1600 if (UNLIKELY(AvREAL(av)))
1601 /* abandon @_ if it got reified */
1602 clear_defarray(av, 0);
1603 else {
1604 CLEAR_ARGARRAY(av);
1605 }
1606}
1607
1608
1609PERL_STATIC_INLINE void
1610S_cx_popsub(pTHX_ PERL_CONTEXT *cx)
1611{
1612 PERL_ARGS_ASSERT_CX_POPSUB;
1613 assert(CxTYPE(cx) == CXt_SUB);
1614
3f6bd23a 1615 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
1616
1617 if (CxHASARGS(cx))
1618 cx_popsub_args(cx);
1619 cx_popsub_common(cx);
1620}
1621
1622
6a7d52cc
DM
1623PERL_STATIC_INLINE void
1624S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
1625{
1626 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
1627
1628 cx->blk_format.cv = cv;
1629 cx->blk_format.retop = retop;
1630 cx->blk_format.gv = gv;
1631 cx->blk_format.dfoutgv = PL_defoutgv;
1632 cx->blk_format.prevcomppad = PL_comppad;
1633 cx->blk_u16 = 0;
1634
1635 SvREFCNT_inc_simple_void_NN(cv);
1636 CvDEPTH(cv)++;
1637 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
1638}
1639
1640
1641PERL_STATIC_INLINE void
1642S_cx_popformat(pTHX_ PERL_CONTEXT *cx)
1643{
1644 CV *cv;
1645 GV *dfout;
1646
1647 PERL_ARGS_ASSERT_CX_POPFORMAT;
1648 assert(CxTYPE(cx) == CXt_FORMAT);
1649
1650 dfout = cx->blk_format.dfoutgv;
1651 setdefout(dfout);
1652 cx->blk_format.dfoutgv = NULL;
1653 SvREFCNT_dec_NN(dfout);
1654
1655 PL_comppad = cx->blk_format.prevcomppad;
1656 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1657 cv = cx->blk_format.cv;
1658 cx->blk_format.cv = NULL;
1659 --CvDEPTH(cv);
1660 SvREFCNT_dec_NN(cv);
1661}
1662
1663
13febba5
DM
1664PERL_STATIC_INLINE void
1665S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
1666{
1667 PERL_ARGS_ASSERT_CX_PUSHEVAL;
1668
1669 cx->blk_eval.retop = retop;
1670 cx->blk_eval.old_namesv = namesv;
1671 cx->blk_eval.old_eval_root = PL_eval_root;
1672 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
1673 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
1674 cx->blk_eval.cur_top_env = PL_top_env;
1675
4c57ced5 1676 assert(!(PL_in_eval & ~ 0x3F));
13febba5 1677 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 1678 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
1679}
1680
1681
1682PERL_STATIC_INLINE void
1683S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
1684{
1685 SV *sv;
1686
1687 PERL_ARGS_ASSERT_CX_POPEVAL;
1688 assert(CxTYPE(cx) == CXt_EVAL);
1689
1690 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 1691 assert(!(PL_in_eval & 0xc0));
13febba5
DM
1692 PL_eval_root = cx->blk_eval.old_eval_root;
1693 sv = cx->blk_eval.cur_text;
4c57ced5 1694 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
1695 cx->blk_eval.cur_text = NULL;
1696 SvREFCNT_dec_NN(sv);
1697 }
1698
1699 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
1700 if (sv) {
1701 cx->blk_eval.old_namesv = NULL;
1702 SvREFCNT_dec_NN(sv);
1703 }
13febba5 1704}
6a7d52cc 1705
a73d8813 1706
d1b6bf72
DM
1707/* push a plain loop, i.e.
1708 * { block }
1709 * while (cond) { block }
1710 * for (init;cond;continue) { block }
1711 * This loop can be last/redo'ed etc.
1712 */
1713
1714PERL_STATIC_INLINE void
1715S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
1716{
1717 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
1718 cx->blk_loop.my_op = cLOOP;
1719}
1720
1721
1722/* push a true for loop, i.e.
1723 * for var (list) { block }
1724 */
1725
1726PERL_STATIC_INLINE void
1727S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
1728{
1729 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
1730
1731 /* this one line is common with cx_pushloop_plain */
1732 cx->blk_loop.my_op = cLOOP;
1733
1734 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
1735 cx->blk_loop.itersave = itersave;
1736#ifdef USE_ITHREADS
1737 cx->blk_loop.oldcomppad = PL_comppad;
1738#endif
1739}
1740
1741
1742/* pop all loop types, including plain */
1743
1744PERL_STATIC_INLINE void
1745S_cx_poploop(pTHX_ PERL_CONTEXT *cx)
1746{
1747 PERL_ARGS_ASSERT_CX_POPLOOP;
1748
1749 assert(CxTYPE_is_LOOP(cx));
1750 if ( CxTYPE(cx) == CXt_LOOP_ARY
1751 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
1752 {
1753 /* Free ary or cur. This assumes that state_u.ary.ary
1754 * aligns with state_u.lazysv.cur. See cx_dup() */
1755 SV *sv = cx->blk_loop.state_u.lazysv.cur;
1756 cx->blk_loop.state_u.lazysv.cur = NULL;
1757 SvREFCNT_dec_NN(sv);
1758 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
1759 sv = cx->blk_loop.state_u.lazysv.end;
1760 cx->blk_loop.state_u.lazysv.end = NULL;
1761 SvREFCNT_dec_NN(sv);
1762 }
1763 }
1764 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
1765 SV *cursv;
1766 SV **svp = (cx)->blk_loop.itervar_u.svp;
1767 if ((cx->cx_type & CXp_FOR_GV))
1768 svp = &GvSV((GV*)svp);
1769 cursv = *svp;
1770 *svp = cx->blk_loop.itersave;
1771 cx->blk_loop.itersave = NULL;
1772 SvREFCNT_dec(cursv);
1773 }
1774}
1775
2a7b7c61
DM
1776
1777PERL_STATIC_INLINE void
1778S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
1779{
1780 PERL_ARGS_ASSERT_CX_PUSHWHEN;
1781
1782 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1783}
1784
1785
1786PERL_STATIC_INLINE void
1787S_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
1788{
1789 PERL_ARGS_ASSERT_CX_POPWHEN;
1790 assert(CxTYPE(cx) == CXt_WHEN);
1791
1792 PERL_UNUSED_ARG(cx);
59a14f30 1793 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
1794 /* currently NOOP */
1795}
1796
1797
1798PERL_STATIC_INLINE void
1799S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
1800{
1801 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
1802
1803 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1804 cx->blk_givwhen.defsv_save = orig_defsv;
1805}
1806
1807
1808PERL_STATIC_INLINE void
1809S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
1810{
1811 SV *sv;
1812
1813 PERL_ARGS_ASSERT_CX_POPGIVEN;
1814 assert(CxTYPE(cx) == CXt_GIVEN);
1815
1816 sv = GvSV(PL_defgv);
1817 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
1818 cx->blk_givwhen.defsv_save = NULL;
1819 SvREFCNT_dec(sv);
1820}
1821
ec2c235b
KW
1822/* ------------------ util.h ------------------------------------------- */
1823
1824/*
1825=head1 Miscellaneous Functions
1826
1827=for apidoc foldEQ
1828
1829Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1830same
1831case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
1832match themselves and their opposite case counterparts. Non-cased and non-ASCII
1833range bytes match only themselves.
1834
1835=cut
1836*/
1837
1838PERL_STATIC_INLINE I32
1839Perl_foldEQ(const char *s1, const char *s2, I32 len)
1840{
1841 const U8 *a = (const U8 *)s1;
1842 const U8 *b = (const U8 *)s2;
1843
1844 PERL_ARGS_ASSERT_FOLDEQ;
1845
1846 assert(len >= 0);
1847
1848 while (len--) {
1849 if (*a != *b && *a != PL_fold[*b])
1850 return 0;
1851 a++,b++;
1852 }
1853 return 1;
1854}
1855
0f9cb40c 1856PERL_STATIC_INLINE I32
ec2c235b
KW
1857Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
1858{
1859 /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on
1860 * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
1861 * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor
1862 * does it check that the strings each have at least 'len' characters */
1863
1864 const U8 *a = (const U8 *)s1;
1865 const U8 *b = (const U8 *)s2;
1866
1867 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
1868
1869 assert(len >= 0);
1870
1871 while (len--) {
1872 if (*a != *b && *a != PL_fold_latin1[*b]) {
1873 return 0;
1874 }
1875 a++, b++;
1876 }
1877 return 1;
1878}
1879
1880/*
1881=for apidoc foldEQ_locale
1882
1883Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1884same case-insensitively in the current locale; false otherwise.
1885
1886=cut
1887*/
1888
0f9cb40c 1889PERL_STATIC_INLINE I32
ec2c235b
KW
1890Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
1891{
1892 dVAR;
1893 const U8 *a = (const U8 *)s1;
1894 const U8 *b = (const U8 *)s2;
1895
1896 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
1897
1898 assert(len >= 0);
1899
1900 while (len--) {
1901 if (*a != *b && *a != PL_fold_locale[*b])
1902 return 0;
1903 a++,b++;
1904 }
1905 return 1;
1906}
1907
6dba01e2
KW
1908#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
1909
1910PERL_STATIC_INLINE void *
1911S_my_memrchr(const char * s, const char c, const STRLEN len)
1912{
1913 /* memrchr(), since many platforms lack it */
1914
1915 const char * t = s + len - 1;
1916
1917 PERL_ARGS_ASSERT_MY_MEMRCHR;
1918
1919 while (t >= s) {
1920 if (*t == c) {
1921 return (void *) t;
1922 }
1923 t--;
1924 }
1925
1926 return NULL;
1927}
1928
1929#endif
1930
ed382232 1931/*
c8028aa6
TC
1932 * ex: set ts=8 sts=4 sw=4 et:
1933 */