This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
is_utf8_invariant_string(): small speed optimization
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file is a home for static inline functions that cannot go in other
9 * headers files, because they depend on proto.h (included after most other
10 * headers) or struct definitions.
11 *
12 * Each section names the header file that the functions "belong" to.
13 */
27669aa4 14
be3a7a5d
KW
15/* ------------------------------- av.h ------------------------------- */
16
c70927a6 17PERL_STATIC_INLINE SSize_t
be3a7a5d
KW
18S_av_top_index(pTHX_ AV *av)
19{
20 PERL_ARGS_ASSERT_AV_TOP_INDEX;
21 assert(SvTYPE(av) == SVt_PVAV);
22
23 return AvFILL(av);
24}
25
1afe1db1
FC
26/* ------------------------------- cv.h ------------------------------- */
27
ae77754a
FC
28PERL_STATIC_INLINE GV *
29S_CvGV(pTHX_ CV *sv)
30{
31 return CvNAMED(sv)
32 ? Perl_cvgv_from_hek(aTHX_ sv)
33 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
34}
35
1afe1db1
FC
36PERL_STATIC_INLINE I32 *
37S_CvDEPTHp(const CV * const sv)
38{
39 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
8de47657 40 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
41}
42
d16269d8
PM
43/*
44 CvPROTO returns the prototype as stored, which is not necessarily what
45 the interpreter should be using. Specifically, the interpreter assumes
46 that spaces have been stripped, which has been the case if the prototype
47 was added by toke.c, but is generally not the case if it was added elsewhere.
48 Since we can't enforce the spacelessness at assignment time, this routine
49 provides a temporary copy at parse time with spaces removed.
50 I<orig> is the start of the original buffer, I<len> is the length of the
51 prototype and will be updated when this returns.
52 */
53
5b67adb8 54#ifdef PERL_CORE
d16269d8
PM
55PERL_STATIC_INLINE char *
56S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
57{
58 SV * tmpsv;
59 char * tmps;
60 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
61 tmps = SvPVX(tmpsv);
62 while ((*len)--) {
63 if (!isSPACE(*orig))
64 *tmps++ = *orig;
65 orig++;
66 }
67 *tmps = '\0';
68 *len = tmps - SvPVX(tmpsv);
69 return SvPVX(tmpsv);
70}
5b67adb8 71#endif
d16269d8 72
25fdce4a
FC
73/* ------------------------------- mg.h ------------------------------- */
74
75#if defined(PERL_CORE) || defined(PERL_EXT)
76/* assumes get-magic and stringification have already occurred */
77PERL_STATIC_INLINE STRLEN
78S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
79{
80 assert(mg->mg_type == PERL_MAGIC_regex_global);
81 assert(mg->mg_len != -1);
82 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
83 return (STRLEN)mg->mg_len;
84 else {
85 const STRLEN pos = (STRLEN)mg->mg_len;
86 /* Without this check, we may read past the end of the buffer: */
87 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
88 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
89 }
90}
91#endif
92
03414f05
FC
93/* ------------------------------- pad.h ------------------------------ */
94
95#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
96PERL_STATIC_INLINE bool
97PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
98{
99 /* is seq within the range _LOW to _HIGH ?
100 * This is complicated by the fact that PL_cop_seqmax
101 * may have wrapped around at some point */
102 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
103 return FALSE; /* not yet introduced */
104
105 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
106 /* in compiling scope */
107 if (
108 (seq > COP_SEQ_RANGE_LOW(pn))
109 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
110 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
111 )
112 return TRUE;
113 }
114 else if (
115 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
116 ?
117 ( seq > COP_SEQ_RANGE_LOW(pn)
118 || seq <= COP_SEQ_RANGE_HIGH(pn))
119
120 : ( seq > COP_SEQ_RANGE_LOW(pn)
121 && seq <= COP_SEQ_RANGE_HIGH(pn))
122 )
123 return TRUE;
124 return FALSE;
125}
126#endif
127
33a4312b
FC
128/* ------------------------------- pp.h ------------------------------- */
129
130PERL_STATIC_INLINE I32
131S_TOPMARK(pTHX)
132{
133 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 134 "MARK top %p %" IVdf "\n",
33a4312b
FC
135 PL_markstack_ptr,
136 (IV)*PL_markstack_ptr)));
137 return *PL_markstack_ptr;
138}
139
140PERL_STATIC_INLINE I32
141S_POPMARK(pTHX)
142{
143 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 144 "MARK pop %p %" IVdf "\n",
33a4312b
FC
145 (PL_markstack_ptr-1),
146 (IV)*(PL_markstack_ptr-1))));
147 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
148 return *PL_markstack_ptr--;
149}
150
8d919b0a
FC
151/* ----------------------------- regexp.h ----------------------------- */
152
153PERL_STATIC_INLINE struct regexp *
154S_ReANY(const REGEXP * const re)
155{
df6b4bd5 156 XPV* const p = (XPV*)SvANY(re);
8d919b0a 157 assert(isREGEXP(re));
df6b4bd5
DM
158 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
159 : (struct regexp *)p;
8d919b0a
FC
160}
161
27669aa4
FC
162/* ------------------------------- sv.h ------------------------------- */
163
164PERL_STATIC_INLINE SV *
165S_SvREFCNT_inc(SV *sv)
166{
2439e033 167 if (LIKELY(sv != NULL))
27669aa4
FC
168 SvREFCNT(sv)++;
169 return sv;
170}
171PERL_STATIC_INLINE SV *
172S_SvREFCNT_inc_NN(SV *sv)
173{
174 SvREFCNT(sv)++;
175 return sv;
176}
177PERL_STATIC_INLINE void
178S_SvREFCNT_inc_void(SV *sv)
179{
2439e033 180 if (LIKELY(sv != NULL))
27669aa4
FC
181 SvREFCNT(sv)++;
182}
75e16a44
FC
183PERL_STATIC_INLINE void
184S_SvREFCNT_dec(pTHX_ SV *sv)
185{
2439e033 186 if (LIKELY(sv != NULL)) {
75a9bf96 187 U32 rc = SvREFCNT(sv);
79e2a32a 188 if (LIKELY(rc > 1))
75a9bf96
DM
189 SvREFCNT(sv) = rc - 1;
190 else
191 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
192 }
193}
541377b1
FC
194
195PERL_STATIC_INLINE void
4a9a56a7
DM
196S_SvREFCNT_dec_NN(pTHX_ SV *sv)
197{
198 U32 rc = SvREFCNT(sv);
79e2a32a 199 if (LIKELY(rc > 1))
4a9a56a7
DM
200 SvREFCNT(sv) = rc - 1;
201 else
202 Perl_sv_free2(aTHX_ sv, rc);
203}
204
205PERL_STATIC_INLINE void
541377b1
FC
206SvAMAGIC_on(SV *sv)
207{
208 assert(SvROK(sv));
209 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
210}
211PERL_STATIC_INLINE void
212SvAMAGIC_off(SV *sv)
213{
214 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
215 HvAMAGIC_off(SvSTASH(SvRV(sv)));
216}
217
218PERL_STATIC_INLINE U32
541377b1
FC
219S_SvPADSTALE_on(SV *sv)
220{
c0683843 221 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
222 return SvFLAGS(sv) |= SVs_PADSTALE;
223}
224PERL_STATIC_INLINE U32
225S_SvPADSTALE_off(SV *sv)
226{
c0683843 227 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
228 return SvFLAGS(sv) &= ~SVs_PADSTALE;
229}
25fdce4a 230#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 231PERL_STATIC_INLINE STRLEN
6964422a 232S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 233{
25fdce4a 234 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
235 if (SvGAMAGIC(sv)) {
236 U8 *hopped = utf8_hop((U8 *)pv, pos);
237 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
238 return (STRLEN)(hopped - (U8 *)pv);
239 }
240 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
241}
242#endif
f019c49e 243
d1decf2b
TC
244/* ------------------------------- handy.h ------------------------------- */
245
246/* saves machine code for a common noreturn idiom typically used in Newx*() */
c1d6452f 247#ifdef GCC_DIAG_PRAGMA
6ab56f1e 248GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
c1d6452f 249#endif
d1decf2b
TC
250static void
251S_croak_memory_wrap(void)
252{
253 Perl_croak_nocontext("%s",PL_memory_wrap);
254}
c1d6452f 255#ifdef GCC_DIAG_PRAGMA
6ab56f1e 256GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
c1d6452f 257#endif
d1decf2b 258
a8a2ceaa
KW
259/* ------------------------------- utf8.h ------------------------------- */
260
2fe720e2
KW
261/*
262=head1 Unicode Support
263*/
264
55d09dc8
KW
265PERL_STATIC_INLINE void
266S_append_utf8_from_native_byte(const U8 byte, U8** dest)
267{
268 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
269 * encoded string at '*dest', updating '*dest' to include it */
270
55d09dc8
KW
271 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
272
6f2d5cbc 273 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 274 *((*dest)++) = byte;
55d09dc8 275 else {
a09ec51a
KW
276 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
277 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
278 }
279}
280
e123187a 281/*
2fe720e2 282=for apidoc valid_utf8_to_uvchr
2717076a 283Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
2fe720e2
KW
284the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
285it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and
286non-Unicode code points are allowed.
287
288=cut
289
290 */
291
292PERL_STATIC_INLINE UV
293Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
294{
c41b2540 295 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
296 const U8* send = s + expectlen;
297 UV uv = *s;
298
299 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
300
301 if (retlen) {
302 *retlen = expectlen;
303 }
304
305 /* An invariant is trivially returned */
306 if (expectlen == 1) {
307 return uv;
308 }
309
310 /* Remove the leading bits that indicate the number of bytes, leaving just
311 * the bits that are part of the value */
312 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
313
314 /* Now, loop through the remaining bytes, accumulating each into the
315 * working total as we go. (I khw tried unrolling the loop for up to 4
316 * bytes, but there was no performance improvement) */
317 for (++s; s < send; s++) {
318 uv = UTF8_ACCUMULATE(uv, *s);
319 }
320
321 return UNI_TO_NATIVE(uv);
322
323}
324
1e599354
KW
325/*
326=for apidoc is_utf8_invariant_string
327
82c5d941 328Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 329regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
330EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
331are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
332the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
333characters are invariant, but so also are the C1 controls.
1e599354
KW
334
335If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
336use this option, that C<s> can't have embedded C<NUL> characters and has to
337have a terminating C<NUL> byte).
338
9f2abfde
KW
339See also
340C<L</is_utf8_string>>,
341C<L</is_utf8_string_flags>>,
342C<L</is_utf8_string_loc>>,
343C<L</is_utf8_string_loc_flags>>,
344C<L</is_utf8_string_loclen>>,
345C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
346C<L</is_utf8_fixed_width_buf_flags>>,
347C<L</is_utf8_fixed_width_buf_loc_flags>>,
348C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
349C<L</is_strict_utf8_string>>,
350C<L</is_strict_utf8_string_loc>>,
351C<L</is_strict_utf8_string_loclen>>,
352C<L</is_c9strict_utf8_string>>,
353C<L</is_c9strict_utf8_string_loc>>,
354and
355C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
356
357=cut
0cbf5865
KW
358
359*/
360
361#define is_utf8_invariant_string(s, len) \
362 is_utf8_invariant_string_loc(s, len, NULL)
363
364/*
365=for apidoc is_utf8_invariant_string_loc
366
367Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
368the first UTF-8 variant character in the C<ep> pointer; if all characters are
369UTF-8 invariant, this function does not change the contents of C<*ep>.
370
371=cut
372
1e599354
KW
373*/
374
375PERL_STATIC_INLINE bool
e17544a6 376S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 377{
e17544a6 378 const U8* send;
1e599354
KW
379 const U8* x = s;
380
0cbf5865
KW
381 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
382
e17544a6
KW
383 if (len == 0) {
384 len = strlen((const char *)s);
385 }
386
387 send = s + len;
388
389#ifndef EBCDIC
390 /* Try to get the widest word on this platform */
391# ifdef HAS_LONG_LONG
392# define PERL_WORDCAST unsigned long long
393# define PERL_WORDSIZE LONGLONGSIZE
394# else
395# define PERL_WORDCAST UV
396# define PERL_WORDSIZE UVSIZE
397# endif
398
399# if PERL_WORDSIZE == 4
400# define PERL_VARIANTS_WORD_MASK 0x80808080
401# define PERL_WORD_BOUNDARY_MASK 0x3
402# elif PERL_WORDSIZE == 8
57845a20 403# define PERL_VARIANTS_WORD_MASK UINT64_C(0x8080808080808080)
e17544a6
KW
404# define PERL_WORD_BOUNDARY_MASK 0x7
405# else
406# error Unexpected word size
407# endif
408
b40579ff
KW
409 if ((STRLEN) (send - x) >= PERL_WORDSIZE) {
410
e17544a6
KW
411 /* Process per-byte until reach word boundary. XXX This loop could be
412 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 413 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
e17544a6
KW
414 if (! UTF8_IS_INVARIANT(*x)) {
415 if (ep) {
416 *ep = x;
417 }
418
419 return FALSE;
420 }
421 x++;
422 }
423
424 /* Process per-word as long as we have at least a full word left */
425 while (x + PERL_WORDSIZE <= send) {
426 if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
427
428 /* Found a variant. Just return if caller doesn't want its exact
429 * position */
430 if (! ep) {
431 return FALSE;
432 }
433
434 /* Otherwise fall into final loop to find which byte it is */
435 break;
0cbf5865 436 }
e17544a6
KW
437 x += PERL_WORDSIZE;
438 }
b40579ff 439 }
e17544a6
KW
440
441# undef PERL_WORDCAST
442# undef PERL_WORDSIZE
443# undef PERL_WORD_BOUNDARY_MASK
444# undef PERL_VARIANTS_WORD_MASK
445#endif
446
447 /* Process per-byte */
448 while (x < send) {
449 if (! UTF8_IS_INVARIANT(*x)) {
450 if (ep) {
451 *ep = x;
452 }
0cbf5865 453
e17544a6 454 return FALSE;
0cbf5865 455 }
1e599354 456
e17544a6 457 x++;
1e599354
KW
458 }
459
460 return TRUE;
461}
462
7c93d8f0 463/*
5ff889fb
KW
464=for apidoc is_utf8_string
465
82c5d941
KW
466Returns TRUE if the first C<len> bytes of string C<s> form a valid
467Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
468be calculated using C<strlen(s)> (which means if you use this option, that C<s>
469can't have embedded C<NUL> characters and has to have a terminating C<NUL>
470byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
471
2717076a
KW
472This function considers Perl's extended UTF-8 to be valid. That means that
473code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
474considered valid by this function. Use C<L</is_strict_utf8_string>>,
475C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
476code points are considered valid.
5ff889fb 477
9f2abfde
KW
478See also
479C<L</is_utf8_invariant_string>>,
0cbf5865 480C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
481C<L</is_utf8_string_loc>>,
482C<L</is_utf8_string_loclen>>,
8bc127bf
KW
483C<L</is_utf8_fixed_width_buf_flags>>,
484C<L</is_utf8_fixed_width_buf_loc_flags>>,
485C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
486
487=cut
488*/
489
dd237e82 490#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 491
c9cd936b
KW
492#if defined(PERL_CORE) || defined (PERL_EXT)
493
494/*
495=for apidoc is_utf8_non_invariant_string
496
497Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
498C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
499UTF-8; otherwise returns FALSE.
500
501A TRUE return means that at least one code point represented by the sequence
502either is a wide character not representable as a single byte, or the
503representation differs depending on whether the sequence is encoded in UTF-8 or
504not.
505
506See also
507C<L<perlapi/is_utf8_invariant_string>>,
508C<L<perlapi/is_utf8_string>>
509
510=cut
511
512This is commonly used to determine if a SV's UTF-8 flag should be turned on.
513It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
514it otherwise contains invalid UTF-8.
515
516It is an internal function because khw thinks that XS code shouldn't be working
517at this low a level. A valid use case could change that.
518
519*/
520
521PERL_STATIC_INLINE bool
522S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
523{
524 const U8 * first_variant;
525
526 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
527
528 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
529 return FALSE;
530 }
531
532 return is_utf8_string(first_variant, len - (first_variant - s));
533}
534
535#endif
536
5ff889fb 537/*
9f2abfde
KW
538=for apidoc is_strict_utf8_string
539
540Returns TRUE if the first C<len> bytes of string C<s> form a valid
541UTF-8-encoded string that is fully interchangeable by any application using
542Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
543calculated using C<strlen(s)> (which means if you use this option, that C<s>
544can't have embedded C<NUL> characters and has to have a terminating C<NUL>
545byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
546
547This function returns FALSE for strings containing any
548code points above the Unicode max of 0x10FFFF, surrogate code points, or
549non-character code points.
550
551See also
552C<L</is_utf8_invariant_string>>,
0cbf5865 553C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
554C<L</is_utf8_string>>,
555C<L</is_utf8_string_flags>>,
556C<L</is_utf8_string_loc>>,
557C<L</is_utf8_string_loc_flags>>,
558C<L</is_utf8_string_loclen>>,
559C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
560C<L</is_utf8_fixed_width_buf_flags>>,
561C<L</is_utf8_fixed_width_buf_loc_flags>>,
562C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
563C<L</is_strict_utf8_string_loc>>,
564C<L</is_strict_utf8_string_loclen>>,
565C<L</is_c9strict_utf8_string>>,
566C<L</is_c9strict_utf8_string_loc>>,
567and
568C<L</is_c9strict_utf8_string_loclen>>.
569
570=cut
571*/
572
dd237e82 573#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
574
575/*
576=for apidoc is_c9strict_utf8_string
577
578Returns TRUE if the first C<len> bytes of string C<s> form a valid
579UTF-8-encoded string that conforms to
580L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
581otherwise it returns FALSE. If C<len> is 0, it will be calculated using
582C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
583C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
584characters being ASCII constitute 'a valid UTF-8 string'.
585
586This function returns FALSE for strings containing any code points above the
587Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
588code points per
589L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
590
591See also
592C<L</is_utf8_invariant_string>>,
0cbf5865 593C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
594C<L</is_utf8_string>>,
595C<L</is_utf8_string_flags>>,
596C<L</is_utf8_string_loc>>,
597C<L</is_utf8_string_loc_flags>>,
598C<L</is_utf8_string_loclen>>,
599C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
600C<L</is_utf8_fixed_width_buf_flags>>,
601C<L</is_utf8_fixed_width_buf_loc_flags>>,
602C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
603C<L</is_strict_utf8_string>>,
604C<L</is_strict_utf8_string_loc>>,
605C<L</is_strict_utf8_string_loclen>>,
606C<L</is_c9strict_utf8_string_loc>>,
607and
608C<L</is_c9strict_utf8_string_loclen>>.
609
610=cut
611*/
612
dd237e82 613#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
614
615/*
616=for apidoc is_utf8_string_flags
617
618Returns TRUE if the first C<len> bytes of string C<s> form a valid
619UTF-8 string, subject to the restrictions imposed by C<flags>;
620returns FALSE otherwise. If C<len> is 0, it will be calculated
621using C<strlen(s)> (which means if you use this option, that C<s> can't have
622embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
623that all characters being ASCII constitute 'a valid UTF-8 string'.
624
625If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
626C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
627as C<L</is_strict_utf8_string>>; and if C<flags> is
628C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
629C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
630combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
631C<L</utf8n_to_uvchr>>, with the same meanings.
632
633See also
634C<L</is_utf8_invariant_string>>,
0cbf5865 635C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
636C<L</is_utf8_string>>,
637C<L</is_utf8_string_loc>>,
638C<L</is_utf8_string_loc_flags>>,
639C<L</is_utf8_string_loclen>>,
640C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
641C<L</is_utf8_fixed_width_buf_flags>>,
642C<L</is_utf8_fixed_width_buf_loc_flags>>,
643C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
644C<L</is_strict_utf8_string>>,
645C<L</is_strict_utf8_string_loc>>,
646C<L</is_strict_utf8_string_loclen>>,
647C<L</is_c9strict_utf8_string>>,
648C<L</is_c9strict_utf8_string_loc>>,
649and
650C<L</is_c9strict_utf8_string_loclen>>.
651
652=cut
653*/
654
655PERL_STATIC_INLINE bool
f60f61fd 656S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 657{
33756530 658 const U8 * first_variant;
9f2abfde
KW
659
660 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
661 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 662 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 663
f60f61fd
KW
664 if (len == 0) {
665 len = strlen((const char *)s);
666 }
667
9f2abfde
KW
668 if (flags == 0) {
669 return is_utf8_string(s, len);
670 }
671
d044b7a7 672 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
673 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
674 {
675 return is_strict_utf8_string(s, len);
676 }
677
d044b7a7 678 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
679 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
680 {
681 return is_c9strict_utf8_string(s, len);
682 }
683
33756530
KW
684 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
685 const U8* const send = s + len;
686 const U8* x = first_variant;
687
a0d7f935
KW
688 while (x < send) {
689 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
690 if (UNLIKELY(! cur_len)) {
691 return FALSE;
692 }
693 x += cur_len;
9f2abfde 694 }
33756530 695 }
9f2abfde
KW
696
697 return TRUE;
698}
699
700/*
5ff889fb
KW
701
702=for apidoc is_utf8_string_loc
703
2717076a 704Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 705case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 706"utf8ness success") in the C<ep> pointer.
5ff889fb 707
2717076a 708See also C<L</is_utf8_string_loclen>>.
5ff889fb 709
3964c812
KW
710=cut
711*/
712
713#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
714
715/*
716
5ff889fb
KW
717=for apidoc is_utf8_string_loclen
718
2717076a 719Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 720case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 721"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 722encoded characters in the C<el> pointer.
5ff889fb 723
2717076a 724See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
725
726=cut
727*/
728
56e4cf64 729PERL_STATIC_INLINE bool
33756530 730Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 731{
33756530 732 const U8 * first_variant;
5ff889fb
KW
733
734 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
735
33756530
KW
736 if (len == 0) {
737 len = strlen((const char *) s);
738 }
739
740 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
741 if (el)
742 *el = len;
743
744 if (ep) {
745 *ep = s + len;
746 }
747
748 return TRUE;
749 }
750
751 {
752 const U8* const send = s + len;
753 const U8* x = first_variant;
754 STRLEN outlen = first_variant - s;
755
a0d7f935
KW
756 while (x < send) {
757 const STRLEN cur_len = isUTF8_CHAR(x, send);
758 if (UNLIKELY(! cur_len)) {
759 break;
760 }
761 x += cur_len;
762 outlen++;
5ff889fb 763 }
5ff889fb 764
a0d7f935
KW
765 if (el)
766 *el = outlen;
5ff889fb 767
a0d7f935
KW
768 if (ep) {
769 *ep = x;
770 }
5ff889fb 771
a0d7f935 772 return (x == send);
33756530 773 }
5ff889fb
KW
774}
775
776/*
9f2abfde
KW
777
778=for apidoc is_strict_utf8_string_loc
779
780Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
781case of "utf8ness failure") or the location C<s>+C<len> (in the case of
782"utf8ness success") in the C<ep> pointer.
783
784See also C<L</is_strict_utf8_string_loclen>>.
785
786=cut
787*/
788
789#define is_strict_utf8_string_loc(s, len, ep) \
790 is_strict_utf8_string_loclen(s, len, ep, 0)
791
792/*
793
794=for apidoc is_strict_utf8_string_loclen
795
796Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
797case of "utf8ness failure") or the location C<s>+C<len> (in the case of
798"utf8ness success") in the C<ep> pointer, and the number of UTF-8
799encoded characters in the C<el> pointer.
800
801See also C<L</is_strict_utf8_string_loc>>.
802
803=cut
804*/
805
806PERL_STATIC_INLINE bool
33756530 807S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 808{
33756530 809 const U8 * first_variant;
9f2abfde
KW
810
811 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
812
33756530
KW
813 if (len == 0) {
814 len = strlen((const char *) s);
815 }
816
817 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
818 if (el)
819 *el = len;
820
821 if (ep) {
822 *ep = s + len;
823 }
824
825 return TRUE;
826 }
827
828 {
829 const U8* const send = s + len;
830 const U8* x = first_variant;
831 STRLEN outlen = first_variant - s;
832
a0d7f935
KW
833 while (x < send) {
834 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
835 if (UNLIKELY(! cur_len)) {
836 break;
837 }
838 x += cur_len;
839 outlen++;
9f2abfde 840 }
9f2abfde 841
a0d7f935
KW
842 if (el)
843 *el = outlen;
9f2abfde 844
a0d7f935
KW
845 if (ep) {
846 *ep = x;
847 }
9f2abfde 848
a0d7f935 849 return (x == send);
33756530 850 }
9f2abfde
KW
851}
852
853/*
854
855=for apidoc is_c9strict_utf8_string_loc
856
857Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
858the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
859"utf8ness success") in the C<ep> pointer.
860
861See also C<L</is_c9strict_utf8_string_loclen>>.
862
863=cut
864*/
865
866#define is_c9strict_utf8_string_loc(s, len, ep) \
867 is_c9strict_utf8_string_loclen(s, len, ep, 0)
868
869/*
870
871=for apidoc is_c9strict_utf8_string_loclen
872
873Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
874the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
875"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
876characters in the C<el> pointer.
877
878See also C<L</is_c9strict_utf8_string_loc>>.
879
880=cut
881*/
882
883PERL_STATIC_INLINE bool
33756530 884S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 885{
33756530 886 const U8 * first_variant;
9f2abfde
KW
887
888 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
889
33756530
KW
890 if (len == 0) {
891 len = strlen((const char *) s);
892 }
893
894 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
895 if (el)
896 *el = len;
897
898 if (ep) {
899 *ep = s + len;
900 }
901
902 return TRUE;
903 }
904
905 {
906 const U8* const send = s + len;
907 const U8* x = first_variant;
908 STRLEN outlen = first_variant - s;
909
a0d7f935
KW
910 while (x < send) {
911 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
912 if (UNLIKELY(! cur_len)) {
913 break;
914 }
915 x += cur_len;
916 outlen++;
9f2abfde 917 }
9f2abfde 918
a0d7f935
KW
919 if (el)
920 *el = outlen;
9f2abfde 921
a0d7f935
KW
922 if (ep) {
923 *ep = x;
924 }
9f2abfde 925
a0d7f935 926 return (x == send);
33756530 927 }
9f2abfde
KW
928}
929
930/*
931
932=for apidoc is_utf8_string_loc_flags
933
934Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
935case of "utf8ness failure") or the location C<s>+C<len> (in the case of
936"utf8ness success") in the C<ep> pointer.
937
938See also C<L</is_utf8_string_loclen_flags>>.
939
940=cut
941*/
942
943#define is_utf8_string_loc_flags(s, len, ep, flags) \
944 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
945
946
947/* The above 3 actual functions could have been moved into the more general one
948 * just below, and made #defines that call it with the right 'flags'. They are
949 * currently kept separate to increase their chances of getting inlined */
950
951/*
952
953=for apidoc is_utf8_string_loclen_flags
954
955Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
956case of "utf8ness failure") or the location C<s>+C<len> (in the case of
957"utf8ness success") in the C<ep> pointer, and the number of UTF-8
958encoded characters in the C<el> pointer.
959
960See also C<L</is_utf8_string_loc_flags>>.
961
962=cut
963*/
964
965PERL_STATIC_INLINE bool
f60f61fd 966S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 967{
33756530 968 const U8 * first_variant;
9f2abfde
KW
969
970 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
971 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 972 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 973
f60f61fd 974 if (len == 0) {
a0d7f935 975 len = strlen((const char *) s);
f60f61fd
KW
976 }
977
9f2abfde
KW
978 if (flags == 0) {
979 return is_utf8_string_loclen(s, len, ep, el);
980 }
981
d044b7a7 982 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
983 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
984 {
985 return is_strict_utf8_string_loclen(s, len, ep, el);
986 }
987
d044b7a7 988 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
989 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
990 {
991 return is_c9strict_utf8_string_loclen(s, len, ep, el);
992 }
993
33756530
KW
994 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
995 if (el)
996 *el = len;
997
998 if (ep) {
999 *ep = s + len;
1000 }
1001
1002 return TRUE;
1003 }
1004
1005 {
1006 const U8* send = s + len;
1007 const U8* x = first_variant;
1008 STRLEN outlen = first_variant - s;
1009
a0d7f935
KW
1010 while (x < send) {
1011 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1012 if (UNLIKELY(! cur_len)) {
1013 break;
1014 }
1015 x += cur_len;
1016 outlen++;
9f2abfde 1017 }
9f2abfde 1018
a0d7f935
KW
1019 if (el)
1020 *el = outlen;
9f2abfde 1021
a0d7f935
KW
1022 if (ep) {
1023 *ep = x;
1024 }
9f2abfde 1025
a0d7f935 1026 return (x == send);
33756530 1027 }
9f2abfde
KW
1028}
1029
1030/*
7c93d8f0
KW
1031=for apidoc utf8_distance
1032
1033Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1034and C<b>.
1035
1036WARNING: use only if you *know* that the pointers point inside the
1037same UTF-8 buffer.
1038
1039=cut
1040*/
1041
1042PERL_STATIC_INLINE IV
1043Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1044{
1045 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1046
1047 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1048}
1049
1050/*
1051=for apidoc utf8_hop
1052
1053Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1054forward or backward.
1055
1056WARNING: do not use the following unless you *know* C<off> is within
1057the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1058on the first byte of character or just after the last byte of a character.
1059
1060=cut
1061*/
1062
1063PERL_STATIC_INLINE U8 *
1064Perl_utf8_hop(const U8 *s, SSize_t off)
1065{
1066 PERL_ARGS_ASSERT_UTF8_HOP;
1067
1068 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1069 * the bitops (especially ~) can create illegal UTF-8.
1070 * In other words: in Perl UTF-8 is not just for Unicode. */
1071
1072 if (off >= 0) {
1073 while (off--)
1074 s += UTF8SKIP(s);
1075 }
1076 else {
1077 while (off++) {
1078 s--;
1079 while (UTF8_IS_CONTINUATION(*s))
1080 s--;
1081 }
1082 }
de979548 1083 GCC_DIAG_IGNORE(-Wcast-qual);
7c93d8f0 1084 return (U8 *)s;
de979548 1085 GCC_DIAG_RESTORE;
7c93d8f0
KW
1086}
1087
4dab108f 1088/*
65df57a8
TC
1089=for apidoc utf8_hop_forward
1090
1091Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1092forward.
1093
1094C<off> must be non-negative.
1095
1096C<s> must be before or equal to C<end>.
1097
1098When moving forward it will not move beyond C<end>.
1099
1100Will not exceed this limit even if the string is not valid "UTF-8".
1101
1102=cut
1103*/
1104
1105PERL_STATIC_INLINE U8 *
1106Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1107{
1108 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1109
1110 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1111 * the bitops (especially ~) can create illegal UTF-8.
1112 * In other words: in Perl UTF-8 is not just for Unicode. */
1113
1114 assert(s <= end);
1115 assert(off >= 0);
1116
1117 while (off--) {
1118 STRLEN skip = UTF8SKIP(s);
de979548
P
1119 if ((STRLEN)(end - s) <= skip) {
1120 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1121 return (U8 *)end;
de979548
P
1122 GCC_DIAG_RESTORE;
1123 }
65df57a8
TC
1124 s += skip;
1125 }
1126
de979548 1127 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1128 return (U8 *)s;
de979548 1129 GCC_DIAG_RESTORE;
65df57a8
TC
1130}
1131
1132/*
1133=for apidoc utf8_hop_back
1134
1135Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1136backward.
1137
1138C<off> must be non-positive.
1139
1140C<s> must be after or equal to C<start>.
1141
1142When moving backward it will not move before C<start>.
1143
1144Will not exceed this limit even if the string is not valid "UTF-8".
1145
1146=cut
1147*/
1148
1149PERL_STATIC_INLINE U8 *
1150Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1151{
1152 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1153
1154 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1155 * the bitops (especially ~) can create illegal UTF-8.
1156 * In other words: in Perl UTF-8 is not just for Unicode. */
1157
1158 assert(start <= s);
1159 assert(off <= 0);
1160
1161 while (off++ && s > start) {
1162 s--;
1163 while (UTF8_IS_CONTINUATION(*s) && s > start)
1164 s--;
1165 }
1166
de979548 1167 GCC_DIAG_IGNORE(-Wcast-qual);
65df57a8 1168 return (U8 *)s;
de979548 1169 GCC_DIAG_RESTORE;
65df57a8
TC
1170}
1171
1172/*
1173=for apidoc utf8_hop_safe
1174
1175Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1176either forward or backward.
1177
1178When moving backward it will not move before C<start>.
1179
1180When moving forward it will not move beyond C<end>.
1181
1182Will not exceed those limits even if the string is not valid "UTF-8".
1183
1184=cut
1185*/
1186
1187PERL_STATIC_INLINE U8 *
1188Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1189{
1190 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1191
1192 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1193 * the bitops (especially ~) can create illegal UTF-8.
1194 * In other words: in Perl UTF-8 is not just for Unicode. */
1195
1196 assert(start <= s && s <= end);
1197
1198 if (off >= 0) {
1199 return utf8_hop_forward(s, off, end);
1200 }
1201 else {
1202 return utf8_hop_back(s, off, start);
1203 }
1204}
1205
1206/*
4dab108f
KW
1207
1208=for apidoc is_utf8_valid_partial_char
1209
6cbb9248
KW
1210Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1211S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1212points. Otherwise, it returns 1 if there exists at least one non-empty
1213sequence of bytes that when appended to sequence C<s>, starting at position
1214C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1215otherwise returns 0.
1216
1217In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1218point.
1219
1220This is useful when a fixed-length buffer is being tested for being well-formed
1221UTF-8, but the final few bytes in it don't comprise a full character; that is,
1222it is split somewhere in the middle of the final code point's UTF-8
1223representation. (Presumably when the buffer is refreshed with the next chunk
1224of data, the new first bytes will complete the partial code point.) This
1225function is used to verify that the final bytes in the current buffer are in
1226fact the legal beginning of some code point, so that if they aren't, the
1227failure can be signalled without having to wait for the next read.
4dab108f
KW
1228
1229=cut
1230*/
2717076a
KW
1231#define is_utf8_valid_partial_char(s, e) \
1232 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1233
1234/*
1235
1236=for apidoc is_utf8_valid_partial_char_flags
1237
1238Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1239or not the input is a valid UTF-8 encoded partial character, but it takes an
1240extra parameter, C<flags>, which can further restrict which code points are
1241considered valid.
1242
1243If C<flags> is 0, this behaves identically to
1244C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1245of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1246there is any sequence of bytes that can complete the input partial character in
1247such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1248TRUE; otherwise FALSE. Non character code points cannot be determined based on
1249partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1250determined from just the first one or two bytes.
1251
1252=cut
1253 */
1254
56e4cf64 1255PERL_STATIC_INLINE bool
f1c999a7 1256S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1257{
f1c999a7 1258 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1259
f1c999a7 1260 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1261 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1262
8875bd48 1263 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1264 return FALSE;
1265 }
1266
f1c999a7 1267 return cBOOL(_is_utf8_char_helper(s, e, flags));
4dab108f
KW
1268}
1269
8bc127bf
KW
1270/*
1271
1272=for apidoc is_utf8_fixed_width_buf_flags
1273
1274Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1275is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1276otherwise it returns FALSE.
1277
1278If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1279without restriction. If the final few bytes of the buffer do not form a
1280complete code point, this will return TRUE anyway, provided that
1281C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1282
1283If C<flags> in non-zero, it can be any combination of the
1284C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1285same meanings.
1286
1287This function differs from C<L</is_utf8_string_flags>> only in that the latter
1288returns FALSE if the final few bytes of the string don't form a complete code
1289point.
1290
1291=cut
1292 */
1293#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1294 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1295
1296/*
1297
1298=for apidoc is_utf8_fixed_width_buf_loc_flags
1299
1300Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1301failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1302to the beginning of any partial character at the end of the buffer; if there is
1303no partial character C<*ep> will contain C<s>+C<len>.
1304
1305See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1306
1307=cut
1308*/
1309
1310#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1311 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1312
1313/*
1314
1315=for apidoc is_utf8_fixed_width_buf_loclen_flags
1316
1317Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1318complete, valid characters found in the C<el> pointer.
1319
1320=cut
1321*/
1322
1323PERL_STATIC_INLINE bool
1324S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1325 STRLEN len,
8bc127bf
KW
1326 const U8 **ep,
1327 STRLEN *el,
1328 const U32 flags)
1329{
1330 const U8 * maybe_partial;
1331
1332 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1333
1334 if (! ep) {
1335 ep = &maybe_partial;
1336 }
1337
1338 /* If it's entirely valid, return that; otherwise see if the only error is
1339 * that the final few bytes are for a partial character */
1340 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1341 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1342}
1343
c8028aa6
TC
1344/* ------------------------------- perl.h ----------------------------- */
1345
1346/*
dcccc8ff
KW
1347=head1 Miscellaneous Functions
1348
41188aa0 1349=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
c8028aa6 1350
6602b933 1351Test that the given C<pv> doesn't contain any internal C<NUL> characters.
796b6530 1352If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
c8028aa6
TC
1353
1354Return TRUE if the name is safe.
1355
796b6530 1356Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1357
1358=cut
1359*/
1360
1361PERL_STATIC_INLINE bool
41188aa0 1362S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) {
c8028aa6
TC
1363 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1364 * perl itself uses xce*() functions which accept 8-bit strings.
1365 */
1366
1367 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1368
6c4650b3 1369 if (len > 1) {
c8028aa6 1370 char *null_at;
41188aa0 1371 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1372 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1373 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1374 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1375 what, op_name, pv, null_at+1);
c8028aa6
TC
1376 return FALSE;
1377 }
1378 }
1379
1380 return TRUE;
1381}
1382
1383/*
7cb3f959
TC
1384
1385Return true if the supplied filename has a newline character
fa6c7d00 1386immediately before the first (hopefully only) NUL.
7cb3f959
TC
1387
1388My original look at this incorrectly used the len from SvPV(), but
1389that's incorrect, since we allow for a NUL in pv[len-1].
1390
1391So instead, strlen() and work from there.
1392
1393This allow for the user reading a filename, forgetting to chomp it,
1394then calling:
1395
1396 open my $foo, "$file\0";
1397
1398*/
1399
1400#ifdef PERL_CORE
1401
1402PERL_STATIC_INLINE bool
1403S_should_warn_nl(const char *pv) {
1404 STRLEN len;
1405
1406 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1407
1408 len = strlen(pv);
1409
1410 return len > 0 && pv[len-1] == '\n';
1411}
1412
1413#endif
1414
81d52ecd
JH
1415/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1416
1417#define MAX_CHARSET_NAME_LENGTH 2
1418
1419PERL_STATIC_INLINE const char *
1420get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1421{
1422 /* Returns a string that corresponds to the name of the regex character set
1423 * given by 'flags', and *lenp is set the length of that string, which
1424 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
1425
1426 *lenp = 1;
1427 switch (get_regex_charset(flags)) {
1428 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
1429 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
1430 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1431 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
1432 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
1433 *lenp = 2;
1434 return ASCII_MORE_RESTRICT_PAT_MODS;
1435 }
1436 /* The NOT_REACHED; hides an assert() which has a rather complex
1437 * definition in perl.h. */
1438 NOT_REACHED; /* NOTREACHED */
1439 return "?"; /* Unknown */
1440}
1441
7cb3f959 1442/*
ed382232
TC
1443
1444Return false if any get magic is on the SV other than taint magic.
1445
1446*/
1447
1448PERL_STATIC_INLINE bool
1449S_sv_only_taint_gmagic(SV *sv) {
1450 MAGIC *mg = SvMAGIC(sv);
1451
1452 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
1453
1454 while (mg) {
1455 if (mg->mg_type != PERL_MAGIC_taint
1456 && !(mg->mg_flags & MGf_GSKIP)
1457 && mg->mg_virtual->svt_get) {
1458 return FALSE;
1459 }
1460 mg = mg->mg_moremagic;
1461 }
1462
1463 return TRUE;
1464}
1465
ed8ff0f3
DM
1466/* ------------------ cop.h ------------------------------------------- */
1467
1468
1469/* Enter a block. Push a new base context and return its address. */
1470
1471PERL_STATIC_INLINE PERL_CONTEXT *
1472S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
1473{
1474 PERL_CONTEXT * cx;
1475
1476 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
1477
1478 CXINC;
1479 cx = CX_CUR();
1480 cx->cx_type = type;
1481 cx->blk_gimme = gimme;
1482 cx->blk_oldsaveix = saveix;
4caf7d8c 1483 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 1484 cx->blk_oldcop = PL_curcop;
4caf7d8c 1485 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
1486 cx->blk_oldscopesp = PL_scopestack_ix;
1487 cx->blk_oldpm = PL_curpm;
ce8bb8d8 1488 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
1489
1490 PL_tmps_floor = PL_tmps_ix;
1491 CX_DEBUG(cx, "PUSH");
1492 return cx;
1493}
1494
1495
1496/* Exit a block (RETURN and LAST). */
1497
1498PERL_STATIC_INLINE void
1499S_cx_popblock(pTHX_ PERL_CONTEXT *cx)
1500{
1501 PERL_ARGS_ASSERT_CX_POPBLOCK;
1502
1503 CX_DEBUG(cx, "POP");
1504 /* these 3 are common to cx_popblock and cx_topblock */
1505 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1506 PL_scopestack_ix = cx->blk_oldscopesp;
1507 PL_curpm = cx->blk_oldpm;
1508
1509 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
1510 * and leaves a CX entry lying around for repeated use, so
1511 * skip for multicall */ \
1512 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
1513 || PL_savestack_ix == cx->blk_oldsaveix);
1514 PL_curcop = cx->blk_oldcop;
ce8bb8d8 1515 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
1516}
1517
1518/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
1519 * Whereas cx_popblock() restores the state to the point just before
1520 * cx_pushblock() was called, cx_topblock() restores it to the point just
1521 * *after* cx_pushblock() was called. */
1522
1523PERL_STATIC_INLINE void
1524S_cx_topblock(pTHX_ PERL_CONTEXT *cx)
1525{
1526 PERL_ARGS_ASSERT_CX_TOPBLOCK;
1527
1528 CX_DEBUG(cx, "TOP");
1529 /* these 3 are common to cx_popblock and cx_topblock */
1530 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1531 PL_scopestack_ix = cx->blk_oldscopesp;
1532 PL_curpm = cx->blk_oldpm;
1533
1534 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
1535}
1536
1537
a73d8813
DM
1538PERL_STATIC_INLINE void
1539S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
1540{
1541 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
1542
1543 PERL_ARGS_ASSERT_CX_PUSHSUB;
1544
3f6bd23a 1545 PERL_DTRACE_PROBE_ENTRY(cv);
a73d8813
DM
1546 cx->blk_sub.cv = cv;
1547 cx->blk_sub.olddepth = CvDEPTH(cv);
1548 cx->blk_sub.prevcomppad = PL_comppad;
1549 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
1550 cx->blk_sub.retop = retop;
1551 SvREFCNT_inc_simple_void_NN(cv);
1552 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
1553}
1554
1555
1556/* subsets of cx_popsub() */
1557
1558PERL_STATIC_INLINE void
1559S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
1560{
1561 CV *cv;
1562
1563 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
1564 assert(CxTYPE(cx) == CXt_SUB);
1565
1566 PL_comppad = cx->blk_sub.prevcomppad;
1567 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1568 cv = cx->blk_sub.cv;
1569 CvDEPTH(cv) = cx->blk_sub.olddepth;
1570 cx->blk_sub.cv = NULL;
1571 SvREFCNT_dec(cv);
1572}
1573
1574
1575/* handle the @_ part of leaving a sub */
1576
1577PERL_STATIC_INLINE void
1578S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
1579{
1580 AV *av;
1581
1582 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
1583 assert(CxTYPE(cx) == CXt_SUB);
1584 assert(AvARRAY(MUTABLE_AV(
1585 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
1586 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
1587
1588 CX_POP_SAVEARRAY(cx);
1589 av = MUTABLE_AV(PAD_SVl(0));
1590 if (UNLIKELY(AvREAL(av)))
1591 /* abandon @_ if it got reified */
1592 clear_defarray(av, 0);
1593 else {
1594 CLEAR_ARGARRAY(av);
1595 }
1596}
1597
1598
1599PERL_STATIC_INLINE void
1600S_cx_popsub(pTHX_ PERL_CONTEXT *cx)
1601{
1602 PERL_ARGS_ASSERT_CX_POPSUB;
1603 assert(CxTYPE(cx) == CXt_SUB);
1604
3f6bd23a 1605 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
1606
1607 if (CxHASARGS(cx))
1608 cx_popsub_args(cx);
1609 cx_popsub_common(cx);
1610}
1611
1612
6a7d52cc
DM
1613PERL_STATIC_INLINE void
1614S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
1615{
1616 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
1617
1618 cx->blk_format.cv = cv;
1619 cx->blk_format.retop = retop;
1620 cx->blk_format.gv = gv;
1621 cx->blk_format.dfoutgv = PL_defoutgv;
1622 cx->blk_format.prevcomppad = PL_comppad;
1623 cx->blk_u16 = 0;
1624
1625 SvREFCNT_inc_simple_void_NN(cv);
1626 CvDEPTH(cv)++;
1627 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
1628}
1629
1630
1631PERL_STATIC_INLINE void
1632S_cx_popformat(pTHX_ PERL_CONTEXT *cx)
1633{
1634 CV *cv;
1635 GV *dfout;
1636
1637 PERL_ARGS_ASSERT_CX_POPFORMAT;
1638 assert(CxTYPE(cx) == CXt_FORMAT);
1639
1640 dfout = cx->blk_format.dfoutgv;
1641 setdefout(dfout);
1642 cx->blk_format.dfoutgv = NULL;
1643 SvREFCNT_dec_NN(dfout);
1644
1645 PL_comppad = cx->blk_format.prevcomppad;
1646 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1647 cv = cx->blk_format.cv;
1648 cx->blk_format.cv = NULL;
1649 --CvDEPTH(cv);
1650 SvREFCNT_dec_NN(cv);
1651}
1652
1653
13febba5
DM
1654PERL_STATIC_INLINE void
1655S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
1656{
1657 PERL_ARGS_ASSERT_CX_PUSHEVAL;
1658
1659 cx->blk_eval.retop = retop;
1660 cx->blk_eval.old_namesv = namesv;
1661 cx->blk_eval.old_eval_root = PL_eval_root;
1662 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
1663 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
1664 cx->blk_eval.cur_top_env = PL_top_env;
1665
4c57ced5 1666 assert(!(PL_in_eval & ~ 0x3F));
13febba5 1667 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 1668 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
1669}
1670
1671
1672PERL_STATIC_INLINE void
1673S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
1674{
1675 SV *sv;
1676
1677 PERL_ARGS_ASSERT_CX_POPEVAL;
1678 assert(CxTYPE(cx) == CXt_EVAL);
1679
1680 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 1681 assert(!(PL_in_eval & 0xc0));
13febba5
DM
1682 PL_eval_root = cx->blk_eval.old_eval_root;
1683 sv = cx->blk_eval.cur_text;
4c57ced5 1684 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
1685 cx->blk_eval.cur_text = NULL;
1686 SvREFCNT_dec_NN(sv);
1687 }
1688
1689 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
1690 if (sv) {
1691 cx->blk_eval.old_namesv = NULL;
1692 SvREFCNT_dec_NN(sv);
1693 }
13febba5 1694}
6a7d52cc 1695
a73d8813 1696
d1b6bf72
DM
1697/* push a plain loop, i.e.
1698 * { block }
1699 * while (cond) { block }
1700 * for (init;cond;continue) { block }
1701 * This loop can be last/redo'ed etc.
1702 */
1703
1704PERL_STATIC_INLINE void
1705S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
1706{
1707 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
1708 cx->blk_loop.my_op = cLOOP;
1709}
1710
1711
1712/* push a true for loop, i.e.
1713 * for var (list) { block }
1714 */
1715
1716PERL_STATIC_INLINE void
1717S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
1718{
1719 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
1720
1721 /* this one line is common with cx_pushloop_plain */
1722 cx->blk_loop.my_op = cLOOP;
1723
1724 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
1725 cx->blk_loop.itersave = itersave;
1726#ifdef USE_ITHREADS
1727 cx->blk_loop.oldcomppad = PL_comppad;
1728#endif
1729}
1730
1731
1732/* pop all loop types, including plain */
1733
1734PERL_STATIC_INLINE void
1735S_cx_poploop(pTHX_ PERL_CONTEXT *cx)
1736{
1737 PERL_ARGS_ASSERT_CX_POPLOOP;
1738
1739 assert(CxTYPE_is_LOOP(cx));
1740 if ( CxTYPE(cx) == CXt_LOOP_ARY
1741 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
1742 {
1743 /* Free ary or cur. This assumes that state_u.ary.ary
1744 * aligns with state_u.lazysv.cur. See cx_dup() */
1745 SV *sv = cx->blk_loop.state_u.lazysv.cur;
1746 cx->blk_loop.state_u.lazysv.cur = NULL;
1747 SvREFCNT_dec_NN(sv);
1748 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
1749 sv = cx->blk_loop.state_u.lazysv.end;
1750 cx->blk_loop.state_u.lazysv.end = NULL;
1751 SvREFCNT_dec_NN(sv);
1752 }
1753 }
1754 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
1755 SV *cursv;
1756 SV **svp = (cx)->blk_loop.itervar_u.svp;
1757 if ((cx->cx_type & CXp_FOR_GV))
1758 svp = &GvSV((GV*)svp);
1759 cursv = *svp;
1760 *svp = cx->blk_loop.itersave;
1761 cx->blk_loop.itersave = NULL;
1762 SvREFCNT_dec(cursv);
1763 }
1764}
1765
2a7b7c61
DM
1766
1767PERL_STATIC_INLINE void
1768S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
1769{
1770 PERL_ARGS_ASSERT_CX_PUSHWHEN;
1771
1772 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1773}
1774
1775
1776PERL_STATIC_INLINE void
1777S_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
1778{
1779 PERL_ARGS_ASSERT_CX_POPWHEN;
1780 assert(CxTYPE(cx) == CXt_WHEN);
1781
1782 PERL_UNUSED_ARG(cx);
59a14f30 1783 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
1784 /* currently NOOP */
1785}
1786
1787
1788PERL_STATIC_INLINE void
1789S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
1790{
1791 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
1792
1793 cx->blk_givwhen.leave_op = cLOGOP->op_other;
1794 cx->blk_givwhen.defsv_save = orig_defsv;
1795}
1796
1797
1798PERL_STATIC_INLINE void
1799S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
1800{
1801 SV *sv;
1802
1803 PERL_ARGS_ASSERT_CX_POPGIVEN;
1804 assert(CxTYPE(cx) == CXt_GIVEN);
1805
1806 sv = GvSV(PL_defgv);
1807 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
1808 cx->blk_givwhen.defsv_save = NULL;
1809 SvREFCNT_dec(sv);
1810}
1811
ec2c235b
KW
1812/* ------------------ util.h ------------------------------------------- */
1813
1814/*
1815=head1 Miscellaneous Functions
1816
1817=for apidoc foldEQ
1818
1819Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1820same
1821case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
1822match themselves and their opposite case counterparts. Non-cased and non-ASCII
1823range bytes match only themselves.
1824
1825=cut
1826*/
1827
1828PERL_STATIC_INLINE I32
1829Perl_foldEQ(const char *s1, const char *s2, I32 len)
1830{
1831 const U8 *a = (const U8 *)s1;
1832 const U8 *b = (const U8 *)s2;
1833
1834 PERL_ARGS_ASSERT_FOLDEQ;
1835
1836 assert(len >= 0);
1837
1838 while (len--) {
1839 if (*a != *b && *a != PL_fold[*b])
1840 return 0;
1841 a++,b++;
1842 }
1843 return 1;
1844}
1845
0f9cb40c 1846PERL_STATIC_INLINE I32
ec2c235b
KW
1847Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
1848{
1849 /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on
1850 * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
1851 * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor
1852 * does it check that the strings each have at least 'len' characters */
1853
1854 const U8 *a = (const U8 *)s1;
1855 const U8 *b = (const U8 *)s2;
1856
1857 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
1858
1859 assert(len >= 0);
1860
1861 while (len--) {
1862 if (*a != *b && *a != PL_fold_latin1[*b]) {
1863 return 0;
1864 }
1865 a++, b++;
1866 }
1867 return 1;
1868}
1869
1870/*
1871=for apidoc foldEQ_locale
1872
1873Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1874same case-insensitively in the current locale; false otherwise.
1875
1876=cut
1877*/
1878
0f9cb40c 1879PERL_STATIC_INLINE I32
ec2c235b
KW
1880Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
1881{
1882 dVAR;
1883 const U8 *a = (const U8 *)s1;
1884 const U8 *b = (const U8 *)s2;
1885
1886 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
1887
1888 assert(len >= 0);
1889
1890 while (len--) {
1891 if (*a != *b && *a != PL_fold_locale[*b])
1892 return 0;
1893 a++,b++;
1894 }
1895 return 1;
1896}
1897
6dba01e2
KW
1898#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
1899
1900PERL_STATIC_INLINE void *
1901S_my_memrchr(const char * s, const char c, const STRLEN len)
1902{
1903 /* memrchr(), since many platforms lack it */
1904
1905 const char * t = s + len - 1;
1906
1907 PERL_ARGS_ASSERT_MY_MEMRCHR;
1908
1909 while (t >= s) {
1910 if (*t == c) {
1911 return (void *) t;
1912 }
1913 t--;
1914 }
1915
1916 return NULL;
1917}
1918
1919#endif
1920
ed382232 1921/*
c8028aa6
TC
1922 * ex: set ts=8 sts=4 sw=4 et:
1923 */