perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* inline.h
	2	*
	3	* Copyright (C) 2012 by Larry Wall and others
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	* This file contains tables and code adapted from
	9	* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
	10	* copyright notice:
	11
	12	Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
	13
	14	Permission is hereby granted, free of charge, to any person obtaining a copy of
	15	this software and associated documentation files (the "Software"), to deal in
	16	the Software without restriction, including without limitation the rights to
	17	use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	18	of the Software, and to permit persons to whom the Software is furnished to do
	19	so, subject to the following conditions:
	20
	21	The above copyright notice and this permission notice shall be included in all
	22	copies or substantial portions of the Software.
	23
	24	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	25	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	26	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	27	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	28	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	29	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	30	SOFTWARE.
	31
	32	*
	33	* This file is a home for static inline functions that cannot go in other
	34	* header files, because they depend on proto.h (included after most other
	35	* headers) or struct definitions.
	36	*
	37	* Each section names the header file that the functions "belong" to.
	38	*/
	39
	40	/* ------------------------------- av.h ------------------------------- */
	41
	42	PERL_STATIC_INLINE SSize_t
	43	Perl_av_top_index(pTHX_ AV *av)
	44	{
	45	PERL_ARGS_ASSERT_AV_TOP_INDEX;
	46	assert(SvTYPE(av) == SVt_PVAV);
	47
	48	return AvFILL(av);
	49	}
	50
	51	/* ------------------------------- cv.h ------------------------------- */
	52
	53	PERL_STATIC_INLINE GV *
	54	Perl_CvGV(pTHX_ CV *sv)
	55	{
	56	PERL_ARGS_ASSERT_CVGV;
	57
	58	return CvNAMED(sv)
	59	? Perl_cvgv_from_hek(aTHX_ sv)
	60	: ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
	61	}
	62
	63	PERL_STATIC_INLINE I32 *
	64	Perl_CvDEPTH(const CV * const sv)
	65	{
	66	PERL_ARGS_ASSERT_CVDEPTH;
	67	assert(SvTYPE(sv) == SVt_PVCV \|\| SvTYPE(sv) == SVt_PVFM);
	68
	69	return &((XPVCV*)SvANY(sv))->xcv_depth;
	70	}
	71
	72	/*
	73	CvPROTO returns the prototype as stored, which is not necessarily what
	74	the interpreter should be using. Specifically, the interpreter assumes
	75	that spaces have been stripped, which has been the case if the prototype
	76	was added by toke.c, but is generally not the case if it was added elsewhere.
	77	Since we can't enforce the spacelessness at assignment time, this routine
	78	provides a temporary copy at parse time with spaces removed.
	79	I<orig> is the start of the original buffer, I<len> is the length of the
	80	prototype and will be updated when this returns.
	81	*/
	82
	83	#ifdef PERL_CORE
	84	PERL_STATIC_INLINE char *
	85	S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
	86	{
	87	SV * tmpsv;
	88	char * tmps;
	89	tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
	90	tmps = SvPVX(tmpsv);
	91	while ((*len)--) {
	92	if (!isSPACE(*orig))
	93	tmps++ = orig;
	94	orig++;
	95	}
	96	*tmps = '\0';
	97	*len = tmps - SvPVX(tmpsv);
	98	return SvPVX(tmpsv);
	99	}
	100	#endif
	101
	102	/* ------------------------------- mg.h ------------------------------- */
	103
	104	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	105	/* assumes get-magic and stringification have already occurred */
	106	PERL_STATIC_INLINE STRLEN
	107	S_MgBYTEPOS(pTHX_ MAGIC mg, SV sv, const char *s, STRLEN len)
	108	{
	109	assert(mg->mg_type == PERL_MAGIC_regex_global);
	110	assert(mg->mg_len != -1);
	111	if (mg->mg_flags & MGf_BYTES \|\| !DO_UTF8(sv))
	112	return (STRLEN)mg->mg_len;
	113	else {
	114	const STRLEN pos = (STRLEN)mg->mg_len;
	115	/* Without this check, we may read past the end of the buffer: */
	116	if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
	117	return sv_or_pv_pos_u2b(sv, s, pos, NULL);
	118	}
	119	}
	120	#endif
	121
	122	/* ------------------------------- pad.h ------------------------------ */
	123
	124	#if defined(PERL_IN_PAD_C) \|\| defined(PERL_IN_OP_C)
	125	PERL_STATIC_INLINE bool
	126	S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
	127	{
	128	PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
	129
	130	/* is seq within the range _LOW to _HIGH ?
	131	* This is complicated by the fact that PL_cop_seqmax
	132	* may have wrapped around at some point */
	133	if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
	134	return FALSE; /* not yet introduced */
	135
	136	if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
	137	/* in compiling scope */
	138	if (
	139	(seq > COP_SEQ_RANGE_LOW(pn))
	140	? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
	141	: (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
	142	)
	143	return TRUE;
	144	}
	145	else if (
	146	(COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
	147	?
	148	( seq > COP_SEQ_RANGE_LOW(pn)
	149	\|\| seq <= COP_SEQ_RANGE_HIGH(pn))
	150
	151	: ( seq > COP_SEQ_RANGE_LOW(pn)
	152	&& seq <= COP_SEQ_RANGE_HIGH(pn))
	153	)
	154	return TRUE;
	155	return FALSE;
	156	}
	157	#endif
	158
	159	/* ------------------------------- pp.h ------------------------------- */
	160
	161	PERL_STATIC_INLINE I32
	162	Perl_TOPMARK(pTHX)
	163	{
	164	DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
	165	"MARK top %p %" IVdf "\n",
	166	PL_markstack_ptr,
	167	(IV)*PL_markstack_ptr)));
	168	return *PL_markstack_ptr;
	169	}
	170
	171	PERL_STATIC_INLINE I32
	172	Perl_POPMARK(pTHX)
	173	{
	174	DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
	175	"MARK pop %p %" IVdf "\n",
	176	(PL_markstack_ptr-1),
	177	(IV)*(PL_markstack_ptr-1))));
	178	assert((PL_markstack_ptr > PL_markstack) \|\| !"MARK underflow");
	179	return *PL_markstack_ptr--;
	180	}
	181
	182	/* ----------------------------- regexp.h ----------------------------- */
	183
	184	PERL_STATIC_INLINE struct regexp *
	185	Perl_ReANY(const REGEXP * const re)
	186	{
	187	XPV* const p = (XPV*)SvANY(re);
	188
	189	PERL_ARGS_ASSERT_REANY;
	190	assert(isREGEXP(re));
	191
	192	return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
	193	: (struct regexp *)p;
	194	}
	195
	196	/* ------------------------------- sv.h ------------------------------- */
	197
	198	PERL_STATIC_INLINE bool
	199	Perl_SvTRUE(pTHX_ SV *sv) {
	200	if (!LIKELY(sv))
	201	return FALSE;
	202	SvGETMAGIC(sv);
	203	return SvTRUE_nomg_NN(sv);
	204	}
	205
	206	PERL_STATIC_INLINE SV *
	207	Perl_SvREFCNT_inc(SV *sv)
	208	{
	209	if (LIKELY(sv != NULL))
	210	SvREFCNT(sv)++;
	211	return sv;
	212	}
	213	PERL_STATIC_INLINE SV *
	214	Perl_SvREFCNT_inc_NN(SV *sv)
	215	{
	216	PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
	217
	218	SvREFCNT(sv)++;
	219	return sv;
	220	}
	221	PERL_STATIC_INLINE void
	222	Perl_SvREFCNT_inc_void(SV *sv)
	223	{
	224	if (LIKELY(sv != NULL))
	225	SvREFCNT(sv)++;
	226	}
	227	PERL_STATIC_INLINE void
	228	Perl_SvREFCNT_dec(pTHX_ SV *sv)
	229	{
	230	if (LIKELY(sv != NULL)) {
	231	U32 rc = SvREFCNT(sv);
	232	if (LIKELY(rc > 1))
	233	SvREFCNT(sv) = rc - 1;
	234	else
	235	Perl_sv_free2(aTHX_ sv, rc);
	236	}
	237	}
	238
	239	PERL_STATIC_INLINE void
	240	Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
	241	{
	242	U32 rc = SvREFCNT(sv);
	243
	244	PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
	245
	246	if (LIKELY(rc > 1))
	247	SvREFCNT(sv) = rc - 1;
	248	else
	249	Perl_sv_free2(aTHX_ sv, rc);
	250	}
	251
	252	PERL_STATIC_INLINE void
	253	Perl_SvAMAGIC_on(SV *sv)
	254	{
	255	PERL_ARGS_ASSERT_SVAMAGIC_ON;
	256	assert(SvROK(sv));
	257
	258	if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
	259	}
	260	PERL_STATIC_INLINE void
	261	Perl_SvAMAGIC_off(SV *sv)
	262	{
	263	PERL_ARGS_ASSERT_SVAMAGIC_OFF;
	264
	265	if (SvROK(sv) && SvOBJECT(SvRV(sv)))
	266	HvAMAGIC_off(SvSTASH(SvRV(sv)));
	267	}
	268
	269	PERL_STATIC_INLINE U32
	270	Perl_SvPADSTALE_on(SV *sv)
	271	{
	272	assert(!(SvFLAGS(sv) & SVs_PADTMP));
	273	return SvFLAGS(sv) \|= SVs_PADSTALE;
	274	}
	275	PERL_STATIC_INLINE U32
	276	Perl_SvPADSTALE_off(SV *sv)
	277	{
	278	assert(!(SvFLAGS(sv) & SVs_PADTMP));
	279	return SvFLAGS(sv) &= ~SVs_PADSTALE;
	280	}
	281	#if defined(PERL_CORE) \|\| defined (PERL_EXT)
	282	PERL_STATIC_INLINE STRLEN
	283	S_sv_or_pv_pos_u2b(pTHX_ SV sv, const char pv, STRLEN pos, STRLEN *lenp)
	284	{
	285	PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
	286	if (SvGAMAGIC(sv)) {
	287	U8 hopped = utf8_hop((U8 )pv, pos);
	288	if (lenp) lenp = (STRLEN)(utf8_hop(hopped, lenp) - hopped);
	289	return (STRLEN)(hopped - (U8 *)pv);
	290	}
	291	return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
	292	}
	293	#endif
	294
	295	/* ------------------------------- utf8.h ------------------------------- */
	296
	297	/*
	298	=head1 Unicode Support
	299	*/
	300
	301	PERL_STATIC_INLINE void
	302	Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
	303	{
	304	/* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
	305	* encoded string at 'dest', updating 'dest' to include it */
	306
	307	PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
	308
	309	if (NATIVE_BYTE_IS_INVARIANT(byte))
	310	((dest)++) = byte;
	311	else {
	312	((dest)++) = UTF8_EIGHT_BIT_HI(byte);
	313	((dest)++) = UTF8_EIGHT_BIT_LO(byte);
	314	}
	315	}
	316
	317	/*
	318	=for apidoc valid_utf8_to_uvchr
	319	Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
	320	known that the next character in the input UTF-8 string C<s> is well-formed
	321	(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
	322	points, and non-Unicode code points are allowed.
	323
	324	=cut
	325
	326	*/
	327
	328	PERL_STATIC_INLINE UV
	329	Perl_valid_utf8_to_uvchr(const U8 s, STRLEN retlen)
	330	{
	331	const UV expectlen = UTF8SKIP(s);
	332	const U8* send = s + expectlen;
	333	UV uv = *s;
	334
	335	PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
	336
	337	if (retlen) {
	338	*retlen = expectlen;
	339	}
	340
	341	/* An invariant is trivially returned */
	342	if (expectlen == 1) {
	343	return uv;
	344	}
	345
	346	/* Remove the leading bits that indicate the number of bytes, leaving just
	347	* the bits that are part of the value */
	348	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	349
	350	/* Now, loop through the remaining bytes, accumulating each into the
	351	* working total as we go. (I khw tried unrolling the loop for up to 4
	352	* bytes, but there was no performance improvement) */
	353	for (++s; s < send; s++) {
	354	uv = UTF8_ACCUMULATE(uv, *s);
	355	}
	356
	357	return UNI_TO_NATIVE(uv);
	358
	359	}
	360
	361	/*
	362	=for apidoc is_utf8_invariant_string
	363
	364	Returns TRUE if the first C<len> bytes of the string C<s> are the same
	365	regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
	366	EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
	367	are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
	368	the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
	369	characters are invariant, but so also are the C1 controls.
	370
	371	If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
	372	use this option, that C<s> can't have embedded C<NUL> characters and has to
	373	have a terminating C<NUL> byte).
	374
	375	See also
	376	C<L</is_utf8_string>>,
	377	C<L</is_utf8_string_flags>>,
	378	C<L</is_utf8_string_loc>>,
	379	C<L</is_utf8_string_loc_flags>>,
	380	C<L</is_utf8_string_loclen>>,
	381	C<L</is_utf8_string_loclen_flags>>,
	382	C<L</is_utf8_fixed_width_buf_flags>>,
	383	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	384	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	385	C<L</is_strict_utf8_string>>,
	386	C<L</is_strict_utf8_string_loc>>,
	387	C<L</is_strict_utf8_string_loclen>>,
	388	C<L</is_c9strict_utf8_string>>,
	389	C<L</is_c9strict_utf8_string_loc>>,
	390	and
	391	C<L</is_c9strict_utf8_string_loclen>>.
	392
	393	=cut
	394
	395	*/
	396
	397	#define is_utf8_invariant_string(s, len) \
	398	is_utf8_invariant_string_loc(s, len, NULL)
	399
	400	/*
	401	=for apidoc is_utf8_invariant_string_loc
	402
	403	Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
	404	the first UTF-8 variant character in the C<ep> pointer; if all characters are
	405	UTF-8 invariant, this function does not change the contents of C<*ep>.
	406
	407	=cut
	408
	409	*/
	410
	411	PERL_STATIC_INLINE bool
	412	Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
	413	{
	414	const U8* send;
	415	const U8* x = s;
	416
	417	PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
	418
	419	if (len == 0) {
	420	len = strlen((const char *)s);
	421	}
	422
	423	send = s + len;
	424
	425	/* This looks like 0x010101... */
	426	# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
	427
	428	/* This looks like 0x808080... */
	429	# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
	430	# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
	431	# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
	432
	433	/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
	434	* or'ing together the lowest bits of 'x'. Hopefully the final term gets
	435	* optimized out completely on a 32-bit system, and its mask gets optimized out
	436	* on a 64-bit system */
	437	# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
	438	\| ( PTR2nat(x) >> 1) \
	439	\| ( ( (PTR2nat(x) \
	440	& PERL_WORD_BOUNDARY_MASK) >> 2))))
	441
	442	#ifndef EBCDIC
	443
	444	/* Do the word-at-a-time iff there is at least one usable full word. That
	445	* means that after advancing to a word boundary, there still is at least a
	446	* full word left. The number of bytes needed to advance is 'wordsize -
	447	* offset' unless offset is 0. */
	448	if ((STRLEN) (send - x) >= PERL_WORDSIZE
	449
	450	/* This term is wordsize if subword; 0 if not */
	451	+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
	452
	453	/* 'offset' */
	454	- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
	455	{
	456
	457	/* Process per-byte until reach word boundary. XXX This loop could be
	458	* eliminated if we knew that this platform had fast unaligned reads */
	459	while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
	460	if (! UTF8_IS_INVARIANT(*x)) {
	461	if (ep) {
	462	*ep = x;
	463	}
	464
	465	return FALSE;
	466	}
	467	x++;
	468	}
	469
	470	/* Here, we know we have at least one full word to process. Process
	471	* per-word as long as we have at least a full word left */
	472	do {
	473	if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
	474
	475	/* Found a variant. Just return if caller doesn't want its
	476	* exact position */
	477	if (! ep) {
	478	return FALSE;
	479	}
	480
	481	# if BYTEORDER == 0x1234 \|\| BYTEORDER == 0x12345678 \
	482	\|\| BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	483
	484	ep = x + variant_byte_number( (PERL_UINTMAX_T *) x);
	485	assert(ep >= s && ep < send);
	486
	487	return FALSE;
	488
	489	# else /* If weird byte order, drop into next loop to do byte-at-a-time
	490	checks. */
	491
	492	break;
	493	# endif
	494	}
	495
	496	x += PERL_WORDSIZE;
	497
	498	} while (x + PERL_WORDSIZE <= send);
	499	}
	500
	501	#endif /* End of ! EBCDIC */
	502
	503	/* Process per-byte */
	504	while (x < send) {
	505	if (! UTF8_IS_INVARIANT(*x)) {
	506	if (ep) {
	507	*ep = x;
	508	}
	509
	510	return FALSE;
	511	}
	512
	513	x++;
	514	}
	515
	516	return TRUE;
	517	}
	518
	519	#ifndef EBCDIC
	520
	521	PERL_STATIC_INLINE unsigned int
	522	Perl_variant_byte_number(PERL_UINTMAX_T word)
	523	{
	524
	525	/* This returns the position in a word (0..7) of the first variant byte in
	526	* it. This is a helper function. Note that there are no branches */
	527
	528	assert(word);
	529
	530	/* Get just the msb bits of each byte */
	531	word &= PERL_VARIANTS_WORD_MASK;
	532
	533	# if BYTEORDER == 0x1234 \|\| BYTEORDER == 0x12345678
	534
	535	/* Bytes are stored like
	536	* Byte8 ... Byte2 Byte1
	537	* 63..56...15...8 7...0
	538	*
	539	* Isolate the lsb;
	540	* https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
	541	*
	542	* The word will look like this, with a rightmost set bit in position 's':
	543	* ('x's are don't cares)
	544	* s
	545	* x..x100..0
	546	* x..xx10..0 Right shift (rightmost 0 is shifted off)
	547	* x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
	548	* the 1 just to their left into a 0; the remainder is
	549	* untouched
	550	* 0..0011..1 The xor with the original, x..xx10..0, clears that
	551	* remainder, sets the bottom to all 1
	552	* 0..0100..0 Add 1 to clear the word except for the bit in 's'
	553	*
	554	* Another method is to do 'word &= -word'; but it generates a compiler
	555	* message on some platforms about taking the negative of an unsigned */
	556
	557	word >>= 1;
	558	word = 1 + (word ^ (word - 1));
	559
	560	# elif BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	561
	562	/* Bytes are stored like
	563	* Byte1 Byte2 ... Byte8
	564	* 63..56 55..47 ... 7...0
	565	*
	566	* Isolate the msb; http://codeforces.com/blog/entry/10330
	567	*
	568	* Only the most significant set bit matters. Or'ing word with its right
	569	* shift of 1 makes that bit and the next one to its right both 1. Then
	570	* right shifting by 2 makes for 4 1-bits in a row. ... We end with the
	571	* msb and all to the right being 1. */
	572	word \|= word >> 1;
	573	word \|= word >> 2;
	574	word \|= word >> 4;
	575	word \|= word >> 8;
	576	word \|= word >> 16;
	577	word \|= word >> 32; /* This should get optimized out on 32-bit systems. */
	578
	579	/* Then subtracting the right shift by 1 clears all but the left-most of
	580	* the 1 bits, which is our desired result */
	581	word -= (word >> 1);
	582
	583	# else
	584	# error Unexpected byte order
	585	# endif
	586
	587	/* Here 'word' has a single bit set: the msb of the first byte in which it
	588	* is set. Calculate that position in the word. We can use this
	589	* specialized solution: https://stackoverflow.com/a/32339674/1626653,
	590	* assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
	591	* just get shifted off at compile time) */
	592	word = (word >> 7) * ((UINTMAX_C( 7) << 56) \| (UINTMAX_C(15) << 48)
	593	\| (UINTMAX_C(23) << 40) \| (UINTMAX_C(31) << 32)
	594	\| (39 << 24) \| (47 << 16)
	595	\| (55 << 8) \| (63 << 0));
	596	word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
	597
	598	/* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
	599	word = ((word + 1) >> 3) - 1;
	600
	601	# if BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	602
	603	/* And invert the result */
	604	word = CHARBITS - word - 1;
	605
	606	# endif
	607
	608	return (unsigned int) word;
	609	}
	610
	611	#endif
	612	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	613
	614	/*
	615	=for apidoc variant_under_utf8_count
	616
	617	This function looks at the sequence of bytes between C<s> and C<e>, which are
	618	assumed to be encoded in ASCII/Latin1, and returns how many of them would
	619	change should the string be translated into UTF-8. Due to the nature of UTF-8,
	620	each of these would occupy two bytes instead of the single one in the input
	621	string. Thus, this function returns the precise number of bytes the string
	622	would expand by when translated to UTF-8.
	623
	624	Unlike most of the other functions that have C<utf8> in their name, the input
	625	to this function is NOT a UTF-8-encoded string. The function name is slightly
	626	I<odd> to emphasize this.
	627
	628	This function is internal to Perl because khw thinks that any XS code that
	629	would want this is probably operating too close to the internals. Presenting a
	630	valid use case could change that.
	631
	632	See also
	633	C<L<perlapi/is_utf8_invariant_string>>
	634	and
	635	C<L<perlapi/is_utf8_invariant_string_loc>>,
	636
	637	=cut
	638
	639	*/
	640
	641	PERL_STATIC_INLINE Size_t
	642	S_variant_under_utf8_count(const U8* const s, const U8* const e)
	643	{
	644	const U8* x = s;
	645	Size_t count = 0;
	646
	647	PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
	648
	649	# ifndef EBCDIC
	650
	651	/* Test if the string is long enough to use word-at-a-time. (Logic is the
	652	* same as for is_utf8_invariant_string()) */
	653	if ((STRLEN) (e - x) >= PERL_WORDSIZE
	654	+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
	655	- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
	656	{
	657
	658	/* Process per-byte until reach word boundary. XXX This loop could be
	659	* eliminated if we knew that this platform had fast unaligned reads */
	660	while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
	661	count += ! UTF8_IS_INVARIANT(*x++);
	662	}
	663
	664	/* Process per-word as long as we have at least a full word left */
	665	do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
	666	explanation of how this works */
	667	PERL_UINTMAX_T increment
	668	= ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
	669	* PERL_COUNT_MULTIPLIER)
	670	>> ((PERL_WORDSIZE - 1) * CHARBITS);
	671	count += (Size_t) increment;
	672	x += PERL_WORDSIZE;
	673	} while (x + PERL_WORDSIZE <= e);
	674	}
	675
	676	# endif
	677
	678	/* Process per-byte */
	679	while (x < e) {
	680	if (! UTF8_IS_INVARIANT(*x)) {
	681	count++;
	682	}
	683
	684	x++;
	685	}
	686
	687	return count;
	688	}
	689
	690	#endif
	691
	692	#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
	693	# undef PERL_WORDSIZE
	694	# undef PERL_COUNT_MULTIPLIER
	695	# undef PERL_WORD_BOUNDARY_MASK
	696	# undef PERL_VARIANTS_WORD_MASK
	697	#endif
	698
	699	/*
	700	=for apidoc is_utf8_string
	701
	702	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	703	Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
	704	be calculated using C<strlen(s)> (which means if you use this option, that C<s>
	705	can't have embedded C<NUL> characters and has to have a terminating C<NUL>
	706	byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
	707
	708	This function considers Perl's extended UTF-8 to be valid. That means that
	709	code points above Unicode, surrogates, and non-character code points are
	710	considered valid by this function. Use C<L</is_strict_utf8_string>>,
	711	C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
	712	code points are considered valid.
	713
	714	See also
	715	C<L</is_utf8_invariant_string>>,
	716	C<L</is_utf8_invariant_string_loc>>,
	717	C<L</is_utf8_string_loc>>,
	718	C<L</is_utf8_string_loclen>>,
	719	C<L</is_utf8_fixed_width_buf_flags>>,
	720	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	721	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	722
	723	=cut
	724	*/
	725
	726	#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
	727
	728	#if defined(PERL_CORE) \|\| defined (PERL_EXT)
	729
	730	/*
	731	=for apidoc is_utf8_non_invariant_string
	732
	733	Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
	734	C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
	735	UTF-8; otherwise returns FALSE.
	736
	737	A TRUE return means that at least one code point represented by the sequence
	738	either is a wide character not representable as a single byte, or the
	739	representation differs depending on whether the sequence is encoded in UTF-8 or
	740	not.
	741
	742	See also
	743	C<L<perlapi/is_utf8_invariant_string>>,
	744	C<L<perlapi/is_utf8_string>>
	745
	746	=cut
	747
	748	This is commonly used to determine if a SV's UTF-8 flag should be turned on.
	749	It generally needn't be if its string is entirely UTF-8 invariant, and it
	750	shouldn't be if it otherwise contains invalid UTF-8.
	751
	752	It is an internal function because khw thinks that XS code shouldn't be working
	753	at this low a level. A valid use case could change that.
	754
	755	*/
	756
	757	PERL_STATIC_INLINE bool
	758	Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
	759	{
	760	const U8 * first_variant;
	761
	762	PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
	763
	764	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	765	return FALSE;
	766	}
	767
	768	return is_utf8_string(first_variant, len - (first_variant - s));
	769	}
	770
	771	#endif
	772
	773	/*
	774	=for apidoc is_strict_utf8_string
	775
	776	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	777	UTF-8-encoded string that is fully interchangeable by any application using
	778	Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
	779	calculated using C<strlen(s)> (which means if you use this option, that C<s>
	780	can't have embedded C<NUL> characters and has to have a terminating C<NUL>
	781	byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
	782
	783	This function returns FALSE for strings containing any
	784	code points above the Unicode max of 0x10FFFF, surrogate code points, or
	785	non-character code points.
	786
	787	See also
	788	C<L</is_utf8_invariant_string>>,
	789	C<L</is_utf8_invariant_string_loc>>,
	790	C<L</is_utf8_string>>,
	791	C<L</is_utf8_string_flags>>,
	792	C<L</is_utf8_string_loc>>,
	793	C<L</is_utf8_string_loc_flags>>,
	794	C<L</is_utf8_string_loclen>>,
	795	C<L</is_utf8_string_loclen_flags>>,
	796	C<L</is_utf8_fixed_width_buf_flags>>,
	797	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	798	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	799	C<L</is_strict_utf8_string_loc>>,
	800	C<L</is_strict_utf8_string_loclen>>,
	801	C<L</is_c9strict_utf8_string>>,
	802	C<L</is_c9strict_utf8_string_loc>>,
	803	and
	804	C<L</is_c9strict_utf8_string_loclen>>.
	805
	806	=cut
	807	*/
	808
	809	#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
	810
	811	/*
	812	=for apidoc is_c9strict_utf8_string
	813
	814	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	815	UTF-8-encoded string that conforms to
	816	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>;
	817	otherwise it returns FALSE. If C<len> is 0, it will be calculated using
	818	C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
	819	C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
	820	characters being ASCII constitute 'a valid UTF-8 string'.
	821
	822	This function returns FALSE for strings containing any code points above the
	823	Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
	824	code points per
	825	L<Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	826
	827	See also
	828	C<L</is_utf8_invariant_string>>,
	829	C<L</is_utf8_invariant_string_loc>>,
	830	C<L</is_utf8_string>>,
	831	C<L</is_utf8_string_flags>>,
	832	C<L</is_utf8_string_loc>>,
	833	C<L</is_utf8_string_loc_flags>>,
	834	C<L</is_utf8_string_loclen>>,
	835	C<L</is_utf8_string_loclen_flags>>,
	836	C<L</is_utf8_fixed_width_buf_flags>>,
	837	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	838	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	839	C<L</is_strict_utf8_string>>,
	840	C<L</is_strict_utf8_string_loc>>,
	841	C<L</is_strict_utf8_string_loclen>>,
	842	C<L</is_c9strict_utf8_string_loc>>,
	843	and
	844	C<L</is_c9strict_utf8_string_loclen>>.
	845
	846	=cut
	847	*/
	848
	849	#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
	850
	851	/*
	852	=for apidoc is_utf8_string_flags
	853
	854	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	855	UTF-8 string, subject to the restrictions imposed by C<flags>;
	856	returns FALSE otherwise. If C<len> is 0, it will be calculated
	857	using C<strlen(s)> (which means if you use this option, that C<s> can't have
	858	embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
	859	that all characters being ASCII constitute 'a valid UTF-8 string'.
	860
	861	If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
	862	C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
	863	as C<L</is_strict_utf8_string>>; and if C<flags> is
	864	C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
	865	C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
	866	combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
	867	C<L</utf8n_to_uvchr>>, with the same meanings.
	868
	869	See also
	870	C<L</is_utf8_invariant_string>>,
	871	C<L</is_utf8_invariant_string_loc>>,
	872	C<L</is_utf8_string>>,
	873	C<L</is_utf8_string_loc>>,
	874	C<L</is_utf8_string_loc_flags>>,
	875	C<L</is_utf8_string_loclen>>,
	876	C<L</is_utf8_string_loclen_flags>>,
	877	C<L</is_utf8_fixed_width_buf_flags>>,
	878	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	879	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	880	C<L</is_strict_utf8_string>>,
	881	C<L</is_strict_utf8_string_loc>>,
	882	C<L</is_strict_utf8_string_loclen>>,
	883	C<L</is_c9strict_utf8_string>>,
	884	C<L</is_c9strict_utf8_string_loc>>,
	885	and
	886	C<L</is_c9strict_utf8_string_loclen>>.
	887
	888	=cut
	889	*/
	890
	891	PERL_STATIC_INLINE bool
	892	Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
	893	{
	894	const U8 * first_variant;
	895
	896	PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
	897	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	898	\|UTF8_DISALLOW_PERL_EXTENDED)));
	899
	900	if (len == 0) {
	901	len = strlen((const char *)s);
	902	}
	903
	904	if (flags == 0) {
	905	return is_utf8_string(s, len);
	906	}
	907
	908	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	909	== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
	910	{
	911	return is_strict_utf8_string(s, len);
	912	}
	913
	914	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	915	== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
	916	{
	917	return is_c9strict_utf8_string(s, len);
	918	}
	919
	920	if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
	921	const U8* const send = s + len;
	922	const U8* x = first_variant;
	923
	924	while (x < send) {
	925	STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
	926	if (UNLIKELY(! cur_len)) {
	927	return FALSE;
	928	}
	929	x += cur_len;
	930	}
	931	}
	932
	933	return TRUE;
	934	}
	935
	936	/*
	937
	938	=for apidoc is_utf8_string_loc
	939
	940	Like C<L</is_utf8_string>> but stores the location of the failure (in the
	941	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	942	"utf8ness success") in the C<ep> pointer.
	943
	944	See also C<L</is_utf8_string_loclen>>.
	945
	946	=cut
	947	*/
	948
	949	#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
	950
	951	/*
	952
	953	=for apidoc is_utf8_string_loclen
	954
	955	Like C<L</is_utf8_string>> but stores the location of the failure (in the
	956	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	957	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	958	encoded characters in the C<el> pointer.
	959
	960	See also C<L</is_utf8_string_loc>>.
	961
	962	=cut
	963	*/
	964
	965	PERL_STATIC_INLINE bool
	966	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	967	{
	968	const U8 * first_variant;
	969
	970	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	971
	972	if (len == 0) {
	973	len = strlen((const char *) s);
	974	}
	975
	976	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	977	if (el)
	978	*el = len;
	979
	980	if (ep) {
	981	*ep = s + len;
	982	}
	983
	984	return TRUE;
	985	}
	986
	987	{
	988	const U8* const send = s + len;
	989	const U8* x = first_variant;
	990	STRLEN outlen = first_variant - s;
	991
	992	while (x < send) {
	993	const STRLEN cur_len = isUTF8_CHAR(x, send);
	994	if (UNLIKELY(! cur_len)) {
	995	break;
	996	}
	997	x += cur_len;
	998	outlen++;
	999	}
	1000
	1001	if (el)
	1002	*el = outlen;
	1003
	1004	if (ep) {
	1005	*ep = x;
	1006	}
	1007
	1008	return (x == send);
	1009	}
	1010	}
	1011
	1012	/*
	1013
	1014	=for apidoc isUTF8_CHAR
	1015
	1016	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1017	looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
	1018	that represents some code point; otherwise it evaluates to 0. If non-zero, the
	1019	value gives how many bytes starting at C<s> comprise the code point's
	1020	representation. Any bytes remaining before C<e>, but beyond the ones needed to
	1021	form the first code point in C<s>, are not examined.
	1022
	1023	The code point can be any that will fit in an IV on this machine, using Perl's
	1024	extension to official UTF-8 to represent those higher than the Unicode maximum
	1025	of 0x10FFFF. That means that this macro is used to efficiently decide if the
	1026	next few bytes in C<s> is legal UTF-8 for a single character.
	1027
	1028	Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
	1029	defined by Unicode to be fully interchangeable across applications;
	1030	C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
	1031	#9\|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
	1032	code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1033
	1034	Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
	1035	C<L</is_utf8_string_loclen>> to check entire strings.
	1036
	1037	Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
	1038	machines) is a valid UTF-8 character.
	1039
	1040	=cut
	1041
	1042	This uses an adaptation of the table and algorithm given in
	1043	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1044	documentation of the original version. A copyright notice for the original
	1045	version is given at the beginning of this file. The Perl adapation is
	1046	documented at the definition of PL_extended_utf8_dfa_tab[].
	1047
	1048	*/
	1049
	1050	PERL_STATIC_INLINE Size_t
	1051	Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
	1052	{
	1053	const U8 * s = s0;
	1054	UV state = 0;
	1055
	1056	PERL_ARGS_ASSERT_ISUTF8_CHAR;
	1057
	1058	/* This dfa is fast. If it accepts the input, it was for a well-formed,
	1059	* code point, which can be returned immediately. Otherwise, it is either
	1060	* malformed, or for the start byte FF which the dfa doesn't handle (except
	1061	* on 32-bit ASCII platforms where it trivially is an error). Call a
	1062	* helper function for the other platforms. */
	1063
	1064	while (s < e && LIKELY(state != 1)) {
	1065	state = PL_extended_utf8_dfa_tab[256
	1066	+ state
	1067	+ PL_extended_utf8_dfa_tab[*s]];
	1068	if (state != 0) {
	1069	s++;
	1070	continue;
	1071	}
	1072
	1073	return s - s0 + 1;
	1074	}
	1075
	1076	#if defined(UV_IS_QUAD) \|\| defined(EBCDIC)
	1077
	1078	if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
	1079	return is_utf8_char_helper(s0, e, 0);
	1080	}
	1081
	1082	#endif
	1083
	1084	return 0;
	1085	}
	1086
	1087	/*
	1088
	1089	=for apidoc isSTRICT_UTF8_CHAR
	1090
	1091	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1092	looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
	1093	Unicode code point completely acceptable for open interchange between all
	1094	applications; otherwise it evaluates to 0. If non-zero, the value gives how
	1095	many bytes starting at C<s> comprise the code point's representation. Any
	1096	bytes remaining before C<e>, but beyond the ones needed to form the first code
	1097	point in C<s>, are not examined.
	1098
	1099	The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
	1100	be a surrogate nor a non-character code point. Thus this excludes any code
	1101	point from Perl's extended UTF-8.
	1102
	1103	This is used to efficiently decide if the next few bytes in C<s> is
	1104	legal Unicode-acceptable UTF-8 for a single character.
	1105
	1106	Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
	1107	#9\|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
	1108	code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
	1109	and C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1110
	1111	Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
	1112	C<L</is_strict_utf8_string_loclen>> to check entire strings.
	1113
	1114	=cut
	1115
	1116	This uses an adaptation of the tables and algorithm given in
	1117	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1118	documentation of the original version. A copyright notice for the original
	1119	version is given at the beginning of this file. The Perl adapation is
	1120	documented at the definition of strict_extended_utf8_dfa_tab[].
	1121
	1122	*/
	1123
	1124	PERL_STATIC_INLINE Size_t
	1125	Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
	1126	{
	1127	const U8 * s = s0;
	1128	UV state = 0;
	1129
	1130	PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
	1131
	1132	while (s < e && LIKELY(state != 1)) {
	1133	state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
	1134
	1135	if (state != 0) {
	1136	s++;
	1137	continue;
	1138	}
	1139
	1140	return s - s0 + 1;
	1141	}
	1142
	1143	#ifndef EBCDIC
	1144
	1145	/* The dfa above drops out for certain Hanguls; handle them specially */
	1146	if (is_HANGUL_ED_utf8_safe(s0, e)) {
	1147	return 3;
	1148	}
	1149
	1150	#endif
	1151
	1152	return 0;
	1153	}
	1154
	1155	/*
	1156
	1157	=for apidoc isC9_STRICT_UTF8_CHAR
	1158
	1159	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1160	looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
	1161	Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
	1162	the value gives how many bytes starting at C<s> comprise the code point's
	1163	representation. Any bytes remaining before C<e>, but beyond the ones needed to
	1164	form the first code point in C<s>, are not examined.
	1165
	1166	The largest acceptable code point is the Unicode maximum 0x10FFFF. This
	1167	differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
	1168	code points. This corresponds to
	1169	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	1170	which said that non-character code points are merely discouraged rather than
	1171	completely forbidden in open interchange. See
	1172	L<perlunicode/Noncharacter code points>.
	1173
	1174	Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
	1175	C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1176
	1177	Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
	1178	C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
	1179
	1180	=cut
	1181
	1182	This uses an adaptation of the tables and algorithm given in
	1183	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1184	documentation of the original version. A copyright notice for the original
	1185	version is given at the beginning of this file. The Perl adapation is
	1186	documented at the definition of PL_c9_utf8_dfa_tab[].
	1187
	1188	*/
	1189
	1190	PERL_STATIC_INLINE Size_t
	1191	Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
	1192	{
	1193	const U8 * s = s0;
	1194	UV state = 0;
	1195
	1196	PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
	1197
	1198	while (s < e && LIKELY(state != 1)) {
	1199	state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
	1200
	1201	if (state != 0) {
	1202	s++;
	1203	continue;
	1204	}
	1205
	1206	return s - s0 + 1;
	1207	}
	1208
	1209	return 0;
	1210	}
	1211
	1212	/*
	1213
	1214	=for apidoc is_strict_utf8_string_loc
	1215
	1216	Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
	1217	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1218	"utf8ness success") in the C<ep> pointer.
	1219
	1220	See also C<L</is_strict_utf8_string_loclen>>.
	1221
	1222	=cut
	1223	*/
	1224
	1225	#define is_strict_utf8_string_loc(s, len, ep) \
	1226	is_strict_utf8_string_loclen(s, len, ep, 0)
	1227
	1228	/*
	1229
	1230	=for apidoc is_strict_utf8_string_loclen
	1231
	1232	Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
	1233	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1234	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	1235	encoded characters in the C<el> pointer.
	1236
	1237	See also C<L</is_strict_utf8_string_loc>>.
	1238
	1239	=cut
	1240	*/
	1241
	1242	PERL_STATIC_INLINE bool
	1243	Perl_is_strict_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	1244	{
	1245	const U8 * first_variant;
	1246
	1247	PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
	1248
	1249	if (len == 0) {
	1250	len = strlen((const char *) s);
	1251	}
	1252
	1253	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1254	if (el)
	1255	*el = len;
	1256
	1257	if (ep) {
	1258	*ep = s + len;
	1259	}
	1260
	1261	return TRUE;
	1262	}
	1263
	1264	{
	1265	const U8* const send = s + len;
	1266	const U8* x = first_variant;
	1267	STRLEN outlen = first_variant - s;
	1268
	1269	while (x < send) {
	1270	const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
	1271	if (UNLIKELY(! cur_len)) {
	1272	break;
	1273	}
	1274	x += cur_len;
	1275	outlen++;
	1276	}
	1277
	1278	if (el)
	1279	*el = outlen;
	1280
	1281	if (ep) {
	1282	*ep = x;
	1283	}
	1284
	1285	return (x == send);
	1286	}
	1287	}
	1288
	1289	/*
	1290
	1291	=for apidoc is_c9strict_utf8_string_loc
	1292
	1293	Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
	1294	the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1295	"utf8ness success") in the C<ep> pointer.
	1296
	1297	See also C<L</is_c9strict_utf8_string_loclen>>.
	1298
	1299	=cut
	1300	*/
	1301
	1302	#define is_c9strict_utf8_string_loc(s, len, ep) \
	1303	is_c9strict_utf8_string_loclen(s, len, ep, 0)
	1304
	1305	/*
	1306
	1307	=for apidoc is_c9strict_utf8_string_loclen
	1308
	1309	Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
	1310	the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1311	"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
	1312	characters in the C<el> pointer.
	1313
	1314	See also C<L</is_c9strict_utf8_string_loc>>.
	1315
	1316	=cut
	1317	*/
	1318
	1319	PERL_STATIC_INLINE bool
	1320	Perl_is_c9strict_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	1321	{
	1322	const U8 * first_variant;
	1323
	1324	PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
	1325
	1326	if (len == 0) {
	1327	len = strlen((const char *) s);
	1328	}
	1329
	1330	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1331	if (el)
	1332	*el = len;
	1333
	1334	if (ep) {
	1335	*ep = s + len;
	1336	}
	1337
	1338	return TRUE;
	1339	}
	1340
	1341	{
	1342	const U8* const send = s + len;
	1343	const U8* x = first_variant;
	1344	STRLEN outlen = first_variant - s;
	1345
	1346	while (x < send) {
	1347	const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
	1348	if (UNLIKELY(! cur_len)) {
	1349	break;
	1350	}
	1351	x += cur_len;
	1352	outlen++;
	1353	}
	1354
	1355	if (el)
	1356	*el = outlen;
	1357
	1358	if (ep) {
	1359	*ep = x;
	1360	}
	1361
	1362	return (x == send);
	1363	}
	1364	}
	1365
	1366	/*
	1367
	1368	=for apidoc is_utf8_string_loc_flags
	1369
	1370	Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
	1371	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1372	"utf8ness success") in the C<ep> pointer.
	1373
	1374	See also C<L</is_utf8_string_loclen_flags>>.
	1375
	1376	=cut
	1377	*/
	1378
	1379	#define is_utf8_string_loc_flags(s, len, ep, flags) \
	1380	is_utf8_string_loclen_flags(s, len, ep, 0, flags)
	1381
	1382
	1383	/* The above 3 actual functions could have been moved into the more general one
	1384	* just below, and made #defines that call it with the right 'flags'. They are
	1385	* currently kept separate to increase their chances of getting inlined */
	1386
	1387	/*
	1388
	1389	=for apidoc is_utf8_string_loclen_flags
	1390
	1391	Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
	1392	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1393	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	1394	encoded characters in the C<el> pointer.
	1395
	1396	See also C<L</is_utf8_string_loc_flags>>.
	1397
	1398	=cut
	1399	*/
	1400
	1401	PERL_STATIC_INLINE bool
	1402	Perl_is_utf8_string_loclen_flags(const U8 s, STRLEN len, const U8 ep, STRLEN el, const U32 flags)
	1403	{
	1404	const U8 * first_variant;
	1405
	1406	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
	1407	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	1408	\|UTF8_DISALLOW_PERL_EXTENDED)));
	1409
	1410	if (len == 0) {
	1411	len = strlen((const char *) s);
	1412	}
	1413
	1414	if (flags == 0) {
	1415	return is_utf8_string_loclen(s, len, ep, el);
	1416	}
	1417
	1418	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	1419	== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
	1420	{
	1421	return is_strict_utf8_string_loclen(s, len, ep, el);
	1422	}
	1423
	1424	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	1425	== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
	1426	{
	1427	return is_c9strict_utf8_string_loclen(s, len, ep, el);
	1428	}
	1429
	1430	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1431	if (el)
	1432	*el = len;
	1433
	1434	if (ep) {
	1435	*ep = s + len;
	1436	}
	1437
	1438	return TRUE;
	1439	}
	1440
	1441	{
	1442	const U8* send = s + len;
	1443	const U8* x = first_variant;
	1444	STRLEN outlen = first_variant - s;
	1445
	1446	while (x < send) {
	1447	const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
	1448	if (UNLIKELY(! cur_len)) {
	1449	break;
	1450	}
	1451	x += cur_len;
	1452	outlen++;
	1453	}
	1454
	1455	if (el)
	1456	*el = outlen;
	1457
	1458	if (ep) {
	1459	*ep = x;
	1460	}
	1461
	1462	return (x == send);
	1463	}
	1464	}
	1465
	1466	/*
	1467	=for apidoc utf8_distance
	1468
	1469	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	1470	and C<b>.
	1471
	1472	WARNING: use only if you know that the pointers point inside the
	1473	same UTF-8 buffer.
	1474
	1475	=cut
	1476	*/
	1477
	1478	PERL_STATIC_INLINE IV
	1479	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	1480	{
	1481	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	1482
	1483	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	1484	}
	1485
	1486	/*
	1487	=for apidoc utf8_hop
	1488
	1489	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	1490	forward or backward.
	1491
	1492	WARNING: do not use the following unless you know C<off> is within
	1493	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	1494	on the first byte of character or just after the last byte of a character.
	1495
	1496	=cut
	1497	*/
	1498
	1499	PERL_STATIC_INLINE U8 *
	1500	Perl_utf8_hop(const U8 *s, SSize_t off)
	1501	{
	1502	PERL_ARGS_ASSERT_UTF8_HOP;
	1503
	1504	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1505	* the bitops (especially ~) can create illegal UTF-8.
	1506	* In other words: in Perl UTF-8 is not just for Unicode. */
	1507
	1508	if (off >= 0) {
	1509	while (off--)
	1510	s += UTF8SKIP(s);
	1511	}
	1512	else {
	1513	while (off++) {
	1514	s--;
	1515	while (UTF8_IS_CONTINUATION(*s))
	1516	s--;
	1517	}
	1518	}
	1519	GCC_DIAG_IGNORE(-Wcast-qual)
	1520	return (U8 *)s;
	1521	GCC_DIAG_RESTORE
	1522	}
	1523
	1524	/*
	1525	=for apidoc utf8_hop_forward
	1526
	1527	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1528	forward.
	1529
	1530	C<off> must be non-negative.
	1531
	1532	C<s> must be before or equal to C<end>.
	1533
	1534	When moving forward it will not move beyond C<end>.
	1535
	1536	Will not exceed this limit even if the string is not valid "UTF-8".
	1537
	1538	=cut
	1539	*/
	1540
	1541	PERL_STATIC_INLINE U8 *
	1542	Perl_utf8_hop_forward(const U8 s, SSize_t off, const U8 end)
	1543	{
	1544	PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
	1545
	1546	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1547	* the bitops (especially ~) can create illegal UTF-8.
	1548	* In other words: in Perl UTF-8 is not just for Unicode. */
	1549
	1550	assert(s <= end);
	1551	assert(off >= 0);
	1552
	1553	while (off--) {
	1554	STRLEN skip = UTF8SKIP(s);
	1555	if ((STRLEN)(end - s) <= skip) {
	1556	GCC_DIAG_IGNORE(-Wcast-qual)
	1557	return (U8 *)end;
	1558	GCC_DIAG_RESTORE
	1559	}
	1560	s += skip;
	1561	}
	1562
	1563	GCC_DIAG_IGNORE(-Wcast-qual)
	1564	return (U8 *)s;
	1565	GCC_DIAG_RESTORE
	1566	}
	1567
	1568	/*
	1569	=for apidoc utf8_hop_back
	1570
	1571	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1572	backward.
	1573
	1574	C<off> must be non-positive.
	1575
	1576	C<s> must be after or equal to C<start>.
	1577
	1578	When moving backward it will not move before C<start>.
	1579
	1580	Will not exceed this limit even if the string is not valid "UTF-8".
	1581
	1582	=cut
	1583	*/
	1584
	1585	PERL_STATIC_INLINE U8 *
	1586	Perl_utf8_hop_back(const U8 s, SSize_t off, const U8 start)
	1587	{
	1588	PERL_ARGS_ASSERT_UTF8_HOP_BACK;
	1589
	1590	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1591	* the bitops (especially ~) can create illegal UTF-8.
	1592	* In other words: in Perl UTF-8 is not just for Unicode. */
	1593
	1594	assert(start <= s);
	1595	assert(off <= 0);
	1596
	1597	while (off++ && s > start) {
	1598	do {
	1599	s--;
	1600	} while (UTF8_IS_CONTINUATION(*s) && s > start);
	1601	}
	1602
	1603	GCC_DIAG_IGNORE(-Wcast-qual)
	1604	return (U8 *)s;
	1605	GCC_DIAG_RESTORE
	1606	}
	1607
	1608	/*
	1609	=for apidoc utf8_hop_safe
	1610
	1611	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1612	either forward or backward.
	1613
	1614	When moving backward it will not move before C<start>.
	1615
	1616	When moving forward it will not move beyond C<end>.
	1617
	1618	Will not exceed those limits even if the string is not valid "UTF-8".
	1619
	1620	=cut
	1621	*/
	1622
	1623	PERL_STATIC_INLINE U8 *
	1624	Perl_utf8_hop_safe(const U8 s, SSize_t off, const U8 start, const U8 *end)
	1625	{
	1626	PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
	1627
	1628	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1629	* the bitops (especially ~) can create illegal UTF-8.
	1630	* In other words: in Perl UTF-8 is not just for Unicode. */
	1631
	1632	assert(start <= s && s <= end);
	1633
	1634	if (off >= 0) {
	1635	return utf8_hop_forward(s, off, end);
	1636	}
	1637	else {
	1638	return utf8_hop_back(s, off, start);
	1639	}
	1640	}
	1641
	1642	/*
	1643
	1644	=for apidoc is_utf8_valid_partial_char
	1645
	1646	Returns 0 if the sequence of bytes starting at C<s> and looking no further than
	1647	S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
	1648	points. Otherwise, it returns 1 if there exists at least one non-empty
	1649	sequence of bytes that when appended to sequence C<s>, starting at position
	1650	C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
	1651	otherwise returns 0.
	1652
	1653	In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
	1654	point.
	1655
	1656	This is useful when a fixed-length buffer is being tested for being well-formed
	1657	UTF-8, but the final few bytes in it don't comprise a full character; that is,
	1658	it is split somewhere in the middle of the final code point's UTF-8
	1659	representation. (Presumably when the buffer is refreshed with the next chunk
	1660	of data, the new first bytes will complete the partial code point.) This
	1661	function is used to verify that the final bytes in the current buffer are in
	1662	fact the legal beginning of some code point, so that if they aren't, the
	1663	failure can be signalled without having to wait for the next read.
	1664
	1665	=cut
	1666	*/
	1667	#define is_utf8_valid_partial_char(s, e) \
	1668	is_utf8_valid_partial_char_flags(s, e, 0)
	1669
	1670	/*
	1671
	1672	=for apidoc is_utf8_valid_partial_char_flags
	1673
	1674	Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
	1675	or not the input is a valid UTF-8 encoded partial character, but it takes an
	1676	extra parameter, C<flags>, which can further restrict which code points are
	1677	considered valid.
	1678
	1679	If C<flags> is 0, this behaves identically to
	1680	C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
	1681	of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
	1682	there is any sequence of bytes that can complete the input partial character in
	1683	such a way that a non-prohibited character is formed, the function returns
	1684	TRUE; otherwise FALSE. Non character code points cannot be determined based on
	1685	partial character input. But many of the other possible excluded types can be
	1686	determined from just the first one or two bytes.
	1687
	1688	=cut
	1689	*/
	1690
	1691	PERL_STATIC_INLINE bool
	1692	Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
	1693	{
	1694	PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
	1695
	1696	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	1697	\|UTF8_DISALLOW_PERL_EXTENDED)));
	1698
	1699	if (s >= e \|\| s + UTF8SKIP(s) <= e) {
	1700	return FALSE;
	1701	}
	1702
	1703	return cBOOL(is_utf8_char_helper(s, e, flags));
	1704	}
	1705
	1706	/*
	1707
	1708	=for apidoc is_utf8_fixed_width_buf_flags
	1709
	1710	Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
	1711	is entirely valid UTF-8, subject to the restrictions given by C<flags>;
	1712	otherwise it returns FALSE.
	1713
	1714	If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
	1715	without restriction. If the final few bytes of the buffer do not form a
	1716	complete code point, this will return TRUE anyway, provided that
	1717	C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
	1718
	1719	If C<flags> in non-zero, it can be any combination of the
	1720	C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
	1721	same meanings.
	1722
	1723	This function differs from C<L</is_utf8_string_flags>> only in that the latter
	1724	returns FALSE if the final few bytes of the string don't form a complete code
	1725	point.
	1726
	1727	=cut
	1728	*/
	1729	#define is_utf8_fixed_width_buf_flags(s, len, flags) \
	1730	is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
	1731
	1732	/*
	1733
	1734	=for apidoc is_utf8_fixed_width_buf_loc_flags
	1735
	1736	Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
	1737	failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
	1738	to the beginning of any partial character at the end of the buffer; if there is
	1739	no partial character C<*ep> will contain C<s>+C<len>.
	1740
	1741	See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
	1742
	1743	=cut
	1744	*/
	1745
	1746	#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
	1747	is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
	1748
	1749	/*
	1750
	1751	=for apidoc is_utf8_fixed_width_buf_loclen_flags
	1752
	1753	Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
	1754	complete, valid characters found in the C<el> pointer.
	1755
	1756	=cut
	1757	*/
	1758
	1759	PERL_STATIC_INLINE bool
	1760	Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
	1761	STRLEN len,
	1762	const U8 **ep,
	1763	STRLEN *el,
	1764	const U32 flags)
	1765	{
	1766	const U8 * maybe_partial;
	1767
	1768	PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
	1769
	1770	if (! ep) {
	1771	ep = &maybe_partial;
	1772	}
	1773
	1774	/* If it's entirely valid, return that; otherwise see if the only error is
	1775	* that the final few bytes are for a partial character */
	1776	return is_utf8_string_loclen_flags(s, len, ep, el, flags)
	1777	\|\| is_utf8_valid_partial_char_flags(*ep, s + len, flags);
	1778	}
	1779
	1780	PERL_STATIC_INLINE UV
	1781	Perl_utf8n_to_uvchr_msgs(const U8 *s,
	1782	STRLEN curlen,
	1783	STRLEN *retlen,
	1784	const U32 flags,
	1785	U32 * errors,
	1786	AV ** msgs)
	1787	{
	1788	/* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
	1789	* simple cases, and, if necessary calls a helper function to deal with the
	1790	* more complex ones. Almost all well-formed non-problematic code points
	1791	* are considered simple, so that it's unlikely that the helper function
	1792	* will need to be called.
	1793	*
	1794	* This is an adaptation of the tables and algorithm given in
	1795	* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
	1796	* comprehensive documentation of the original version. A copyright notice
	1797	* for the original version is given at the beginning of this file. The
	1798	* Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
	1799	*/
	1800
	1801	const U8 * const s0 = s;
	1802	const U8 * send = s0 + curlen;
	1803	UV uv = 0; /* The 0 silences some stupid compilers */
	1804	UV state = 0;
	1805
	1806	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
	1807
	1808	/* This dfa is fast. If it accepts the input, it was for a well-formed,
	1809	* non-problematic code point, which can be returned immediately.
	1810	* Otherwise we call a helper function to figure out the more complicated
	1811	* cases. */
	1812
	1813	while (s < send && LIKELY(state != 1)) {
	1814	UV type = PL_strict_utf8_dfa_tab[*s];
	1815
	1816	uv = (state == 0)
	1817	? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
	1818	: UTF8_ACCUMULATE(uv, *s);
	1819	state = PL_strict_utf8_dfa_tab[256 + state + type];
	1820
	1821	if (state != 0) {
	1822	s++;
	1823	continue;
	1824	}
	1825
	1826	if (retlen) {
	1827	*retlen = s - s0 + 1;
	1828	}
	1829	if (errors) {
	1830	*errors = 0;
	1831	}
	1832	if (msgs) {
	1833	*msgs = NULL;
	1834	}
	1835
	1836	return UNI_TO_NATIVE(uv);
	1837	}
	1838
	1839	/* Here is potentially problematic. Use the full mechanism */
	1840	return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
	1841	}
	1842
	1843	PERL_STATIC_INLINE UV
	1844	Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1845	{
	1846	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
	1847
	1848	assert(s < send);
	1849
	1850	if (! ckWARN_d(WARN_UTF8)) {
	1851
	1852	/* EMPTY is not really allowed, and asserts on debugging builds. But
	1853	* on non-debugging we have to deal with it, and this causes it to
	1854	* return the REPLACEMENT CHARACTER, as the documentation indicates */
	1855	return utf8n_to_uvchr(s, send - s, retlen,
	1856	(UTF8_ALLOW_ANY \| UTF8_ALLOW_EMPTY));
	1857	}
	1858	else {
	1859	UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
	1860	if (retlen && ret == 0 && *s != '\0') {
	1861	*retlen = (STRLEN) -1;
	1862	}
	1863
	1864	return ret;
	1865	}
	1866	}
	1867
	1868	/* ------------------------------- perl.h ----------------------------- */
	1869
	1870	/*
	1871	=head1 Miscellaneous Functions
	1872
	1873	=for apidoc is_safe_syscall
	1874
	1875	Test that the given C<pv> (with length C<len>) doesn't contain any internal
	1876	C<NUL> characters.
	1877	If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
	1878	category, and return FALSE.
	1879
	1880	Return TRUE if the name is safe.
	1881
	1882	C<what> and C<op_name> are used in any warning.
	1883
	1884	Used by the C<IS_SAFE_SYSCALL()> macro.
	1885
	1886	=cut
	1887	*/
	1888
	1889	PERL_STATIC_INLINE bool
	1890	Perl_is_safe_syscall(pTHX_ const char pv, STRLEN len, const char what, const char *op_name)
	1891	{
	1892	/* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
	1893	* perl itself uses xce*() functions which accept 8-bit strings.
	1894	*/
	1895
	1896	PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
	1897
	1898	if (len > 1) {
	1899	char *null_at;
	1900	if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
	1901	SETERRNO(ENOENT, LIB_INVARG);
	1902	Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
	1903	"Invalid \\0 character in %s for %s: %s\\0%s",
	1904	what, op_name, pv, null_at+1);
	1905	return FALSE;
	1906	}
	1907	}
	1908
	1909	return TRUE;
	1910	}
	1911
	1912	/*
	1913
	1914	Return true if the supplied filename has a newline character
	1915	immediately before the first (hopefully only) NUL.
	1916
	1917	My original look at this incorrectly used the len from SvPV(), but
	1918	that's incorrect, since we allow for a NUL in pv[len-1].
	1919
	1920	So instead, strlen() and work from there.
	1921
	1922	This allow for the user reading a filename, forgetting to chomp it,
	1923	then calling:
	1924
	1925	open my $foo, "$file\0";
	1926
	1927	*/
	1928
	1929	#ifdef PERL_CORE
	1930
	1931	PERL_STATIC_INLINE bool
	1932	S_should_warn_nl(const char *pv)
	1933	{
	1934	STRLEN len;
	1935
	1936	PERL_ARGS_ASSERT_SHOULD_WARN_NL;
	1937
	1938	len = strlen(pv);
	1939
	1940	return len > 0 && pv[len-1] == '\n';
	1941	}
	1942
	1943	#endif
	1944
	1945	#if defined(PERL_IN_PP_C) \|\| defined(PERL_IN_PP_HOT_C)
	1946
	1947	PERL_STATIC_INLINE bool
	1948	S_lossless_NV_to_IV(const NV nv, IV *ivp)
	1949	{
	1950	/* This function determines if the input NV 'nv' may be converted without
	1951	* loss of data to an IV. If not, it returns FALSE taking no other action.
	1952	* But if it is possible, it does the conversion, returning TRUE, and
	1953	* storing the converted result in 'ivp' /
	1954
	1955	PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
	1956
	1957	# if defined(Perl_isnan)
	1958
	1959	if (UNLIKELY(Perl_isnan(nv))) {
	1960	return FALSE;
	1961	}
	1962
	1963	# endif
	1964
	1965	if (UNLIKELY(nv < IV_MIN) \|\| UNLIKELY(nv > IV_MAX)) {
	1966	return FALSE;
	1967	}
	1968
	1969	if ((IV) nv != nv) {
	1970	return FALSE;
	1971	}
	1972
	1973	*ivp = (IV) nv;
	1974	return TRUE;
	1975	}
	1976
	1977	#endif
	1978
	1979	/* ------------------ regcomp.c, toke.c ------------ */
	1980
	1981	#if defined(PERL_IN_REGCOMP_C) \|\| defined(PERL_IN_TOKE_C)
	1982
	1983	/*
	1984	- regcurly - a little FSA that accepts {\d+,?\d*}
	1985	Pulled from reg.c.
	1986	*/
	1987	PERL_STATIC_INLINE bool
	1988	S_regcurly(const char *s)
	1989	{
	1990	PERL_ARGS_ASSERT_REGCURLY;
	1991
	1992	if (*s++ != '{')
	1993	return FALSE;
	1994	if (!isDIGIT(*s))
	1995	return FALSE;
	1996	while (isDIGIT(*s))
	1997	s++;
	1998	if (*s == ',') {
	1999	s++;
	2000	while (isDIGIT(*s))
	2001	s++;
	2002	}
	2003
	2004	return *s == '}';
	2005	}
	2006
	2007	#endif
	2008
	2009	/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
	2010
	2011	#if defined(PERL_IN_PP_C) \|\| defined(PERL_IN_REGCOMP_C) \|\| defined(PERL_IN_TOKE_C) \|\| defined(PERL_IN_UNIVERSAL_C)
	2012
	2013	#define MAX_CHARSET_NAME_LENGTH 2
	2014
	2015	PERL_STATIC_INLINE const char *
	2016	S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
	2017	{
	2018	PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
	2019
	2020	/* Returns a string that corresponds to the name of the regex character set
	2021	* given by 'flags', and *lenp is set the length of that string, which
	2022	* cannot exceed MAX_CHARSET_NAME_LENGTH characters */
	2023
	2024	*lenp = 1;
	2025	switch (get_regex_charset(flags)) {
	2026	case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
	2027	case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
	2028	case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
	2029	case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
	2030	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	2031	*lenp = 2;
	2032	return ASCII_MORE_RESTRICT_PAT_MODS;
	2033	}
	2034	/* The NOT_REACHED; hides an assert() which has a rather complex
	2035	* definition in perl.h. */
	2036	NOT_REACHED; /* NOTREACHED */
	2037	return "?"; /* Unknown */
	2038	}
	2039
	2040	#endif
	2041
	2042	/*
	2043
	2044	Return false if any get magic is on the SV other than taint magic.
	2045
	2046	*/
	2047
	2048	PERL_STATIC_INLINE bool
	2049	Perl_sv_only_taint_gmagic(SV *sv)
	2050	{
	2051	MAGIC *mg = SvMAGIC(sv);
	2052
	2053	PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
	2054
	2055	while (mg) {
	2056	if (mg->mg_type != PERL_MAGIC_taint
	2057	&& !(mg->mg_flags & MGf_GSKIP)
	2058	&& mg->mg_virtual->svt_get) {
	2059	return FALSE;
	2060	}
	2061	mg = mg->mg_moremagic;
	2062	}
	2063
	2064	return TRUE;
	2065	}
	2066
	2067	/* ------------------ cop.h ------------------------------------------- */
	2068
	2069	/* implement GIMME_V() macro */
	2070
	2071	PERL_STATIC_INLINE U8
	2072	Perl_gimme_V(pTHX)
	2073	{
	2074	I32 cxix;
	2075	U8 gimme = (PL_op->op_flags & OPf_WANT);
	2076
	2077	if (gimme)
	2078	return gimme;
	2079	cxix = PL_curstackinfo->si_cxsubix;
	2080	if (cxix < 0)
	2081	return G_VOID;
	2082	assert(cxstack[cxix].blk_gimme & G_WANT);
	2083	return (cxstack[cxix].blk_gimme & G_WANT);
	2084	}
	2085
	2086
	2087	/* Enter a block. Push a new base context and return its address. */
	2088
	2089	PERL_STATIC_INLINE PERL_CONTEXT *
	2090	Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
	2091	{
	2092	PERL_CONTEXT * cx;
	2093
	2094	PERL_ARGS_ASSERT_CX_PUSHBLOCK;
	2095
	2096	CXINC;
	2097	cx = CX_CUR();
	2098	cx->cx_type = type;
	2099	cx->blk_gimme = gimme;
	2100	cx->blk_oldsaveix = saveix;
	2101	cx->blk_oldsp = (I32)(sp - PL_stack_base);
	2102	cx->blk_oldcop = PL_curcop;
	2103	cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
	2104	cx->blk_oldscopesp = PL_scopestack_ix;
	2105	cx->blk_oldpm = PL_curpm;
	2106	cx->blk_old_tmpsfloor = PL_tmps_floor;
	2107
	2108	PL_tmps_floor = PL_tmps_ix;
	2109	CX_DEBUG(cx, "PUSH");
	2110	return cx;
	2111	}
	2112
	2113
	2114	/* Exit a block (RETURN and LAST). */
	2115
	2116	PERL_STATIC_INLINE void
	2117	Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
	2118	{
	2119	PERL_ARGS_ASSERT_CX_POPBLOCK;
	2120
	2121	CX_DEBUG(cx, "POP");
	2122	/* these 3 are common to cx_popblock and cx_topblock */
	2123	PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
	2124	PL_scopestack_ix = cx->blk_oldscopesp;
	2125	PL_curpm = cx->blk_oldpm;
	2126
	2127	/* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
	2128	* and leaves a CX entry lying around for repeated use, so
	2129	* skip for multicall */ \
	2130	assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
	2131	\|\| PL_savestack_ix == cx->blk_oldsaveix);
	2132	PL_curcop = cx->blk_oldcop;
	2133	PL_tmps_floor = cx->blk_old_tmpsfloor;
	2134	}
	2135
	2136	/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
	2137	* Whereas cx_popblock() restores the state to the point just before
	2138	* cx_pushblock() was called, cx_topblock() restores it to the point just
	2139	* after cx_pushblock() was called. */
	2140
	2141	PERL_STATIC_INLINE void
	2142	Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
	2143	{
	2144	PERL_ARGS_ASSERT_CX_TOPBLOCK;
	2145
	2146	CX_DEBUG(cx, "TOP");
	2147	/* these 3 are common to cx_popblock and cx_topblock */
	2148	PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
	2149	PL_scopestack_ix = cx->blk_oldscopesp;
	2150	PL_curpm = cx->blk_oldpm;
	2151
	2152	PL_stack_sp = PL_stack_base + cx->blk_oldsp;
	2153	}
	2154
	2155
	2156	PERL_STATIC_INLINE void
	2157	Perl_cx_pushsub(pTHX_ PERL_CONTEXT cx, CV cv, OP *retop, bool hasargs)
	2158	{
	2159	U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
	2160
	2161	PERL_ARGS_ASSERT_CX_PUSHSUB;
	2162
	2163	PERL_DTRACE_PROBE_ENTRY(cv);
	2164	cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2165	PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
	2166	cx->blk_sub.cv = cv;
	2167	cx->blk_sub.olddepth = CvDEPTH(cv);
	2168	cx->blk_sub.prevcomppad = PL_comppad;
	2169	cx->cx_type \|= (hasargs) ? CXp_HASARGS : 0;
	2170	cx->blk_sub.retop = retop;
	2171	SvREFCNT_inc_simple_void_NN(cv);
	2172	cx->blk_u16 = PL_op->op_private & (phlags\|OPpDEREF);
	2173	}
	2174
	2175
	2176	/* subsets of cx_popsub() */
	2177
	2178	PERL_STATIC_INLINE void
	2179	Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
	2180	{
	2181	CV *cv;
	2182
	2183	PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
	2184	assert(CxTYPE(cx) == CXt_SUB);
	2185
	2186	PL_comppad = cx->blk_sub.prevcomppad;
	2187	PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
	2188	cv = cx->blk_sub.cv;
	2189	CvDEPTH(cv) = cx->blk_sub.olddepth;
	2190	cx->blk_sub.cv = NULL;
	2191	SvREFCNT_dec(cv);
	2192	PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
	2193	}
	2194
	2195
	2196	/* handle the @_ part of leaving a sub */
	2197
	2198	PERL_STATIC_INLINE void
	2199	Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
	2200	{
	2201	AV *av;
	2202
	2203	PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
	2204	assert(CxTYPE(cx) == CXt_SUB);
	2205	assert(AvARRAY(MUTABLE_AV(
	2206	PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
	2207	CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
	2208
	2209	CX_POP_SAVEARRAY(cx);
	2210	av = MUTABLE_AV(PAD_SVl(0));
	2211	if (UNLIKELY(AvREAL(av)))
	2212	/* abandon @_ if it got reified */
	2213	clear_defarray(av, 0);
	2214	else {
	2215	CLEAR_ARGARRAY(av);
	2216	}
	2217	}
	2218
	2219
	2220	PERL_STATIC_INLINE void
	2221	Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
	2222	{
	2223	PERL_ARGS_ASSERT_CX_POPSUB;
	2224	assert(CxTYPE(cx) == CXt_SUB);
	2225
	2226	PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
	2227
	2228	if (CxHASARGS(cx))
	2229	cx_popsub_args(cx);
	2230	cx_popsub_common(cx);
	2231	}
	2232
	2233
	2234	PERL_STATIC_INLINE void
	2235	Perl_cx_pushformat(pTHX_ PERL_CONTEXT cx, CV cv, OP retop, GV gv)
	2236	{
	2237	PERL_ARGS_ASSERT_CX_PUSHFORMAT;
	2238
	2239	cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2240	PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
	2241	cx->blk_format.cv = cv;
	2242	cx->blk_format.retop = retop;
	2243	cx->blk_format.gv = gv;
	2244	cx->blk_format.dfoutgv = PL_defoutgv;
	2245	cx->blk_format.prevcomppad = PL_comppad;
	2246	cx->blk_u16 = 0;
	2247
	2248	SvREFCNT_inc_simple_void_NN(cv);
	2249	CvDEPTH(cv)++;
	2250	SvREFCNT_inc_void(cx->blk_format.dfoutgv);
	2251	}
	2252
	2253
	2254	PERL_STATIC_INLINE void
	2255	Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
	2256	{
	2257	CV *cv;
	2258	GV *dfout;
	2259
	2260	PERL_ARGS_ASSERT_CX_POPFORMAT;
	2261	assert(CxTYPE(cx) == CXt_FORMAT);
	2262
	2263	dfout = cx->blk_format.dfoutgv;
	2264	setdefout(dfout);
	2265	cx->blk_format.dfoutgv = NULL;
	2266	SvREFCNT_dec_NN(dfout);
	2267
	2268	PL_comppad = cx->blk_format.prevcomppad;
	2269	PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
	2270	cv = cx->blk_format.cv;
	2271	cx->blk_format.cv = NULL;
	2272	--CvDEPTH(cv);
	2273	SvREFCNT_dec_NN(cv);
	2274	PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
	2275	}
	2276
	2277
	2278	PERL_STATIC_INLINE void
	2279	Perl_cx_pusheval(pTHX_ PERL_CONTEXT cx, OP retop, SV *namesv)
	2280	{
	2281	PERL_ARGS_ASSERT_CX_PUSHEVAL;
	2282
	2283	cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2284	PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
	2285	cx->blk_eval.retop = retop;
	2286	cx->blk_eval.old_namesv = namesv;
	2287	cx->blk_eval.old_eval_root = PL_eval_root;
	2288	cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
	2289	cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
	2290	cx->blk_eval.cur_top_env = PL_top_env;
	2291
	2292	assert(!(PL_in_eval & ~ 0x3F));
	2293	assert(!(PL_op->op_type & ~0x1FF));
	2294	cx->blk_u16 = (PL_in_eval & 0x3F) \| ((U16)PL_op->op_type << 7);
	2295	}
	2296
	2297
	2298	PERL_STATIC_INLINE void
	2299	Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
	2300	{
	2301	SV *sv;
	2302
	2303	PERL_ARGS_ASSERT_CX_POPEVAL;
	2304	assert(CxTYPE(cx) == CXt_EVAL);
	2305
	2306	PL_in_eval = CxOLD_IN_EVAL(cx);
	2307	assert(!(PL_in_eval & 0xc0));
	2308	PL_eval_root = cx->blk_eval.old_eval_root;
	2309	sv = cx->blk_eval.cur_text;
	2310	if (sv && CxEVAL_TXT_REFCNTED(cx)) {
	2311	cx->blk_eval.cur_text = NULL;
	2312	SvREFCNT_dec_NN(sv);
	2313	}
	2314
	2315	sv = cx->blk_eval.old_namesv;
	2316	if (sv) {
	2317	cx->blk_eval.old_namesv = NULL;
	2318	SvREFCNT_dec_NN(sv);
	2319	}
	2320	PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
	2321	}
	2322
	2323
	2324	/* push a plain loop, i.e.
	2325	* { block }
	2326	* while (cond) { block }
	2327	* for (init;cond;continue) { block }
	2328	* This loop can be last/redo'ed etc.
	2329	*/
	2330
	2331	PERL_STATIC_INLINE void
	2332	Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
	2333	{
	2334	PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
	2335	cx->blk_loop.my_op = cLOOP;
	2336	}
	2337
	2338
	2339	/* push a true for loop, i.e.
	2340	* for var (list) { block }
	2341	*/
	2342
	2343	PERL_STATIC_INLINE void
	2344	Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT cx, void itervarp, SV* itersave)
	2345	{
	2346	PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
	2347
	2348	/* this one line is common with cx_pushloop_plain */
	2349	cx->blk_loop.my_op = cLOOP;
	2350
	2351	cx->blk_loop.itervar_u.svp = (SV**)itervarp;
	2352	cx->blk_loop.itersave = itersave;
	2353	#ifdef USE_ITHREADS
	2354	cx->blk_loop.oldcomppad = PL_comppad;
	2355	#endif
	2356	}
	2357
	2358
	2359	/* pop all loop types, including plain */
	2360
	2361	PERL_STATIC_INLINE void
	2362	Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
	2363	{
	2364	PERL_ARGS_ASSERT_CX_POPLOOP;
	2365
	2366	assert(CxTYPE_is_LOOP(cx));
	2367	if ( CxTYPE(cx) == CXt_LOOP_ARY
	2368	\|\| CxTYPE(cx) == CXt_LOOP_LAZYSV)
	2369	{
	2370	/* Free ary or cur. This assumes that state_u.ary.ary
	2371	* aligns with state_u.lazysv.cur. See cx_dup() */
	2372	SV *sv = cx->blk_loop.state_u.lazysv.cur;
	2373	cx->blk_loop.state_u.lazysv.cur = NULL;
	2374	SvREFCNT_dec_NN(sv);
	2375	if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
	2376	sv = cx->blk_loop.state_u.lazysv.end;
	2377	cx->blk_loop.state_u.lazysv.end = NULL;
	2378	SvREFCNT_dec_NN(sv);
	2379	}
	2380	}
	2381	if (cx->cx_type & (CXp_FOR_PAD\|CXp_FOR_GV)) {
	2382	SV *cursv;
	2383	SV **svp = (cx)->blk_loop.itervar_u.svp;
	2384	if ((cx->cx_type & CXp_FOR_GV))
	2385	svp = &GvSV((GV*)svp);
	2386	cursv = *svp;
	2387	*svp = cx->blk_loop.itersave;
	2388	cx->blk_loop.itersave = NULL;
	2389	SvREFCNT_dec(cursv);
	2390	}
	2391	}
	2392
	2393
	2394	PERL_STATIC_INLINE void
	2395	Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
	2396	{
	2397	PERL_ARGS_ASSERT_CX_PUSHWHEN;
	2398
	2399	cx->blk_givwhen.leave_op = cLOGOP->op_other;
	2400	}
	2401
	2402
	2403	PERL_STATIC_INLINE void
	2404	Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
	2405	{
	2406	PERL_ARGS_ASSERT_CX_POPWHEN;
	2407	assert(CxTYPE(cx) == CXt_WHEN);
	2408
	2409	PERL_UNUSED_ARG(cx);
	2410	PERL_UNUSED_CONTEXT;
	2411	/* currently NOOP */
	2412	}
	2413
	2414
	2415	PERL_STATIC_INLINE void
	2416	Perl_cx_pushgiven(pTHX_ PERL_CONTEXT cx, SV orig_defsv)
	2417	{
	2418	PERL_ARGS_ASSERT_CX_PUSHGIVEN;
	2419
	2420	cx->blk_givwhen.leave_op = cLOGOP->op_other;
	2421	cx->blk_givwhen.defsv_save = orig_defsv;
	2422	}
	2423
	2424
	2425	PERL_STATIC_INLINE void
	2426	Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
	2427	{
	2428	SV *sv;
	2429
	2430	PERL_ARGS_ASSERT_CX_POPGIVEN;
	2431	assert(CxTYPE(cx) == CXt_GIVEN);
	2432
	2433	sv = GvSV(PL_defgv);
	2434	GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
	2435	cx->blk_givwhen.defsv_save = NULL;
	2436	SvREFCNT_dec(sv);
	2437	}
	2438
	2439	/* ------------------ util.h ------------------------------------------- */
	2440
	2441	/*
	2442	=head1 Miscellaneous Functions
	2443
	2444	=for apidoc foldEQ
	2445
	2446	Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
	2447	same
	2448	case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
	2449	match themselves and their opposite case counterparts. Non-cased and non-ASCII
	2450	range bytes match only themselves.
	2451
	2452	=cut
	2453	*/
	2454
	2455	PERL_STATIC_INLINE I32
	2456	Perl_foldEQ(const char s1, const char s2, I32 len)
	2457	{
	2458	const U8 a = (const U8 )s1;
	2459	const U8 b = (const U8 )s2;
	2460
	2461	PERL_ARGS_ASSERT_FOLDEQ;
	2462
	2463	assert(len >= 0);
	2464
	2465	while (len--) {
	2466	if (a != b && a != PL_fold[b])
	2467	return 0;
	2468	a++,b++;
	2469	}
	2470	return 1;
	2471	}
	2472
	2473	PERL_STATIC_INLINE I32
	2474	Perl_foldEQ_latin1(const char s1, const char s2, I32 len)
	2475	{
	2476	/* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
	2477	* representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
	2478	* does not check for this. Nor does it check that the strings each have
	2479	* at least 'len' characters. */
	2480
	2481	const U8 a = (const U8 )s1;
	2482	const U8 b = (const U8 )s2;
	2483
	2484	PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
	2485
	2486	assert(len >= 0);
	2487
	2488	while (len--) {
	2489	if (a != b && a != PL_fold_latin1[b]) {
	2490	return 0;
	2491	}
	2492	a++, b++;
	2493	}
	2494	return 1;
	2495	}
	2496
	2497	/*
	2498	=for apidoc foldEQ_locale
	2499
	2500	Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
	2501	same case-insensitively in the current locale; false otherwise.
	2502
	2503	=cut
	2504	*/
	2505
	2506	PERL_STATIC_INLINE I32
	2507	Perl_foldEQ_locale(const char s1, const char s2, I32 len)
	2508	{
	2509	dVAR;
	2510	const U8 a = (const U8 )s1;
	2511	const U8 b = (const U8 )s2;
	2512
	2513	PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
	2514
	2515	assert(len >= 0);
	2516
	2517	while (len--) {
	2518	if (a != b && a != PL_fold_locale[b])
	2519	return 0;
	2520	a++,b++;
	2521	}
	2522	return 1;
	2523	}
	2524
	2525	/*
	2526	=for apidoc my_strnlen
	2527
	2528	The C library C<strnlen> if available, or a Perl implementation of it.
	2529
	2530	C<my_strnlen()> computes the length of the string, up to C<maxlen>
	2531	characters. It will never attempt to address more than C<maxlen>
	2532	characters, making it suitable for use with strings that are not
	2533	guaranteed to be NUL-terminated.
	2534
	2535	=cut
	2536
	2537	Description stolen from http://man.openbsd.org/strnlen.3,
	2538	implementation stolen from PostgreSQL.
	2539	*/
	2540	#ifndef HAS_STRNLEN
	2541
	2542	PERL_STATIC_INLINE Size_t
	2543	Perl_my_strnlen(const char *str, Size_t maxlen)
	2544	{
	2545	const char end = (char ) memchr(str, '\0', maxlen);
	2546
	2547	PERL_ARGS_ASSERT_MY_STRNLEN;
	2548
	2549	if (end == NULL) return maxlen;
	2550	return end - str;
	2551	}
	2552
	2553	#endif
	2554
	2555	#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) \|\| defined(PERL_EXT))
	2556
	2557	PERL_STATIC_INLINE void *
	2558	S_my_memrchr(const char * s, const char c, const STRLEN len)
	2559	{
	2560	/* memrchr(), since many platforms lack it */
	2561
	2562	const char * t = s + len - 1;
	2563
	2564	PERL_ARGS_ASSERT_MY_MEMRCHR;
	2565
	2566	while (t >= s) {
	2567	if (*t == c) {
	2568	return (void *) t;
	2569	}
	2570	t--;
	2571	}
	2572
	2573	return NULL;
	2574	}
	2575
	2576	#endif
	2577
	2578	PERL_STATIC_INLINE char *
	2579	Perl_mortal_getenv(const char * str)
	2580	{
	2581	/* This implements a (mostly) thread-safe, sequential-call-safe getenv().
	2582	*
	2583	* It's (mostly) thread-safe because it uses a mutex to prevent
	2584	* simultaneous access from other threads that use the same mutex, and
	2585	* makes a copy of the result before releasing that mutex. All of the Perl
	2586	* core uses that mutex, but, like all mutexes, everything has to cooperate
	2587	* for it to completely work. It is possible for code from, say XS, to not
	2588	* use this mutex, defeating the safety.
	2589	*
	2590	* On some platforms, getenv() is not sequential-call-safe, because
	2591	* subsequent calls destroy the static storage inside the C library
	2592	* returned by an earlier call. The result must be copied or completely
	2593	* acted upon before a subsequent getenv call. Those calls could come from
	2594	* another thread. Again, making a copy while controlling the mutex
	2595	* prevents these problems..
	2596	*
	2597	* To prevent leaks, the copy is made by creating a new SV containing it,
	2598	* mortalizing the SV, and returning the SV's string (the copy). Thus this
	2599	* is a drop-in replacement for getenv().
	2600	*
	2601	* A complication is that this can be called during phases where the
	2602	* mortalization process isn't available. These are in interpreter
	2603	* destruction or early in construction. khw believes that at these times
	2604	* there shouldn't be anything else going on, so plain getenv is safe AS
	2605	* LONG AS the caller acts on the return before calling it again. */
	2606
	2607	char * ret;
	2608	dTHX;
	2609
	2610	PERL_ARGS_ASSERT_MORTAL_GETENV;
	2611
	2612	/* Can't mortalize without stacks. khw believes that no other threads
	2613	* should be running, so no need to lock things, and this may be during a
	2614	* phase when locking isn't even available */
	2615	if (UNLIKELY(PL_scopestack_ix == 0)) {
	2616	return getenv(str);
	2617	}
	2618
	2619	ENV_LOCK;
	2620
	2621	ret = getenv(str);
	2622
	2623	if (ret != NULL) {
	2624	ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
	2625	}
	2626
	2627	ENV_UNLOCK;
	2628	return ret;
	2629	}
	2630
	2631	/*
	2632	* ex: set ts=8 sts=4 sw=4 et:
	2633	*/