perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* inline.h
	2	*
	3	* Copyright (C) 2012 by Larry Wall and others
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	* This file contains tables and code adapted from
	9	* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
	10	* copyright notice:
	11
	12	Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
	13
	14	Permission is hereby granted, free of charge, to any person obtaining a copy of
	15	this software and associated documentation files (the "Software"), to deal in
	16	the Software without restriction, including without limitation the rights to
	17	use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	18	of the Software, and to permit persons to whom the Software is furnished to do
	19	so, subject to the following conditions:
	20
	21	The above copyright notice and this permission notice shall be included in all
	22	copies or substantial portions of the Software.
	23
	24	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	25	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	26	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	27	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	28	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	29	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	30	SOFTWARE.
	31
	32	*
	33	* This file is a home for static inline functions that cannot go in other
	34	* header files, because they depend on proto.h (included after most other
	35	* headers) or struct definitions.
	36	*
	37	* Each section names the header file that the functions "belong" to.
	38	*/
	39
	40	/* ------------------------------- av.h ------------------------------- */
	41
	42	/*
	43	=for apidoc_section AV Handling
	44	=for apidoc av_count
	45	Returns the number of elements in the array C<av>. This is the true length of
	46	the array, including any undefined elements. It is always the same as
	47	S<C<av_top_index(av) + 1>>.
	48
	49	=cut
	50	*/
	51	PERL_STATIC_INLINE Size_t
	52	Perl_av_count(pTHX_ AV *av)
	53	{
	54	PERL_ARGS_ASSERT_AV_COUNT;
	55	assert(SvTYPE(av) == SVt_PVAV);
	56
	57	return AvFILL(av) + 1;
	58	}
	59
	60	/* ------------------------------- cv.h ------------------------------- */
	61
	62	PERL_STATIC_INLINE GV *
	63	Perl_CvGV(pTHX_ CV *sv)
	64	{
	65	PERL_ARGS_ASSERT_CVGV;
	66
	67	return CvNAMED(sv)
	68	? Perl_cvgv_from_hek(aTHX_ sv)
	69	: ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
	70	}
	71
	72	PERL_STATIC_INLINE I32 *
	73	Perl_CvDEPTH(const CV * const sv)
	74	{
	75	PERL_ARGS_ASSERT_CVDEPTH;
	76	assert(SvTYPE(sv) == SVt_PVCV \|\| SvTYPE(sv) == SVt_PVFM);
	77
	78	return &((XPVCV*)SvANY(sv))->xcv_depth;
	79	}
	80
	81	/*
	82	CvPROTO returns the prototype as stored, which is not necessarily what
	83	the interpreter should be using. Specifically, the interpreter assumes
	84	that spaces have been stripped, which has been the case if the prototype
	85	was added by toke.c, but is generally not the case if it was added elsewhere.
	86	Since we can't enforce the spacelessness at assignment time, this routine
	87	provides a temporary copy at parse time with spaces removed.
	88	I<orig> is the start of the original buffer, I<len> is the length of the
	89	prototype and will be updated when this returns.
	90	*/
	91
	92	#ifdef PERL_CORE
	93	PERL_STATIC_INLINE char *
	94	S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
	95	{
	96	SV * tmpsv;
	97	char * tmps;
	98	tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
	99	tmps = SvPVX(tmpsv);
	100	while ((*len)--) {
	101	if (!isSPACE(*orig))
	102	tmps++ = orig;
	103	orig++;
	104	}
	105	*tmps = '\0';
	106	*len = tmps - SvPVX(tmpsv);
	107	return SvPVX(tmpsv);
	108	}
	109	#endif
	110
	111	/* ------------------------------- mg.h ------------------------------- */
	112
	113	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	114	/* assumes get-magic and stringification have already occurred */
	115	PERL_STATIC_INLINE STRLEN
	116	S_MgBYTEPOS(pTHX_ MAGIC mg, SV sv, const char *s, STRLEN len)
	117	{
	118	assert(mg->mg_type == PERL_MAGIC_regex_global);
	119	assert(mg->mg_len != -1);
	120	if (mg->mg_flags & MGf_BYTES \|\| !DO_UTF8(sv))
	121	return (STRLEN)mg->mg_len;
	122	else {
	123	const STRLEN pos = (STRLEN)mg->mg_len;
	124	/* Without this check, we may read past the end of the buffer: */
	125	if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
	126	return sv_or_pv_pos_u2b(sv, s, pos, NULL);
	127	}
	128	}
	129	#endif
	130
	131	/* ------------------------------- pad.h ------------------------------ */
	132
	133	#if defined(PERL_IN_PAD_C) \|\| defined(PERL_IN_OP_C)
	134	PERL_STATIC_INLINE bool
	135	S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
	136	{
	137	PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
	138
	139	/* is seq within the range _LOW to _HIGH ?
	140	* This is complicated by the fact that PL_cop_seqmax
	141	* may have wrapped around at some point */
	142	if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
	143	return FALSE; /* not yet introduced */
	144
	145	if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
	146	/* in compiling scope */
	147	if (
	148	(seq > COP_SEQ_RANGE_LOW(pn))
	149	? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
	150	: (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
	151	)
	152	return TRUE;
	153	}
	154	else if (
	155	(COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
	156	?
	157	( seq > COP_SEQ_RANGE_LOW(pn)
	158	\|\| seq <= COP_SEQ_RANGE_HIGH(pn))
	159
	160	: ( seq > COP_SEQ_RANGE_LOW(pn)
	161	&& seq <= COP_SEQ_RANGE_HIGH(pn))
	162	)
	163	return TRUE;
	164	return FALSE;
	165	}
	166	#endif
	167
	168	/* ------------------------------- pp.h ------------------------------- */
	169
	170	PERL_STATIC_INLINE I32
	171	Perl_TOPMARK(pTHX)
	172	{
	173	DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
	174	"MARK top %p %" IVdf "\n",
	175	PL_markstack_ptr,
	176	(IV)*PL_markstack_ptr)));
	177	return *PL_markstack_ptr;
	178	}
	179
	180	PERL_STATIC_INLINE I32
	181	Perl_POPMARK(pTHX)
	182	{
	183	DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
	184	"MARK pop %p %" IVdf "\n",
	185	(PL_markstack_ptr-1),
	186	(IV)*(PL_markstack_ptr-1))));
	187	assert((PL_markstack_ptr > PL_markstack) \|\| !"MARK underflow");
	188	return *PL_markstack_ptr--;
	189	}
	190
	191	/* ----------------------------- regexp.h ----------------------------- */
	192
	193	PERL_STATIC_INLINE struct regexp *
	194	Perl_ReANY(const REGEXP * const re)
	195	{
	196	XPV* const p = (XPV*)SvANY(re);
	197
	198	PERL_ARGS_ASSERT_REANY;
	199	assert(isREGEXP(re));
	200
	201	return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
	202	: (struct regexp *)p;
	203	}
	204
	205	/* ------------------------------- sv.h ------------------------------- */
	206
	207	PERL_STATIC_INLINE bool
	208	Perl_SvTRUE(pTHX_ SV *sv) {
	209	if (!LIKELY(sv))
	210	return FALSE;
	211	SvGETMAGIC(sv);
	212	return SvTRUE_nomg_NN(sv);
	213	}
	214
	215	PERL_STATIC_INLINE SV *
	216	Perl_SvREFCNT_inc(SV *sv)
	217	{
	218	if (LIKELY(sv != NULL))
	219	SvREFCNT(sv)++;
	220	return sv;
	221	}
	222	PERL_STATIC_INLINE SV *
	223	Perl_SvREFCNT_inc_NN(SV *sv)
	224	{
	225	PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
	226
	227	SvREFCNT(sv)++;
	228	return sv;
	229	}
	230	PERL_STATIC_INLINE void
	231	Perl_SvREFCNT_inc_void(SV *sv)
	232	{
	233	if (LIKELY(sv != NULL))
	234	SvREFCNT(sv)++;
	235	}
	236	PERL_STATIC_INLINE void
	237	Perl_SvREFCNT_dec(pTHX_ SV *sv)
	238	{
	239	if (LIKELY(sv != NULL)) {
	240	U32 rc = SvREFCNT(sv);
	241	if (LIKELY(rc > 1))
	242	SvREFCNT(sv) = rc - 1;
	243	else
	244	Perl_sv_free2(aTHX_ sv, rc);
	245	}
	246	}
	247
	248	PERL_STATIC_INLINE void
	249	Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
	250	{
	251	U32 rc = SvREFCNT(sv);
	252
	253	PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
	254
	255	if (LIKELY(rc > 1))
	256	SvREFCNT(sv) = rc - 1;
	257	else
	258	Perl_sv_free2(aTHX_ sv, rc);
	259	}
	260
	261	PERL_STATIC_INLINE void
	262	Perl_SvAMAGIC_on(SV *sv)
	263	{
	264	PERL_ARGS_ASSERT_SVAMAGIC_ON;
	265	assert(SvROK(sv));
	266
	267	if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
	268	}
	269	PERL_STATIC_INLINE void
	270	Perl_SvAMAGIC_off(SV *sv)
	271	{
	272	PERL_ARGS_ASSERT_SVAMAGIC_OFF;
	273
	274	if (SvROK(sv) && SvOBJECT(SvRV(sv)))
	275	HvAMAGIC_off(SvSTASH(SvRV(sv)));
	276	}
	277
	278	PERL_STATIC_INLINE U32
	279	Perl_SvPADSTALE_on(SV *sv)
	280	{
	281	assert(!(SvFLAGS(sv) & SVs_PADTMP));
	282	return SvFLAGS(sv) \|= SVs_PADSTALE;
	283	}
	284	PERL_STATIC_INLINE U32
	285	Perl_SvPADSTALE_off(SV *sv)
	286	{
	287	assert(!(SvFLAGS(sv) & SVs_PADTMP));
	288	return SvFLAGS(sv) &= ~SVs_PADSTALE;
	289	}
	290	#if defined(PERL_CORE) \|\| defined (PERL_EXT)
	291	PERL_STATIC_INLINE STRLEN
	292	S_sv_or_pv_pos_u2b(pTHX_ SV sv, const char pv, STRLEN pos, STRLEN *lenp)
	293	{
	294	PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
	295	if (SvGAMAGIC(sv)) {
	296	U8 hopped = utf8_hop((U8 )pv, pos);
	297	if (lenp) lenp = (STRLEN)(utf8_hop(hopped, lenp) - hopped);
	298	return (STRLEN)(hopped - (U8 *)pv);
	299	}
	300	return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
	301	}
	302	#endif
	303
	304	/* ------------------------------- utf8.h ------------------------------- */
	305
	306	/*
	307	=for apidoc_section Unicode Support
	308	*/
	309
	310	PERL_STATIC_INLINE void
	311	Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
	312	{
	313	/* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
	314	* encoded string at 'dest', updating 'dest' to include it */
	315
	316	PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
	317
	318	if (NATIVE_BYTE_IS_INVARIANT(byte))
	319	((dest)++) = byte;
	320	else {
	321	((dest)++) = UTF8_EIGHT_BIT_HI(byte);
	322	((dest)++) = UTF8_EIGHT_BIT_LO(byte);
	323	}
	324	}
	325
	326	/*
	327	=for apidoc valid_utf8_to_uvchr
	328	Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
	329	known that the next character in the input UTF-8 string C<s> is well-formed
	330	(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
	331	points, and non-Unicode code points are allowed.
	332
	333	=cut
	334
	335	*/
	336
	337	PERL_STATIC_INLINE UV
	338	Perl_valid_utf8_to_uvchr(const U8 s, STRLEN retlen)
	339	{
	340	const UV expectlen = UTF8SKIP(s);
	341	const U8* send = s + expectlen;
	342	UV uv = *s;
	343
	344	PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
	345
	346	if (retlen) {
	347	*retlen = expectlen;
	348	}
	349
	350	/* An invariant is trivially returned */
	351	if (expectlen == 1) {
	352	return uv;
	353	}
	354
	355	/* Remove the leading bits that indicate the number of bytes, leaving just
	356	* the bits that are part of the value */
	357	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	358
	359	/* Now, loop through the remaining bytes, accumulating each into the
	360	* working total as we go. (I khw tried unrolling the loop for up to 4
	361	* bytes, but there was no performance improvement) */
	362	for (++s; s < send; s++) {
	363	uv = UTF8_ACCUMULATE(uv, *s);
	364	}
	365
	366	return UNI_TO_NATIVE(uv);
	367
	368	}
	369
	370	/*
	371	=for apidoc is_utf8_invariant_string
	372
	373	Returns TRUE if the first C<len> bytes of the string C<s> are the same
	374	regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
	375	EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
	376	are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
	377	the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
	378	characters are invariant, but so also are the C1 controls.
	379
	380	If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
	381	use this option, that C<s> can't have embedded C<NUL> characters and has to
	382	have a terminating C<NUL> byte).
	383
	384	See also
	385	C<L</is_utf8_string>>,
	386	C<L</is_utf8_string_flags>>,
	387	C<L</is_utf8_string_loc>>,
	388	C<L</is_utf8_string_loc_flags>>,
	389	C<L</is_utf8_string_loclen>>,
	390	C<L</is_utf8_string_loclen_flags>>,
	391	C<L</is_utf8_fixed_width_buf_flags>>,
	392	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	393	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	394	C<L</is_strict_utf8_string>>,
	395	C<L</is_strict_utf8_string_loc>>,
	396	C<L</is_strict_utf8_string_loclen>>,
	397	C<L</is_c9strict_utf8_string>>,
	398	C<L</is_c9strict_utf8_string_loc>>,
	399	and
	400	C<L</is_c9strict_utf8_string_loclen>>.
	401
	402	=cut
	403
	404	*/
	405
	406	#define is_utf8_invariant_string(s, len) \
	407	is_utf8_invariant_string_loc(s, len, NULL)
	408
	409	/*
	410	=for apidoc is_utf8_invariant_string_loc
	411
	412	Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
	413	the first UTF-8 variant character in the C<ep> pointer; if all characters are
	414	UTF-8 invariant, this function does not change the contents of C<*ep>.
	415
	416	=cut
	417
	418	*/
	419
	420	PERL_STATIC_INLINE bool
	421	Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
	422	{
	423	const U8* send;
	424	const U8* x = s;
	425
	426	PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
	427
	428	if (len == 0) {
	429	len = strlen((const char *)s);
	430	}
	431
	432	send = s + len;
	433
	434	/* This looks like 0x010101... */
	435	# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
	436
	437	/* This looks like 0x808080... */
	438	# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
	439	# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
	440	# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
	441
	442	/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
	443	* or'ing together the lowest bits of 'x'. Hopefully the final term gets
	444	* optimized out completely on a 32-bit system, and its mask gets optimized out
	445	* on a 64-bit system */
	446	# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
	447	\| ( PTR2nat(x) >> 1) \
	448	\| ( ( (PTR2nat(x) \
	449	& PERL_WORD_BOUNDARY_MASK) >> 2))))
	450
	451	#ifndef EBCDIC
	452
	453	/* Do the word-at-a-time iff there is at least one usable full word. That
	454	* means that after advancing to a word boundary, there still is at least a
	455	* full word left. The number of bytes needed to advance is 'wordsize -
	456	* offset' unless offset is 0. */
	457	if ((STRLEN) (send - x) >= PERL_WORDSIZE
	458
	459	/* This term is wordsize if subword; 0 if not */
	460	+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
	461
	462	/* 'offset' */
	463	- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
	464	{
	465
	466	/* Process per-byte until reach word boundary. XXX This loop could be
	467	* eliminated if we knew that this platform had fast unaligned reads */
	468	while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
	469	if (! UTF8_IS_INVARIANT(*x)) {
	470	if (ep) {
	471	*ep = x;
	472	}
	473
	474	return FALSE;
	475	}
	476	x++;
	477	}
	478
	479	/* Here, we know we have at least one full word to process. Process
	480	* per-word as long as we have at least a full word left */
	481	do {
	482	if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
	483
	484	/* Found a variant. Just return if caller doesn't want its
	485	* exact position */
	486	if (! ep) {
	487	return FALSE;
	488	}
	489
	490	# if BYTEORDER == 0x1234 \|\| BYTEORDER == 0x12345678 \
	491	\|\| BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	492
	493	ep = x + variant_byte_number( (PERL_UINTMAX_T *) x);
	494	assert(ep >= s && ep < send);
	495
	496	return FALSE;
	497
	498	# else /* If weird byte order, drop into next loop to do byte-at-a-time
	499	checks. */
	500
	501	break;
	502	# endif
	503	}
	504
	505	x += PERL_WORDSIZE;
	506
	507	} while (x + PERL_WORDSIZE <= send);
	508	}
	509
	510	#endif /* End of ! EBCDIC */
	511
	512	/* Process per-byte */
	513	while (x < send) {
	514	if (! UTF8_IS_INVARIANT(*x)) {
	515	if (ep) {
	516	*ep = x;
	517	}
	518
	519	return FALSE;
	520	}
	521
	522	x++;
	523	}
	524
	525	return TRUE;
	526	}
	527
	528	#ifndef EBCDIC
	529
	530	PERL_STATIC_INLINE unsigned int
	531	Perl_variant_byte_number(PERL_UINTMAX_T word)
	532	{
	533
	534	/* This returns the position in a word (0..7) of the first variant byte in
	535	* it. This is a helper function. Note that there are no branches */
	536
	537	assert(word);
	538
	539	/* Get just the msb bits of each byte */
	540	word &= PERL_VARIANTS_WORD_MASK;
	541
	542	# if BYTEORDER == 0x1234 \|\| BYTEORDER == 0x12345678
	543
	544	/* Bytes are stored like
	545	* Byte8 ... Byte2 Byte1
	546	* 63..56...15...8 7...0
	547	*
	548	* Isolate the lsb;
	549	* https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
	550	*
	551	* The word will look like this, with a rightmost set bit in position 's':
	552	* ('x's are don't cares)
	553	* s
	554	* x..x100..0
	555	* x..xx10..0 Right shift (rightmost 0 is shifted off)
	556	* x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
	557	* the 1 just to their left into a 0; the remainder is
	558	* untouched
	559	* 0..0011..1 The xor with the original, x..xx10..0, clears that
	560	* remainder, sets the bottom to all 1
	561	* 0..0100..0 Add 1 to clear the word except for the bit in 's'
	562	*
	563	* Another method is to do 'word &= -word'; but it generates a compiler
	564	* message on some platforms about taking the negative of an unsigned */
	565
	566	word >>= 1;
	567	word = 1 + (word ^ (word - 1));
	568
	569	# elif BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	570
	571	/* Bytes are stored like
	572	* Byte1 Byte2 ... Byte8
	573	* 63..56 55..47 ... 7...0
	574	*
	575	* Isolate the msb; http://codeforces.com/blog/entry/10330
	576	*
	577	* Only the most significant set bit matters. Or'ing word with its right
	578	* shift of 1 makes that bit and the next one to its right both 1. Then
	579	* right shifting by 2 makes for 4 1-bits in a row. ... We end with the
	580	* msb and all to the right being 1. */
	581	word \|= word >> 1;
	582	word \|= word >> 2;
	583	word \|= word >> 4;
	584	word \|= word >> 8;
	585	word \|= word >> 16;
	586	word \|= word >> 32; /* This should get optimized out on 32-bit systems. */
	587
	588	/* Then subtracting the right shift by 1 clears all but the left-most of
	589	* the 1 bits, which is our desired result */
	590	word -= (word >> 1);
	591
	592	# else
	593	# error Unexpected byte order
	594	# endif
	595
	596	/* Here 'word' has a single bit set: the msb of the first byte in which it
	597	* is set. Calculate that position in the word. We can use this
	598	* specialized solution: https://stackoverflow.com/a/32339674/1626653,
	599	* assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
	600	* just get shifted off at compile time) */
	601	word = (word >> 7) * ((UINTMAX_C( 7) << 56) \| (UINTMAX_C(15) << 48)
	602	\| (UINTMAX_C(23) << 40) \| (UINTMAX_C(31) << 32)
	603	\| (39 << 24) \| (47 << 16)
	604	\| (55 << 8) \| (63 << 0));
	605	word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
	606
	607	/* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
	608	word = ((word + 1) >> 3) - 1;
	609
	610	# if BYTEORDER == 0x4321 \|\| BYTEORDER == 0x87654321
	611
	612	/* And invert the result */
	613	word = CHARBITS - word - 1;
	614
	615	# endif
	616
	617	return (unsigned int) word;
	618	}
	619
	620	#endif
	621	#if defined(PERL_CORE) \|\| defined(PERL_EXT)
	622
	623	/*
	624	=for apidoc variant_under_utf8_count
	625
	626	This function looks at the sequence of bytes between C<s> and C<e>, which are
	627	assumed to be encoded in ASCII/Latin1, and returns how many of them would
	628	change should the string be translated into UTF-8. Due to the nature of UTF-8,
	629	each of these would occupy two bytes instead of the single one in the input
	630	string. Thus, this function returns the precise number of bytes the string
	631	would expand by when translated to UTF-8.
	632
	633	Unlike most of the other functions that have C<utf8> in their name, the input
	634	to this function is NOT a UTF-8-encoded string. The function name is slightly
	635	I<odd> to emphasize this.
	636
	637	This function is internal to Perl because khw thinks that any XS code that
	638	would want this is probably operating too close to the internals. Presenting a
	639	valid use case could change that.
	640
	641	See also
	642	C<L<perlapi/is_utf8_invariant_string>>
	643	and
	644	C<L<perlapi/is_utf8_invariant_string_loc>>,
	645
	646	=cut
	647
	648	*/
	649
	650	PERL_STATIC_INLINE Size_t
	651	S_variant_under_utf8_count(const U8* const s, const U8* const e)
	652	{
	653	const U8* x = s;
	654	Size_t count = 0;
	655
	656	PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
	657
	658	# ifndef EBCDIC
	659
	660	/* Test if the string is long enough to use word-at-a-time. (Logic is the
	661	* same as for is_utf8_invariant_string()) */
	662	if ((STRLEN) (e - x) >= PERL_WORDSIZE
	663	+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
	664	- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
	665	{
	666
	667	/* Process per-byte until reach word boundary. XXX This loop could be
	668	* eliminated if we knew that this platform had fast unaligned reads */
	669	while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
	670	count += ! UTF8_IS_INVARIANT(*x++);
	671	}
	672
	673	/* Process per-word as long as we have at least a full word left */
	674	do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
	675	explanation of how this works */
	676	PERL_UINTMAX_T increment
	677	= ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
	678	* PERL_COUNT_MULTIPLIER)
	679	>> ((PERL_WORDSIZE - 1) * CHARBITS);
	680	count += (Size_t) increment;
	681	x += PERL_WORDSIZE;
	682	} while (x + PERL_WORDSIZE <= e);
	683	}
	684
	685	# endif
	686
	687	/* Process per-byte */
	688	while (x < e) {
	689	if (! UTF8_IS_INVARIANT(*x)) {
	690	count++;
	691	}
	692
	693	x++;
	694	}
	695
	696	return count;
	697	}
	698
	699	#endif
	700
	701	#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
	702	# undef PERL_WORDSIZE
	703	# undef PERL_COUNT_MULTIPLIER
	704	# undef PERL_WORD_BOUNDARY_MASK
	705	# undef PERL_VARIANTS_WORD_MASK
	706	#endif
	707
	708	/*
	709	=for apidoc is_utf8_string
	710
	711	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	712	Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
	713	be calculated using C<strlen(s)> (which means if you use this option, that C<s>
	714	can't have embedded C<NUL> characters and has to have a terminating C<NUL>
	715	byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
	716
	717	This function considers Perl's extended UTF-8 to be valid. That means that
	718	code points above Unicode, surrogates, and non-character code points are
	719	considered valid by this function. Use C<L</is_strict_utf8_string>>,
	720	C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
	721	code points are considered valid.
	722
	723	See also
	724	C<L</is_utf8_invariant_string>>,
	725	C<L</is_utf8_invariant_string_loc>>,
	726	C<L</is_utf8_string_loc>>,
	727	C<L</is_utf8_string_loclen>>,
	728	C<L</is_utf8_fixed_width_buf_flags>>,
	729	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	730	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	731
	732	=cut
	733	*/
	734
	735	#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
	736
	737	#if defined(PERL_CORE) \|\| defined (PERL_EXT)
	738
	739	/*
	740	=for apidoc is_utf8_non_invariant_string
	741
	742	Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
	743	C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
	744	UTF-8; otherwise returns FALSE.
	745
	746	A TRUE return means that at least one code point represented by the sequence
	747	either is a wide character not representable as a single byte, or the
	748	representation differs depending on whether the sequence is encoded in UTF-8 or
	749	not.
	750
	751	See also
	752	C<L<perlapi/is_utf8_invariant_string>>,
	753	C<L<perlapi/is_utf8_string>>
	754
	755	=cut
	756
	757	This is commonly used to determine if a SV's UTF-8 flag should be turned on.
	758	It generally needn't be if its string is entirely UTF-8 invariant, and it
	759	shouldn't be if it otherwise contains invalid UTF-8.
	760
	761	It is an internal function because khw thinks that XS code shouldn't be working
	762	at this low a level. A valid use case could change that.
	763
	764	*/
	765
	766	PERL_STATIC_INLINE bool
	767	Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
	768	{
	769	const U8 * first_variant;
	770
	771	PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
	772
	773	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	774	return FALSE;
	775	}
	776
	777	return is_utf8_string(first_variant, len - (first_variant - s));
	778	}
	779
	780	#endif
	781
	782	/*
	783	=for apidoc is_strict_utf8_string
	784
	785	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	786	UTF-8-encoded string that is fully interchangeable by any application using
	787	Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
	788	calculated using C<strlen(s)> (which means if you use this option, that C<s>
	789	can't have embedded C<NUL> characters and has to have a terminating C<NUL>
	790	byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
	791
	792	This function returns FALSE for strings containing any
	793	code points above the Unicode max of 0x10FFFF, surrogate code points, or
	794	non-character code points.
	795
	796	See also
	797	C<L</is_utf8_invariant_string>>,
	798	C<L</is_utf8_invariant_string_loc>>,
	799	C<L</is_utf8_string>>,
	800	C<L</is_utf8_string_flags>>,
	801	C<L</is_utf8_string_loc>>,
	802	C<L</is_utf8_string_loc_flags>>,
	803	C<L</is_utf8_string_loclen>>,
	804	C<L</is_utf8_string_loclen_flags>>,
	805	C<L</is_utf8_fixed_width_buf_flags>>,
	806	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	807	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	808	C<L</is_strict_utf8_string_loc>>,
	809	C<L</is_strict_utf8_string_loclen>>,
	810	C<L</is_c9strict_utf8_string>>,
	811	C<L</is_c9strict_utf8_string_loc>>,
	812	and
	813	C<L</is_c9strict_utf8_string_loclen>>.
	814
	815	=cut
	816	*/
	817
	818	#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
	819
	820	/*
	821	=for apidoc is_c9strict_utf8_string
	822
	823	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	824	UTF-8-encoded string that conforms to
	825	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>;
	826	otherwise it returns FALSE. If C<len> is 0, it will be calculated using
	827	C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
	828	C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
	829	characters being ASCII constitute 'a valid UTF-8 string'.
	830
	831	This function returns FALSE for strings containing any code points above the
	832	Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
	833	code points per
	834	L<Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	835
	836	See also
	837	C<L</is_utf8_invariant_string>>,
	838	C<L</is_utf8_invariant_string_loc>>,
	839	C<L</is_utf8_string>>,
	840	C<L</is_utf8_string_flags>>,
	841	C<L</is_utf8_string_loc>>,
	842	C<L</is_utf8_string_loc_flags>>,
	843	C<L</is_utf8_string_loclen>>,
	844	C<L</is_utf8_string_loclen_flags>>,
	845	C<L</is_utf8_fixed_width_buf_flags>>,
	846	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	847	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	848	C<L</is_strict_utf8_string>>,
	849	C<L</is_strict_utf8_string_loc>>,
	850	C<L</is_strict_utf8_string_loclen>>,
	851	C<L</is_c9strict_utf8_string_loc>>,
	852	and
	853	C<L</is_c9strict_utf8_string_loclen>>.
	854
	855	=cut
	856	*/
	857
	858	#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
	859
	860	/*
	861	=for apidoc is_utf8_string_flags
	862
	863	Returns TRUE if the first C<len> bytes of string C<s> form a valid
	864	UTF-8 string, subject to the restrictions imposed by C<flags>;
	865	returns FALSE otherwise. If C<len> is 0, it will be calculated
	866	using C<strlen(s)> (which means if you use this option, that C<s> can't have
	867	embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
	868	that all characters being ASCII constitute 'a valid UTF-8 string'.
	869
	870	If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
	871	C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
	872	as C<L</is_strict_utf8_string>>; and if C<flags> is
	873	C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
	874	C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
	875	combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
	876	C<L</utf8n_to_uvchr>>, with the same meanings.
	877
	878	See also
	879	C<L</is_utf8_invariant_string>>,
	880	C<L</is_utf8_invariant_string_loc>>,
	881	C<L</is_utf8_string>>,
	882	C<L</is_utf8_string_loc>>,
	883	C<L</is_utf8_string_loc_flags>>,
	884	C<L</is_utf8_string_loclen>>,
	885	C<L</is_utf8_string_loclen_flags>>,
	886	C<L</is_utf8_fixed_width_buf_flags>>,
	887	C<L</is_utf8_fixed_width_buf_loc_flags>>,
	888	C<L</is_utf8_fixed_width_buf_loclen_flags>>,
	889	C<L</is_strict_utf8_string>>,
	890	C<L</is_strict_utf8_string_loc>>,
	891	C<L</is_strict_utf8_string_loclen>>,
	892	C<L</is_c9strict_utf8_string>>,
	893	C<L</is_c9strict_utf8_string_loc>>,
	894	and
	895	C<L</is_c9strict_utf8_string_loclen>>.
	896
	897	=cut
	898	*/
	899
	900	PERL_STATIC_INLINE bool
	901	Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
	902	{
	903	const U8 * first_variant;
	904
	905	PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
	906	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	907	\|UTF8_DISALLOW_PERL_EXTENDED)));
	908
	909	if (len == 0) {
	910	len = strlen((const char *)s);
	911	}
	912
	913	if (flags == 0) {
	914	return is_utf8_string(s, len);
	915	}
	916
	917	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	918	== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
	919	{
	920	return is_strict_utf8_string(s, len);
	921	}
	922
	923	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	924	== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
	925	{
	926	return is_c9strict_utf8_string(s, len);
	927	}
	928
	929	if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
	930	const U8* const send = s + len;
	931	const U8* x = first_variant;
	932
	933	while (x < send) {
	934	STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
	935	if (UNLIKELY(! cur_len)) {
	936	return FALSE;
	937	}
	938	x += cur_len;
	939	}
	940	}
	941
	942	return TRUE;
	943	}
	944
	945	/*
	946
	947	=for apidoc is_utf8_string_loc
	948
	949	Like C<L</is_utf8_string>> but stores the location of the failure (in the
	950	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	951	"utf8ness success") in the C<ep> pointer.
	952
	953	See also C<L</is_utf8_string_loclen>>.
	954
	955	=cut
	956	*/
	957
	958	#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
	959
	960	/*
	961
	962	=for apidoc is_utf8_string_loclen
	963
	964	Like C<L</is_utf8_string>> but stores the location of the failure (in the
	965	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	966	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	967	encoded characters in the C<el> pointer.
	968
	969	See also C<L</is_utf8_string_loc>>.
	970
	971	=cut
	972	*/
	973
	974	PERL_STATIC_INLINE bool
	975	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	976	{
	977	const U8 * first_variant;
	978
	979	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	980
	981	if (len == 0) {
	982	len = strlen((const char *) s);
	983	}
	984
	985	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	986	if (el)
	987	*el = len;
	988
	989	if (ep) {
	990	*ep = s + len;
	991	}
	992
	993	return TRUE;
	994	}
	995
	996	{
	997	const U8* const send = s + len;
	998	const U8* x = first_variant;
	999	STRLEN outlen = first_variant - s;
	1000
	1001	while (x < send) {
	1002	const STRLEN cur_len = isUTF8_CHAR(x, send);
	1003	if (UNLIKELY(! cur_len)) {
	1004	break;
	1005	}
	1006	x += cur_len;
	1007	outlen++;
	1008	}
	1009
	1010	if (el)
	1011	*el = outlen;
	1012
	1013	if (ep) {
	1014	*ep = x;
	1015	}
	1016
	1017	return (x == send);
	1018	}
	1019	}
	1020
	1021	/*
	1022
	1023	=for apidoc isUTF8_CHAR
	1024
	1025	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1026	looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
	1027	that represents some code point; otherwise it evaluates to 0. If non-zero, the
	1028	value gives how many bytes starting at C<s> comprise the code point's
	1029	representation. Any bytes remaining before C<e>, but beyond the ones needed to
	1030	form the first code point in C<s>, are not examined.
	1031
	1032	The code point can be any that will fit in an IV on this machine, using Perl's
	1033	extension to official UTF-8 to represent those higher than the Unicode maximum
	1034	of 0x10FFFF. That means that this macro is used to efficiently decide if the
	1035	next few bytes in C<s> is legal UTF-8 for a single character.
	1036
	1037	Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
	1038	defined by Unicode to be fully interchangeable across applications;
	1039	C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
	1040	#9\|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
	1041	code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1042
	1043	Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
	1044	C<L</is_utf8_string_loclen>> to check entire strings.
	1045
	1046	Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
	1047	machines) is a valid UTF-8 character.
	1048
	1049	=cut
	1050
	1051	This uses an adaptation of the table and algorithm given in
	1052	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1053	documentation of the original version. A copyright notice for the original
	1054	version is given at the beginning of this file. The Perl adapation is
	1055	documented at the definition of PL_extended_utf8_dfa_tab[].
	1056
	1057	*/
	1058
	1059	PERL_STATIC_INLINE Size_t
	1060	Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
	1061	{
	1062	const U8 * s = s0;
	1063	UV state = 0;
	1064
	1065	PERL_ARGS_ASSERT_ISUTF8_CHAR;
	1066
	1067	/* This dfa is fast. If it accepts the input, it was for a well-formed,
	1068	* code point, which can be returned immediately. Otherwise, it is either
	1069	* malformed, or for the start byte FF which the dfa doesn't handle (except
	1070	* on 32-bit ASCII platforms where it trivially is an error). Call a
	1071	* helper function for the other platforms. */
	1072
	1073	while (s < e && LIKELY(state != 1)) {
	1074	state = PL_extended_utf8_dfa_tab[256
	1075	+ state
	1076	+ PL_extended_utf8_dfa_tab[*s]];
	1077	if (state != 0) {
	1078	s++;
	1079	continue;
	1080	}
	1081
	1082	return s - s0 + 1;
	1083	}
	1084
	1085	#if defined(UV_IS_QUAD) \|\| defined(EBCDIC)
	1086
	1087	if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
	1088	return is_utf8_char_helper(s0, e, 0);
	1089	}
	1090
	1091	#endif
	1092
	1093	return 0;
	1094	}
	1095
	1096	/*
	1097
	1098	=for apidoc isSTRICT_UTF8_CHAR
	1099
	1100	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1101	looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
	1102	Unicode code point completely acceptable for open interchange between all
	1103	applications; otherwise it evaluates to 0. If non-zero, the value gives how
	1104	many bytes starting at C<s> comprise the code point's representation. Any
	1105	bytes remaining before C<e>, but beyond the ones needed to form the first code
	1106	point in C<s>, are not examined.
	1107
	1108	The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
	1109	be a surrogate nor a non-character code point. Thus this excludes any code
	1110	point from Perl's extended UTF-8.
	1111
	1112	This is used to efficiently decide if the next few bytes in C<s> is
	1113	legal Unicode-acceptable UTF-8 for a single character.
	1114
	1115	Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
	1116	#9\|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
	1117	code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
	1118	and C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1119
	1120	Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
	1121	C<L</is_strict_utf8_string_loclen>> to check entire strings.
	1122
	1123	=cut
	1124
	1125	This uses an adaptation of the tables and algorithm given in
	1126	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1127	documentation of the original version. A copyright notice for the original
	1128	version is given at the beginning of this file. The Perl adapation is
	1129	documented at the definition of strict_extended_utf8_dfa_tab[].
	1130
	1131	*/
	1132
	1133	PERL_STATIC_INLINE Size_t
	1134	Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
	1135	{
	1136	const U8 * s = s0;
	1137	UV state = 0;
	1138
	1139	PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
	1140
	1141	while (s < e && LIKELY(state != 1)) {
	1142	state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
	1143
	1144	if (state != 0) {
	1145	s++;
	1146	continue;
	1147	}
	1148
	1149	return s - s0 + 1;
	1150	}
	1151
	1152	#ifndef EBCDIC
	1153
	1154	/* The dfa above drops out for certain Hanguls; handle them specially */
	1155	if (is_HANGUL_ED_utf8_safe(s0, e)) {
	1156	return 3;
	1157	}
	1158
	1159	#endif
	1160
	1161	return 0;
	1162	}
	1163
	1164	/*
	1165
	1166	=for apidoc isC9_STRICT_UTF8_CHAR
	1167
	1168	Evaluates to non-zero if the first few bytes of the string starting at C<s> and
	1169	looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
	1170	Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
	1171	the value gives how many bytes starting at C<s> comprise the code point's
	1172	representation. Any bytes remaining before C<e>, but beyond the ones needed to
	1173	form the first code point in C<s>, are not examined.
	1174
	1175	The largest acceptable code point is the Unicode maximum 0x10FFFF. This
	1176	differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
	1177	code points. This corresponds to
	1178	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	1179	which said that non-character code points are merely discouraged rather than
	1180	completely forbidden in open interchange. See
	1181	L<perlunicode/Noncharacter code points>.
	1182
	1183	Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
	1184	C<L</isUTF8_CHAR_flags>> for a more customized definition.
	1185
	1186	Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
	1187	C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
	1188
	1189	=cut
	1190
	1191	This uses an adaptation of the tables and algorithm given in
	1192	https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
	1193	documentation of the original version. A copyright notice for the original
	1194	version is given at the beginning of this file. The Perl adapation is
	1195	documented at the definition of PL_c9_utf8_dfa_tab[].
	1196
	1197	*/
	1198
	1199	PERL_STATIC_INLINE Size_t
	1200	Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
	1201	{
	1202	const U8 * s = s0;
	1203	UV state = 0;
	1204
	1205	PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
	1206
	1207	while (s < e && LIKELY(state != 1)) {
	1208	state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
	1209
	1210	if (state != 0) {
	1211	s++;
	1212	continue;
	1213	}
	1214
	1215	return s - s0 + 1;
	1216	}
	1217
	1218	return 0;
	1219	}
	1220
	1221	/*
	1222
	1223	=for apidoc is_strict_utf8_string_loc
	1224
	1225	Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
	1226	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1227	"utf8ness success") in the C<ep> pointer.
	1228
	1229	See also C<L</is_strict_utf8_string_loclen>>.
	1230
	1231	=cut
	1232	*/
	1233
	1234	#define is_strict_utf8_string_loc(s, len, ep) \
	1235	is_strict_utf8_string_loclen(s, len, ep, 0)
	1236
	1237	/*
	1238
	1239	=for apidoc is_strict_utf8_string_loclen
	1240
	1241	Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
	1242	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1243	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	1244	encoded characters in the C<el> pointer.
	1245
	1246	See also C<L</is_strict_utf8_string_loc>>.
	1247
	1248	=cut
	1249	*/
	1250
	1251	PERL_STATIC_INLINE bool
	1252	Perl_is_strict_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	1253	{
	1254	const U8 * first_variant;
	1255
	1256	PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
	1257
	1258	if (len == 0) {
	1259	len = strlen((const char *) s);
	1260	}
	1261
	1262	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1263	if (el)
	1264	*el = len;
	1265
	1266	if (ep) {
	1267	*ep = s + len;
	1268	}
	1269
	1270	return TRUE;
	1271	}
	1272
	1273	{
	1274	const U8* const send = s + len;
	1275	const U8* x = first_variant;
	1276	STRLEN outlen = first_variant - s;
	1277
	1278	while (x < send) {
	1279	const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
	1280	if (UNLIKELY(! cur_len)) {
	1281	break;
	1282	}
	1283	x += cur_len;
	1284	outlen++;
	1285	}
	1286
	1287	if (el)
	1288	*el = outlen;
	1289
	1290	if (ep) {
	1291	*ep = x;
	1292	}
	1293
	1294	return (x == send);
	1295	}
	1296	}
	1297
	1298	/*
	1299
	1300	=for apidoc is_c9strict_utf8_string_loc
	1301
	1302	Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
	1303	the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1304	"utf8ness success") in the C<ep> pointer.
	1305
	1306	See also C<L</is_c9strict_utf8_string_loclen>>.
	1307
	1308	=cut
	1309	*/
	1310
	1311	#define is_c9strict_utf8_string_loc(s, len, ep) \
	1312	is_c9strict_utf8_string_loclen(s, len, ep, 0)
	1313
	1314	/*
	1315
	1316	=for apidoc is_c9strict_utf8_string_loclen
	1317
	1318	Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
	1319	the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1320	"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
	1321	characters in the C<el> pointer.
	1322
	1323	See also C<L</is_c9strict_utf8_string_loc>>.
	1324
	1325	=cut
	1326	*/
	1327
	1328	PERL_STATIC_INLINE bool
	1329	Perl_is_c9strict_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	1330	{
	1331	const U8 * first_variant;
	1332
	1333	PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
	1334
	1335	if (len == 0) {
	1336	len = strlen((const char *) s);
	1337	}
	1338
	1339	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1340	if (el)
	1341	*el = len;
	1342
	1343	if (ep) {
	1344	*ep = s + len;
	1345	}
	1346
	1347	return TRUE;
	1348	}
	1349
	1350	{
	1351	const U8* const send = s + len;
	1352	const U8* x = first_variant;
	1353	STRLEN outlen = first_variant - s;
	1354
	1355	while (x < send) {
	1356	const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
	1357	if (UNLIKELY(! cur_len)) {
	1358	break;
	1359	}
	1360	x += cur_len;
	1361	outlen++;
	1362	}
	1363
	1364	if (el)
	1365	*el = outlen;
	1366
	1367	if (ep) {
	1368	*ep = x;
	1369	}
	1370
	1371	return (x == send);
	1372	}
	1373	}
	1374
	1375	/*
	1376
	1377	=for apidoc is_utf8_string_loc_flags
	1378
	1379	Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
	1380	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1381	"utf8ness success") in the C<ep> pointer.
	1382
	1383	See also C<L</is_utf8_string_loclen_flags>>.
	1384
	1385	=cut
	1386	*/
	1387
	1388	#define is_utf8_string_loc_flags(s, len, ep, flags) \
	1389	is_utf8_string_loclen_flags(s, len, ep, 0, flags)
	1390
	1391
	1392	/* The above 3 actual functions could have been moved into the more general one
	1393	* just below, and made #defines that call it with the right 'flags'. They are
	1394	* currently kept separate to increase their chances of getting inlined */
	1395
	1396	/*
	1397
	1398	=for apidoc is_utf8_string_loclen_flags
	1399
	1400	Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
	1401	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	1402	"utf8ness success") in the C<ep> pointer, and the number of UTF-8
	1403	encoded characters in the C<el> pointer.
	1404
	1405	See also C<L</is_utf8_string_loc_flags>>.
	1406
	1407	=cut
	1408	*/
	1409
	1410	PERL_STATIC_INLINE bool
	1411	Perl_is_utf8_string_loclen_flags(const U8 s, STRLEN len, const U8 ep, STRLEN el, const U32 flags)
	1412	{
	1413	const U8 * first_variant;
	1414
	1415	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
	1416	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	1417	\|UTF8_DISALLOW_PERL_EXTENDED)));
	1418
	1419	if (len == 0) {
	1420	len = strlen((const char *) s);
	1421	}
	1422
	1423	if (flags == 0) {
	1424	return is_utf8_string_loclen(s, len, ep, el);
	1425	}
	1426
	1427	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	1428	== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
	1429	{
	1430	return is_strict_utf8_string_loclen(s, len, ep, el);
	1431	}
	1432
	1433	if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
	1434	== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
	1435	{
	1436	return is_c9strict_utf8_string_loclen(s, len, ep, el);
	1437	}
	1438
	1439	if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
	1440	if (el)
	1441	*el = len;
	1442
	1443	if (ep) {
	1444	*ep = s + len;
	1445	}
	1446
	1447	return TRUE;
	1448	}
	1449
	1450	{
	1451	const U8* send = s + len;
	1452	const U8* x = first_variant;
	1453	STRLEN outlen = first_variant - s;
	1454
	1455	while (x < send) {
	1456	const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
	1457	if (UNLIKELY(! cur_len)) {
	1458	break;
	1459	}
	1460	x += cur_len;
	1461	outlen++;
	1462	}
	1463
	1464	if (el)
	1465	*el = outlen;
	1466
	1467	if (ep) {
	1468	*ep = x;
	1469	}
	1470
	1471	return (x == send);
	1472	}
	1473	}
	1474
	1475	/*
	1476	=for apidoc utf8_distance
	1477
	1478	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	1479	and C<b>.
	1480
	1481	WARNING: use only if you know that the pointers point inside the
	1482	same UTF-8 buffer.
	1483
	1484	=cut
	1485	*/
	1486
	1487	PERL_STATIC_INLINE IV
	1488	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	1489	{
	1490	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	1491
	1492	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	1493	}
	1494
	1495	/*
	1496	=for apidoc utf8_hop
	1497
	1498	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	1499	forward or backward.
	1500
	1501	WARNING: do not use the following unless you know C<off> is within
	1502	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	1503	on the first byte of character or just after the last byte of a character.
	1504
	1505	=cut
	1506	*/
	1507
	1508	PERL_STATIC_INLINE U8 *
	1509	Perl_utf8_hop(const U8 *s, SSize_t off)
	1510	{
	1511	PERL_ARGS_ASSERT_UTF8_HOP;
	1512
	1513	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1514	* the bitops (especially ~) can create illegal UTF-8.
	1515	* In other words: in Perl UTF-8 is not just for Unicode. */
	1516
	1517	if (off >= 0) {
	1518	while (off--)
	1519	s += UTF8SKIP(s);
	1520	}
	1521	else {
	1522	while (off++) {
	1523	s--;
	1524	while (UTF8_IS_CONTINUATION(*s))
	1525	s--;
	1526	}
	1527	}
	1528	GCC_DIAG_IGNORE(-Wcast-qual)
	1529	return (U8 *)s;
	1530	GCC_DIAG_RESTORE
	1531	}
	1532
	1533	/*
	1534	=for apidoc utf8_hop_forward
	1535
	1536	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1537	forward.
	1538
	1539	C<off> must be non-negative.
	1540
	1541	C<s> must be before or equal to C<end>.
	1542
	1543	When moving forward it will not move beyond C<end>.
	1544
	1545	Will not exceed this limit even if the string is not valid "UTF-8".
	1546
	1547	=cut
	1548	*/
	1549
	1550	PERL_STATIC_INLINE U8 *
	1551	Perl_utf8_hop_forward(const U8 s, SSize_t off, const U8 end)
	1552	{
	1553	PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
	1554
	1555	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1556	* the bitops (especially ~) can create illegal UTF-8.
	1557	* In other words: in Perl UTF-8 is not just for Unicode. */
	1558
	1559	assert(s <= end);
	1560	assert(off >= 0);
	1561
	1562	while (off--) {
	1563	STRLEN skip = UTF8SKIP(s);
	1564	if ((STRLEN)(end - s) <= skip) {
	1565	GCC_DIAG_IGNORE(-Wcast-qual)
	1566	return (U8 *)end;
	1567	GCC_DIAG_RESTORE
	1568	}
	1569	s += skip;
	1570	}
	1571
	1572	GCC_DIAG_IGNORE(-Wcast-qual)
	1573	return (U8 *)s;
	1574	GCC_DIAG_RESTORE
	1575	}
	1576
	1577	/*
	1578	=for apidoc utf8_hop_back
	1579
	1580	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1581	backward.
	1582
	1583	C<off> must be non-positive.
	1584
	1585	C<s> must be after or equal to C<start>.
	1586
	1587	When moving backward it will not move before C<start>.
	1588
	1589	Will not exceed this limit even if the string is not valid "UTF-8".
	1590
	1591	=cut
	1592	*/
	1593
	1594	PERL_STATIC_INLINE U8 *
	1595	Perl_utf8_hop_back(const U8 s, SSize_t off, const U8 start)
	1596	{
	1597	PERL_ARGS_ASSERT_UTF8_HOP_BACK;
	1598
	1599	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1600	* the bitops (especially ~) can create illegal UTF-8.
	1601	* In other words: in Perl UTF-8 is not just for Unicode. */
	1602
	1603	assert(start <= s);
	1604	assert(off <= 0);
	1605
	1606	while (off++ && s > start) {
	1607	do {
	1608	s--;
	1609	} while (UTF8_IS_CONTINUATION(*s) && s > start);
	1610	}
	1611
	1612	GCC_DIAG_IGNORE(-Wcast-qual)
	1613	return (U8 *)s;
	1614	GCC_DIAG_RESTORE
	1615	}
	1616
	1617	/*
	1618	=for apidoc utf8_hop_safe
	1619
	1620	Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
	1621	either forward or backward.
	1622
	1623	When moving backward it will not move before C<start>.
	1624
	1625	When moving forward it will not move beyond C<end>.
	1626
	1627	Will not exceed those limits even if the string is not valid "UTF-8".
	1628
	1629	=cut
	1630	*/
	1631
	1632	PERL_STATIC_INLINE U8 *
	1633	Perl_utf8_hop_safe(const U8 s, SSize_t off, const U8 start, const U8 *end)
	1634	{
	1635	PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
	1636
	1637	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1638	* the bitops (especially ~) can create illegal UTF-8.
	1639	* In other words: in Perl UTF-8 is not just for Unicode. */
	1640
	1641	assert(start <= s && s <= end);
	1642
	1643	if (off >= 0) {
	1644	return utf8_hop_forward(s, off, end);
	1645	}
	1646	else {
	1647	return utf8_hop_back(s, off, start);
	1648	}
	1649	}
	1650
	1651	/*
	1652
	1653	=for apidoc is_utf8_valid_partial_char
	1654
	1655	Returns 0 if the sequence of bytes starting at C<s> and looking no further than
	1656	S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
	1657	points. Otherwise, it returns 1 if there exists at least one non-empty
	1658	sequence of bytes that when appended to sequence C<s>, starting at position
	1659	C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
	1660	otherwise returns 0.
	1661
	1662	In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
	1663	point.
	1664
	1665	This is useful when a fixed-length buffer is being tested for being well-formed
	1666	UTF-8, but the final few bytes in it don't comprise a full character; that is,
	1667	it is split somewhere in the middle of the final code point's UTF-8
	1668	representation. (Presumably when the buffer is refreshed with the next chunk
	1669	of data, the new first bytes will complete the partial code point.) This
	1670	function is used to verify that the final bytes in the current buffer are in
	1671	fact the legal beginning of some code point, so that if they aren't, the
	1672	failure can be signalled without having to wait for the next read.
	1673
	1674	=cut
	1675	*/
	1676	#define is_utf8_valid_partial_char(s, e) \
	1677	is_utf8_valid_partial_char_flags(s, e, 0)
	1678
	1679	/*
	1680
	1681	=for apidoc is_utf8_valid_partial_char_flags
	1682
	1683	Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
	1684	or not the input is a valid UTF-8 encoded partial character, but it takes an
	1685	extra parameter, C<flags>, which can further restrict which code points are
	1686	considered valid.
	1687
	1688	If C<flags> is 0, this behaves identically to
	1689	C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
	1690	of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
	1691	there is any sequence of bytes that can complete the input partial character in
	1692	such a way that a non-prohibited character is formed, the function returns
	1693	TRUE; otherwise FALSE. Non character code points cannot be determined based on
	1694	partial character input. But many of the other possible excluded types can be
	1695	determined from just the first one or two bytes.
	1696
	1697	=cut
	1698	*/
	1699
	1700	PERL_STATIC_INLINE bool
	1701	Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
	1702	{
	1703	PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
	1704
	1705	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	1706	\|UTF8_DISALLOW_PERL_EXTENDED)));
	1707
	1708	if (s >= e \|\| s + UTF8SKIP(s) <= e) {
	1709	return FALSE;
	1710	}
	1711
	1712	return cBOOL(is_utf8_char_helper(s, e, flags));
	1713	}
	1714
	1715	/*
	1716
	1717	=for apidoc is_utf8_fixed_width_buf_flags
	1718
	1719	Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
	1720	is entirely valid UTF-8, subject to the restrictions given by C<flags>;
	1721	otherwise it returns FALSE.
	1722
	1723	If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
	1724	without restriction. If the final few bytes of the buffer do not form a
	1725	complete code point, this will return TRUE anyway, provided that
	1726	C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
	1727
	1728	If C<flags> in non-zero, it can be any combination of the
	1729	C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
	1730	same meanings.
	1731
	1732	This function differs from C<L</is_utf8_string_flags>> only in that the latter
	1733	returns FALSE if the final few bytes of the string don't form a complete code
	1734	point.
	1735
	1736	=cut
	1737	*/
	1738	#define is_utf8_fixed_width_buf_flags(s, len, flags) \
	1739	is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
	1740
	1741	/*
	1742
	1743	=for apidoc is_utf8_fixed_width_buf_loc_flags
	1744
	1745	Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
	1746	failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
	1747	to the beginning of any partial character at the end of the buffer; if there is
	1748	no partial character C<*ep> will contain C<s>+C<len>.
	1749
	1750	See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
	1751
	1752	=cut
	1753	*/
	1754
	1755	#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
	1756	is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
	1757
	1758	/*
	1759
	1760	=for apidoc is_utf8_fixed_width_buf_loclen_flags
	1761
	1762	Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
	1763	complete, valid characters found in the C<el> pointer.
	1764
	1765	=cut
	1766	*/
	1767
	1768	PERL_STATIC_INLINE bool
	1769	Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
	1770	STRLEN len,
	1771	const U8 **ep,
	1772	STRLEN *el,
	1773	const U32 flags)
	1774	{
	1775	const U8 * maybe_partial;
	1776
	1777	PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
	1778
	1779	if (! ep) {
	1780	ep = &maybe_partial;
	1781	}
	1782
	1783	/* If it's entirely valid, return that; otherwise see if the only error is
	1784	* that the final few bytes are for a partial character */
	1785	return is_utf8_string_loclen_flags(s, len, ep, el, flags)
	1786	\|\| is_utf8_valid_partial_char_flags(*ep, s + len, flags);
	1787	}
	1788
	1789	PERL_STATIC_INLINE UV
	1790	Perl_utf8n_to_uvchr_msgs(const U8 *s,
	1791	STRLEN curlen,
	1792	STRLEN *retlen,
	1793	const U32 flags,
	1794	U32 * errors,
	1795	AV ** msgs)
	1796	{
	1797	/* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
	1798	* simple cases, and, if necessary calls a helper function to deal with the
	1799	* more complex ones. Almost all well-formed non-problematic code points
	1800	* are considered simple, so that it's unlikely that the helper function
	1801	* will need to be called.
	1802	*
	1803	* This is an adaptation of the tables and algorithm given in
	1804	* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
	1805	* comprehensive documentation of the original version. A copyright notice
	1806	* for the original version is given at the beginning of this file. The
	1807	* Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
	1808	*/
	1809
	1810	const U8 * const s0 = s;
	1811	const U8 * send = s0 + curlen;
	1812	UV uv = 0; /* The 0 silences some stupid compilers */
	1813	UV state = 0;
	1814
	1815	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
	1816
	1817	/* This dfa is fast. If it accepts the input, it was for a well-formed,
	1818	* non-problematic code point, which can be returned immediately.
	1819	* Otherwise we call a helper function to figure out the more complicated
	1820	* cases. */
	1821
	1822	while (s < send && LIKELY(state != 1)) {
	1823	UV type = PL_strict_utf8_dfa_tab[*s];
	1824
	1825	uv = (state == 0)
	1826	? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
	1827	: UTF8_ACCUMULATE(uv, *s);
	1828	state = PL_strict_utf8_dfa_tab[256 + state + type];
	1829
	1830	if (state != 0) {
	1831	s++;
	1832	continue;
	1833	}
	1834
	1835	if (retlen) {
	1836	*retlen = s - s0 + 1;
	1837	}
	1838	if (errors) {
	1839	*errors = 0;
	1840	}
	1841	if (msgs) {
	1842	*msgs = NULL;
	1843	}
	1844
	1845	return UNI_TO_NATIVE(uv);
	1846	}
	1847
	1848	/* Here is potentially problematic. Use the full mechanism */
	1849	return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
	1850	}
	1851
	1852	PERL_STATIC_INLINE UV
	1853	Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1854	{
	1855	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
	1856
	1857	assert(s < send);
	1858
	1859	if (! ckWARN_d(WARN_UTF8)) {
	1860
	1861	/* EMPTY is not really allowed, and asserts on debugging builds. But
	1862	* on non-debugging we have to deal with it, and this causes it to
	1863	* return the REPLACEMENT CHARACTER, as the documentation indicates */
	1864	return utf8n_to_uvchr(s, send - s, retlen,
	1865	(UTF8_ALLOW_ANY \| UTF8_ALLOW_EMPTY));
	1866	}
	1867	else {
	1868	UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
	1869	if (retlen && ret == 0 && *s != '\0') {
	1870	*retlen = (STRLEN) -1;
	1871	}
	1872
	1873	return ret;
	1874	}
	1875	}
	1876
	1877	/* ------------------------------- perl.h ----------------------------- */
	1878
	1879	/*
	1880	=for apidoc_section Utility Functions
	1881
	1882	=for apidoc is_safe_syscall
	1883
	1884	Test that the given C<pv> (with length C<len>) doesn't contain any internal
	1885	C<NUL> characters.
	1886	If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
	1887	category, and return FALSE.
	1888
	1889	Return TRUE if the name is safe.
	1890
	1891	C<what> and C<op_name> are used in any warning.
	1892
	1893	Used by the C<IS_SAFE_SYSCALL()> macro.
	1894
	1895	=cut
	1896	*/
	1897
	1898	PERL_STATIC_INLINE bool
	1899	Perl_is_safe_syscall(pTHX_ const char pv, STRLEN len, const char what, const char *op_name)
	1900	{
	1901	/* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
	1902	* perl itself uses xce*() functions which accept 8-bit strings.
	1903	*/
	1904
	1905	PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
	1906
	1907	if (len > 1) {
	1908	char *null_at;
	1909	if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
	1910	SETERRNO(ENOENT, LIB_INVARG);
	1911	Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
	1912	"Invalid \\0 character in %s for %s: %s\\0%s",
	1913	what, op_name, pv, null_at+1);
	1914	return FALSE;
	1915	}
	1916	}
	1917
	1918	return TRUE;
	1919	}
	1920
	1921	/*
	1922
	1923	Return true if the supplied filename has a newline character
	1924	immediately before the first (hopefully only) NUL.
	1925
	1926	My original look at this incorrectly used the len from SvPV(), but
	1927	that's incorrect, since we allow for a NUL in pv[len-1].
	1928
	1929	So instead, strlen() and work from there.
	1930
	1931	This allow for the user reading a filename, forgetting to chomp it,
	1932	then calling:
	1933
	1934	open my $foo, "$file\0";
	1935
	1936	*/
	1937
	1938	#ifdef PERL_CORE
	1939
	1940	PERL_STATIC_INLINE bool
	1941	S_should_warn_nl(const char *pv)
	1942	{
	1943	STRLEN len;
	1944
	1945	PERL_ARGS_ASSERT_SHOULD_WARN_NL;
	1946
	1947	len = strlen(pv);
	1948
	1949	return len > 0 && pv[len-1] == '\n';
	1950	}
	1951
	1952	#endif
	1953
	1954	#if defined(PERL_IN_PP_C) \|\| defined(PERL_IN_PP_HOT_C)
	1955
	1956	PERL_STATIC_INLINE bool
	1957	S_lossless_NV_to_IV(const NV nv, IV *ivp)
	1958	{
	1959	/* This function determines if the input NV 'nv' may be converted without
	1960	* loss of data to an IV. If not, it returns FALSE taking no other action.
	1961	* But if it is possible, it does the conversion, returning TRUE, and
	1962	* storing the converted result in 'ivp' /
	1963
	1964	PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
	1965
	1966	# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
	1967	/* Normally any comparison with a NaN returns false; if we can't rely
	1968	* on that behaviour, check explicitly */
	1969	if (UNLIKELY(Perl_isnan(nv))) {
	1970	return FALSE;
	1971	}
	1972	# endif
	1973
	1974	/* Written this way so that with an always-false NaN comparison we
	1975	* return false */
	1976	if (!(LIKELY(nv >= IV_MIN) && LIKELY(nv <= IV_MAX))) {
	1977	return FALSE;
	1978	}
	1979
	1980	if ((IV) nv != nv) {
	1981	return FALSE;
	1982	}
	1983
	1984	*ivp = (IV) nv;
	1985	return TRUE;
	1986	}
	1987
	1988	#endif
	1989
	1990	/* ------------------ regcomp.c, toke.c ------------ */
	1991
	1992	#if defined(PERL_IN_REGCOMP_C) \|\| defined(PERL_IN_TOKE_C)
	1993
	1994	/*
	1995	- regcurly - a little FSA that accepts {\d+,?\d*}
	1996	Pulled from reg.c.
	1997	*/
	1998	PERL_STATIC_INLINE bool
	1999	S_regcurly(const char *s)
	2000	{
	2001	PERL_ARGS_ASSERT_REGCURLY;
	2002
	2003	if (*s++ != '{')
	2004	return FALSE;
	2005	if (!isDIGIT(*s))
	2006	return FALSE;
	2007	while (isDIGIT(*s))
	2008	s++;
	2009	if (*s == ',') {
	2010	s++;
	2011	while (isDIGIT(*s))
	2012	s++;
	2013	}
	2014
	2015	return *s == '}';
	2016	}
	2017
	2018	#endif
	2019
	2020	/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
	2021
	2022	#if defined(PERL_IN_PP_C) \|\| defined(PERL_IN_REGCOMP_C) \|\| defined(PERL_IN_TOKE_C) \|\| defined(PERL_IN_UNIVERSAL_C)
	2023
	2024	#define MAX_CHARSET_NAME_LENGTH 2
	2025
	2026	PERL_STATIC_INLINE const char *
	2027	S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
	2028	{
	2029	PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
	2030
	2031	/* Returns a string that corresponds to the name of the regex character set
	2032	* given by 'flags', and *lenp is set the length of that string, which
	2033	* cannot exceed MAX_CHARSET_NAME_LENGTH characters */
	2034
	2035	*lenp = 1;
	2036	switch (get_regex_charset(flags)) {
	2037	case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
	2038	case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
	2039	case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
	2040	case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
	2041	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
	2042	*lenp = 2;
	2043	return ASCII_MORE_RESTRICT_PAT_MODS;
	2044	}
	2045	/* The NOT_REACHED; hides an assert() which has a rather complex
	2046	* definition in perl.h. */
	2047	NOT_REACHED; /* NOTREACHED */
	2048	return "?"; /* Unknown */
	2049	}
	2050
	2051	#endif
	2052
	2053	/*
	2054
	2055	Return false if any get magic is on the SV other than taint magic.
	2056
	2057	*/
	2058
	2059	PERL_STATIC_INLINE bool
	2060	Perl_sv_only_taint_gmagic(SV *sv)
	2061	{
	2062	MAGIC *mg = SvMAGIC(sv);
	2063
	2064	PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
	2065
	2066	while (mg) {
	2067	if (mg->mg_type != PERL_MAGIC_taint
	2068	&& !(mg->mg_flags & MGf_GSKIP)
	2069	&& mg->mg_virtual->svt_get) {
	2070	return FALSE;
	2071	}
	2072	mg = mg->mg_moremagic;
	2073	}
	2074
	2075	return TRUE;
	2076	}
	2077
	2078	/* ------------------ cop.h ------------------------------------------- */
	2079
	2080	/* implement GIMME_V() macro */
	2081
	2082	PERL_STATIC_INLINE U8
	2083	Perl_gimme_V(pTHX)
	2084	{
	2085	I32 cxix;
	2086	U8 gimme = (PL_op->op_flags & OPf_WANT);
	2087
	2088	if (gimme)
	2089	return gimme;
	2090	cxix = PL_curstackinfo->si_cxsubix;
	2091	if (cxix < 0)
	2092	return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
	2093	assert(cxstack[cxix].blk_gimme & G_WANT);
	2094	return (cxstack[cxix].blk_gimme & G_WANT);
	2095	}
	2096
	2097
	2098	/* Enter a block. Push a new base context and return its address. */
	2099
	2100	PERL_STATIC_INLINE PERL_CONTEXT *
	2101	Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
	2102	{
	2103	PERL_CONTEXT * cx;
	2104
	2105	PERL_ARGS_ASSERT_CX_PUSHBLOCK;
	2106
	2107	CXINC;
	2108	cx = CX_CUR();
	2109	cx->cx_type = type;
	2110	cx->blk_gimme = gimme;
	2111	cx->blk_oldsaveix = saveix;
	2112	cx->blk_oldsp = (I32)(sp - PL_stack_base);
	2113	cx->blk_oldcop = PL_curcop;
	2114	cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
	2115	cx->blk_oldscopesp = PL_scopestack_ix;
	2116	cx->blk_oldpm = PL_curpm;
	2117	cx->blk_old_tmpsfloor = PL_tmps_floor;
	2118
	2119	PL_tmps_floor = PL_tmps_ix;
	2120	CX_DEBUG(cx, "PUSH");
	2121	return cx;
	2122	}
	2123
	2124
	2125	/* Exit a block (RETURN and LAST). */
	2126
	2127	PERL_STATIC_INLINE void
	2128	Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
	2129	{
	2130	PERL_ARGS_ASSERT_CX_POPBLOCK;
	2131
	2132	CX_DEBUG(cx, "POP");
	2133	/* these 3 are common to cx_popblock and cx_topblock */
	2134	PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
	2135	PL_scopestack_ix = cx->blk_oldscopesp;
	2136	PL_curpm = cx->blk_oldpm;
	2137
	2138	/* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
	2139	* and leaves a CX entry lying around for repeated use, so
	2140	* skip for multicall */ \
	2141	assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
	2142	\|\| PL_savestack_ix == cx->blk_oldsaveix);
	2143	PL_curcop = cx->blk_oldcop;
	2144	PL_tmps_floor = cx->blk_old_tmpsfloor;
	2145	}
	2146
	2147	/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
	2148	* Whereas cx_popblock() restores the state to the point just before
	2149	* cx_pushblock() was called, cx_topblock() restores it to the point just
	2150	* after cx_pushblock() was called. */
	2151
	2152	PERL_STATIC_INLINE void
	2153	Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
	2154	{
	2155	PERL_ARGS_ASSERT_CX_TOPBLOCK;
	2156
	2157	CX_DEBUG(cx, "TOP");
	2158	/* these 3 are common to cx_popblock and cx_topblock */
	2159	PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
	2160	PL_scopestack_ix = cx->blk_oldscopesp;
	2161	PL_curpm = cx->blk_oldpm;
	2162
	2163	PL_stack_sp = PL_stack_base + cx->blk_oldsp;
	2164	}
	2165
	2166
	2167	PERL_STATIC_INLINE void
	2168	Perl_cx_pushsub(pTHX_ PERL_CONTEXT cx, CV cv, OP *retop, bool hasargs)
	2169	{
	2170	U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
	2171
	2172	PERL_ARGS_ASSERT_CX_PUSHSUB;
	2173
	2174	PERL_DTRACE_PROBE_ENTRY(cv);
	2175	cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2176	PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
	2177	cx->blk_sub.cv = cv;
	2178	cx->blk_sub.olddepth = CvDEPTH(cv);
	2179	cx->blk_sub.prevcomppad = PL_comppad;
	2180	cx->cx_type \|= (hasargs) ? CXp_HASARGS : 0;
	2181	cx->blk_sub.retop = retop;
	2182	SvREFCNT_inc_simple_void_NN(cv);
	2183	cx->blk_u16 = PL_op->op_private & (phlags\|OPpDEREF);
	2184	}
	2185
	2186
	2187	/* subsets of cx_popsub() */
	2188
	2189	PERL_STATIC_INLINE void
	2190	Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
	2191	{
	2192	CV *cv;
	2193
	2194	PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
	2195	assert(CxTYPE(cx) == CXt_SUB);
	2196
	2197	PL_comppad = cx->blk_sub.prevcomppad;
	2198	PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
	2199	cv = cx->blk_sub.cv;
	2200	CvDEPTH(cv) = cx->blk_sub.olddepth;
	2201	cx->blk_sub.cv = NULL;
	2202	SvREFCNT_dec(cv);
	2203	PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
	2204	}
	2205
	2206
	2207	/* handle the @_ part of leaving a sub */
	2208
	2209	PERL_STATIC_INLINE void
	2210	Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
	2211	{
	2212	AV *av;
	2213
	2214	PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
	2215	assert(CxTYPE(cx) == CXt_SUB);
	2216	assert(AvARRAY(MUTABLE_AV(
	2217	PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
	2218	CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
	2219
	2220	CX_POP_SAVEARRAY(cx);
	2221	av = MUTABLE_AV(PAD_SVl(0));
	2222	if (UNLIKELY(AvREAL(av)))
	2223	/* abandon @_ if it got reified */
	2224	clear_defarray(av, 0);
	2225	else {
	2226	CLEAR_ARGARRAY(av);
	2227	}
	2228	}
	2229
	2230
	2231	PERL_STATIC_INLINE void
	2232	Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
	2233	{
	2234	PERL_ARGS_ASSERT_CX_POPSUB;
	2235	assert(CxTYPE(cx) == CXt_SUB);
	2236
	2237	PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
	2238
	2239	if (CxHASARGS(cx))
	2240	cx_popsub_args(cx);
	2241	cx_popsub_common(cx);
	2242	}
	2243
	2244
	2245	PERL_STATIC_INLINE void
	2246	Perl_cx_pushformat(pTHX_ PERL_CONTEXT cx, CV cv, OP retop, GV gv)
	2247	{
	2248	PERL_ARGS_ASSERT_CX_PUSHFORMAT;
	2249
	2250	cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2251	PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
	2252	cx->blk_format.cv = cv;
	2253	cx->blk_format.retop = retop;
	2254	cx->blk_format.gv = gv;
	2255	cx->blk_format.dfoutgv = PL_defoutgv;
	2256	cx->blk_format.prevcomppad = PL_comppad;
	2257	cx->blk_u16 = 0;
	2258
	2259	SvREFCNT_inc_simple_void_NN(cv);
	2260	CvDEPTH(cv)++;
	2261	SvREFCNT_inc_void(cx->blk_format.dfoutgv);
	2262	}
	2263
	2264
	2265	PERL_STATIC_INLINE void
	2266	Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
	2267	{
	2268	CV *cv;
	2269	GV *dfout;
	2270
	2271	PERL_ARGS_ASSERT_CX_POPFORMAT;
	2272	assert(CxTYPE(cx) == CXt_FORMAT);
	2273
	2274	dfout = cx->blk_format.dfoutgv;
	2275	setdefout(dfout);
	2276	cx->blk_format.dfoutgv = NULL;
	2277	SvREFCNT_dec_NN(dfout);
	2278
	2279	PL_comppad = cx->blk_format.prevcomppad;
	2280	PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
	2281	cv = cx->blk_format.cv;
	2282	cx->blk_format.cv = NULL;
	2283	--CvDEPTH(cv);
	2284	SvREFCNT_dec_NN(cv);
	2285	PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
	2286	}
	2287
	2288
	2289	PERL_STATIC_INLINE void
	2290	Perl_cx_pusheval(pTHX_ PERL_CONTEXT cx, OP retop, SV *namesv)
	2291	{
	2292	PERL_ARGS_ASSERT_CX_PUSHEVAL;
	2293
	2294	cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
	2295	PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
	2296	cx->blk_eval.retop = retop;
	2297	cx->blk_eval.old_namesv = namesv;
	2298	cx->blk_eval.old_eval_root = PL_eval_root;
	2299	cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
	2300	cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
	2301	cx->blk_eval.cur_top_env = PL_top_env;
	2302
	2303	assert(!(PL_in_eval & ~ 0x3F));
	2304	assert(!(PL_op->op_type & ~0x1FF));
	2305	cx->blk_u16 = (PL_in_eval & 0x3F) \| ((U16)PL_op->op_type << 7);
	2306	}
	2307
	2308
	2309	PERL_STATIC_INLINE void
	2310	Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
	2311	{
	2312	SV *sv;
	2313
	2314	PERL_ARGS_ASSERT_CX_POPEVAL;
	2315	assert(CxTYPE(cx) == CXt_EVAL);
	2316
	2317	PL_in_eval = CxOLD_IN_EVAL(cx);
	2318	assert(!(PL_in_eval & 0xc0));
	2319	PL_eval_root = cx->blk_eval.old_eval_root;
	2320	sv = cx->blk_eval.cur_text;
	2321	if (sv && CxEVAL_TXT_REFCNTED(cx)) {
	2322	cx->blk_eval.cur_text = NULL;
	2323	SvREFCNT_dec_NN(sv);
	2324	}
	2325
	2326	sv = cx->blk_eval.old_namesv;
	2327	if (sv) {
	2328	cx->blk_eval.old_namesv = NULL;
	2329	SvREFCNT_dec_NN(sv);
	2330	}
	2331	PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
	2332	}
	2333
	2334
	2335	/* push a plain loop, i.e.
	2336	* { block }
	2337	* while (cond) { block }
	2338	* for (init;cond;continue) { block }
	2339	* This loop can be last/redo'ed etc.
	2340	*/
	2341
	2342	PERL_STATIC_INLINE void
	2343	Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
	2344	{
	2345	PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
	2346	cx->blk_loop.my_op = cLOOP;
	2347	}
	2348
	2349
	2350	/* push a true for loop, i.e.
	2351	* for var (list) { block }
	2352	*/
	2353
	2354	PERL_STATIC_INLINE void
	2355	Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT cx, void itervarp, SV* itersave)
	2356	{
	2357	PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
	2358
	2359	/* this one line is common with cx_pushloop_plain */
	2360	cx->blk_loop.my_op = cLOOP;
	2361
	2362	cx->blk_loop.itervar_u.svp = (SV**)itervarp;
	2363	cx->blk_loop.itersave = itersave;
	2364	#ifdef USE_ITHREADS
	2365	cx->blk_loop.oldcomppad = PL_comppad;
	2366	#endif
	2367	}
	2368
	2369
	2370	/* pop all loop types, including plain */
	2371
	2372	PERL_STATIC_INLINE void
	2373	Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
	2374	{
	2375	PERL_ARGS_ASSERT_CX_POPLOOP;
	2376
	2377	assert(CxTYPE_is_LOOP(cx));
	2378	if ( CxTYPE(cx) == CXt_LOOP_ARY
	2379	\|\| CxTYPE(cx) == CXt_LOOP_LAZYSV)
	2380	{
	2381	/* Free ary or cur. This assumes that state_u.ary.ary
	2382	* aligns with state_u.lazysv.cur. See cx_dup() */
	2383	SV *sv = cx->blk_loop.state_u.lazysv.cur;
	2384	cx->blk_loop.state_u.lazysv.cur = NULL;
	2385	SvREFCNT_dec_NN(sv);
	2386	if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
	2387	sv = cx->blk_loop.state_u.lazysv.end;
	2388	cx->blk_loop.state_u.lazysv.end = NULL;
	2389	SvREFCNT_dec_NN(sv);
	2390	}
	2391	}
	2392	if (cx->cx_type & (CXp_FOR_PAD\|CXp_FOR_GV)) {
	2393	SV *cursv;
	2394	SV **svp = (cx)->blk_loop.itervar_u.svp;
	2395	if ((cx->cx_type & CXp_FOR_GV))
	2396	svp = &GvSV((GV*)svp);
	2397	cursv = *svp;
	2398	*svp = cx->blk_loop.itersave;
	2399	cx->blk_loop.itersave = NULL;
	2400	SvREFCNT_dec(cursv);
	2401	}
	2402	}
	2403
	2404
	2405	PERL_STATIC_INLINE void
	2406	Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
	2407	{
	2408	PERL_ARGS_ASSERT_CX_PUSHWHEN;
	2409
	2410	cx->blk_givwhen.leave_op = cLOGOP->op_other;
	2411	}
	2412
	2413
	2414	PERL_STATIC_INLINE void
	2415	Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
	2416	{
	2417	PERL_ARGS_ASSERT_CX_POPWHEN;
	2418	assert(CxTYPE(cx) == CXt_WHEN);
	2419
	2420	PERL_UNUSED_ARG(cx);
	2421	PERL_UNUSED_CONTEXT;
	2422	/* currently NOOP */
	2423	}
	2424
	2425
	2426	PERL_STATIC_INLINE void
	2427	Perl_cx_pushgiven(pTHX_ PERL_CONTEXT cx, SV orig_defsv)
	2428	{
	2429	PERL_ARGS_ASSERT_CX_PUSHGIVEN;
	2430
	2431	cx->blk_givwhen.leave_op = cLOGOP->op_other;
	2432	cx->blk_givwhen.defsv_save = orig_defsv;
	2433	}
	2434
	2435
	2436	PERL_STATIC_INLINE void
	2437	Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
	2438	{
	2439	SV *sv;
	2440
	2441	PERL_ARGS_ASSERT_CX_POPGIVEN;
	2442	assert(CxTYPE(cx) == CXt_GIVEN);
	2443
	2444	sv = GvSV(PL_defgv);
	2445	GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
	2446	cx->blk_givwhen.defsv_save = NULL;
	2447	SvREFCNT_dec(sv);
	2448	}
	2449
	2450	/* ------------------ util.h ------------------------------------------- */
	2451
	2452	/*
	2453	=for apidoc_section String Handling
	2454
	2455	=for apidoc foldEQ
	2456
	2457	Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
	2458	same
	2459	case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
	2460	match themselves and their opposite case counterparts. Non-cased and non-ASCII
	2461	range bytes match only themselves.
	2462
	2463	=cut
	2464	*/
	2465
	2466	PERL_STATIC_INLINE I32
	2467	Perl_foldEQ(const char s1, const char s2, I32 len)
	2468	{
	2469	const U8 a = (const U8 )s1;
	2470	const U8 b = (const U8 )s2;
	2471
	2472	PERL_ARGS_ASSERT_FOLDEQ;
	2473
	2474	assert(len >= 0);
	2475
	2476	while (len--) {
	2477	if (a != b && a != PL_fold[b])
	2478	return 0;
	2479	a++,b++;
	2480	}
	2481	return 1;
	2482	}
	2483
	2484	PERL_STATIC_INLINE I32
	2485	Perl_foldEQ_latin1(const char s1, const char s2, I32 len)
	2486	{
	2487	/* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
	2488	* representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
	2489	* does not check for this. Nor does it check that the strings each have
	2490	* at least 'len' characters. */
	2491
	2492	const U8 a = (const U8 )s1;
	2493	const U8 b = (const U8 )s2;
	2494
	2495	PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
	2496
	2497	assert(len >= 0);
	2498
	2499	while (len--) {
	2500	if (a != b && a != PL_fold_latin1[b]) {
	2501	return 0;
	2502	}
	2503	a++, b++;
	2504	}
	2505	return 1;
	2506	}
	2507
	2508	/*
	2509	=for apidoc_section Locales
	2510	=for apidoc foldEQ_locale
	2511
	2512	Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
	2513	same case-insensitively in the current locale; false otherwise.
	2514
	2515	=cut
	2516	*/
	2517
	2518	PERL_STATIC_INLINE I32
	2519	Perl_foldEQ_locale(const char s1, const char s2, I32 len)
	2520	{
	2521	const U8 a = (const U8 )s1;
	2522	const U8 b = (const U8 )s2;
	2523
	2524	PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
	2525
	2526	assert(len >= 0);
	2527
	2528	while (len--) {
	2529	if (a != b && a != PL_fold_locale[b])
	2530	return 0;
	2531	a++,b++;
	2532	}
	2533	return 1;
	2534	}
	2535
	2536	/*
	2537	=for apidoc_section String Handling
	2538	=for apidoc my_strnlen
	2539
	2540	The C library C<strnlen> if available, or a Perl implementation of it.
	2541
	2542	C<my_strnlen()> computes the length of the string, up to C<maxlen>
	2543	characters. It will never attempt to address more than C<maxlen>
	2544	characters, making it suitable for use with strings that are not
	2545	guaranteed to be NUL-terminated.
	2546
	2547	=cut
	2548
	2549	Description stolen from http://man.openbsd.org/strnlen.3,
	2550	implementation stolen from PostgreSQL.
	2551	*/
	2552	#ifndef HAS_STRNLEN
	2553
	2554	PERL_STATIC_INLINE Size_t
	2555	Perl_my_strnlen(const char *str, Size_t maxlen)
	2556	{
	2557	const char end = (char ) memchr(str, '\0', maxlen);
	2558
	2559	PERL_ARGS_ASSERT_MY_STRNLEN;
	2560
	2561	if (end == NULL) return maxlen;
	2562	return end - str;
	2563	}
	2564
	2565	#endif
	2566
	2567	#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) \|\| defined(PERL_EXT))
	2568
	2569	PERL_STATIC_INLINE void *
	2570	S_my_memrchr(const char * s, const char c, const STRLEN len)
	2571	{
	2572	/* memrchr(), since many platforms lack it */
	2573
	2574	const char * t = s + len - 1;
	2575
	2576	PERL_ARGS_ASSERT_MY_MEMRCHR;
	2577
	2578	while (t >= s) {
	2579	if (*t == c) {
	2580	return (void *) t;
	2581	}
	2582	t--;
	2583	}
	2584
	2585	return NULL;
	2586	}
	2587
	2588	#endif
	2589
	2590	PERL_STATIC_INLINE char *
	2591	Perl_mortal_getenv(const char * str)
	2592	{
	2593	/* This implements a (mostly) thread-safe, sequential-call-safe getenv().
	2594	*
	2595	* It's (mostly) thread-safe because it uses a mutex to prevent
	2596	* simultaneous access from other threads that use the same mutex, and
	2597	* makes a copy of the result before releasing that mutex. All of the Perl
	2598	* core uses that mutex, but, like all mutexes, everything has to cooperate
	2599	* for it to completely work. It is possible for code from, say XS, to not
	2600	* use this mutex, defeating the safety.
	2601	*
	2602	* On some platforms, getenv() is not sequential-call-safe, because
	2603	* subsequent calls destroy the static storage inside the C library
	2604	* returned by an earlier call. The result must be copied or completely
	2605	* acted upon before a subsequent getenv call. Those calls could come from
	2606	* another thread. Again, making a copy while controlling the mutex
	2607	* prevents these problems..
	2608	*
	2609	* To prevent leaks, the copy is made by creating a new SV containing it,
	2610	* mortalizing the SV, and returning the SV's string (the copy). Thus this
	2611	* is a drop-in replacement for getenv().
	2612	*
	2613	* A complication is that this can be called during phases where the
	2614	* mortalization process isn't available. These are in interpreter
	2615	* destruction or early in construction. khw believes that at these times
	2616	* there shouldn't be anything else going on, so plain getenv is safe AS
	2617	* LONG AS the caller acts on the return before calling it again. */
	2618
	2619	char * ret;
	2620	dTHX;
	2621
	2622	PERL_ARGS_ASSERT_MORTAL_GETENV;
	2623
	2624	/* Can't mortalize without stacks. khw believes that no other threads
	2625	* should be running, so no need to lock things, and this may be during a
	2626	* phase when locking isn't even available */
	2627	if (UNLIKELY(PL_scopestack_ix == 0)) {
	2628	return getenv(str);
	2629	}
	2630
	2631	ENV_LOCK;
	2632
	2633	ret = getenv(str);
	2634
	2635	if (ret != NULL) {
	2636	ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
	2637	}
	2638
	2639	ENV_UNLOCK;
	2640	return ret;
	2641	}
	2642
	2643	/*
	2644	* ex: set ts=8 sts=4 sw=4 et:
	2645	*/