perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "invlist_inline.h"
	35
	36	static const char malformed_text[] = "Malformed UTF-8 character";
	37	static const char unees[] =
	38	"Malformed UTF-8 character (unexpected end of string)";
	39	static const char cp_above_legal_max[] =
	40	"Use of code point 0x%" UVXf " is not allowed; the"
	41	" permissible max is 0x%" UVXf;
	42
	43	#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
	44
	45	/*
	46	=head1 Unicode Support
	47	These are various utility functions for manipulating UTF8-encoded
	48	strings. For the uninitiated, this is a method of representing arbitrary
	49	Unicode characters as a variable number of bytes, in such a way that
	50	characters in the ASCII range are unmodified, and a zero byte never appears
	51	within non-zero characters.
	52
	53	=cut
	54	*/
	55
	56	void
	57	Perl__force_out_malformed_utf8_message(pTHX_
	58	const U8 const p, / First byte in UTF-8 sequence */
	59	const U8 * const e, /* Final byte in sequence (may include
	60	multiple chars */
	61	const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
	62	usually 0, or some DISALLOW flags */
	63	const bool die_here) /* If TRUE, this function does not return */
	64	{
	65	/* This core-only function is to be called when a malformed UTF-8 character
	66	* is found, in order to output the detailed information about the
	67	* malformation before dieing. The reason it exists is for the occasions
	68	* when such a malformation is fatal, but warnings might be turned off, so
	69	* that normally they would not be actually output. This ensures that they
	70	* do get output. Because a sequence may be malformed in more than one
	71	* way, multiple messages may be generated, so we can't make them fatal, as
	72	* that would cause the first one to die.
	73	*
	74	* Instead we pretend -W was passed to perl, then die afterwards. The
	75	* flexibility is here to return to the caller so they can finish up and
	76	* die themselves */
	77	U32 errors;
	78
	79	PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
	80
	81	ENTER;
	82	SAVEI8(PL_dowarn);
	83	SAVESPTR(PL_curcop);
	84
	85	PL_dowarn = G_WARN_ALL_ON\|G_WARN_ON;
	86	if (PL_curcop) {
	87	PL_curcop->cop_warnings = pWARN_ALL;
	88	}
	89
	90	(void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
	91
	92	LEAVE;
	93
	94	if (! errors) {
	95	Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
	96	" be called only when there are errors found");
	97	}
	98
	99	if (die_here) {
	100	Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
	101	}
	102	}
	103
	104	/*
	105	=for apidoc uvoffuni_to_utf8_flags
	106
	107	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	108	Instead, B<Almost all code should use L</uvchr_to_utf8> or
	109	L</uvchr_to_utf8_flags>>.
	110
	111	This function is like them, but the input is a strict Unicode
	112	(as opposed to native) code point. Only in very rare circumstances should code
	113	not be using the native code point.
	114
	115	For details, see the description for L</uvchr_to_utf8_flags>.
	116
	117	=cut
	118	*/
	119
	120	/* All these formats take a single UV code point argument */
	121	const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
	122	const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
	123	" is not recommended for open interchange";
	124	const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
	125	" may not be portable";
	126	const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
	127	" Unicode, requires a Perl extension," \
	128	" and so is not portable";
	129
	130	#define HANDLE_UNICODE_SURROGATE(uv, flags) \
	131	STMT_START { \
	132	if (flags & UNICODE_WARN_SURROGATE) { \
	133	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
	134	surrogate_cp_format, uv); \
	135	} \
	136	if (flags & UNICODE_DISALLOW_SURROGATE) { \
	137	return NULL; \
	138	} \
	139	} STMT_END;
	140
	141	#define HANDLE_UNICODE_NONCHAR(uv, flags) \
	142	STMT_START { \
	143	if (flags & UNICODE_WARN_NONCHAR) { \
	144	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
	145	nonchar_cp_format, uv); \
	146	} \
	147	if (flags & UNICODE_DISALLOW_NONCHAR) { \
	148	return NULL; \
	149	} \
	150	} STMT_END;
	151
	152	/* Use shorter names internally in this file */
	153	#define SHIFT UTF_ACCUMULATION_SHIFT
	154	#undef MARK
	155	#define MARK UTF_CONTINUATION_MARK
	156	#define MASK UTF_CONTINUATION_MASK
	157
	158	U8 *
	159	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
	160	{
	161	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	162
	163	if (OFFUNI_IS_INVARIANT(uv)) {
	164	*d++ = LATIN1_TO_NATIVE(uv);
	165	return d;
	166	}
	167
	168	if (uv <= MAX_UTF8_TWO_BYTE) {
	169	*d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) \| UTF_START_MARK(2));
	170	*d++ = I8_TO_NATIVE_UTF8(( uv & MASK) \| MARK);
	171	return d;
	172	}
	173
	174	/* Not 2-byte; test for and handle 3-byte result. In the test immediately
	175	* below, the 16 is for start bytes E0-EF (which are all the possible ones
	176	* for 3 byte characters). The 2 is for 2 continuation bytes; these each
	177	* contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
	178	* on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
	179	* 0x800-0xFFFF on ASCII */
	180	if (uv < (16 * (1U << (2 * SHIFT)))) {
	181	d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) SHIFT)) \| UTF_START_MARK(3));
	182	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	183	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	184
	185	#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
	186	aren't tested here */
	187	/* The most likely code points in this range are below the surrogates.
	188	* Do an extra test to quickly exclude those. */
	189	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
	190	if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
	191	\|\| UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
	192	{
	193	HANDLE_UNICODE_NONCHAR(uv, flags);
	194	}
	195	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	196	HANDLE_UNICODE_SURROGATE(uv, flags);
	197	}
	198	}
	199	#endif
	200	return d;
	201	}
	202
	203	/* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
	204	* platforms, and 0x4000 on EBCDIC. There are problematic cases that can
	205	* happen starting with 4-byte characters on ASCII platforms. We unify the
	206	* code for these with EBCDIC, even though some of them require 5-bytes on
	207	* those, because khw believes the code saving is worth the very slight
	208	* performance hit on these high EBCDIC code points. */
	209
	210	if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
	211	if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
	212	Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
	213	}
	214	if ( (flags & UNICODE_WARN_SUPER)
	215	\|\| ( (flags & UNICODE_WARN_PERL_EXTENDED)
	216	&& UNICODE_IS_PERL_EXTENDED(uv)))
	217	{
	218	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	219
	220	/* Choose the more dire applicable warning */
	221	(UNICODE_IS_PERL_EXTENDED(uv))
	222	? perl_extended_cp_format
	223	: super_cp_format,
	224	uv);
	225	}
	226	if ( (flags & UNICODE_DISALLOW_SUPER)
	227	\|\| ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
	228	&& UNICODE_IS_PERL_EXTENDED(uv)))
	229	{
	230	return NULL;
	231	}
	232	}
	233	else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
	234	HANDLE_UNICODE_NONCHAR(uv, flags);
	235	}
	236
	237	/* Test for and handle 4-byte result. In the test immediately below, the
	238	* 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
	239	* characters). The 3 is for 3 continuation bytes; these each contribute
	240	* SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
	241	* ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
	242	* 0x1_0000-0x1F_FFFF on ASCII */
	243	if (uv < (8 * (1U << (3 * SHIFT)))) {
	244	d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) SHIFT)) \| UTF_START_MARK(4));
	245	d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) SHIFT)) & MASK) \| MARK);
	246	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	247	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	248
	249	#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
	250	characters. The end-plane non-characters for EBCDIC were
	251	handled just above */
	252	if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
	253	HANDLE_UNICODE_NONCHAR(uv, flags);
	254	}
	255	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	256	HANDLE_UNICODE_SURROGATE(uv, flags);
	257	}
	258	#endif
	259
	260	return d;
	261	}
	262
	263	/* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
	264	* platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
	265	* format. The unrolled version above turns out to not save all that much
	266	* time, and at these high code points (well above the legal Unicode range
	267	* on ASCII platforms, and well above anything in common use in EBCDIC),
	268	* khw believes that less code outweighs slight performance gains. */
	269
	270	{
	271	STRLEN len = OFFUNISKIP(uv);
	272	U8 *p = d+len-1;
	273	while (p > d) {
	274	*p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	275	uv >>= UTF_ACCUMULATION_SHIFT;
	276	}
	277	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	278	return d+len;
	279	}
	280	}
	281
	282	/*
	283	=for apidoc uvchr_to_utf8
	284
	285	Adds the UTF-8 representation of the native code point C<uv> to the end
	286	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	287	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	288	the byte after the end of the new character. In other words,
	289
	290	d = uvchr_to_utf8(d, uv);
	291
	292	is the recommended wide native character-aware way of saying
	293
	294	*(d++) = uv;
	295
	296	This function accepts any code point from 0..C<IV_MAX> as input.
	297	C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	298
	299	It is possible to forbid or warn on non-Unicode code points, or those that may
	300	be problematic by using L</uvchr_to_utf8_flags>.
	301
	302	=cut
	303	*/
	304
	305	/* This is also a macro */
	306	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	307
	308	U8 *
	309	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	310	{
	311	return uvchr_to_utf8(d, uv);
	312	}
	313
	314	/*
	315	=for apidoc uvchr_to_utf8_flags
	316
	317	Adds the UTF-8 representation of the native code point C<uv> to the end
	318	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	319	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	320	the byte after the end of the new character. In other words,
	321
	322	d = uvchr_to_utf8_flags(d, uv, flags);
	323
	324	or, in most cases,
	325
	326	d = uvchr_to_utf8_flags(d, uv, 0);
	327
	328	This is the Unicode-aware way of saying
	329
	330	*(d++) = uv;
	331
	332	If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
	333	input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	334
	335	Specifying C<flags> can further restrict what is allowed and not warned on, as
	336	follows:
	337
	338	If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
	339	the function will raise a warning, provided UTF8 warnings are enabled. If
	340	instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
	341	NULL. If both flags are set, the function will both warn and return NULL.
	342
	343	Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
	344	affect how the function handles a Unicode non-character.
	345
	346	And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
	347	affect the handling of code points that are above the Unicode maximum of
	348	0x10FFFF. Languages other than Perl may not be able to accept files that
	349	contain these.
	350
	351	The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
	352	the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
	353	three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
	354	allowed inputs to the strict UTF-8 traditionally defined by Unicode.
	355	Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
	356	C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
	357	above-Unicode and surrogate flags, but not the non-character ones, as
	358	defined in
	359	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	360	See L<perlunicode/Noncharacter code points>.
	361
	362	Extremely high code points were never specified in any standard, and require an
	363	extension to UTF-8 to express, which Perl does. It is likely that programs
	364	written in something other than Perl would not be able to read files that
	365	contain these; nor would Perl understand files written by something that uses a
	366	different extension. For these reasons, there is a separate set of flags that
	367	can warn and/or disallow these extremely high code points, even if other
	368	above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
	369	and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
	370	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
	371	treat all above-Unicode code points, including these, as malformations. (Note
	372	that the Unicode standard considers anything above 0x10FFFF to be illegal, but
	373	there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
	374
	375	A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
	376	retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
	377	C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	378	C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
	379	flags can apply to code points that actually do fit in 31 bits. This happens
	380	on EBCDIC platforms, and sometimes when the L<overlong
	381	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	382	describe the situation in all cases.
	383
	384	=cut
	385	*/
	386
	387	/* This is also a macro */
	388	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	389
	390	U8 *
	391	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	392	{
	393	return uvchr_to_utf8_flags(d, uv, flags);
	394	}
	395
	396	#ifndef UV_IS_QUAD
	397
	398	STATIC int
	399	S_is_utf8_cp_above_31_bits(const U8 * const s,
	400	const U8 * const e,
	401	const bool consider_overlongs)
	402	{
	403	/* Returns TRUE if the first code point represented by the Perl-extended-
	404	* UTF-8-encoded string starting at 's', and looking no further than 'e -
	405	* 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
	406	*
	407	* The function handles the case where the input bytes do not include all
	408	* the ones necessary to represent a full character. That is, they may be
	409	* the intial bytes of the representation of a code point, but possibly
	410	* the final ones necessary for the complete representation may be beyond
	411	* 'e - 1'.
	412	*
	413	* The function also can handle the case where the input is an overlong
	414	* sequence. If 'consider_overlongs' is 0, the function assumes the
	415	* input is not overlong, without checking, and will return based on that
	416	* assumption. If this parameter is 1, the function will go to the trouble
	417	* of figuring out if it actually evaluates to above or below 31 bits.
	418	*
	419	* The sequence is otherwise assumed to be well-formed, without checking.
	420	*/
	421
	422	const STRLEN len = e - s;
	423	int is_overlong;
	424
	425	PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
	426
	427	assert(! UTF8_IS_INVARIANT(*s) && e > s);
	428
	429	#ifdef EBCDIC
	430
	431	PERL_UNUSED_ARG(consider_overlongs);
	432
	433	/* On the EBCDIC code pages we handle, only the native start byte 0xFE can
	434	* mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
	435	* also be the start byte for a 31-bit code point; we need at least 2
	436	* bytes, and maybe up through 8 bytes, to determine that. (It can also be
	437	* the start byte for an overlong sequence, but for 30-bit or smaller code
	438	* points, so we don't have to worry about overlongs on EBCDIC.) */
	439	if (*s != 0xFE) {
	440	return 0;
	441	}
	442
	443	if (len == 1) {
	444	return -1;
	445	}
	446
	447	#else
	448
	449	/* On ASCII, FE and FF are the only start bytes that can evaluate to
	450	* needing more than 31 bits. */
	451	if (LIKELY(*s < 0xFE)) {
	452	return 0;
	453	}
	454
	455	/* What we have left are FE and FF. Both of these require more than 31
	456	* bits unless they are for overlongs. */
	457	if (! consider_overlongs) {
	458	return 1;
	459	}
	460
	461	/* Here, we have FE or FF. If the input isn't overlong, it evaluates to
	462	* above 31 bits. But we need more than one byte to discern this, so if
	463	* passed just the start byte, it could be an overlong evaluating to
	464	* smaller */
	465	if (len == 1) {
	466	return -1;
	467	}
	468
	469	/* Having excluded len==1, and knowing that FE and FF are both valid start
	470	* bytes, we can call the function below to see if the sequence is
	471	* overlong. (We don't need the full generality of the called function,
	472	* but for these huge code points, speed shouldn't be a consideration, and
	473	* the compiler does have enough information, since it's static to this
	474	* file, to optimize to just the needed parts.) */
	475	is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
	476
	477	/* If it isn't overlong, more than 31 bits are required. */
	478	if (is_overlong == 0) {
	479	return 1;
	480	}
	481
	482	/* If it is indeterminate if it is overlong, return that */
	483	if (is_overlong < 0) {
	484	return -1;
	485	}
	486
	487	/* Here is overlong. Such a sequence starting with FE is below 31 bits, as
	488	* the max it can be is 2*31 - 1 /
	489	if (*s == 0xFE) {
	490	return 0;
	491	}
	492
	493	#endif
	494
	495	/* Here, ASCII and EBCDIC rejoin:
	496	* On ASCII: We have an overlong sequence starting with FF
	497	* On EBCDIC: We have a sequence starting with FE. */
	498
	499	{ /* For C89, use a block so the declaration can be close to its use */
	500
	501	#ifdef EBCDIC
	502
	503	/* U+7FFFFFFF (2 ** 31 - 1)
	504	* [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
	505	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
	506	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
	507	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
	508	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
	509	* U+80000000 (2 ** 31):
	510	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	511	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	512	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	513	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
	514	*
	515	* and since we know that *s = \xfe, any continuation sequcence
	516	* following it that is gt the below is above 31 bits
	517	[0] [1] [2] [3] [4] [5] [6] */
	518	const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
	519
	520	#else
	521
	522	/* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
	523	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
	524	* FF overlong for U+80000000 (2 ** 31):
	525	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
	526	* and since we know that *s = \xff, any continuation sequcence
	527	* following it that is gt the below is above 30 bits
	528	[0] [1] [2] [3] [4] [5] [6] */
	529	const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
	530
	531
	532	#endif
	533	const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
	534	const STRLEN cmp_len = MIN(conts_len, len - 1);
	535
	536	/* Now compare the continuation bytes in s with the ones we have
	537	* compiled in that are for the largest 30 bit code point. If we have
	538	* enough bytes available to determine the answer, or the bytes we do
	539	* have differ from them, we can compare the two to get a definitive
	540	* answer (Note that in UTF-EBCDIC, the two lowest possible
	541	* continuation bytes are \x41 and \x42.) */
	542	if (cmp_len >= conts_len \|\| memNE(s + 1,
	543	conts_for_highest_30_bit,
	544	cmp_len))
	545	{
	546	return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
	547	}
	548
	549	/* Here, all the bytes we have are the same as the highest 30-bit code
	550	* point, but we are missing so many bytes that we can't make the
	551	* determination */
	552	return -1;
	553	}
	554	}
	555
	556	#endif
	557
	558	PERL_STATIC_INLINE int
	559	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
	560	{
	561	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	562	* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
	563	* it isn't, and -1 if there isn't enough information to tell. This last
	564	* return value can happen if the sequence is incomplete, missing some
	565	* trailing bytes that would form a complete character. If there are
	566	* enough bytes to make a definitive decision, this function does so.
	567	* Usually 2 bytes sufficient.
	568	*
	569	* Overlongs can occur whenever the number of continuation bytes changes.
	570	* That means whenever the number of leading 1 bits in a start byte
	571	* increases from the next lower start byte. That happens for start bytes
	572	* C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
	573	* start bytes have already been excluded, so don't need to be tested here;
	574	* ASCII platforms: C0, C1
	575	* EBCDIC platforms C0, C1, C2, C3, C4, E0
	576	*/
	577
	578	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	579	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	580
	581	PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
	582	assert(len > 1 && UTF8_IS_START(*s));
	583
	584	/* Each platform has overlongs after the start bytes given above (expressed
	585	* in I8 for EBCDIC). What constitutes an overlong varies by platform, but
	586	* the logic is the same, except the E0 overlong has already been excluded
	587	* on EBCDIC platforms. The values below were found by manually
	588	* inspecting the UTF-8 patterns. See the tables in utf8.h and
	589	* utfebcdic.h. */
	590
	591	# ifdef EBCDIC
	592	# define F0_ABOVE_OVERLONG 0xB0
	593	# define F8_ABOVE_OVERLONG 0xA8
	594	# define FC_ABOVE_OVERLONG 0xA4
	595	# define FE_ABOVE_OVERLONG 0xA2
	596	# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
	597	/* I8(0xfe) is FF */
	598	# else
	599
	600	if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
	601	return 1;
	602	}
	603
	604	# define F0_ABOVE_OVERLONG 0x90
	605	# define F8_ABOVE_OVERLONG 0x88
	606	# define FC_ABOVE_OVERLONG 0x84
	607	# define FE_ABOVE_OVERLONG 0x82
	608	# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
	609	# endif
	610
	611
	612	if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
	613	\|\| (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
	614	\|\| (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
	615	\|\| (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
	616	{
	617	return 1;
	618	}
	619
	620	/* Check for the FF overlong */
	621	return isFF_OVERLONG(s, len);
	622	}
	623
	624	PERL_STATIC_INLINE int
	625	S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
	626	{
	627	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	628	* 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
	629	* it isn't, and -1 if there isn't enough information to tell. This last
	630	* return value can happen if the sequence is incomplete, missing some
	631	* trailing bytes that would form a complete character. If there are
	632	* enough bytes to make a definitive decision, this function does so. */
	633
	634	PERL_ARGS_ASSERT_ISFF_OVERLONG;
	635
	636	/* To be an FF overlong, all the available bytes must match */
	637	if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
	638	MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
	639	{
	640	return 0;
	641	}
	642
	643	/* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
	644	* be there; what comes after them doesn't matter. See tables in utf8.h,
	645	* utfebcdic.h. */
	646	if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
	647	return 1;
	648	}
	649
	650	/* The missing bytes could cause the result to go one way or the other, so
	651	* the result is indeterminate */
	652	return -1;
	653	}
	654
	655	#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2*63-1 /
	656	# ifdef EBCDIC /* Actually is I8 */
	657	# define HIGHEST_REPRESENTABLE_UTF8 \
	658	"\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	659	# else
	660	# define HIGHEST_REPRESENTABLE_UTF8 \
	661	"\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	662	# endif
	663	#endif
	664
	665	PERL_STATIC_INLINE int
	666	S_does_utf8_overflow(const U8 * const s,
	667	const U8 * e,
	668	const bool consider_overlongs)
	669	{
	670	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	671	* 'e' - 1 would overflow an IV on this platform; that is if it represents
	672	* a code point larger than the highest representable code point. It
	673	* returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
	674	* enough information to tell. This last return value can happen if the
	675	* sequence is incomplete, missing some trailing bytes that would form a
	676	* complete character. If there are enough bytes to make a definitive
	677	* decision, this function does so.
	678	*
	679	* If 'consider_overlongs' is TRUE, the function checks for the possibility
	680	* that the sequence is an overlong that doesn't overflow. Otherwise, it
	681	* assumes the sequence is not an overlong. This can give different
	682	* results only on ASCII 32-bit platforms.
	683	*
	684	* (For ASCII platforms, we could use memcmp() because we don't have to
	685	* convert each byte to I8, but it's very rare input indeed that would
	686	* approach overflow, so the loop below will likely only get executed once.)
	687	*
	688	* 'e' - 1 must not be beyond a full character. */
	689
	690
	691	PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
	692	assert(s <= e && s + UTF8SKIP(s) >= e);
	693
	694	#if ! defined(UV_IS_QUAD)
	695
	696	return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
	697
	698	#else
	699
	700	PERL_UNUSED_ARG(consider_overlongs);
	701
	702	{
	703	const STRLEN len = e - s;
	704	const U8 *x;
	705	const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
	706
	707	for (x = s; x < e; x++, y++) {
	708
	709	if (UNLIKELY(NATIVE_UTF8_TO_I8(x) == y)) {
	710	continue;
	711	}
	712
	713	/* If this byte is larger than the corresponding highest UTF-8
	714	* byte, the sequence overflow; otherwise the byte is less than,
	715	* and so the sequence doesn't overflow */
	716	return NATIVE_UTF8_TO_I8(x) > y;
	717
	718	}
	719
	720	/* Got to the end and all bytes are the same. If the input is a whole
	721	* character, it doesn't overflow. And if it is a partial character,
	722	* there's not enough information to tell */
	723	if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
	724	return -1;
	725	}
	726
	727	return 0;
	728	}
	729
	730	#endif
	731
	732	}
	733
	734	#if 0
	735
	736	/* This is the portions of the above function that deal with UV_MAX instead of
	737	* IV_MAX. They are left here in case we want to combine them so that internal
	738	* uses can have larger code points. The only logic difference is that the
	739	* 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
	740	* different logic.
	741	*/
	742
	743	/* Anything larger than this will overflow the word if it were converted into a UV */
	744	#if defined(UV_IS_QUAD)
	745	# ifdef EBCDIC /* Actually is I8 */
	746	# define HIGHEST_REPRESENTABLE_UTF8 \
	747	"\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	748	# else
	749	# define HIGHEST_REPRESENTABLE_UTF8 \
	750	"\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	751	# endif
	752	#else /* 32-bit */
	753	# ifdef EBCDIC
	754	# define HIGHEST_REPRESENTABLE_UTF8 \
	755	"\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
	756	# else
	757	# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
	758	# endif
	759	#endif
	760
	761	#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
	762
	763	/* On 32 bit ASCII machines, many overlongs that start with FF don't
	764	* overflow */
	765	if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
	766
	767	/* To be such an overlong, the first bytes of 's' must match
	768	* FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
	769	* don't have any additional bytes available, the sequence, when
	770	* completed might or might not fit in 32 bits. But if we have that
	771	* next byte, we can tell for sure. If it is <= 0x83, then it does
	772	* fit. */
	773	if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
	774	return -1;
	775	}
	776
	777	return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
	778	}
	779
	780	/* Starting with the #else, the rest of the function is identical except
	781	* 1. we need to move the 'len' declaration to be global to the function
	782	* 2. the endif move to just after the UNUSED_ARG.
	783	* An empty endif is given just below to satisfy the preprocessor
	784	*/
	785	#endif
	786
	787	#endif
	788
	789	#undef F0_ABOVE_OVERLONG
	790	#undef F8_ABOVE_OVERLONG
	791	#undef FC_ABOVE_OVERLONG
	792	#undef FE_ABOVE_OVERLONG
	793	#undef FF_OVERLONG_PREFIX
	794
	795	STRLEN
	796	Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
	797	{
	798	STRLEN len;
	799	const U8 *x;
	800
	801	/* A helper function that should not be called directly.
	802	*
	803	* This function returns non-zero if the string beginning at 's' and
	804	* looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
	805	* code point; otherwise it returns 0. The examination stops after the
	806	* first code point in 's' is validated, not looking at the rest of the
	807	* input. If 'e' is such that there are not enough bytes to represent a
	808	* complete code point, this function will return non-zero anyway, if the
	809	* bytes it does have are well-formed UTF-8 as far as they go, and aren't
	810	* excluded by 'flags'.
	811	*
	812	* A non-zero return gives the number of bytes required to represent the
	813	* code point. Be aware that if the input is for a partial character, the
	814	* return will be larger than 'e - s'.
	815	*
	816	* This function assumes that the code point represented is UTF-8 variant.
	817	* The caller should have excluded the possibility of it being invariant
	818	* before calling this function.
	819	*
	820	* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
	821	* accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
	822	* 0 if the code point represented is well-formed Perl-extended-UTF-8, but
	823	* disallowed by the flags. If the input is only for a partial character,
	824	* the function will return non-zero if there is any sequence of
	825	* well-formed UTF-8 that, when appended to the input sequence, could
	826	* result in an allowed code point; otherwise it returns 0. Non characters
	827	* cannot be determined based on partial character input. But many of the
	828	* other excluded types can be determined with just the first one or two
	829	* bytes.
	830	*
	831	*/
	832
	833	PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
	834
	835	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	836	\|UTF8_DISALLOW_PERL_EXTENDED)));
	837	assert(! UTF8_IS_INVARIANT(*s));
	838
	839	/* A variant char must begin with a start byte */
	840	if (UNLIKELY(! UTF8_IS_START(*s))) {
	841	return 0;
	842	}
	843
	844	/* Examine a maximum of a single whole code point */
	845	if (e - s > UTF8SKIP(s)) {
	846	e = s + UTF8SKIP(s);
	847	}
	848
	849	len = e - s;
	850
	851	if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
	852	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	853
	854	/* Here, we are disallowing some set of largish code points, and the
	855	* first byte indicates the sequence is for a code point that could be
	856	* in the excluded set. We generally don't have to look beyond this or
	857	* the second byte to see if the sequence is actually for one of the
	858	* excluded classes. The code below is derived from this table:
	859	*
	860	* UTF-8 UTF-EBCDIC I8
	861	* U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
	862	* U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
	863	* U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
	864	*
	865	* Keep in mind that legal continuation bytes range between \x80..\xBF
	866	* for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
	867	* continuation bytes. Hence, we don't have to test the upper edge
	868	* because if any of those is encountered, the sequence is malformed,
	869	* and would fail elsewhere in this function.
	870	*
	871	* The code here likewise assumes that there aren't other
	872	* malformations; again the function should fail elsewhere because of
	873	* these. For example, an overlong beginning with FC doesn't actually
	874	* have to be a super; it could actually represent a small code point,
	875	* even U+0000. But, since overlongs (and other malformations) are
	876	* illegal, the function should return FALSE in either case.
	877	*/
	878
	879	#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
	880	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
	881	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
	882
	883	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
	884	/* B6 and B7 */ \
	885	&& ((s1) & 0xFE ) == 0xB6)
	886	# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
	887	#else
	888	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
	889	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
	890	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
	891	# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
	892	#endif
	893
	894	if ( (flags & UTF8_DISALLOW_SUPER)
	895	&& UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	896	{
	897	return 0; /* Above Unicode */
	898	}
	899
	900	if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
	901	&& UNLIKELY(isUTF8_PERL_EXTENDED(s)))
	902	{
	903	return 0;
	904	}
	905
	906	if (len > 1) {
	907	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	908
	909	if ( (flags & UTF8_DISALLOW_SUPER)
	910	&& UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
	911	{
	912	return 0; /* Above Unicode */
	913	}
	914
	915	if ( (flags & UTF8_DISALLOW_SURROGATE)
	916	&& UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
	917	{
	918	return 0; /* Surrogate */
	919	}
	920
	921	if ( (flags & UTF8_DISALLOW_NONCHAR)
	922	&& UNLIKELY(UTF8_IS_NONCHAR(s, e)))
	923	{
	924	return 0; /* Noncharacter code point */
	925	}
	926	}
	927	}
	928
	929	/* Make sure that all that follows are continuation bytes */
	930	for (x = s + 1; x < e; x++) {
	931	if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
	932	return 0;
	933	}
	934	}
	935
	936	/* Here is syntactically valid. Next, make sure this isn't the start of an
	937	* overlong. */
	938	if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
	939	return 0;
	940	}
	941
	942	/* And finally, that the code point represented fits in a word on this
	943	* platform */
	944	if (0 < does_utf8_overflow(s, e,
	945	0 /* Don't consider overlongs */
	946	))
	947	{
	948	return 0;
	949	}
	950
	951	return UTF8SKIP(s);
	952	}
	953
	954	char *
	955	Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
	956	{
	957	/* Returns a mortalized C string that is a displayable copy of the 'len'
	958	* bytes starting at 's'. 'format' gives how to display each byte.
	959	* Currently, there are only two formats, so it is currently a bool:
	960	* 0 \xab
	961	* 1 ab (that is a space between two hex digit bytes)
	962	*/
	963
	964	const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
	965	trailing NUL */
	966	const U8 * const e = s + len;
	967	char * output;
	968	char * d;
	969
	970	PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
	971
	972	Newx(output, output_len, char);
	973	SAVEFREEPV(output);
	974
	975	d = output;
	976	for (; s < e; s++) {
	977	const unsigned high_nibble = (*s & 0xF0) >> 4;
	978	const unsigned low_nibble = (*s & 0x0F);
	979
	980	if (format) {
	981	*d++ = ' ';
	982	}
	983	else {
	984	*d++ = '\\';
	985	*d++ = 'x';
	986	}
	987
	988	if (high_nibble < 10) {
	989	*d++ = high_nibble + '0';
	990	}
	991	else {
	992	*d++ = high_nibble - 10 + 'a';
	993	}
	994
	995	if (low_nibble < 10) {
	996	*d++ = low_nibble + '0';
	997	}
	998	else {
	999	*d++ = low_nibble - 10 + 'a';
	1000	}
	1001	}
	1002
	1003	*d = '\0';
	1004	return output;
	1005	}
	1006
	1007	PERL_STATIC_INLINE char *
	1008	S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
	1009
	1010	/* How many bytes to print */
	1011	STRLEN print_len,
	1012
	1013	/* Which one is the non-continuation */
	1014	const STRLEN non_cont_byte_pos,
	1015
	1016	/* How many bytes should there be? */
	1017	const STRLEN expect_len)
	1018	{
	1019	/* Return the malformation warning text for an unexpected continuation
	1020	* byte. */
	1021
	1022	const char * const where = (non_cont_byte_pos == 1)
	1023	? "immediately"
	1024	: Perl_form(aTHX_ "%d bytes",
	1025	(int) non_cont_byte_pos);
	1026
	1027	PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
	1028
	1029	/* We don't need to pass this parameter, but since it has already been
	1030	* calculated, it's likely faster to pass it; verify under DEBUGGING */
	1031	assert(expect_len == UTF8SKIP(s));
	1032
	1033	return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
	1034	" %s after start byte 0x%02x; need %d bytes, got %d)",
	1035	malformed_text,
	1036	_byte_dump_string(s, print_len, 0),
	1037	*(s + non_cont_byte_pos),
	1038	where,
	1039	*s,
	1040	(int) expect_len,
	1041	(int) non_cont_byte_pos);
	1042	}
	1043
	1044	/*
	1045
	1046	=for apidoc utf8n_to_uvchr
	1047
	1048	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1049	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1050
	1051	Bottom level UTF-8 decode routine.
	1052	Returns the native code point value of the first character in the string C<s>,
	1053	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	1054	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	1055	the length, in bytes, of that character.
	1056
	1057	The value of C<flags> determines the behavior when C<s> does not point to a
	1058	well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
	1059	causes zero to be returned and C<retlen> is set so that (S<C<s> + C<retlen>>)
	1060	is the next possible position in C<s> that could begin a non-malformed
	1061	character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
	1062	is raised. Some UTF-8 input sequences may contain multiple malformations.
	1063	This function tries to find every possible one in each call, so multiple
	1064	warnings can be raised for the same sequence.
	1065
	1066	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	1067	individual types of malformations, such as the sequence being overlong (that
	1068	is, when there is a shorter sequence that can express the same code point;
	1069	overlong sequences are expressly forbidden in the UTF-8 standard due to
	1070	potential security issues). Another malformation example is the first byte of
	1071	a character not being a legal first byte. See F<utf8.h> for the list of such
	1072	flags. Even if allowed, this function generally returns the Unicode
	1073	REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
	1074	F<utf8.h> to override this behavior for the overlong malformations, but don't
	1075	do that except for very specialized purposes.
	1076
	1077	The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
	1078	flags) malformation is found. If this flag is set, the routine assumes that
	1079	the caller will raise a warning, and this function will silently just set
	1080	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	1081
	1082	Note that this API requires disambiguation between successful decoding a C<NUL>
	1083	character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
	1084	in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
	1085	be set to 1. To disambiguate, upon a zero return, see if the first byte of
	1086	C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
	1087	error. Or you can use C<L</utf8n_to_uvchr_error>>.
	1088
	1089	Certain code points are considered problematic. These are Unicode surrogates,
	1090	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	1091	By default these are considered regular code points, but certain situations
	1092	warrant special handling for them, which can be specified using the C<flags>
	1093	parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
	1094	three classes are treated as malformations and handled as such. The flags
	1095	C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
	1096	C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
	1097	disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
	1098	restricts the allowed inputs to the strict UTF-8 traditionally defined by
	1099	Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
	1100	definition given by
	1101	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	1102	The difference between traditional strictness and C9 strictness is that the
	1103	latter does not forbid non-character code points. (They are still discouraged,
	1104	however.) For more discussion see L<perlunicode/Noncharacter code points>.
	1105
	1106	The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
	1107	C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
	1108	C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
	1109	raised for their respective categories, but otherwise the code points are
	1110	considered valid (not malformations). To get a category to both be treated as
	1111	a malformation and raise a warning, specify both the WARN and DISALLOW flags.
	1112	(But note that warnings are not raised if lexically disabled nor if
	1113	C<UTF8_CHECK_ONLY> is also specified.)
	1114
	1115	Extremely high code points were never specified in any standard, and require an
	1116	extension to UTF-8 to express, which Perl does. It is likely that programs
	1117	written in something other than Perl would not be able to read files that
	1118	contain these; nor would Perl understand files written by something that uses a
	1119	different extension. For these reasons, there is a separate set of flags that
	1120	can warn and/or disallow these extremely high code points, even if other
	1121	above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
	1122	C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
	1123	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
	1124	above-Unicode code points, including these, as malformations.
	1125	(Note that the Unicode standard considers anything above 0x10FFFF to be
	1126	illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
	1127	(2**31 -1))
	1128
	1129	A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
	1130	retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
	1131	C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	1132	C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
	1133	can apply to code points that actually do fit in 31 bits. This happens on
	1134	EBCDIC platforms, and sometimes when the L<overlong
	1135	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	1136	describe the situation in all cases.
	1137
	1138
	1139	All other code points corresponding to Unicode characters, including private
	1140	use and those yet to be assigned, are never considered malformed and never
	1141	warn.
	1142
	1143	=cut
	1144
	1145	Also implemented as a macro in utf8.h
	1146	*/
	1147
	1148	UV
	1149	Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
	1150	STRLEN curlen,
	1151	STRLEN *retlen,
	1152	const U32 flags)
	1153	{
	1154	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	1155
	1156	return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
	1157	}
	1158
	1159	/*
	1160
	1161	=for apidoc utf8n_to_uvchr_error
	1162
	1163	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1164	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1165
	1166	This function is for code that needs to know what the precise malformation(s)
	1167	are when an error is found.
	1168
	1169	It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
	1170	all the others, C<errors>. If this parameter is 0, this function behaves
	1171	identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
	1172	to a C<U32> variable, which this function sets to indicate any errors found.
	1173	Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
	1174	C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
	1175	of these bits will be set if a malformation is found, even if the input
	1176	C<flags> parameter indicates that the given malformation is allowed; those
	1177	exceptions are noted:
	1178
	1179	=over 4
	1180
	1181	=item C<UTF8_GOT_PERL_EXTENDED>
	1182
	1183	The input sequence is not standard UTF-8, but a Perl extension. This bit is
	1184	set only if the input C<flags> parameter contains either the
	1185	C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
	1186
	1187	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	1188	and so some extension must be used to express them. Perl uses a natural
	1189	extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
	1190	extension to represent even higher ones, so that any code point that fits in a
	1191	64-bit word can be represented. Text using these extensions is not likely to
	1192	be portable to non-Perl code. We lump both of these extensions together and
	1193	refer to them as Perl extended UTF-8. There exist other extensions that people
	1194	have invented, incompatible with Perl's.
	1195
	1196	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	1197	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	1198	than on ASCII. Prior to that, code points 2**31 and higher were simply
	1199	unrepresentable, and a different, incompatible method was used to represent
	1200	code points between 230 and 231 - 1.
	1201
	1202	On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
	1203	Perl extended UTF-8 is used.
	1204
	1205	In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
	1206	may use for backward compatibility. That name is misleading, as this flag may
	1207	be set when the code point actually does fit in 31 bits. This happens on
	1208	EBCDIC platforms, and sometimes when the L<overlong
	1209	malformation\|/C<UTF8_GOT_LONG>> is also present. The new name accurately
	1210	describes the situation in all cases.
	1211
	1212	=item C<UTF8_GOT_CONTINUATION>
	1213
	1214	The input sequence was malformed in that the first byte was a a UTF-8
	1215	continuation byte.
	1216
	1217	=item C<UTF8_GOT_EMPTY>
	1218
	1219	The input C<curlen> parameter was 0.
	1220
	1221	=item C<UTF8_GOT_LONG>
	1222
	1223	The input sequence was malformed in that there is some other sequence that
	1224	evaluates to the same code point, but that sequence is shorter than this one.
	1225
	1226	Until Unicode 3.1, it was legal for programs to accept this malformation, but
	1227	it was discovered that this created security issues.
	1228
	1229	=item C<UTF8_GOT_NONCHAR>
	1230
	1231	The code point represented by the input UTF-8 sequence is for a Unicode
	1232	non-character code point.
	1233	This bit is set only if the input C<flags> parameter contains either the
	1234	C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
	1235
	1236	=item C<UTF8_GOT_NON_CONTINUATION>
	1237
	1238	The input sequence was malformed in that a non-continuation type byte was found
	1239	in a position where only a continuation type one should be.
	1240
	1241	=item C<UTF8_GOT_OVERFLOW>
	1242
	1243	The input sequence was malformed in that it is for a code point that is not
	1244	representable in the number of bits available in an IV on the current platform.
	1245
	1246	=item C<UTF8_GOT_SHORT>
	1247
	1248	The input sequence was malformed in that C<curlen> is smaller than required for
	1249	a complete sequence. In other words, the input is for a partial character
	1250	sequence.
	1251
	1252	=item C<UTF8_GOT_SUPER>
	1253
	1254	The input sequence was malformed in that it is for a non-Unicode code point;
	1255	that is, one above the legal Unicode maximum.
	1256	This bit is set only if the input C<flags> parameter contains either the
	1257	C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
	1258
	1259	=item C<UTF8_GOT_SURROGATE>
	1260
	1261	The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
	1262	code point.
	1263	This bit is set only if the input C<flags> parameter contains either the
	1264	C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
	1265
	1266	=back
	1267
	1268	To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
	1269	flag to suppress any warnings, and then examine the C<*errors> return.
	1270
	1271	=cut
	1272	*/
	1273
	1274	UV
	1275	Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
	1276	STRLEN curlen,
	1277	STRLEN *retlen,
	1278	const U32 flags,
	1279	U32 * errors)
	1280	{
	1281	const U8 * const s0 = s;
	1282	U8 * send = NULL; /* (initialized to silence compilers' wrong
	1283	warning) */
	1284	U32 possible_problems = 0; /* A bit is set here for each potential problem
	1285	found as we go along */
	1286	UV uv = *s;
	1287	STRLEN expectlen = 0; /* How long should this sequence be?
	1288	(initialized to silence compilers' wrong
	1289	warning) */
	1290	STRLEN avail_len = 0; /* When input is too short, gives what that is */
	1291	U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
	1292	this gets set and discarded */
	1293
	1294	/* The below are used only if there is both an overlong malformation and a
	1295	* too short one. Otherwise the first two are set to 's0' and 'send', and
	1296	* the third not used at all */
	1297	U8 * adjusted_s0 = (U8 *) s0;
	1298	U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
	1299	routine; see [perl #130921] */
	1300	UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
	1301
	1302	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
	1303
	1304	if (errors) {
	1305	*errors = 0;
	1306	}
	1307	else {
	1308	errors = &discard_errors;
	1309	}
	1310
	1311	/* The order of malformation tests here is important. We should consume as
	1312	* few bytes as possible in order to not skip any valid character. This is
	1313	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	1314	* http://unicode.org/reports/tr36 for more discussion as to why. For
	1315	* example, once we've done a UTF8SKIP, we can tell the expected number of
	1316	* bytes, and could fail right off the bat if the input parameters indicate
	1317	* that there are too few available. But it could be that just that first
	1318	* byte is garbled, and the intended character occupies fewer bytes. If we
	1319	* blindly assumed that the first byte is correct, and skipped based on
	1320	* that number, we could skip over a valid input character. So instead, we
	1321	* always examine the sequence byte-by-byte.
	1322	*
	1323	* We also should not consume too few bytes, otherwise someone could inject
	1324	* things. For example, an input could be deliberately designed to
	1325	* overflow, and if this code bailed out immediately upon discovering that,
	1326	* returning to the caller C<*retlen> pointing to the very next byte (one
	1327	* which is actually part of of the overflowing sequence), that could look
	1328	* legitimate to the caller, which could discard the initial partial
	1329	* sequence and process the rest, inappropriately.
	1330	*
	1331	* Some possible input sequences are malformed in more than one way. This
	1332	* function goes to lengths to try to find all of them. This is necessary
	1333	* for correctness, as the inputs may allow one malformation but not
	1334	* another, and if we abandon searching for others after finding the
	1335	* allowed one, we could allow in something that shouldn't have been.
	1336	*/
	1337
	1338	if (UNLIKELY(curlen == 0)) {
	1339	possible_problems \|= UTF8_GOT_EMPTY;
	1340	curlen = 0;
	1341	uv = UNICODE_REPLACEMENT;
	1342	goto ready_to_handle_errors;
	1343	}
	1344
	1345	expectlen = UTF8SKIP(s);
	1346
	1347	/* A well-formed UTF-8 character, as the vast majority of calls to this
	1348	* function will be for, has this expected length. For efficiency, set
	1349	* things up here to return it. It will be overriden only in those rare
	1350	* cases where a malformation is found */
	1351	if (retlen) {
	1352	*retlen = expectlen;
	1353	}
	1354
	1355	/* An invariant is trivially well-formed */
	1356	if (UTF8_IS_INVARIANT(uv)) {
	1357	return uv;
	1358	}
	1359
	1360	/* A continuation character can't start a valid sequence */
	1361	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	1362	possible_problems \|= UTF8_GOT_CONTINUATION;
	1363	curlen = 1;
	1364	uv = UNICODE_REPLACEMENT;
	1365	goto ready_to_handle_errors;
	1366	}
	1367
	1368	/* Here is not a continuation byte, nor an invariant. The only thing left
	1369	* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
	1370	* because it excludes start bytes like \xC0 that always lead to
	1371	* overlongs.) */
	1372
	1373	/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
	1374	* that indicate the number of bytes in the character's whole UTF-8
	1375	* sequence, leaving just the bits that are part of the value. */
	1376	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	1377
	1378	/* Setup the loop end point, making sure to not look past the end of the
	1379	* input string, and flag it as too short if the size isn't big enough. */
	1380	send = (U8*) s0;
	1381	if (UNLIKELY(curlen < expectlen)) {
	1382	possible_problems \|= UTF8_GOT_SHORT;
	1383	avail_len = curlen;
	1384	send += curlen;
	1385	}
	1386	else {
	1387	send += expectlen;
	1388	}
	1389
	1390	/* Now, loop through the remaining bytes in the character's sequence,
	1391	* accumulating each into the working value as we go. */
	1392	for (s = s0 + 1; s < send; s++) {
	1393	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	1394	uv = UTF8_ACCUMULATE(uv, *s);
	1395	continue;
	1396	}
	1397
	1398	/* Here, found a non-continuation before processing all expected bytes.
	1399	* This byte indicates the beginning of a new character, so quit, even
	1400	* if allowing this malformation. */
	1401	possible_problems \|= UTF8_GOT_NON_CONTINUATION;
	1402	break;
	1403	} /* End of loop through the character's bytes */
	1404
	1405	/* Save how many bytes were actually in the character */
	1406	curlen = s - s0;
	1407
	1408	/* Note that there are two types of too-short malformation. One is when
	1409	* there is actual wrong data before the normal termination of the
	1410	* sequence. The other is that the sequence wasn't complete before the end
	1411	* of the data we are allowed to look at, based on the input 'curlen'.
	1412	* This means that we were passed data for a partial character, but it is
	1413	* valid as far as we saw. The other is definitely invalid. This
	1414	* distinction could be important to a caller, so the two types are kept
	1415	* separate.
	1416	*
	1417	* A convenience macro that matches either of the too-short conditions. */
	1418	# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT\|UTF8_GOT_NON_CONTINUATION)
	1419
	1420	if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
	1421	uv_so_far = uv;
	1422	uv = UNICODE_REPLACEMENT;
	1423	}
	1424
	1425	/* Check for overflow. The algorithm requires us to not look past the end
	1426	* of the current character, even if partial, so the upper limit is 's' */
	1427	if (UNLIKELY(0 < does_utf8_overflow(s0, s,
	1428	1 /* Do consider overlongs */
	1429	)))
	1430	{
	1431	possible_problems \|= UTF8_GOT_OVERFLOW;
	1432	uv = UNICODE_REPLACEMENT;
	1433	}
	1434
	1435	/* Check for overlong. If no problems so far, 'uv' is the correct code
	1436	* point value. Simply see if it is expressible in fewer bytes. Otherwise
	1437	* we must look at the UTF-8 byte sequence itself to see if it is for an
	1438	* overlong */
	1439	if ( ( LIKELY(! possible_problems)
	1440	&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
	1441	\|\| ( UNLIKELY(possible_problems)
	1442	&& ( UNLIKELY(! UTF8_IS_START(*s0))
	1443	\|\| ( curlen > 1
	1444	&& UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
	1445	s - s0))))))
	1446	{
	1447	possible_problems \|= UTF8_GOT_LONG;
	1448
	1449	if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
	1450
	1451	/* The calculation in the 'true' branch of this 'if'
	1452	* below won't work if overflows, and isn't needed
	1453	* anyway. Further below we handle all overflow
	1454	* cases */
	1455	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
	1456	{
	1457	UV min_uv = uv_so_far;
	1458	STRLEN i;
	1459
	1460	/* Here, the input is both overlong and is missing some trailing
	1461	* bytes. There is no single code point it could be for, but there
	1462	* may be enough information present to determine if what we have
	1463	* so far is for an unallowed code point, such as for a surrogate.
	1464	* The code further below has the intelligence to determine this,
	1465	* but just for non-overlong UTF-8 sequences. What we do here is
	1466	* calculate the smallest code point the input could represent if
	1467	* there were no too short malformation. Then we compute and save
	1468	* the UTF-8 for that, which is what the code below looks at
	1469	* instead of the raw input. It turns out that the smallest such
	1470	* code point is all we need. */
	1471	for (i = curlen; i < expectlen; i++) {
	1472	min_uv = UTF8_ACCUMULATE(min_uv,
	1473	I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
	1474	}
	1475
	1476	adjusted_s0 = temp_char_buf;
	1477	(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
	1478	}
	1479	}
	1480
	1481	/* Here, we have found all the possible problems, except for when the input
	1482	* is for a problematic code point not allowed by the input parameters. */
	1483
	1484	/* uv is valid for overlongs */
	1485	if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
	1486
	1487	/* isn't problematic if < this */
	1488	&& uv >= UNICODE_SURROGATE_FIRST)
	1489	\|\| ( UNLIKELY(possible_problems)
	1490
	1491	/* if overflow, we know without looking further
	1492	* precisely which of the problematic types it is,
	1493	* and we deal with those in the overflow handling
	1494	* code */
	1495	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
	1496	&& ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
	1497	\|\| UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
	1498	&& ((flags & ( UTF8_DISALLOW_NONCHAR
	1499	\|UTF8_DISALLOW_SURROGATE
	1500	\|UTF8_DISALLOW_SUPER
	1501	\|UTF8_DISALLOW_PERL_EXTENDED
	1502	\|UTF8_WARN_NONCHAR
	1503	\|UTF8_WARN_SURROGATE
	1504	\|UTF8_WARN_SUPER
	1505	\|UTF8_WARN_PERL_EXTENDED))))
	1506	{
	1507	/* If there were no malformations, or the only malformation is an
	1508	* overlong, 'uv' is valid */
	1509	if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
	1510	if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	1511	possible_problems \|= UTF8_GOT_SURROGATE;
	1512	}
	1513	else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
	1514	possible_problems \|= UTF8_GOT_SUPER;
	1515	}
	1516	else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
	1517	possible_problems \|= UTF8_GOT_NONCHAR;
	1518	}
	1519	}
	1520	else { /* Otherwise, need to look at the source UTF-8, possibly
	1521	adjusted to be non-overlong */
	1522
	1523	if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
	1524	>= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1525	{
	1526	possible_problems \|= UTF8_GOT_SUPER;
	1527	}
	1528	else if (curlen > 1) {
	1529	if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
	1530	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1531	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1532	{
	1533	possible_problems \|= UTF8_GOT_SUPER;
	1534	}
	1535	else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
	1536	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1537	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1538	{
	1539	possible_problems \|= UTF8_GOT_SURROGATE;
	1540	}
	1541	}
	1542
	1543	/* We need a complete well-formed UTF-8 character to discern
	1544	* non-characters, so can't look for them here */
	1545	}
	1546	}
	1547
	1548	ready_to_handle_errors:
	1549
	1550	/* At this point:
	1551	* curlen contains the number of bytes in the sequence that
	1552	* this call should advance the input by.
	1553	* avail_len gives the available number of bytes passed in, but
	1554	* only if this is less than the expected number of
	1555	* bytes, based on the code point's start byte.
	1556	* possible_problems' is 0 if there weren't any problems; otherwise a bit
	1557	* is set in it for each potential problem found.
	1558	* uv contains the code point the input sequence
	1559	* represents; or if there is a problem that prevents
	1560	* a well-defined value from being computed, it is
	1561	* some subsitute value, typically the REPLACEMENT
	1562	* CHARACTER.
	1563	* s0 points to the first byte of the character
	1564	* s points to just after were we left off processing
	1565	* the character
	1566	* send points to just after where that character should
	1567	* end, based on how many bytes the start byte tells
	1568	* us should be in it, but no further than s0 +
	1569	* avail_len
	1570	*/
	1571
	1572	if (UNLIKELY(possible_problems)) {
	1573	bool disallowed = FALSE;
	1574	const U32 orig_problems = possible_problems;
	1575
	1576	while (possible_problems) { /* Handle each possible problem */
	1577	UV pack_warn = 0;
	1578	char * message = NULL;
	1579
	1580	/* Each 'if' clause handles one problem. They are ordered so that
	1581	* the first ones' messages will be displayed before the later
	1582	* ones; this is kinda in decreasing severity order. But the
	1583	* overlong must come last, as it changes 'uv' looked at by the
	1584	* others */
	1585	if (possible_problems & UTF8_GOT_OVERFLOW) {
	1586
	1587	/* Overflow means also got a super and are using Perl's
	1588	* extended UTF-8, but we handle all three cases here */
	1589	possible_problems
	1590	&= ~(UTF8_GOT_OVERFLOW\|UTF8_GOT_SUPER\|UTF8_GOT_PERL_EXTENDED);
	1591	*errors \|= UTF8_GOT_OVERFLOW;
	1592
	1593	/* But the API says we flag all errors found */
	1594	if (flags & (UTF8_WARN_SUPER\|UTF8_DISALLOW_SUPER)) {
	1595	*errors \|= UTF8_GOT_SUPER;
	1596	}
	1597	if (flags
	1598	& (UTF8_WARN_PERL_EXTENDED\|UTF8_DISALLOW_PERL_EXTENDED))
	1599	{
	1600	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1601	}
	1602
	1603	/* Disallow if any of the three categories say to */
	1604	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1605	\|\| (flags & ( UTF8_DISALLOW_SUPER
	1606	\|UTF8_DISALLOW_PERL_EXTENDED)))
	1607	{
	1608	disallowed = TRUE;
	1609	}
	1610
	1611	/* Likewise, warn if any say to */
	1612	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1613	\|\| (flags & (UTF8_WARN_SUPER\|UTF8_WARN_PERL_EXTENDED)))
	1614	{
	1615
	1616	/* The warnings code explicitly says it doesn't handle the
	1617	* case of packWARN2 and two categories which have
	1618	* parent-child relationship. Even if it works now to
	1619	* raise the warning if either is enabled, it wouldn't
	1620	* necessarily do so in the future. We output (only) the
	1621	* most dire warning */
	1622	if (! (flags & UTF8_CHECK_ONLY)) {
	1623	if (ckWARN_d(WARN_UTF8)) {
	1624	pack_warn = packWARN(WARN_UTF8);
	1625	}
	1626	else if (ckWARN_d(WARN_NON_UNICODE)) {
	1627	pack_warn = packWARN(WARN_NON_UNICODE);
	1628	}
	1629	if (pack_warn) {
	1630	message = Perl_form(aTHX_ "%s: %s (overflows)",
	1631	malformed_text,
	1632	_byte_dump_string(s0, curlen, 0));
	1633	}
	1634	}
	1635	}
	1636	}
	1637	else if (possible_problems & UTF8_GOT_EMPTY) {
	1638	possible_problems &= ~UTF8_GOT_EMPTY;
	1639	*errors \|= UTF8_GOT_EMPTY;
	1640
	1641	if (! (flags & UTF8_ALLOW_EMPTY)) {
	1642
	1643	/* This so-called malformation is now treated as a bug in
	1644	* the caller. If you have nothing to decode, skip calling
	1645	* this function */
	1646	assert(0);
	1647
	1648	disallowed = TRUE;
	1649	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1650	pack_warn = packWARN(WARN_UTF8);
	1651	message = Perl_form(aTHX_ "%s (empty string)",
	1652	malformed_text);
	1653	}
	1654	}
	1655	}
	1656	else if (possible_problems & UTF8_GOT_CONTINUATION) {
	1657	possible_problems &= ~UTF8_GOT_CONTINUATION;
	1658	*errors \|= UTF8_GOT_CONTINUATION;
	1659
	1660	if (! (flags & UTF8_ALLOW_CONTINUATION)) {
	1661	disallowed = TRUE;
	1662	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1663	pack_warn = packWARN(WARN_UTF8);
	1664	message = Perl_form(aTHX_
	1665	"%s: %s (unexpected continuation byte 0x%02x,"
	1666	" with no preceding start byte)",
	1667	malformed_text,
	1668	_byte_dump_string(s0, 1, 0), *s0);
	1669	}
	1670	}
	1671	}
	1672	else if (possible_problems & UTF8_GOT_SHORT) {
	1673	possible_problems &= ~UTF8_GOT_SHORT;
	1674	*errors \|= UTF8_GOT_SHORT;
	1675
	1676	if (! (flags & UTF8_ALLOW_SHORT)) {
	1677	disallowed = TRUE;
	1678	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1679	pack_warn = packWARN(WARN_UTF8);
	1680	message = Perl_form(aTHX_
	1681	"%s: %s (too short; %d byte%s available, need %d)",
	1682	malformed_text,
	1683	_byte_dump_string(s0, send - s0, 0),
	1684	(int)avail_len,
	1685	avail_len == 1 ? "" : "s",
	1686	(int)expectlen);
	1687	}
	1688	}
	1689
	1690	}
	1691	else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
	1692	possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
	1693	*errors \|= UTF8_GOT_NON_CONTINUATION;
	1694
	1695	if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
	1696	disallowed = TRUE;
	1697	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1698
	1699	/* If we don't know for sure that the input length is
	1700	* valid, avoid as much as possible reading past the
	1701	* end of the buffer */
	1702	int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
	1703	? s - s0
	1704	: send - s0;
	1705	pack_warn = packWARN(WARN_UTF8);
	1706	message = Perl_form(aTHX_ "%s",
	1707	unexpected_non_continuation_text(s0,
	1708	printlen,
	1709	s - s0,
	1710	(int) expectlen));
	1711	}
	1712	}
	1713	}
	1714	else if (possible_problems & UTF8_GOT_SURROGATE) {
	1715	possible_problems &= ~UTF8_GOT_SURROGATE;
	1716
	1717	if (flags & UTF8_WARN_SURROGATE) {
	1718	*errors \|= UTF8_GOT_SURROGATE;
	1719
	1720	if ( ! (flags & UTF8_CHECK_ONLY)
	1721	&& ckWARN_d(WARN_SURROGATE))
	1722	{
	1723	pack_warn = packWARN(WARN_SURROGATE);
	1724
	1725	/* These are the only errors that can occur with a
	1726	* surrogate when the 'uv' isn't valid */
	1727	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	1728	message = Perl_form(aTHX_
	1729	"UTF-16 surrogate (any UTF-8 sequence that"
	1730	" starts with \"%s\" is for a surrogate)",
	1731	_byte_dump_string(s0, curlen, 0));
	1732	}
	1733	else {
	1734	message = Perl_form(aTHX_ surrogate_cp_format, uv);
	1735	}
	1736	}
	1737	}
	1738
	1739	if (flags & UTF8_DISALLOW_SURROGATE) {
	1740	disallowed = TRUE;
	1741	*errors \|= UTF8_GOT_SURROGATE;
	1742	}
	1743	}
	1744	else if (possible_problems & UTF8_GOT_SUPER) {
	1745	possible_problems &= ~UTF8_GOT_SUPER;
	1746
	1747	if (flags & UTF8_WARN_SUPER) {
	1748	*errors \|= UTF8_GOT_SUPER;
	1749
	1750	if ( ! (flags & UTF8_CHECK_ONLY)
	1751	&& ckWARN_d(WARN_NON_UNICODE))
	1752	{
	1753	pack_warn = packWARN(WARN_NON_UNICODE);
	1754
	1755	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	1756	message = Perl_form(aTHX_
	1757	"Any UTF-8 sequence that starts with"
	1758	" \"%s\" is for a non-Unicode code point,"
	1759	" may not be portable",
	1760	_byte_dump_string(s0, curlen, 0));
	1761	}
	1762	else {
	1763	message = Perl_form(aTHX_ super_cp_format, uv);
	1764	}
	1765	}
	1766	}
	1767
	1768	/* Test for Perl's extended UTF-8 after the regular SUPER ones,
	1769	* and before possibly bailing out, so that the more dire
	1770	* warning will override the regular one. */
	1771	if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
	1772	if ( ! (flags & UTF8_CHECK_ONLY)
	1773	&& (flags & (UTF8_WARN_PERL_EXTENDED\|UTF8_WARN_SUPER))
	1774	&& ckWARN_d(WARN_NON_UNICODE))
	1775	{
	1776	pack_warn = packWARN(WARN_NON_UNICODE);
	1777
	1778	/* If it is an overlong that evaluates to a code point
	1779	* that doesn't have to use the Perl extended UTF-8, it
	1780	* still used it, and so we output a message that
	1781	* doesn't refer to the code point. The same is true
	1782	* if there was a SHORT malformation where the code
	1783	* point is not valid. In that case, 'uv' will have
	1784	* been set to the REPLACEMENT CHAR, and the message
	1785	* below without the code point in it will be selected
	1786	* */
	1787	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	1788	message = Perl_form(aTHX_
	1789	perl_extended_cp_format, uv);
	1790	}
	1791	else {
	1792	message = Perl_form(aTHX_
	1793	"Any UTF-8 sequence that starts with"
	1794	" \"%s\" is a Perl extension, and"
	1795	" so is not portable",
	1796	_byte_dump_string(s0, curlen, 0));
	1797	}
	1798	}
	1799
	1800	if (flags & ( UTF8_WARN_PERL_EXTENDED
	1801	\|UTF8_DISALLOW_PERL_EXTENDED))
	1802	{
	1803	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1804
	1805	if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
	1806	disallowed = TRUE;
	1807	}
	1808	}
	1809	}
	1810
	1811	if (flags & UTF8_DISALLOW_SUPER) {
	1812	*errors \|= UTF8_GOT_SUPER;
	1813	disallowed = TRUE;
	1814	}
	1815	}
	1816	else if (possible_problems & UTF8_GOT_NONCHAR) {
	1817	possible_problems &= ~UTF8_GOT_NONCHAR;
	1818
	1819	if (flags & UTF8_WARN_NONCHAR) {
	1820	*errors \|= UTF8_GOT_NONCHAR;
	1821
	1822	if ( ! (flags & UTF8_CHECK_ONLY)
	1823	&& ckWARN_d(WARN_NONCHAR))
	1824	{
	1825	/* The code above should have guaranteed that we don't
	1826	* get here with errors other than overlong */
	1827	assert (! (orig_problems
	1828	& ~(UTF8_GOT_LONG\|UTF8_GOT_NONCHAR)));
	1829
	1830	pack_warn = packWARN(WARN_NONCHAR);
	1831	message = Perl_form(aTHX_ nonchar_cp_format, uv);
	1832	}
	1833	}
	1834
	1835	if (flags & UTF8_DISALLOW_NONCHAR) {
	1836	disallowed = TRUE;
	1837	*errors \|= UTF8_GOT_NONCHAR;
	1838	}
	1839	}
	1840	else if (possible_problems & UTF8_GOT_LONG) {
	1841	possible_problems &= ~UTF8_GOT_LONG;
	1842	*errors \|= UTF8_GOT_LONG;
	1843
	1844	if (flags & UTF8_ALLOW_LONG) {
	1845
	1846	/* We don't allow the actual overlong value, unless the
	1847	* special extra bit is also set */
	1848	if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
	1849	& ~UTF8_ALLOW_LONG)))
	1850	{
	1851	uv = UNICODE_REPLACEMENT;
	1852	}
	1853	}
	1854	else {
	1855	disallowed = TRUE;
	1856
	1857	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1858	pack_warn = packWARN(WARN_UTF8);
	1859
	1860	/* These error types cause 'uv' to be something that
	1861	* isn't what was intended, so can't use it in the
	1862	* message. The other error types either can't
	1863	* generate an overlong, or else the 'uv' is valid */
	1864	if (orig_problems &
	1865	(UTF8_GOT_TOO_SHORT\|UTF8_GOT_OVERFLOW))
	1866	{
	1867	message = Perl_form(aTHX_
	1868	"%s: %s (any UTF-8 sequence that starts"
	1869	" with \"%s\" is overlong which can and"
	1870	" should be represented with a"
	1871	" different, shorter sequence)",
	1872	malformed_text,
	1873	_byte_dump_string(s0, send - s0, 0),
	1874	_byte_dump_string(s0, curlen, 0));
	1875	}
	1876	else {
	1877	U8 tmpbuf[UTF8_MAXBYTES+1];
	1878	const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
	1879	uv, 0);
	1880	const char * preface = (uv <= PERL_UNICODE_MAX)
	1881	? "U+"
	1882	: "0x";
	1883	message = Perl_form(aTHX_
	1884	"%s: %s (overlong; instead use %s to represent"
	1885	" %s%0*" UVXf ")",
	1886	malformed_text,
	1887	_byte_dump_string(s0, send - s0, 0),
	1888	_byte_dump_string(tmpbuf, e - tmpbuf, 0),
	1889	preface,
	1890	((uv < 256) ? 2 : 4), /* Field width of 2 for
	1891	small code points */
	1892	uv);
	1893	}
	1894	}
	1895	}
	1896	} /* End of looking through the possible flags */
	1897
	1898	/* Display the message (if any) for the problem being handled in
	1899	* this iteration of the loop */
	1900	if (message) {
	1901	if (PL_op)
	1902	Perl_warner(aTHX_ pack_warn, "%s in %s", message,
	1903	OP_DESC(PL_op));
	1904	else
	1905	Perl_warner(aTHX_ pack_warn, "%s", message);
	1906	}
	1907	} /* End of 'while (possible_problems)' */
	1908
	1909	/* Since there was a possible problem, the returned length may need to
	1910	* be changed from the one stored at the beginning of this function.
	1911	* Instead of trying to figure out if that's needed, just do it. */
	1912	if (retlen) {
	1913	*retlen = curlen;
	1914	}
	1915
	1916	if (disallowed) {
	1917	if (flags & UTF8_CHECK_ONLY && retlen) {
	1918	*retlen = ((STRLEN) -1);
	1919	}
	1920	return 0;
	1921	}
	1922	}
	1923
	1924	return UNI_TO_NATIVE(uv);
	1925	}
	1926
	1927	/*
	1928	=for apidoc utf8_to_uvchr_buf
	1929
	1930	Returns the native code point of the first character in the string C<s> which
	1931	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	1932	C<*retlen> will be set to the length, in bytes, of that character.
	1933
	1934	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	1935	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	1936	C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
	1937	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	1938	C<retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<retlen>>) is
	1939	the next possible position in C<s> that could begin a non-malformed character.
	1940	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	1941	returned.
	1942
	1943	=cut
	1944
	1945	Also implemented as a macro in utf8.h
	1946
	1947	*/
	1948
	1949
	1950	UV
	1951	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1952	{
	1953	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
	1954
	1955	assert(s < send);
	1956
	1957	return utf8n_to_uvchr(s, send - s, retlen,
	1958	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	1959	}
	1960
	1961	/* This is marked as deprecated
	1962	*
	1963	=for apidoc utf8_to_uvuni_buf
	1964
	1965	Only in very rare circumstances should code need to be dealing in Unicode
	1966	(as opposed to native) code points. In those few cases, use
	1967	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>> instead.
	1968
	1969	Returns the Unicode (not-native) code point of the first character in the
	1970	string C<s> which
	1971	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	1972	C<retlen> will be set to the length, in bytes, of that character.
	1973
	1974	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	1975	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	1976	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	1977	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	1978	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	1979	next possible position in C<s> that could begin a non-malformed character.
	1980	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	1981
	1982	=cut
	1983	*/
	1984
	1985	UV
	1986	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1987	{
	1988	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	1989
	1990	assert(send > s);
	1991
	1992	/* Call the low level routine, asking for checks */
	1993	return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
	1994	}
	1995
	1996	/*
	1997	=for apidoc utf8_length
	1998
	1999	Return the length of the UTF-8 char encoded string C<s> in characters.
	2000	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	2001	up past C<e>, croaks.
	2002
	2003	=cut
	2004	*/
	2005
	2006	STRLEN
	2007	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	2008	{
	2009	STRLEN len = 0;
	2010
	2011	PERL_ARGS_ASSERT_UTF8_LENGTH;
	2012
	2013	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	2014	* the bitops (especially ~) can create illegal UTF-8.
	2015	* In other words: in Perl UTF-8 is not just for Unicode. */
	2016
	2017	if (e < s)
	2018	goto warn_and_return;
	2019	while (s < e) {
	2020	s += UTF8SKIP(s);
	2021	len++;
	2022	}
	2023
	2024	if (e != s) {
	2025	len--;
	2026	warn_and_return:
	2027	if (PL_op)
	2028	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2029	"%s in %s", unees, OP_DESC(PL_op));
	2030	else
	2031	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2032	}
	2033
	2034	return len;
	2035	}
	2036
	2037	/*
	2038	=for apidoc bytes_cmp_utf8
	2039
	2040	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	2041	sequence of characters (stored as UTF-8)
	2042	in C<u>, C<ulen>. Returns 0 if they are
	2043	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	2044	if the first string is greater than the second string.
	2045
	2046	-1 or +1 is returned if the shorter string was identical to the start of the
	2047	longer string. -2 or +2 is returned if
	2048	there was a difference between characters
	2049	within the strings.
	2050
	2051	=cut
	2052	*/
	2053
	2054	int
	2055	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	2056	{
	2057	const U8 *const bend = b + blen;
	2058	const U8 *const uend = u + ulen;
	2059
	2060	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	2061
	2062	while (b < bend && u < uend) {
	2063	U8 c = *u++;
	2064	if (!UTF8_IS_INVARIANT(c)) {
	2065	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2066	if (u < uend) {
	2067	U8 c1 = *u++;
	2068	if (UTF8_IS_CONTINUATION(c1)) {
	2069	c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
	2070	} else {
	2071	/* diag_listed_as: Malformed UTF-8 character%s */
	2072	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2073	"%s %s%s",
	2074	unexpected_non_continuation_text(u - 2, 2, 1, 2),
	2075	PL_op ? " in " : "",
	2076	PL_op ? OP_DESC(PL_op) : "");
	2077	return -2;
	2078	}
	2079	} else {
	2080	if (PL_op)
	2081	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2082	"%s in %s", unees, OP_DESC(PL_op));
	2083	else
	2084	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2085	return -2; /* Really want to return undef :-) */
	2086	}
	2087	} else {
	2088	return -2;
	2089	}
	2090	}
	2091	if (*b != c) {
	2092	return *b < c ? -2 : +2;
	2093	}
	2094	++b;
	2095	}
	2096
	2097	if (b == bend && u == uend)
	2098	return 0;
	2099
	2100	return b < bend ? +1 : -1;
	2101	}
	2102
	2103	/*
	2104	=for apidoc utf8_to_bytes
	2105
	2106	Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
	2107	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	2108	updates C<*lenp> to contain the new length.
	2109	Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
	2110
	2111	Upon successful return, the number of variants in the string can be computed by
	2112	having saved the value of C<*lenp> before the call, and subtracting the
	2113	after-call value of C<*lenp> from it.
	2114
	2115	If you need a copy of the string, see L</bytes_from_utf8>.
	2116
	2117	=cut
	2118	*/
	2119
	2120	U8 *
	2121	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN lenp)
	2122	{
	2123	U8 * first_variant;
	2124
	2125	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	2126	PERL_UNUSED_CONTEXT;
	2127
	2128	/* This is a no-op if no variants at all in the input */
	2129	if (is_utf8_invariant_string_loc(s, lenp, (const U8 *) &first_variant)) {
	2130	return s;
	2131	}
	2132
	2133	{
	2134	U8 * const save = s;
	2135	U8 * const send = s + *lenp;
	2136	U8 * d;
	2137
	2138	/* Nothing before the first variant needs to be changed, so start the real
	2139	* work there */
	2140	s = first_variant;
	2141	while (s < send) {
	2142	if (! UTF8_IS_INVARIANT(*s)) {
	2143	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	2144	*lenp = ((STRLEN) -1);
	2145	return 0;
	2146	}
	2147	s++;
	2148	}
	2149	s++;
	2150	}
	2151
	2152	/* Is downgradable, so do it */
	2153	d = s = first_variant;
	2154	while (s < send) {
	2155	U8 c = *s++;
	2156	if (! UVCHR_IS_INVARIANT(c)) {
	2157	/* Then it is two-byte encoded */
	2158	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2159	s++;
	2160	}
	2161	*d++ = c;
	2162	}
	2163	*d = '\0';
	2164	*lenp = d - save;
	2165
	2166	return save;
	2167	}
	2168	}
	2169
	2170	/*
	2171	=for apidoc bytes_from_utf8
	2172
	2173	Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
	2174	byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
	2175	actually encoded in UTF-8.
	2176
	2177	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
	2178	the input string.
	2179
	2180	Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
	2181	not expressible in native byte encoding. In these cases, C<*is_utf8p> and
	2182	C<*lenp> are unchanged, and the return value is the original C<s>.
	2183
	2184	Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
	2185	newly created string containing a downgraded copy of C<s>, and whose length is
	2186	returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
	2187
	2188	Upon successful return, the number of variants in the string can be computed by
	2189	having saved the value of C<*lenp> before the call, and subtracting the
	2190	after-call value of C<*lenp> from it.
	2191
	2192	=cut
	2193
	2194	There is a macro that avoids this function call, but this is retained for
	2195	anyone who calls it with the Perl_ prefix */
	2196
	2197	U8 *
	2198	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN lenp, bool *is_utf8p)
	2199	{
	2200	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	2201	PERL_UNUSED_CONTEXT;
	2202
	2203	return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
	2204	}
	2205
	2206	/*
	2207	No = here because currently externally undocumented
	2208	for apidoc bytes_from_utf8_loc
	2209
	2210	Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
	2211	to store the location of the first character in C<"s"> that cannot be
	2212	converted to non-UTF8.
	2213
	2214	If that parameter is C<NULL>, this function behaves identically to
	2215	C<bytes_from_utf8>.
	2216
	2217	Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
	2218	C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
	2219
	2220	Otherwise, the function returns a newly created C<NUL>-terminated string
	2221	containing the non-UTF8 equivalent of the convertible first portion of
	2222	C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
	2223	If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
	2224	and C<*first_non_downgradable> is set to C<NULL>.
	2225
	2226	Otherwise, C<*first_non_downgradable> set to point to the first byte of the
	2227	first character in the original string that wasn't converted. C<*is_utf8p> is
	2228	unchanged. Note that the new string may have length 0.
	2229
	2230	Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
	2231	C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
	2232	converts as many characters in it as possible stopping at the first one it
	2233	finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
	2234	set to point to that. The function returns the portion that could be converted
	2235	in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
	2236	not including the terminating C<NUL>. If the very first character in the
	2237	original could not be converted, C<*lenp> will be 0, and the new string will
	2238	contain just a single C<NUL>. If the entire input string was converted,
	2239	C<is_utf8p> is set to FALSE and C<first_non_downgradable> is set to C<NULL>.
	2240
	2241	Upon successful return, the number of variants in the converted portion of the
	2242	string can be computed by having saved the value of C<*lenp> before the call,
	2243	and subtracting the after-call value of C<*lenp> from it.
	2244
	2245	=cut
	2246
	2247
	2248	*/
	2249
	2250	U8 *
	2251	Perl_bytes_from_utf8_loc(const U8 s, STRLEN lenp, bool is_utf8p, const U8* first_unconverted)
	2252	{
	2253	U8 *d;
	2254	const U8 *original = s;
	2255	U8 *converted_start;
	2256	const U8 send = s + lenp;
	2257
	2258	PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
	2259
	2260	if (! *is_utf8p) {
	2261	if (first_unconverted) {
	2262	*first_unconverted = NULL;
	2263	}
	2264
	2265	return (U8 *) original;
	2266	}
	2267
	2268	Newx(d, (*lenp) + 1, U8);
	2269
	2270	converted_start = d;
	2271	while (s < send) {
	2272	U8 c = *s++;
	2273	if (! UTF8_IS_INVARIANT(c)) {
	2274
	2275	/* Then it is multi-byte encoded. If the code point is above 0xFF,
	2276	* have to stop now */
	2277	if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
	2278	if (first_unconverted) {
	2279	*first_unconverted = s - 1;
	2280	goto finish_and_return;
	2281	}
	2282	else {
	2283	Safefree(converted_start);
	2284	return (U8 *) original;
	2285	}
	2286	}
	2287
	2288	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2289	s++;
	2290	}
	2291	*d++ = c;
	2292	}
	2293
	2294	/* Here, converted the whole of the input */
	2295	*is_utf8p = FALSE;
	2296	if (first_unconverted) {
	2297	*first_unconverted = NULL;
	2298	}
	2299
	2300	finish_and_return:
	2301	*d = '\0';
	2302	*lenp = d - converted_start;
	2303
	2304	/* Trim unused space */
	2305	Renew(converted_start, *lenp + 1, U8);
	2306
	2307	return converted_start;
	2308	}
	2309
	2310	/*
	2311	=for apidoc bytes_to_utf8
	2312
	2313	Converts a string C<s> of length C<*lenp> bytes from the native encoding into
	2314	UTF-8.
	2315	Returns a pointer to the newly-created string, and sets C<*lenp> to
	2316	reflect the new length in bytes.
	2317
	2318	Upon successful return, the number of variants in the string can be computed by
	2319	having saved the value of C<*lenp> before the call, and subtracting it from the
	2320	after-call value of C<*lenp>.
	2321
	2322	A C<NUL> character will be written after the end of the string.
	2323
	2324	If you want to convert to UTF-8 from encodings other than
	2325	the native (Latin1 or EBCDIC),
	2326	see L</sv_recode_to_utf8>().
	2327
	2328	=cut
	2329	*/
	2330
	2331	U8*
	2332	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN lenp)
	2333	{
	2334	const U8 * const send = s + (*lenp);
	2335	U8 *d;
	2336	U8 *dst;
	2337
	2338	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	2339	PERL_UNUSED_CONTEXT;
	2340
	2341	Newx(d, (lenp) 2 + 1, U8);
	2342	dst = d;
	2343
	2344	while (s < send) {
	2345	append_utf8_from_native_byte(*s, &d);
	2346	s++;
	2347	}
	2348	*d = '\0';
	2349	*lenp = d-dst;
	2350	return dst;
	2351	}
	2352
	2353	/*
	2354	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	2355	*
	2356	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	2357	* We optimize for native, for obvious reasons. */
	2358
	2359	U8*
	2360	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2361	{
	2362	U8* pend;
	2363	U8* dstart = d;
	2364
	2365	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	2366
	2367	if (bytelen & 1)
	2368	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
	2369	(UV)bytelen);
	2370
	2371	pend = p + bytelen;
	2372
	2373	while (p < pend) {
	2374	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	2375	p += 2;
	2376	if (OFFUNI_IS_INVARIANT(uv)) {
	2377	*d++ = LATIN1_TO_NATIVE((U8) uv);
	2378	continue;
	2379	}
	2380	if (uv <= MAX_UTF8_TWO_BYTE) {
	2381	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	2382	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	2383	continue;
	2384	}
	2385	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	2386	#define LAST_HIGH_SURROGATE 0xDBFF
	2387	#define FIRST_LOW_SURROGATE 0xDC00
	2388	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	2389
	2390	/* This assumes that most uses will be in the first Unicode plane, not
	2391	* needing surrogates */
	2392	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
	2393	&& uv <= UNICODE_SURROGATE_LAST))
	2394	{
	2395	if (UNLIKELY(p >= pend) \|\| UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
	2396	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2397	}
	2398	else {
	2399	UV low = (p[0] << 8) + p[1];
	2400	if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
	2401	\|\| UNLIKELY(low > LAST_LOW_SURROGATE))
	2402	{
	2403	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2404	}
	2405	p += 2;
	2406	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	2407	+ (low - FIRST_LOW_SURROGATE) + 0x10000;
	2408	}
	2409	}
	2410	#ifdef EBCDIC
	2411	d = uvoffuni_to_utf8_flags(d, uv, 0);
	2412	#else
	2413	if (uv < 0x10000) {
	2414	*d++ = (U8)(( uv >> 12) \| 0xe0);
	2415	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2416	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2417	continue;
	2418	}
	2419	else {
	2420	*d++ = (U8)(( uv >> 18) \| 0xf0);
	2421	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	2422	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2423	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2424	continue;
	2425	}
	2426	#endif
	2427	}
	2428	*newlen = d - dstart;
	2429	return d;
	2430	}
	2431
	2432	/* Note: this one is slightly destructive of the source. */
	2433
	2434	U8*
	2435	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2436	{
	2437	U8* s = (U8*)p;
	2438	U8* const send = s + bytelen;
	2439
	2440	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	2441
	2442	if (bytelen & 1)
	2443	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
	2444	(UV)bytelen);
	2445
	2446	while (s < send) {
	2447	const U8 tmp = s[0];
	2448	s[0] = s[1];
	2449	s[1] = tmp;
	2450	s += 2;
	2451	}
	2452	return utf16_to_utf8(p, d, bytelen, newlen);
	2453	}
	2454
	2455	bool
	2456	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	2457	{
	2458	U8 tmpbuf[UTF8_MAXBYTES+1];
	2459	uvchr_to_utf8(tmpbuf, c);
	2460	return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
	2461	}
	2462
	2463	/* Internal function so we can deprecate the external one, and call
	2464	this one from other deprecated functions in this file */
	2465
	2466	bool
	2467	Perl__is_utf8_idstart(pTHX_ const U8 *p)
	2468	{
	2469	PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
	2470
	2471	if (*p == '_')
	2472	return TRUE;
	2473	return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
	2474	}
	2475
	2476	bool
	2477	Perl__is_uni_perl_idcont(pTHX_ UV c)
	2478	{
	2479	U8 tmpbuf[UTF8_MAXBYTES+1];
	2480	uvchr_to_utf8(tmpbuf, c);
	2481	return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
	2482	}
	2483
	2484	bool
	2485	Perl__is_uni_perl_idstart(pTHX_ UV c)
	2486	{
	2487	U8 tmpbuf[UTF8_MAXBYTES+1];
	2488	uvchr_to_utf8(tmpbuf, c);
	2489	return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
	2490	}
	2491
	2492	UV
	2493	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2494	const char S_or_s)
	2495	{
	2496	/* We have the latin1-range values compiled into the core, so just use
	2497	* those, converting the result to UTF-8. The only difference between upper
	2498	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	2499	* either "SS" or "Ss". Which one to use is passed into the routine in
	2500	* 'S_or_s' to avoid a test */
	2501
	2502	UV converted = toUPPER_LATIN1_MOD(c);
	2503
	2504	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	2505
	2506	assert(S_or_s == 'S' \|\| S_or_s == 's');
	2507
	2508	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	2509	characters in this range */
	2510	*p = (U8) converted;
	2511	*lenp = 1;
	2512	return converted;
	2513	}
	2514
	2515	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	2516	* which it maps to one of them, so as to only have to have one check for
	2517	* it in the main case */
	2518	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	2519	switch (c) {
	2520	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	2521	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	2522	break;
	2523	case MICRO_SIGN:
	2524	converted = GREEK_CAPITAL_LETTER_MU;
	2525	break;
	2526	#if UNICODE_MAJOR_VERSION > 2 \
	2527	\|\| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
	2528	&& UNICODE_DOT_DOT_VERSION >= 8)
	2529	case LATIN_SMALL_LETTER_SHARP_S:
	2530	*(p)++ = 'S';
	2531	*p = S_or_s;
	2532	*lenp = 2;
	2533	return 'S';
	2534	#endif
	2535	default:
	2536	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
	2537	" '%c' to map to '%c'",
	2538	c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	2539	NOT_REACHED; /* NOTREACHED */
	2540	}
	2541	}
	2542
	2543	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2544	*p = UTF8_TWO_BYTE_LO(converted);
	2545	*lenp = 2;
	2546
	2547	return converted;
	2548	}
	2549
	2550	/* Call the function to convert a UTF-8 encoded character to the specified case.
	2551	* Note that there may be more than one character in the result.
	2552	* INP is a pointer to the first byte of the input character
	2553	* OUTP will be set to the first byte of the string of changed characters. It
	2554	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	2555	* LENP will be set to the length in bytes of the string of changed characters
	2556	*
	2557	* The functions return the ordinal of the first character in the string of
	2558	* OUTP */
	2559	#define CALL_UPPER_CASE(uv, s, d, lenp) \
	2560	_to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
	2561	#define CALL_TITLE_CASE(uv, s, d, lenp) \
	2562	_to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
	2563	#define CALL_LOWER_CASE(uv, s, d, lenp) \
	2564	_to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
	2565
	2566	/* This additionally has the input parameter 'specials', which if non-zero will
	2567	* cause this to use the specials hash for folding (meaning get full case
	2568	* folding); otherwise, when zero, this implies a simple case fold */
	2569	#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
	2570	_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
	2571
	2572	UV
	2573	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	2574	{
	2575	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	2576	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	2577	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	2578	* the changed version may be longer than the original character.
	2579	*
	2580	* The ordinal of the first character of the changed version is returned
	2581	* (but note, as explained above, that there may be more.) */
	2582
	2583	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	2584
	2585	if (c < 256) {
	2586	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	2587	}
	2588
	2589	uvchr_to_utf8(p, c);
	2590	return CALL_UPPER_CASE(c, p, p, lenp);
	2591	}
	2592
	2593	UV
	2594	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	2595	{
	2596	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	2597
	2598	if (c < 256) {
	2599	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	2600	}
	2601
	2602	uvchr_to_utf8(p, c);
	2603	return CALL_TITLE_CASE(c, p, p, lenp);
	2604	}
	2605
	2606	STATIC U8
	2607	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
	2608	{
	2609	/* We have the latin1-range values compiled into the core, so just use
	2610	* those, converting the result to UTF-8. Since the result is always just
	2611	* one character, we allow <p> to be NULL */
	2612
	2613	U8 converted = toLOWER_LATIN1(c);
	2614
	2615	PERL_UNUSED_ARG(dummy);
	2616
	2617	if (p != NULL) {
	2618	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	2619	*p = converted;
	2620	*lenp = 1;
	2621	}
	2622	else {
	2623	/* Result is known to always be < 256, so can use the EIGHT_BIT
	2624	* macros */
	2625	*p = UTF8_EIGHT_BIT_HI(converted);
	2626	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	2627	*lenp = 2;
	2628	}
	2629	}
	2630	return converted;
	2631	}
	2632
	2633	UV
	2634	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	2635	{
	2636	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	2637
	2638	if (c < 256) {
	2639	return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
	2640	}
	2641
	2642	uvchr_to_utf8(p, c);
	2643	return CALL_LOWER_CASE(c, p, p, lenp);
	2644	}
	2645
	2646	UV
	2647	Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2648	const unsigned int flags)
	2649	{
	2650	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	2651	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	2652	* FOLD_FLAGS_FULL iff full folding is to be used;
	2653	*
	2654	* Not to be used for locale folds
	2655	*/
	2656
	2657	UV converted;
	2658
	2659	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	2660	PERL_UNUSED_CONTEXT;
	2661
	2662	assert (! (flags & FOLD_FLAGS_LOCALE));
	2663
	2664	if (UNLIKELY(c == MICRO_SIGN)) {
	2665	converted = GREEK_SMALL_LETTER_MU;
	2666	}
	2667	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	2668	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	2669	\|\| UNICODE_DOT_DOT_VERSION > 0)
	2670	else if ( (flags & FOLD_FLAGS_FULL)
	2671	&& UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
	2672	{
	2673	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	2674	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	2675	* under those circumstances. */
	2676	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	2677	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	2678	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	2679	p, *lenp, U8);
	2680	return LATIN_SMALL_LETTER_LONG_S;
	2681	}
	2682	else {
	2683	*(p)++ = 's';
	2684	*p = 's';
	2685	*lenp = 2;
	2686	return 's';
	2687	}
	2688	}
	2689	#endif
	2690	else { /* In this range the fold of all other characters is their lower
	2691	case */
	2692	converted = toLOWER_LATIN1(c);
	2693	}
	2694
	2695	if (UVCHR_IS_INVARIANT(converted)) {
	2696	*p = (U8) converted;
	2697	*lenp = 1;
	2698	}
	2699	else {
	2700	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2701	*p = UTF8_TWO_BYTE_LO(converted);
	2702	*lenp = 2;
	2703	}
	2704
	2705	return converted;
	2706	}
	2707
	2708	UV
	2709	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	2710	{
	2711
	2712	/* Not currently externally documented, and subject to change
	2713	* <flags> bits meanings:
	2714	* FOLD_FLAGS_FULL iff full folding is to be used;
	2715	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	2716	* locale are to be used.
	2717	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	2718	*/
	2719
	2720	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	2721
	2722	if (flags & FOLD_FLAGS_LOCALE) {
	2723	/* Treat a UTF-8 locale as not being in locale at all */
	2724	if (IN_UTF8_CTYPE_LOCALE) {
	2725	flags &= ~FOLD_FLAGS_LOCALE;
	2726	}
	2727	else {
	2728	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2729	goto needs_full_generality;
	2730	}
	2731	}
	2732
	2733	if (c < 256) {
	2734	return _to_fold_latin1((U8) c, p, lenp,
	2735	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2736	}
	2737
	2738	/* Here, above 255. If no special needs, just use the macro */
	2739	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	2740	uvchr_to_utf8(p, c);
	2741	return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
	2742	}
	2743	else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
	2744	the special flags. */
	2745	U8 utf8_c[UTF8_MAXBYTES + 1];
	2746
	2747	needs_full_generality:
	2748	uvchr_to_utf8(utf8_c, c);
	2749	return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
	2750	p, lenp, flags);
	2751	}
	2752	}
	2753
	2754	PERL_STATIC_INLINE bool
	2755	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	2756	const char const swashname, SV const invlist)
	2757	{
	2758	/* returns a boolean giving whether or not the UTF8-encoded character that
	2759	* starts at <p> is in the swash indicated by <swashname>. <swash>
	2760	* contains a pointer to where the swash indicated by <swashname>
	2761	* is to be stored; which this routine will do, so that future calls will
	2762	* look at <*swash> and only generate a swash if it is not null. <invlist>
	2763	* is NULL or an inversion list that defines the swash. If not null, it
	2764	* saves time during initialization of the swash.
	2765	*
	2766	* Note that it is assumed that the buffer length of <p> is enough to
	2767	* contain all the bytes that comprise the character. Thus, <*p> should
	2768	* have been checked before this call for mal-formedness enough to assure
	2769	* that. */
	2770
	2771	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	2772
	2773	/* The API should have included a length for the UTF-8 character in <p>,
	2774	* but it doesn't. We therefore assume that p has been validated at least
	2775	* as far as there being enough bytes available in it to accommodate the
	2776	* character without reading beyond the end, and pass that number on to the
	2777	* validating routine */
	2778	if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
	2779	_force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
	2780	_UTF8_NO_CONFIDENCE_IN_CURLEN,
	2781	1 /* Die */ );
	2782	NOT_REACHED; /* NOTREACHED */
	2783	}
	2784
	2785	if (!*swash) {
	2786	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	2787	*swash = _core_swash_init("utf8",
	2788
	2789	/* Only use the name if there is no inversion
	2790	* list; otherwise will go out to disk */
	2791	(invlist) ? "" : swashname,
	2792
	2793	&PL_sv_undef, 1, 0, invlist, &flags);
	2794	}
	2795
	2796	return swash_fetch(*swash, p, TRUE) != 0;
	2797	}
	2798
	2799	PERL_STATIC_INLINE bool
	2800	S_is_utf8_common_with_len(pTHX_ const U8 const p, const U8 const e,
	2801	SV *swash, const char const swashname,
	2802	SV* const invlist)
	2803	{
	2804	/* returns a boolean giving whether or not the UTF8-encoded character that
	2805	* starts at <p>, and extending no further than <e - 1> is in the swash
	2806	* indicated by <swashname>. <swash> contains a pointer to where the swash
	2807	* indicated by <swashname> is to be stored; which this routine will do, so
	2808	* that future calls will look at <*swash> and only generate a swash if it
	2809	* is not null. <invlist> is NULL or an inversion list that defines the
	2810	* swash. If not null, it saves time during initialization of the swash.
	2811	*/
	2812
	2813	PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
	2814
	2815	if (! isUTF8_CHAR(p, e)) {
	2816	_force_out_malformed_utf8_message(p, e, 0, 1);
	2817	NOT_REACHED; /* NOTREACHED */
	2818	}
	2819
	2820	if (!*swash) {
	2821	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	2822	*swash = _core_swash_init("utf8",
	2823
	2824	/* Only use the name if there is no inversion
	2825	* list; otherwise will go out to disk */
	2826	(invlist) ? "" : swashname,
	2827
	2828	&PL_sv_undef, 1, 0, invlist, &flags);
	2829	}
	2830
	2831	return swash_fetch(*swash, p, TRUE) != 0;
	2832	}
	2833
	2834	STATIC void
	2835	S_warn_on_first_deprecated_use(pTHX_ const char * const name,
	2836	const char * const alternative,
	2837	const bool use_locale,
	2838	const char * const file,
	2839	const unsigned line)
	2840	{
	2841	const char * key;
	2842
	2843	PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
	2844
	2845	if (ckWARN_d(WARN_DEPRECATED)) {
	2846
	2847	key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
	2848	if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
	2849	if (! PL_seen_deprecated_macro) {
	2850	PL_seen_deprecated_macro = newHV();
	2851	}
	2852	if (! hv_store(PL_seen_deprecated_macro, key,
	2853	strlen(key), &PL_sv_undef, 0))
	2854	{
	2855	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2856	}
	2857
	2858	if (instr(file, "mathoms.c")) {
	2859	Perl_warner(aTHX_ WARN_DEPRECATED,
	2860	"In %s, line %d, starting in Perl v5.30, %s()"
	2861	" will be removed. Avoid this message by"
	2862	" converting to use %s().\n",
	2863	file, line, name, alternative);
	2864	}
	2865	else {
	2866	Perl_warner(aTHX_ WARN_DEPRECATED,
	2867	"In %s, line %d, starting in Perl v5.30, %s() will"
	2868	" require an additional parameter. Avoid this"
	2869	" message by converting to use %s().\n",
	2870	file, line, name, alternative);
	2871	}
	2872	}
	2873	}
	2874	}
	2875
	2876	bool
	2877	Perl__is_utf8_FOO(pTHX_ U8 classnum,
	2878	const U8 * const p,
	2879	const char * const name,
	2880	const char * const alternative,
	2881	const bool use_utf8,
	2882	const bool use_locale,
	2883	const char * const file,
	2884	const unsigned line)
	2885	{
	2886	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	2887
	2888	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	2889
	2890	if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
	2891
	2892	switch (classnum) {
	2893	case _CC_WORDCHAR:
	2894	case _CC_DIGIT:
	2895	case _CC_ALPHA:
	2896	case _CC_LOWER:
	2897	case _CC_UPPER:
	2898	case _CC_PUNCT:
	2899	case _CC_PRINT:
	2900	case _CC_ALPHANUMERIC:
	2901	case _CC_GRAPH:
	2902	case _CC_CASED:
	2903
	2904	return is_utf8_common(p,
	2905	&PL_utf8_swash_ptrs[classnum],
	2906	swash_property_names[classnum],
	2907	PL_XPosix_ptrs[classnum]);
	2908
	2909	case _CC_SPACE:
	2910	return is_XPERLSPACE_high(p);
	2911	case _CC_BLANK:
	2912	return is_HORIZWS_high(p);
	2913	case _CC_XDIGIT:
	2914	return is_XDIGIT_high(p);
	2915	case _CC_CNTRL:
	2916	return 0;
	2917	case _CC_ASCII:
	2918	return 0;
	2919	case _CC_VERTSPACE:
	2920	return is_VERTWS_high(p);
	2921	case _CC_IDFIRST:
	2922	if (! PL_utf8_perl_idstart) {
	2923	PL_utf8_perl_idstart
	2924	= _new_invlist_C_array(_Perl_IDStart_invlist);
	2925	}
	2926	return is_utf8_common(p, &PL_utf8_perl_idstart,
	2927	"_Perl_IDStart", NULL);
	2928	case _CC_IDCONT:
	2929	if (! PL_utf8_perl_idcont) {
	2930	PL_utf8_perl_idcont
	2931	= _new_invlist_C_array(_Perl_IDCont_invlist);
	2932	}
	2933	return is_utf8_common(p, &PL_utf8_perl_idcont,
	2934	"_Perl_IDCont", NULL);
	2935	}
	2936	}
	2937
	2938	/* idcont is the same as wordchar below 256 */
	2939	if (classnum == _CC_IDCONT) {
	2940	classnum = _CC_WORDCHAR;
	2941	}
	2942	else if (classnum == _CC_IDFIRST) {
	2943	if (*p == '_') {
	2944	return TRUE;
	2945	}
	2946	classnum = _CC_ALPHA;
	2947	}
	2948
	2949	if (! use_locale) {
	2950	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	2951	return _generic_isCC(*p, classnum);
	2952	}
	2953
	2954	return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )), classnum);
	2955	}
	2956	else {
	2957	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	2958	return isFOO_lc(classnum, *p);
	2959	}
	2960
	2961	return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )));
	2962	}
	2963
	2964	NOT_REACHED; /* NOTREACHED */
	2965	}
	2966
	2967	bool
	2968	Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
	2969	const U8 * const e)
	2970	{
	2971	PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
	2972
	2973	assert(classnum < _FIRST_NON_SWASH_CC);
	2974
	2975	return is_utf8_common_with_len(p,
	2976	e,
	2977	&PL_utf8_swash_ptrs[classnum],
	2978	swash_property_names[classnum],
	2979	PL_XPosix_ptrs[classnum]);
	2980	}
	2981
	2982	bool
	2983	Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 p, const U8 const e)
	2984	{
	2985	SV* invlist = NULL;
	2986
	2987	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
	2988
	2989	if (! PL_utf8_perl_idstart) {
	2990	invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
	2991	}
	2992	return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
	2993	"_Perl_IDStart", invlist);
	2994	}
	2995
	2996	bool
	2997	Perl__is_utf8_xidstart(pTHX_ const U8 *p)
	2998	{
	2999	PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
	3000
	3001	if (*p == '_')
	3002	return TRUE;
	3003	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
	3004	}
	3005
	3006	bool
	3007	Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 p, const U8 const e)
	3008	{
	3009	SV* invlist = NULL;
	3010
	3011	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
	3012
	3013	if (! PL_utf8_perl_idcont) {
	3014	invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
	3015	}
	3016	return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
	3017	"_Perl_IDCont", invlist);
	3018	}
	3019
	3020	bool
	3021	Perl__is_utf8_idcont(pTHX_ const U8 *p)
	3022	{
	3023	PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
	3024
	3025	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
	3026	}
	3027
	3028	bool
	3029	Perl__is_utf8_xidcont(pTHX_ const U8 *p)
	3030	{
	3031	PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
	3032
	3033	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
	3034	}
	3035
	3036	bool
	3037	Perl__is_utf8_mark(pTHX_ const U8 *p)
	3038	{
	3039	PERL_ARGS_ASSERT__IS_UTF8_MARK;
	3040
	3041	return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
	3042	}
	3043
	3044	/* change namve uv1 to 'from' */
	3045	STATIC UV
	3046	S__to_utf8_case(pTHX_ const UV uv1, const U8 p, U8 ustrp, STRLEN *lenp,
	3047	SV *swashp, const char normal, const char *special)
	3048	{
	3049	STRLEN len = 0;
	3050
	3051	PERL_ARGS_ASSERT__TO_UTF8_CASE;
	3052
	3053	/* For code points that don't change case, we already know that the output
	3054	* of this function is the unchanged input, so we can skip doing look-ups
	3055	* for them. Unfortunately the case-changing code points are scattered
	3056	* around. But there are some long consecutive ranges where there are no
	3057	* case changing code points. By adding tests, we can eliminate the lookup
	3058	* for all the ones in such ranges. This is currently done here only for
	3059	* just a few cases where the scripts are in common use in modern commerce
	3060	* (and scripts adjacent to those which can be included without additional
	3061	* tests). */
	3062
	3063	if (uv1 >= 0x0590) {
	3064	/* This keeps from needing further processing the code points most
	3065	* likely to be used in the following non-cased scripts: Hebrew,
	3066	* Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
	3067	* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
	3068	* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
	3069	if (uv1 < 0x10A0) {
	3070	goto cases_to_self;
	3071	}
	3072
	3073	/* The following largish code point ranges also don't have case
	3074	* changes, but khw didn't think they warranted extra tests to speed
	3075	* them up (which would slightly slow down everything else above them):
	3076	* 1100..139F Hangul Jamo, Ethiopic
	3077	* 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
	3078	* Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
	3079	* Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
	3080	* Combining Diacritical Marks Extended, Balinese,
	3081	* Sundanese, Batak, Lepcha, Ol Chiki
	3082	* 2000..206F General Punctuation
	3083	*/
	3084
	3085	if (uv1 >= 0x2D30) {
	3086
	3087	/* This keeps the from needing further processing the code points
	3088	* most likely to be used in the following non-cased major scripts:
	3089	* CJK, Katakana, Hiragana, plus some less-likely scripts.
	3090	*
	3091	* (0x2D30 above might have to be changed to 2F00 in the unlikely
	3092	* event that Unicode eventually allocates the unused block as of
	3093	* v8.0 2FE0..2FEF to code points that are cased. khw has verified
	3094	* that the test suite will start having failures to alert you
	3095	* should that happen) */
	3096	if (uv1 < 0xA640) {
	3097	goto cases_to_self;
	3098	}
	3099
	3100	if (uv1 >= 0xAC00) {
	3101	if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
	3102	if (ckWARN_d(WARN_SURROGATE)) {
	3103	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3104	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3105	"Operation \"%s\" returns its argument for"
	3106	" UTF-16 surrogate U+%04" UVXf, desc, uv1);
	3107	}
	3108	goto cases_to_self;
	3109	}
	3110
	3111	/* AC00..FAFF Catches Hangul syllables and private use, plus
	3112	* some others */
	3113	if (uv1 < 0xFB00) {
	3114	goto cases_to_self;
	3115
	3116	}
	3117
	3118	if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
	3119	if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
	3120	Perl_croak(aTHX_ cp_above_legal_max, uv1,
	3121	MAX_EXTERNALLY_LEGAL_CP);
	3122	}
	3123	if (ckWARN_d(WARN_NON_UNICODE)) {
	3124	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3125	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3126	"Operation \"%s\" returns its argument for"
	3127	" non-Unicode code point 0x%04" UVXf, desc, uv1);
	3128	}
	3129	goto cases_to_self;
	3130	}
	3131	#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
	3132	if (UNLIKELY(uv1
	3133	> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
	3134	{
	3135
	3136	/* As of Unicode 10.0, this means we avoid swash creation
	3137	* for anything beyond high Plane 1 (below emojis) */
	3138	goto cases_to_self;
	3139	}
	3140	#endif
	3141	}
	3142	}
	3143
	3144	/* Note that non-characters are perfectly legal, so no warning should
	3145	* be given. There are so few of them, that it isn't worth the extra
	3146	* tests to avoid swash creation */
	3147	}
	3148
	3149	if (!swashp) / load on-demand */
	3150	*swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
	3151	4, 0, NULL, NULL);
	3152
	3153	if (special) {
	3154	/* It might be "special" (sometimes, but not always,
	3155	* a multicharacter mapping) */
	3156	HV *hv = NULL;
	3157	SV **svp;
	3158
	3159	/* If passed in the specials name, use that; otherwise use any
	3160	* given in the swash */
	3161	if (*special != '\0') {
	3162	hv = get_hv(special, 0);
	3163	}
	3164	else {
	3165	svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
	3166	if (svp) {
	3167	hv = MUTABLE_HV(SvRV(*svp));
	3168	}
	3169	}
	3170
	3171	if (hv
	3172	&& (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
	3173	&& (*svp))
	3174	{
	3175	const char *s;
	3176
	3177	s = SvPV_const(*svp, len);
	3178	if (len == 1)
	3179	/* EIGHTBIT */
	3180	len = uvchr_to_utf8(ustrp, (U8)s) - ustrp;
	3181	else {
	3182	Copy(s, ustrp, len, U8);
	3183	}
	3184	}
	3185	}
	3186
	3187	if (!len && *swashp) {
	3188	const UV uv2 = swash_fetch(swashp, p, TRUE / => is UTF-8 */);
	3189
	3190	if (uv2) {
	3191	/* It was "normal" (a single character mapping). */
	3192	len = uvchr_to_utf8(ustrp, uv2) - ustrp;
	3193	}
	3194	}
	3195
	3196	if (len) {
	3197	if (lenp) {
	3198	*lenp = len;
	3199	}
	3200	return valid_utf8_to_uvchr(ustrp, 0);
	3201	}
	3202
	3203	/* Here, there was no mapping defined, which means that the code point maps
	3204	* to itself. Return the inputs */
	3205	cases_to_self:
	3206	len = UTF8SKIP(p);
	3207	if (p != ustrp) { /* Don't copy onto itself */
	3208	Copy(p, ustrp, len, U8);
	3209	}
	3210
	3211	if (lenp)
	3212	*lenp = len;
	3213
	3214	return uv1;
	3215
	3216	}
	3217
	3218	STATIC UV
	3219	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
	3220	U8* const ustrp, STRLEN *lenp)
	3221	{
	3222	/* This is called when changing the case of a UTF-8-encoded character above
	3223	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	3224	* result contains a character that crosses the 255/256 boundary, disallow
	3225	* the change, and return the original code point. See L<perlfunc/lc> for
	3226	* why;
	3227	*
	3228	* p points to the original string whose case was changed; assumed
	3229	* by this routine to be well-formed
	3230	* result the code point of the first character in the changed-case string
	3231	* ustrp points to the changed-case string (<result> represents its
	3232	* first char)
	3233	* lenp points to the length of <ustrp> */
	3234
	3235	UV original; /* To store the first code point of <p> */
	3236
	3237	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	3238
	3239	assert(UTF8_IS_ABOVE_LATIN1(*p));
	3240
	3241	/* We know immediately if the first character in the string crosses the
	3242	* boundary, so can skip */
	3243	if (result > 255) {
	3244
	3245	/* Look at every character in the result; if any cross the
	3246	* boundary, the whole thing is disallowed */
	3247	U8* s = ustrp + UTF8SKIP(ustrp);
	3248	U8* e = ustrp + *lenp;
	3249	while (s < e) {
	3250	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	3251	goto bad_crossing;
	3252	}
	3253	s += UTF8SKIP(s);
	3254	}
	3255
	3256	/* Here, no characters crossed, result is ok as-is, but we warn. */
	3257	_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
	3258	return result;
	3259	}
	3260
	3261	bad_crossing:
	3262
	3263	/* Failed, have to return the original */
	3264	original = valid_utf8_to_uvchr(p, lenp);
	3265
	3266	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3267	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3268	"Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
	3269	" locale; resolved to \"\\x{%" UVXf "}\".",
	3270	OP_DESC(PL_op),
	3271	original,
	3272	original);
	3273	Copy(p, ustrp, *lenp, char);
	3274	return original;
	3275	}
	3276
	3277	STATIC U32
	3278	S_check_and_deprecate(pTHX_ const U8 *p,
	3279	const U8 **e,
	3280	const unsigned int type, /* See below */
	3281	const bool use_locale, /* Is this a 'LC_'
	3282	macro call? */
	3283	const char * const file,
	3284	const unsigned line)
	3285	{
	3286	/* This is a temporary function to deprecate the unsafe calls to the case
	3287	* changing macros and functions. It keeps all the special stuff in just
	3288	* one place.
	3289	*
	3290	* It updates *e with the pointer to the end of the input string. If using
	3291	* the old-style macros, *e is NULL on input, and so this function assumes
	3292	* the input string is long enough to hold the entire UTF-8 sequence, and
	3293	* sets *e accordingly, but it then returns a flag to pass the
	3294	* utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
	3295	* using the full length if possible.
	3296	*
	3297	* It also does the assert that e > p when e is not NULL. This should be
	3298	* migrated to the callers when this function gets deleted.
	3299	*
	3300	* The 'type' parameter is used for the caller to specify which case
	3301	* changing function this is called from: */
	3302
	3303	# define DEPRECATE_TO_UPPER 0
	3304	# define DEPRECATE_TO_TITLE 1
	3305	# define DEPRECATE_TO_LOWER 2
	3306	# define DEPRECATE_TO_FOLD 3
	3307
	3308	U32 utf8n_flags = 0;
	3309	const char * name;
	3310	const char * alternative;
	3311
	3312	PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
	3313
	3314	if (*e == NULL) {
	3315	utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
	3316	*e = p + UTF8SKIP(p);
	3317
	3318	/* For mathoms.c calls, we use the function name we know is stored
	3319	* there. It could be part of a larger path */
	3320	if (type == DEPRECATE_TO_UPPER) {
	3321	name = instr(file, "mathoms.c")
	3322	? "to_utf8_upper"
	3323	: "toUPPER_utf8";
	3324	alternative = "toUPPER_utf8_safe";
	3325	}
	3326	else if (type == DEPRECATE_TO_TITLE) {
	3327	name = instr(file, "mathoms.c")
	3328	? "to_utf8_title"
	3329	: "toTITLE_utf8";
	3330	alternative = "toTITLE_utf8_safe";
	3331	}
	3332	else if (type == DEPRECATE_TO_LOWER) {
	3333	name = instr(file, "mathoms.c")
	3334	? "to_utf8_lower"
	3335	: "toLOWER_utf8";
	3336	alternative = "toLOWER_utf8_safe";
	3337	}
	3338	else if (type == DEPRECATE_TO_FOLD) {
	3339	name = instr(file, "mathoms.c")
	3340	? "to_utf8_fold"
	3341	: "toFOLD_utf8";
	3342	alternative = "toFOLD_utf8_safe";
	3343	}
	3344	else Perl_croak(aTHX_ "panic: Unexpected case change type");
	3345
	3346	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	3347	}
	3348	else {
	3349	assert (p < *e);
	3350	}
	3351
	3352	return utf8n_flags;
	3353	}
	3354
	3355	/* The process for changing the case is essentially the same for the four case
	3356	* change types, except there are complications for folding. Otherwise the
	3357	* difference is only which case to change to. To make sure that they all do
	3358	* the same thing, the bodies of the functions are extracted out into the
	3359	* following two macros. The functions are written with the same variable
	3360	* names, and these are known and used inside these macros. It would be
	3361	* better, of course, to have inline functions to do it, but since different
	3362	* macros are called, depending on which case is being changed to, this is not
	3363	* feasible in C (to khw's knowledge). Two macros are created so that the fold
	3364	* function can start with the common start macro, then finish with its special
	3365	* handling; while the other three cases can just use the common end macro.
	3366	*
	3367	* The algorithm is to use the proper (passed in) macro or function to change
	3368	* the case for code points that are below 256. The macro is used if using
	3369	* locale rules for the case change; the function if not. If the code point is
	3370	* above 255, it is computed from the input UTF-8, and another macro is called
	3371	* to do the conversion. If necessary, the output is converted to UTF-8. If
	3372	* using a locale, we have to check that the change did not cross the 255/256
	3373	* boundary, see check_locale_boundary_crossing() for further details.
	3374	*
	3375	* The macros are split with the correct case change for the below-256 case
	3376	* stored into 'result', and in the middle of an else clause for the above-255
	3377	* case. At that point in the 'else', 'result' is not the final result, but is
	3378	* the input code point calculated from the UTF-8. The fold code needs to
	3379	* realize all this and take it from there.
	3380	*
	3381	* If you read the two macros as sequential, it's easier to understand what's
	3382	* going on. */
	3383	#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
	3384	L1_func_extra_param) \
	3385	\
	3386	if (flags & (locale_flags)) { \
	3387	/* Treat a UTF-8 locale as not being in locale at all */ \
	3388	if (IN_UTF8_CTYPE_LOCALE) { \
	3389	flags &= ~(locale_flags); \
	3390	} \
	3391	else { \
	3392	_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
	3393	} \
	3394	} \
	3395	\
	3396	if (UTF8_IS_INVARIANT(*p)) { \
	3397	if (flags & (locale_flags)) { \
	3398	result = LC_L1_change_macro(*p); \
	3399	} \
	3400	else { \
	3401	return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
	3402	} \
	3403	} \
	3404	else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
	3405	if (flags & (locale_flags)) { \
	3406	result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
	3407	*(p+1))); \
	3408	} \
	3409	else { \
	3410	return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)), \
	3411	ustrp, lenp, L1_func_extra_param); \
	3412	} \
	3413	} \
	3414	else { /* malformed UTF-8 or ord above 255 */ \
	3415	STRLEN len_result; \
	3416	result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
	3417	if (len_result == (STRLEN) -1) { \
	3418	_force_out_malformed_utf8_message(p, e, utf8n_flags, \
	3419	1 /* Die */ ); \
	3420	}
	3421
	3422	#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
	3423	result = change_macro(result, p, ustrp, lenp); \
	3424	\
	3425	if (flags & (locale_flags)) { \
	3426	result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
	3427	} \
	3428	return result; \
	3429	} \
	3430	\
	3431	/* Here, used locale rules. Convert back to UTF-8 */ \
	3432	if (UTF8_IS_INVARIANT(result)) { \
	3433	*ustrp = (U8) result; \
	3434	*lenp = 1; \
	3435	} \
	3436	else { \
	3437	*ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
	3438	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
	3439	*lenp = 2; \
	3440	} \
	3441	\
	3442	return result;
	3443
	3444	/*
	3445	=for apidoc to_utf8_upper
	3446
	3447	Instead use L</toUPPER_utf8_safe>.
	3448
	3449	=cut */
	3450
	3451	/* Not currently externally documented, and subject to change:
	3452	* <flags> is set iff iff the rules from the current underlying locale are to
	3453	* be used. */
	3454
	3455	UV
	3456	Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
	3457	const U8 *e,
	3458	U8* ustrp,
	3459	STRLEN *lenp,
	3460	bool flags,
	3461	const char * const file,
	3462	const int line)
	3463	{
	3464	UV result;
	3465	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
	3466	cBOOL(flags), file, line);
	3467
	3468	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	3469
	3470	/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
	3471	/* 2nd char of uc(U+DF) is 'S' */
	3472	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
	3473	CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
	3474	}
	3475
	3476	/*
	3477	=for apidoc to_utf8_title
	3478
	3479	Instead use L</toTITLE_utf8_safe>.
	3480
	3481	=cut */
	3482
	3483	/* Not currently externally documented, and subject to change:
	3484	* <flags> is set iff the rules from the current underlying locale are to be
	3485	* used. Since titlecase is not defined in POSIX, for other than a
	3486	* UTF-8 locale, uppercase is used instead for code points < 256.
	3487	*/
	3488
	3489	UV
	3490	Perl__to_utf8_title_flags(pTHX_ const U8 *p,
	3491	const U8 *e,
	3492	U8* ustrp,
	3493	STRLEN *lenp,
	3494	bool flags,
	3495	const char * const file,
	3496	const int line)
	3497	{
	3498	UV result;
	3499	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
	3500	cBOOL(flags), file, line);
	3501
	3502	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	3503
	3504	/* 2nd char of ucfirst(U+DF) is 's' */
	3505	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
	3506	CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
	3507	}
	3508
	3509	/*
	3510	=for apidoc to_utf8_lower
	3511
	3512	Instead use L</toLOWER_utf8_safe>.
	3513
	3514	=cut */
	3515
	3516	/* Not currently externally documented, and subject to change:
	3517	* <flags> is set iff iff the rules from the current underlying locale are to
	3518	* be used.
	3519	*/
	3520
	3521	UV
	3522	Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
	3523	const U8 *e,
	3524	U8* ustrp,
	3525	STRLEN *lenp,
	3526	bool flags,
	3527	const char * const file,
	3528	const int line)
	3529	{
	3530	UV result;
	3531	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
	3532	cBOOL(flags), file, line);
	3533
	3534	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	3535
	3536	CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
	3537	CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
	3538	}
	3539
	3540	/*
	3541	=for apidoc to_utf8_fold
	3542
	3543	Instead use L</toFOLD_utf8_safe>.
	3544
	3545	=cut */
	3546
	3547	/* Not currently externally documented, and subject to change,
	3548	* in <flags>
	3549	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3550	* locale are to be used.
	3551	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	3552	* otherwise simple folds
	3553	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	3554	* prohibited
	3555	*/
	3556
	3557	UV
	3558	Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
	3559	const U8 *e,
	3560	U8* ustrp,
	3561	STRLEN *lenp,
	3562	U8 flags,
	3563	const char * const file,
	3564	const int line)
	3565	{
	3566	UV result;
	3567	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
	3568	cBOOL(flags), file, line);
	3569
	3570	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	3571
	3572	/* These are mutually exclusive */
	3573	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	3574
	3575	assert(p != ustrp); /* Otherwise overwrites */
	3576
	3577	CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
	3578	((flags) & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII)));
	3579
	3580	result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	3581
	3582	if (flags & FOLD_FLAGS_LOCALE) {
	3583
	3584	# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
	3585	const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
	3586
	3587	# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3588	# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3589
	3590	const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
	3591
	3592	/* Special case these two characters, as what normally gets
	3593	* returned under locale doesn't work */
	3594	if (UTF8SKIP(p) == cap_sharp_s_len
	3595	&& memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
	3596	{
	3597	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3598	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3599	"Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
	3600	"resolved to \"\\x{17F}\\x{17F}\".");
	3601	goto return_long_s;
	3602	}
	3603	else
	3604	#endif
	3605	if (UTF8SKIP(p) == long_s_t_len
	3606	&& memEQ((char *) p, LONG_S_T, long_s_t_len))
	3607	{
	3608	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3609	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3610	"Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
	3611	"resolved to \"\\x{FB06}\".");
	3612	goto return_ligature_st;
	3613	}
	3614
	3615	#if UNICODE_MAJOR_VERSION == 3 \
	3616	&& UNICODE_DOT_VERSION == 0 \
	3617	&& UNICODE_DOT_DOT_VERSION == 1
	3618	# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
	3619
	3620	/* And special case this on this Unicode version only, for the same
	3621	* reaons the other two are special cased. They would cross the
	3622	* 255/256 boundary which is forbidden under /l, and so the code
	3623	* wouldn't catch that they are equivalent (which they are only in
	3624	* this release) */
	3625	else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
	3626	&& memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
	3627	{
	3628	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3629	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3630	"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
	3631	"resolved to \"\\x{0131}\".");
	3632	goto return_dotless_i;
	3633	}
	3634	#endif
	3635
	3636	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	3637	}
	3638	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	3639	return result;
	3640	}
	3641	else {
	3642	/* This is called when changing the case of a UTF-8-encoded
	3643	* character above the ASCII range, and the result should not
	3644	* contain an ASCII character. */
	3645
	3646	UV original; /* To store the first code point of <p> */
	3647
	3648	/* Look at every character in the result; if any cross the
	3649	* boundary, the whole thing is disallowed */
	3650	U8* s = ustrp;
	3651	U8* e = ustrp + *lenp;
	3652	while (s < e) {
	3653	if (isASCII(*s)) {
	3654	/* Crossed, have to return the original */
	3655	original = valid_utf8_to_uvchr(p, lenp);
	3656
	3657	/* But in these instances, there is an alternative we can
	3658	* return that is valid */
	3659	if (original == LATIN_SMALL_LETTER_SHARP_S
	3660	#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
	3661	\|\| original == LATIN_CAPITAL_LETTER_SHARP_S
	3662	#endif
	3663	) {
	3664	goto return_long_s;
	3665	}
	3666	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	3667	goto return_ligature_st;
	3668	}
	3669	#if UNICODE_MAJOR_VERSION == 3 \
	3670	&& UNICODE_DOT_VERSION == 0 \
	3671	&& UNICODE_DOT_DOT_VERSION == 1
	3672
	3673	else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	3674	goto return_dotless_i;
	3675	}
	3676	#endif
	3677	Copy(p, ustrp, *lenp, char);
	3678	return original;
	3679	}
	3680	s += UTF8SKIP(s);
	3681	}
	3682
	3683	/* Here, no characters crossed, result is ok as-is */
	3684	return result;
	3685	}
	3686	}
	3687
	3688	/* Here, used locale rules. Convert back to UTF-8 */
	3689	if (UTF8_IS_INVARIANT(result)) {
	3690	*ustrp = (U8) result;
	3691	*lenp = 1;
	3692	}
	3693	else {
	3694	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	3695	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	3696	*lenp = 2;
	3697	}
	3698
	3699	return result;
	3700
	3701	return_long_s:
	3702	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	3703	* folds to a string of two of these characters. By returning this
	3704	* instead, then, e.g.,
	3705	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	3706	* works. */
	3707
	3708	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	3709	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	3710	ustrp, *lenp, U8);
	3711	return LATIN_SMALL_LETTER_LONG_S;
	3712
	3713	return_ligature_st:
	3714	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	3715	* have the other one fold to it */
	3716
	3717	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	3718	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	3719	return LATIN_SMALL_LIGATURE_ST;
	3720
	3721	#if UNICODE_MAJOR_VERSION == 3 \
	3722	&& UNICODE_DOT_VERSION == 0 \
	3723	&& UNICODE_DOT_DOT_VERSION == 1
	3724
	3725	return_dotless_i:
	3726	*lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
	3727	Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
	3728	return LATIN_SMALL_LETTER_DOTLESS_I;
	3729
	3730	#endif
	3731
	3732	}
	3733
	3734	/* Note:
	3735	* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
	3736	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	3737	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	3738	*/
	3739
	3740	SV*
	3741	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	3742	I32 minbits, I32 none)
	3743	{
	3744	PERL_ARGS_ASSERT_SWASH_INIT;
	3745
	3746	/* Returns a copy of a swash initiated by the called function. This is the
	3747	* public interface, and returning a copy prevents others from doing
	3748	* mischief on the original */
	3749
	3750	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
	3751	NULL, NULL));
	3752	}
	3753
	3754	SV*
	3755	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	3756	I32 minbits, I32 none, SV* invlist,
	3757	U8* const flags_p)
	3758	{
	3759
	3760	/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
	3761	* use the following define */
	3762
	3763	#define CORE_SWASH_INIT_RETURN(x) \
	3764	PL_curpm= old_PL_curpm; \
	3765	return x
	3766
	3767	/* Initialize and return a swash, creating it if necessary. It does this
	3768	* by calling utf8_heavy.pl in the general case. The returned value may be
	3769	* the swash's inversion list instead if the input parameters allow it.
	3770	* Which is returned should be immaterial to callers, as the only
	3771	* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
	3772	* and swash_to_invlist() handle both these transparently.
	3773	*
	3774	* This interface should only be used by functions that won't destroy or
	3775	* adversely change the swash, as doing so affects all other uses of the
	3776	* swash in the program; the general public should use 'Perl_swash_init'
	3777	* instead.
	3778	*
	3779	* pkg is the name of the package that <name> should be in.
	3780	* name is the name of the swash to find. Typically it is a Unicode
	3781	* property name, including user-defined ones
	3782	* listsv is a string to initialize the swash with. It must be of the form
	3783	* documented as the subroutine return value in
	3784	* L<perlunicode/User-Defined Character Properties>
	3785	* minbits is the number of bits required to represent each data element.
	3786	* It is '1' for binary properties.
	3787	* none I (khw) do not understand this one, but it is used only in tr///.
	3788	* invlist is an inversion list to initialize the swash with (or NULL)
	3789	* flags_p if non-NULL is the address of various input and output flag bits
	3790	* to the routine, as follows: ('I' means is input to the routine;
	3791	* 'O' means output from the routine. Only flags marked O are
	3792	* meaningful on return.)
	3793	* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
	3794	* came from a user-defined property. (I O)
	3795	* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
	3796	* when the swash cannot be located, to simply return NULL. (I)
	3797	* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
	3798	* return of an inversion list instead of a swash hash if this routine
	3799	* thinks that would result in faster execution of swash_fetch() later
	3800	* on. (I)
	3801	*
	3802	* Thus there are three possible inputs to find the swash: <name>,
	3803	* <listsv>, and <invlist>. At least one must be specified. The result
	3804	* will be the union of the specified ones, although <listsv>'s various
	3805	* actions can intersect, etc. what <name> gives. To avoid going out to
	3806	* disk at all, <invlist> should specify completely what the swash should
	3807	* have, and <listsv> should be &PL_sv_undef and <name> should be "".
	3808	*
	3809	* <invlist> is only valid for binary properties */
	3810
	3811	PMOP old_PL_curpm= PL_curpm; / save away the old PL_curpm */
	3812
	3813	SV* retval = &PL_sv_undef;
	3814	HV* swash_hv = NULL;
	3815	const int invlist_swash_boundary =
	3816	(flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
	3817	? 512 /* Based on some benchmarking, but not extensive, see commit
	3818	message */
	3819	: -1; /* Never return just an inversion list */
	3820
	3821	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	3822	assert(! invlist \|\| minbits == 1);
	3823
	3824	PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
	3825	regex that triggered the swash init and the swash init
	3826	perl logic itself. See perl #122747 */
	3827
	3828	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	3829	* so */
	3830	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	3831	dSP;
	3832	const size_t pkg_len = strlen(pkg);
	3833	const size_t name_len = strlen(name);
	3834	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	3835	SV* errsv_save;
	3836	GV *method;
	3837
	3838	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	3839
	3840	PUSHSTACKi(PERLSI_MAGIC);
	3841	ENTER;
	3842	SAVEHINTS();
	3843	save_re_context();
	3844	/* We might get here via a subroutine signature which uses a utf8
	3845	* parameter name, at which point PL_subname will have been set
	3846	* but not yet used. */
	3847	save_item(PL_subname);
	3848	if (PL_parser && PL_parser->error_count)
	3849	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	3850	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	3851	if (!method) { /* demand load UTF-8 */
	3852	ENTER;
	3853	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	3854	GvSV(PL_errgv) = NULL;
	3855	#ifndef NO_TAINT_SUPPORT
	3856	/* It is assumed that callers of this routine are not passing in
	3857	* any user derived data. */
	3858	/* Need to do this after save_re_context() as it will set
	3859	* PL_tainted to 1 while saving $1 etc (see the code after getrx:
	3860	* in Perl_magic_get). Even line to create errsv_save can turn on
	3861	* PL_tainted. */
	3862	SAVEBOOL(TAINT_get);
	3863	TAINT_NOT;
	3864	#endif
	3865	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	3866	NULL);
	3867	{
	3868	/* Not ERRSV, as there is no need to vivify a scalar we are
	3869	about to discard. */
	3870	SV * const errsv = GvSV(PL_errgv);
	3871	if (!SvTRUE(errsv)) {
	3872	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	3873	SvREFCNT_dec(errsv);
	3874	}
	3875	}
	3876	LEAVE;
	3877	}
	3878	SPAGAIN;
	3879	PUSHMARK(SP);
	3880	EXTEND(SP,5);
	3881	mPUSHp(pkg, pkg_len);
	3882	mPUSHp(name, name_len);
	3883	PUSHs(listsv);
	3884	mPUSHi(minbits);
	3885	mPUSHi(none);
	3886	PUTBACK;
	3887	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	3888	GvSV(PL_errgv) = NULL;
	3889	/* If we already have a pointer to the method, no need to use
	3890	* call_method() to repeat the lookup. */
	3891	if (method
	3892	? call_sv(MUTABLE_SV(method), G_SCALAR)
	3893	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	3894	{
	3895	retval = *PL_stack_sp--;
	3896	SvREFCNT_inc(retval);
	3897	}
	3898	{
	3899	/* Not ERRSV. See above. */
	3900	SV * const errsv = GvSV(PL_errgv);
	3901	if (!SvTRUE(errsv)) {
	3902	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	3903	SvREFCNT_dec(errsv);
	3904	}
	3905	}
	3906	LEAVE;
	3907	POPSTACK;
	3908	if (IN_PERL_COMPILETIME) {
	3909	CopHINTS_set(PL_curcop, PL_hints);
	3910	}
	3911	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	3912	if (SvPOK(retval)) {
	3913
	3914	/* If caller wants to handle missing properties, let them */
	3915	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
	3916	CORE_SWASH_INIT_RETURN(NULL);
	3917	}
	3918	Perl_croak(aTHX_
	3919	"Can't find Unicode property definition \"%" SVf "\"",
	3920	SVfARG(retval));
	3921	NOT_REACHED; /* NOTREACHED */
	3922	}
	3923	}
	3924	} /* End of calling the module to find the swash */
	3925
	3926	/* If this operation fetched a swash, and we will need it later, get it */
	3927	if (retval != &PL_sv_undef
	3928	&& (minbits == 1 \|\| (flags_p
	3929	&& ! (*flags_p
	3930	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
	3931	{
	3932	swash_hv = MUTABLE_HV(SvRV(retval));
	3933
	3934	/* If we don't already know that there is a user-defined component to
	3935	* this swash, and the user has indicated they wish to know if there is
	3936	* one (by passing <flags_p>), find out */
	3937	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
	3938	SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
	3939	if (user_defined && SvUV(*user_defined)) {
	3940	*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	3941	}
	3942	}
	3943	}
	3944
	3945	/* Make sure there is an inversion list for binary properties */
	3946	if (minbits == 1) {
	3947	SV** swash_invlistsvp = NULL;
	3948	SV* swash_invlist = NULL;
	3949	bool invlist_in_swash_is_valid = FALSE;
	3950	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
	3951	an unclaimed reference count */
	3952
	3953	/* If this operation fetched a swash, get its already existing
	3954	* inversion list, or create one for it */
	3955
	3956	if (swash_hv) {
	3957	swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
	3958	if (swash_invlistsvp) {
	3959	swash_invlist = *swash_invlistsvp;
	3960	invlist_in_swash_is_valid = TRUE;
	3961	}
	3962	else {
	3963	swash_invlist = _swash_to_invlist(retval);
	3964	swash_invlist_unclaimed = TRUE;
	3965	}
	3966	}
	3967
	3968	/* If an inversion list was passed in, have to include it */
	3969	if (invlist) {
	3970
	3971	/* Any fetched swash will by now have an inversion list in it;
	3972	* otherwise <swash_invlist> will be NULL, indicating that we
	3973	* didn't fetch a swash */
	3974	if (swash_invlist) {
	3975
	3976	/* Add the passed-in inversion list, which invalidates the one
	3977	* already stored in the swash */
	3978	invlist_in_swash_is_valid = FALSE;
	3979	SvREADONLY_off(swash_invlist); /* Turned on again below */
	3980	_invlist_union(invlist, swash_invlist, &swash_invlist);
	3981	}
	3982	else {
	3983
	3984	/* Here, there is no swash already. Set up a minimal one, if
	3985	* we are going to return a swash */
	3986	if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
	3987	swash_hv = newHV();
	3988	retval = newRV_noinc(MUTABLE_SV(swash_hv));
	3989	}
	3990	swash_invlist = invlist;
	3991	}
	3992	}
	3993
	3994	/* Here, we have computed the union of all the passed-in data. It may
	3995	* be that there was an inversion list in the swash which didn't get
	3996	* touched; otherwise save the computed one */
	3997	if (! invlist_in_swash_is_valid
	3998	&& (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
	3999	{
	4000	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
	4001	{
	4002	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4003	}
	4004	/* We just stole a reference count. */
	4005	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
	4006	else SvREFCNT_inc_simple_void_NN(swash_invlist);
	4007	}
	4008
	4009	/* The result is immutable. Forbid attempts to change it. */
	4010	SvREADONLY_on(swash_invlist);
	4011
	4012	/* Use the inversion list stand-alone if small enough */
	4013	if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
	4014	SvREFCNT_dec(retval);
	4015	if (!swash_invlist_unclaimed)
	4016	SvREFCNT_inc_simple_void_NN(swash_invlist);
	4017	retval = newRV_noinc(swash_invlist);
	4018	}
	4019	}
	4020
	4021	CORE_SWASH_INIT_RETURN(retval);
	4022	#undef CORE_SWASH_INIT_RETURN
	4023	}
	4024
	4025
	4026	/* This API is wrong for special case conversions since we may need to
	4027	* return several Unicode characters for a single Unicode character
	4028	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	4029	* the lower-level routine, and it is similarly broken for returning
	4030	* multiple values. --jhi
	4031	* For those, you should use S__to_utf8_case() instead */
	4032	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	4033
	4034	/* Note:
	4035	* Returns the value of property/mapping C<swash> for the first character
	4036	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	4037	* assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
	4038	* is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	4039	*
	4040	* A "swash" is a hash which contains initially the keys/values set up by
	4041	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	4042	* property for all possible code points. Things are stored in a compact form
	4043	* (see utf8_heavy.pl) so that calculation is required to find the actual
	4044	* property value for a given code point. As code points are looked up, new
	4045	* key/value pairs are added to the hash, so that the calculation doesn't have
	4046	* to ever be re-done. Further, each calculation is done, not just for the
	4047	* desired one, but for a whole block of code points adjacent to that one.
	4048	* For binary properties on ASCII machines, the block is usually for 64 code
	4049	* points, starting with a code point evenly divisible by 64. Thus if the
	4050	* property value for code point 257 is requested, the code goes out and
	4051	* calculates the property values for all 64 code points between 256 and 319,
	4052	* and stores these as a single 64-bit long bit vector, called a "swatch",
	4053	* under the key for code point 256. The key is the UTF-8 encoding for code
	4054	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	4055	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	4056	* for code point 258 is then requested, this code realizes that it would be
	4057	* stored under the key for 256, and would find that value and extract the
	4058	* relevant bit, offset from 256.
	4059	*
	4060	* Non-binary properties are stored in as many bits as necessary to represent
	4061	* their values (32 currently, though the code is more general than that), not
	4062	* as single bits, but the principle is the same: the value for each key is a
	4063	* vector that encompasses the property values for all code points whose UTF-8
	4064	* representations are represented by the key. That is, for all code points
	4065	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	4066	* bytes of that.
	4067	*/
	4068	UV
	4069	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	4070	{
	4071	HV *const hv = MUTABLE_HV(SvRV(swash));
	4072	U32 klen;
	4073	U32 off;
	4074	STRLEN slen = 0;
	4075	STRLEN needents;
	4076	const U8 *tmps = NULL;
	4077	SV *swatch;
	4078	const U8 c = *ptr;
	4079
	4080	PERL_ARGS_ASSERT_SWASH_FETCH;
	4081
	4082	/* If it really isn't a hash, it isn't really swash; must be an inversion
	4083	* list */
	4084	if (SvTYPE(hv) != SVt_PVHV) {
	4085	return _invlist_contains_cp((SV*)hv,
	4086	(do_utf8)
	4087	? valid_utf8_to_uvchr(ptr, NULL)
	4088	: c);
	4089	}
	4090
	4091	/* We store the values in a "swatch" which is a vec() value in a swash
	4092	* hash. Code points 0-255 are a single vec() stored with key length
	4093	* (klen) 0. All other code points have a UTF-8 representation
	4094	* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
	4095	* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
	4096	* length for them is the length of the encoded char - 1. ptr[klen] is the
	4097	* final byte in the sequence representing the character */
	4098	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
	4099	klen = 0;
	4100	needents = 256;
	4101	off = c;
	4102	}
	4103	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	4104	klen = 0;
	4105	needents = 256;
	4106	off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
	4107	}
	4108	else {
	4109	klen = UTF8SKIP(ptr) - 1;
	4110
	4111	/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
	4112	* the vec is the final byte in the sequence. (In EBCDIC this is
	4113	* converted to I8 to get consecutive values.) To help you visualize
	4114	* all this:
	4115	* Straight 1047 After final byte
	4116	* UTF-8 UTF-EBCDIC I8 transform
	4117	* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
	4118	* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
	4119	* ...
	4120	* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
	4121	* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
	4122	* ...
	4123	* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
	4124	* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
	4125	* ...
	4126	* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
	4127	* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
	4128	* ...
	4129	* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
	4130	* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
	4131	*
	4132	* (There are no discontinuities in the elided (...) entries.)
	4133	* The UTF-8 key for these 33 code points is '\xD0' (which also is the
	4134	* key for the next 31, up through U+043F, whose UTF-8 final byte is
	4135	* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
	4136	* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
	4137	* index into the vec() swatch (after subtracting 0x80, which we
	4138	* actually do with an '&').
	4139	* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
	4140	* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
	4141	* dicontinuities which go away by transforming it into I8, and we
	4142	* effectively subtract 0xA0 to get the index. */
	4143	needents = (1 << UTF_ACCUMULATION_SHIFT);
	4144	off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
	4145	}
	4146
	4147	/*
	4148	* This single-entry cache saves about 1/3 of the UTF-8 overhead in test
	4149	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	4150	* it's nothing to sniff at.) Pity we usually come through at least
	4151	* two function calls to get here...
	4152	*
	4153	* NB: this code assumes that swatches are never modified, once generated!
	4154	*/
	4155
	4156	if (hv == PL_last_swash_hv &&
	4157	klen == PL_last_swash_klen &&
	4158	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	4159	{
	4160	tmps = PL_last_swash_tmps;
	4161	slen = PL_last_swash_slen;
	4162	}
	4163	else {
	4164	/* Try our second-level swatch cache, kept in a hash. */
	4165	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	4166
	4167	/* If not cached, generate it via swatch_get */
	4168	if (!svp \|\| !SvPOK(*svp)
	4169	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
	4170	{
	4171	if (klen) {
	4172	const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
	4173	swatch = swatch_get(swash,
	4174	code_point & ~((UV)needents - 1),
	4175	needents);
	4176	}
	4177	else { /* For the first 256 code points, the swatch has a key of
	4178	length 0 */
	4179	swatch = swatch_get(swash, 0, needents);
	4180	}
	4181
	4182	if (IN_PERL_COMPILETIME)
	4183	CopHINTS_set(PL_curcop, PL_hints);
	4184
	4185	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	4186
	4187	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	4188	\|\| (slen << 3) < needents)
	4189	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	4190	"svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
	4191	svp, tmps, (UV)slen, (UV)needents);
	4192	}
	4193
	4194	PL_last_swash_hv = hv;
	4195	assert(klen <= sizeof(PL_last_swash_key));
	4196	PL_last_swash_klen = (U8)klen;
	4197	/* FIXME change interpvar.h? */
	4198	PL_last_swash_tmps = (U8 *) tmps;
	4199	PL_last_swash_slen = slen;
	4200	if (klen)
	4201	Copy(ptr, PL_last_swash_key, klen, U8);
	4202	}
	4203
	4204	switch ((int)((slen << 3) / needents)) {
	4205	case 1:
	4206	return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
	4207	case 8:
	4208	return ((UV) tmps[off]);
	4209	case 16:
	4210	off <<= 1;
	4211	return
	4212	((UV) tmps[off ] << 8) +
	4213	((UV) tmps[off + 1]);
	4214	case 32:
	4215	off <<= 2;
	4216	return
	4217	((UV) tmps[off ] << 24) +
	4218	((UV) tmps[off + 1] << 16) +
	4219	((UV) tmps[off + 2] << 8) +
	4220	((UV) tmps[off + 3]);
	4221	}
	4222	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	4223	"slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
	4224	NORETURN_FUNCTION_END;
	4225	}
	4226
	4227	/* Read a single line of the main body of the swash input text. These are of
	4228	* the form:
	4229	* 0053 0056 0073
	4230	* where each number is hex. The first two numbers form the minimum and
	4231	* maximum of a range, and the third is the value associated with the range.
	4232	* Not all swashes should have a third number
	4233	*
	4234	* On input: l points to the beginning of the line to be examined; it points
	4235	* to somewhere in the string of the whole input text, and is
	4236	* terminated by a \n or the null string terminator.
	4237	* lend points to the null terminator of that string
	4238	* wants_value is non-zero if the swash expects a third number
	4239	* typestr is the name of the swash's mapping, like 'ToLower'
	4240	* On output: min, max, and *val are set to the values read from the line.
	4241	* returns a pointer just beyond the line examined. If there was no
	4242	* valid min number on the line, returns lend+1
	4243	*/
	4244
	4245	STATIC U8*
	4246	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	4247	const bool wants_value, const U8* const typestr)
	4248	{
	4249	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	4250	STRLEN numlen; /* Length of the number */
	4251	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	4252	\| PERL_SCAN_DISALLOW_PREFIX
	4253	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4254
	4255	/* nl points to the next \n in the scan */
	4256	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	4257
	4258	PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
	4259
	4260	/* Get the first number on the line: the range minimum */
	4261	numlen = lend - l;
	4262	min = grok_hex((char )l, &numlen, &flags, NULL);
	4263	max = min; /* So can never return without setting max */
	4264	if (numlen) /* If found a hex number, position past it */
	4265	l += numlen;
	4266	else if (nl) { /* Else, go handle next line, if any */
	4267	return nl + 1; /* 1 is length of "\n" */
	4268	}
	4269	else { /* Else, no next line */
	4270	return lend + 1; /* to LIST's end at which \n is not found */
	4271	}
	4272
	4273	/* The max range value follows, separated by a BLANK */
	4274	if (isBLANK(*l)) {
	4275	++l;
	4276	flags = PERL_SCAN_SILENT_ILLDIGIT
	4277	\| PERL_SCAN_DISALLOW_PREFIX
	4278	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4279	numlen = lend - l;
	4280	max = grok_hex((char )l, &numlen, &flags, NULL);
	4281	if (numlen)
	4282	l += numlen;
	4283	else /* If no value here, it is a single element range */
	4284	max = min;
	4285
	4286	/* Non-binary tables have a third entry: what the first element of the
	4287	* range maps to. The map for those currently read here is in hex */
	4288	if (wants_value) {
	4289	if (isBLANK(*l)) {
	4290	++l;
	4291	flags = PERL_SCAN_SILENT_ILLDIGIT
	4292	\| PERL_SCAN_DISALLOW_PREFIX
	4293	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4294	numlen = lend - l;
	4295	val = grok_hex((char )l, &numlen, &flags, NULL);
	4296	if (numlen)
	4297	l += numlen;
	4298	else
	4299	*val = 0;
	4300	}
	4301	else {
	4302	*val = 0;
	4303	if (typeto) {
	4304	/* diag_listed_as: To%s: illegal mapping '%s' */
	4305	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	4306	typestr, l);
	4307	}
	4308	}
	4309	}
	4310	else
	4311	val = 0; / bits == 1, then any val should be ignored */
	4312	}
	4313	else { /* Nothing following range min, should be single element with no
	4314	mapping expected */
	4315	if (wants_value) {
	4316	*val = 0;
	4317	if (typeto) {
	4318	/* diag_listed_as: To%s: illegal mapping '%s' */
	4319	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	4320	}
	4321	}
	4322	else
	4323	val = 0; / bits == 1, then val should be ignored */
	4324	}
	4325
	4326	/* Position to next line if any, or EOF */
	4327	if (nl)
	4328	l = nl + 1;
	4329	else
	4330	l = lend;
	4331
	4332	return l;
	4333	}
	4334
	4335	/* Note:
	4336	* Returns a swatch (a bit vector string) for a code point sequence
	4337	* that starts from the value C<start> and comprises the number C<span>.
	4338	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	4339	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	4340	*/
	4341	STATIC SV*
	4342	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	4343	{
	4344	SV *swatch;
	4345	U8 l, lend, x, xend, s, send;
	4346	STRLEN lcur, xcur, scur;
	4347	HV *const hv = MUTABLE_HV(SvRV(swash));
	4348	SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
	4349
	4350	SV** listsvp = NULL; /* The string containing the main body of the table */
	4351	SV** extssvp = NULL;
	4352	SV** invert_it_svp = NULL;
	4353	U8* typestr = NULL;
	4354	STRLEN bits;
	4355	STRLEN octets; /* if bits == 1, then octets == 0 */
	4356	UV none;
	4357	UV end = start + span;
	4358
	4359	if (invlistsvp == NULL) {
	4360	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	4361	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	4362	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	4363	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	4364	listsvp = hv_fetchs(hv, "LIST", FALSE);
	4365	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	4366
	4367	bits = SvUV(*bitssvp);
	4368	none = SvUV(*nonesvp);
	4369	typestr = (U8)SvPV_nolen(typesvp);
	4370	}
	4371	else {
	4372	bits = 1;
	4373	none = 0;
	4374	}
	4375	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	4376
	4377	PERL_ARGS_ASSERT_SWATCH_GET;
	4378
	4379	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	4380	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
	4381	(UV)bits);
	4382	}
	4383
	4384	/* If overflowed, use the max possible */
	4385	if (end < start) {
	4386	end = UV_MAX;
	4387	span = end - start;
	4388	}
	4389
	4390	/* create and initialize $swatch */
	4391	scur = octets ? (span * octets) : (span + 7) / 8;
	4392	swatch = newSV(scur);
	4393	SvPOK_on(swatch);
	4394	s = (U8*)SvPVX(swatch);
	4395	if (octets && none) {
	4396	const U8* const e = s + scur;
	4397	while (s < e) {
	4398	if (bits == 8)
	4399	*s++ = (U8)(none & 0xff);
	4400	else if (bits == 16) {
	4401	*s++ = (U8)((none >> 8) & 0xff);
	4402	*s++ = (U8)( none & 0xff);
	4403	}
	4404	else if (bits == 32) {
	4405	*s++ = (U8)((none >> 24) & 0xff);
	4406	*s++ = (U8)((none >> 16) & 0xff);
	4407	*s++ = (U8)((none >> 8) & 0xff);
	4408	*s++ = (U8)( none & 0xff);
	4409	}
	4410	}
	4411	*s = '\0';
	4412	}
	4413	else {
	4414	(void)memzero((U8*)s, scur + 1);
	4415	}
	4416	SvCUR_set(swatch, scur);
	4417	s = (U8*)SvPVX(swatch);
	4418
	4419	if (invlistsvp) { /* If has an inversion list set up use that */
	4420	_invlist_populate_swatch(*invlistsvp, start, end, s);
	4421	return swatch;
	4422	}
	4423
	4424	/* read $swash->{LIST} */
	4425	l = (U8)SvPV(listsvp, lcur);
	4426	lend = l + lcur;
	4427	while (l < lend) {
	4428	UV min, max, val, upper;
	4429	l = swash_scan_list_line(l, lend, &min, &max, &val,
	4430	cBOOL(octets), typestr);
	4431	if (l > lend) {
	4432	break;
	4433	}
	4434
	4435	/* If looking for something beyond this range, go try the next one */
	4436	if (max < start)
	4437	continue;
	4438
	4439	/* <end> is generally 1 beyond where we want to set things, but at the
	4440	* platform's infinity, where we can't go any higher, we want to
	4441	* include the code point at <end> */
	4442	upper = (max < end)
	4443	? max
	4444	: (max != UV_MAX \|\| end != UV_MAX)
	4445	? end - 1
	4446	: end;
	4447
	4448	if (octets) {
	4449	UV key;
	4450	if (min < start) {
	4451	if (!none \|\| val < none) {
	4452	val += start - min;
	4453	}
	4454	min = start;
	4455	}
	4456	for (key = min; key <= upper; key++) {
	4457	STRLEN offset;
	4458	/* offset must be non-negative (start <= min <= key < end) */
	4459	offset = octets * (key - start);
	4460	if (bits == 8)
	4461	s[offset] = (U8)(val & 0xff);
	4462	else if (bits == 16) {
	4463	s[offset ] = (U8)((val >> 8) & 0xff);
	4464	s[offset + 1] = (U8)( val & 0xff);
	4465	}
	4466	else if (bits == 32) {
	4467	s[offset ] = (U8)((val >> 24) & 0xff);
	4468	s[offset + 1] = (U8)((val >> 16) & 0xff);
	4469	s[offset + 2] = (U8)((val >> 8) & 0xff);
	4470	s[offset + 3] = (U8)( val & 0xff);
	4471	}
	4472
	4473	if (!none \|\| val < none)
	4474	++val;
	4475	}
	4476	}
	4477	else { /* bits == 1, then val should be ignored */
	4478	UV key;
	4479	if (min < start)
	4480	min = start;
	4481
	4482	for (key = min; key <= upper; key++) {
	4483	const STRLEN offset = (STRLEN)(key - start);
	4484	s[offset >> 3] \|= 1 << (offset & 7);
	4485	}
	4486	}
	4487	} /* while */
	4488
	4489	/* Invert if the data says it should be. Assumes that bits == 1 */
	4490	if (invert_it_svp && SvUV(*invert_it_svp)) {
	4491
	4492	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	4493	* be 0, and their inversion should also be 0, as we don't succeed any
	4494	* Unicode property matches for non-Unicode code points */
	4495	if (start <= PERL_UNICODE_MAX) {
	4496
	4497	/* The code below assumes that we never cross the
	4498	* Unicode/above-Unicode boundary in a range, as otherwise we would
	4499	* have to figure out where to stop flipping the bits. Since this
	4500	* boundary is divisible by a large power of 2, and swatches comes
	4501	* in small powers of 2, this should be a valid assumption */
	4502	assert(start + span - 1 <= PERL_UNICODE_MAX);
	4503
	4504	send = s + scur;
	4505	while (s < send) {
	4506	s = ~(s);
	4507	s++;
	4508	}
	4509	}
	4510	}
	4511
	4512	/* read $swash->{EXTRAS}
	4513	* This code also copied to swash_to_invlist() below */
	4514	x = (U8)SvPV(extssvp, xcur);
	4515	xend = x + xcur;
	4516	while (x < xend) {
	4517	STRLEN namelen;
	4518	U8 *namestr;
	4519	SV** othersvp;
	4520	HV* otherhv;
	4521	STRLEN otherbits;
	4522	SV *otherbitssvp, other;
	4523	U8 s, o, *nl;
	4524	STRLEN slen, olen;
	4525
	4526	const U8 opc = *x++;
	4527	if (opc == '\n')
	4528	continue;
	4529
	4530	nl = (U8*)memchr(x, '\n', xend - x);
	4531
	4532	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	4533	if (nl) {
	4534	x = nl + 1; /* 1 is length of "\n" */
	4535	continue;
	4536	}
	4537	else {
	4538	x = xend; /* to EXTRAS' end at which \n is not found */
	4539	break;
	4540	}
	4541	}
	4542
	4543	namestr = x;
	4544	if (nl) {
	4545	namelen = nl - namestr;
	4546	x = nl + 1;
	4547	}
	4548	else {
	4549	namelen = xend - namestr;
	4550	x = xend;
	4551	}
	4552
	4553	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	4554	otherhv = MUTABLE_HV(SvRV(*othersvp));
	4555	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	4556	otherbits = (STRLEN)SvUV(*otherbitssvp);
	4557	if (bits < otherbits)
	4558	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	4559	"bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
	4560
	4561	/* The "other" swatch must be destroyed after. */
	4562	other = swatch_get(*othersvp, start, span);
	4563	o = (U8*)SvPV(other, olen);
	4564
	4565	if (!olen)
	4566	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	4567
	4568	s = (U8*)SvPV(swatch, slen);
	4569	if (bits == 1 && otherbits == 1) {
	4570	if (slen != olen)
	4571	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	4572	"mismatch, slen=%" UVuf ", olen=%" UVuf,
	4573	(UV)slen, (UV)olen);
	4574
	4575	switch (opc) {
	4576	case '+':
	4577	while (slen--)
	4578	s++ \|= o++;
	4579	break;
	4580	case '!':
	4581	while (slen--)
	4582	s++ \|= ~o++;
	4583	break;
	4584	case '-':
	4585	while (slen--)
	4586	s++ &= ~o++;
	4587	break;
	4588	case '&':
	4589	while (slen--)
	4590	s++ &= o++;
	4591	break;
	4592	default:
	4593	break;
	4594	}
	4595	}
	4596	else {
	4597	STRLEN otheroctets = otherbits >> 3;
	4598	STRLEN offset = 0;
	4599	U8* const send = s + slen;
	4600
	4601	while (s < send) {
	4602	UV otherval = 0;
	4603
	4604	if (otherbits == 1) {
	4605	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	4606	++offset;
	4607	}
	4608	else {
	4609	STRLEN vlen = otheroctets;
	4610	otherval = *o++;
	4611	while (--vlen) {
	4612	otherval <<= 8;
	4613	otherval \|= *o++;
	4614	}
	4615	}
	4616
	4617	if (opc == '+' && otherval)
	4618	NOOP; /* replace with otherval */
	4619	else if (opc == '!' && !otherval)
	4620	otherval = 1;
	4621	else if (opc == '-' && otherval)
	4622	otherval = 0;
	4623	else if (opc == '&' && !otherval)
	4624	otherval = 0;
	4625	else {
	4626	s += octets; /* no replacement */
	4627	continue;
	4628	}
	4629
	4630	if (bits == 8)
	4631	*s++ = (U8)( otherval & 0xff);
	4632	else if (bits == 16) {
	4633	*s++ = (U8)((otherval >> 8) & 0xff);
	4634	*s++ = (U8)( otherval & 0xff);
	4635	}
	4636	else if (bits == 32) {
	4637	*s++ = (U8)((otherval >> 24) & 0xff);
	4638	*s++ = (U8)((otherval >> 16) & 0xff);
	4639	*s++ = (U8)((otherval >> 8) & 0xff);
	4640	*s++ = (U8)( otherval & 0xff);
	4641	}
	4642	}
	4643	}
	4644	sv_free(other); /* through with it! */
	4645	} /* while */
	4646	return swatch;
	4647	}
	4648
	4649	HV*
	4650	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	4651	{
	4652
	4653	/* Subject to change or removal. For use only in regcomp.c and regexec.c
	4654	* Can't be used on a property that is subject to user override, as it
	4655	* relies on the value of SPECIALS in the swash which would be set by
	4656	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	4657	* for overridden properties
	4658	*
	4659	* Returns a hash which is the inversion and closure of a swash mapping.
	4660	* For example, consider the input lines:
	4661	* 004B 006B
	4662	* 004C 006C
	4663	* 212A 006B
	4664	*
	4665	* The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
	4666	* 006C. The value for each key is an array. For 006C, the array would
	4667	* have two elements, the UTF-8 for itself, and for 004C. For 006B, there
	4668	* would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
	4669	*
	4670	* Note that there are no elements in the hash for 004B, 004C, 212A. The
	4671	* keys are only code points that are folded-to, so it isn't a full closure.
	4672	*
	4673	* Essentially, for any code point, it gives all the code points that map to
	4674	* it, or the list of 'froms' for that point.
	4675	*
	4676	* Currently it ignores any additions or deletions from other swashes,
	4677	* looking at just the main body of the swash, and if there are SPECIALS
	4678	* in the swash, at that hash
	4679	*
	4680	* The specials hash can be extra code points, and most likely consists of
	4681	* maps from single code points to multiple ones (each expressed as a string
	4682	* of UTF-8 characters). This function currently returns only 1-1 mappings.
	4683	* However consider this possible input in the specials hash:
	4684	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	4685	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	4686	*
	4687	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	4688	* currently handle. But it also means that FB05 and FB06 are equivalent in
	4689	* a 1-1 mapping which we should handle, and this relationship may not be in
	4690	* the main table. Therefore this function examines all the multi-char
	4691	* sequences and adds the 1-1 mappings that come out of that.
	4692	*
	4693	* XXX This function was originally intended to be multipurpose, but its
	4694	* only use is quite likely to remain for constructing the inversion of
	4695	* the CaseFolding (//i) property. If it were more general purpose for
	4696	* regex patterns, it would have to do the FB05/FB06 game for simple folds,
	4697	* because certain folds are prohibited under /iaa and /il. As an example,
	4698	* in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
	4699	* equivalent under /i. But under /iaa and /il, the folds to 'i' are
	4700	* prohibited, so we would not figure out that they fold to each other.
	4701	* Code could be written to automatically figure this out, similar to the
	4702	* code that does this for multi-character folds, but this is the only case
	4703	* where something like this is ever likely to happen, as all the single
	4704	* char folds to the 0-255 range are now quite settled. Instead there is a
	4705	* little special code that is compiled only for this Unicode version. This
	4706	* is smaller and didn't require much coding time to do. But this makes
	4707	* this routine strongly tied to being used just for CaseFolding. If ever
	4708	* it should be generalized, this would have to be fixed */
	4709
	4710	U8 l, lend;
	4711	STRLEN lcur;
	4712	HV *const hv = MUTABLE_HV(SvRV(swash));
	4713
	4714	/* The string containing the main body of the table. This will have its
	4715	* assertion fail if the swash has been converted to its inversion list */
	4716	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	4717
	4718	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	4719	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	4720	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	4721	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	4722	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	4723	const STRLEN bits = SvUV(*bitssvp);
	4724	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	4725	const UV none = SvUV(*nonesvp);
	4726	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	4727
	4728	HV* ret = newHV();
	4729
	4730	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	4731
	4732	/* Must have at least 8 bits to get the mappings */
	4733	if (bits != 8 && bits != 16 && bits != 32) {
	4734	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"
	4735	UVuf, (UV)bits);
	4736	}
	4737
	4738	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	4739	mapping to more than one character */
	4740
	4741	/* Construct an inverse mapping hash for the specials */
	4742	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	4743	HV * specials_inverse = newHV();
	4744	char char_from; / the lhs of the map */
	4745	I32 from_len; /* its byte length */
	4746	char char_to; / the rhs of the map */
	4747	I32 to_len; /* its byte length */
	4748	SV sv_to; / and in a sv */
	4749	AV* from_list; /* list of things that map to each 'to' */
	4750
	4751	hv_iterinit(specials_hv);
	4752
	4753	/* The keys are the characters (in UTF-8) that map to the corresponding
	4754	* UTF-8 string value. Iterate through the list creating the inverse
	4755	* list. */
	4756	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	4757	SV** listp;
	4758	if (! SvPOK(sv_to)) {
	4759	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
	4760	"unexpectedly is not a string, flags=%lu",
	4761	(unsigned long)SvFLAGS(sv_to));
	4762	}
	4763	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8) char_from, 0), valid_utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	4764
	4765	/* Each key in the inverse list is a mapped-to value, and the key's
	4766	* hash value is a list of the strings (each in UTF-8) that map to
	4767	* it. Those strings are all one character long */
	4768	if ((listp = hv_fetch(specials_inverse,
	4769	SvPVX(sv_to),
	4770	SvCUR(sv_to), 0)))
	4771	{
	4772	from_list = (AV) listp;
	4773	}
	4774	else { /* No entry yet for it: create one */
	4775	from_list = newAV();
	4776	if (! hv_store(specials_inverse,
	4777	SvPVX(sv_to),
	4778	SvCUR(sv_to),
	4779	(SV*) from_list, 0))
	4780	{
	4781	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4782	}
	4783	}
	4784
	4785	/* Here have the list associated with this 'to' (perhaps newly
	4786	* created and empty). Just add to it. Note that we ASSUME that
	4787	* the input is guaranteed to not have duplications, so we don't
	4788	* check for that. Duplications just slow down execution time. */
	4789	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	4790	}
	4791
	4792	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	4793	* it looking for cases like the FB05/FB06 examples above. There would
	4794	* be an entry in the hash like
	4795	* 'st' => [ FB05, FB06 ]
	4796	* In this example we will create two lists that get stored in the
	4797	* returned hash, 'ret':
	4798	* FB05 => [ FB05, FB06 ]
	4799	* FB06 => [ FB05, FB06 ]
	4800	*
	4801	* Note that there is nothing to do if the array only has one element.
	4802	* (In the normal 1-1 case handled below, we don't have to worry about
	4803	* two lists, as everything gets tied to the single list that is
	4804	* generated for the single character 'to'. But here, we are omitting
	4805	* that list, ('st' in the example), so must have multiple lists.) */
	4806	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	4807	&char_to, &to_len)))
	4808	{
	4809	if (av_tindex_skip_len_mg(from_list) > 0) {
	4810	SSize_t i;
	4811
	4812	/* We iterate over all combinations of i,j to place each code
	4813	* point on each list */
	4814	for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
	4815	SSize_t j;
	4816	AV* i_list = newAV();
	4817	SV** entryp = av_fetch(from_list, i, FALSE);
	4818	if (entryp == NULL) {
	4819	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly"
	4820	" failed");
	4821	}
	4822	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	4823	Perl_croak(aTHX_ "panic: unexpected entry for %s",
	4824	SvPVX(*entryp));
	4825	}
	4826	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	4827	(SV*) i_list, FALSE))
	4828	{
	4829	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4830	}
	4831
	4832	/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	4833	for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
	4834	entryp = av_fetch(from_list, j, FALSE);
	4835	if (entryp == NULL) {
	4836	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	4837	}
	4838
	4839	/* When i==j this adds itself to the list */
	4840	av_push(i_list, newSVuv(utf8_to_uvchr_buf(
	4841	(U8) SvPVX(entryp),
	4842	(U8) SvPVX(entryp) + SvCUR(*entryp),
	4843	0)));
	4844	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	4845	}
	4846	}
	4847	}
	4848	}
	4849	SvREFCNT_dec(specials_inverse); /* done with it */
	4850	} /* End of specials */
	4851
	4852	/* read $swash->{LIST} */
	4853
	4854	#if UNICODE_MAJOR_VERSION == 3 \
	4855	&& UNICODE_DOT_VERSION == 0 \
	4856	&& UNICODE_DOT_DOT_VERSION == 1
	4857
	4858	/* For this version only U+130 and U+131 are equivalent under qr//i. Add a
	4859	* rule so that things work under /iaa and /il */
	4860
	4861	SV * mod_listsv = sv_mortalcopy(*listsvp);
	4862	sv_catpv(mod_listsv, "130\t130\t131\n");
	4863	l = (U8*)SvPV(mod_listsv, lcur);
	4864
	4865	#else
	4866
	4867	l = (U8)SvPV(listsvp, lcur);
	4868
	4869	#endif
	4870
	4871	lend = l + lcur;
	4872
	4873	/* Go through each input line */
	4874	while (l < lend) {
	4875	UV min, max, val;
	4876	UV inverse;
	4877	l = swash_scan_list_line(l, lend, &min, &max, &val,
	4878	cBOOL(octets), typestr);
	4879	if (l > lend) {
	4880	break;
	4881	}
	4882
	4883	/* Each element in the range is to be inverted */
	4884	for (inverse = min; inverse <= max; inverse++) {
	4885	AV* list;
	4886	SV** listp;
	4887	IV i;
	4888	bool found_key = FALSE;
	4889	bool found_inverse = FALSE;
	4890
	4891	/* The key is the inverse mapping */
	4892	char key[UTF8_MAXBYTES+1];
	4893	char* key_end = (char ) uvchr_to_utf8((U8) key, val);
	4894	STRLEN key_len = key_end - key;
	4895
	4896	/* Get the list for the map */
	4897	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	4898	list = (AV) listp;
	4899	}
	4900	else { /* No entry yet for it: create one */
	4901	list = newAV();
	4902	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	4903	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4904	}
	4905	}
	4906
	4907	/* Look through list to see if this inverse mapping already is
	4908	* listed, or if there is a mapping to itself already */
	4909	for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
	4910	SV** entryp = av_fetch(list, i, FALSE);
	4911	SV* entry;
	4912	UV uv;
	4913	if (entryp == NULL) {
	4914	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	4915	}
	4916	entry = *entryp;
	4917	uv = SvUV(entry);
	4918	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));/
	4919	if (uv == val) {
	4920	found_key = TRUE;
	4921	}
	4922	if (uv == inverse) {
	4923	found_inverse = TRUE;
	4924	}
	4925
	4926	/* No need to continue searching if found everything we are
	4927	* looking for */
	4928	if (found_key && found_inverse) {
	4929	break;
	4930	}
	4931	}
	4932
	4933	/* Make sure there is a mapping to itself on the list */
	4934	if (! found_key) {
	4935	av_push(list, newSVuv(val));
	4936	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));/
	4937	}
	4938
	4939
	4940	/* Simply add the value to the list */
	4941	if (! found_inverse) {
	4942	av_push(list, newSVuv(inverse));
	4943	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));/
	4944	}
	4945
	4946	/* swatch_get() increments the value of val for each element in the
	4947	* range. That makes more compact tables possible. You can
	4948	* express the capitalization, for example, of all consecutive
	4949	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	4950	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	4951	* and it's not documented; it appears to be used only in
	4952	* implementing tr//; I copied the semantics from swatch_get(), just
	4953	* in case */
	4954	if (!none \|\| val < none) {
	4955	++val;
	4956	}
	4957	}
	4958	}
	4959
	4960	return ret;
	4961	}
	4962
	4963	SV*
	4964	Perl__swash_to_invlist(pTHX_ SV* const swash)
	4965	{
	4966
	4967	/* Subject to change or removal. For use only in one place in regcomp.c.
	4968	* Ownership is given to one reference count in the returned SV* */
	4969
	4970	U8 l, lend;
	4971	char *loc;
	4972	STRLEN lcur;
	4973	HV *const hv = MUTABLE_HV(SvRV(swash));
	4974	UV elements = 0; /* Number of elements in the inversion list */
	4975	U8 empty[] = "";
	4976	SV** listsvp;
	4977	SV** typesvp;
	4978	SV** bitssvp;
	4979	SV** extssvp;
	4980	SV** invert_it_svp;
	4981
	4982	U8* typestr;
	4983	STRLEN bits;
	4984	STRLEN octets; /* if bits == 1, then octets == 0 */
	4985	U8 x, xend;
	4986	STRLEN xcur;
	4987
	4988	SV* invlist;
	4989
	4990	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	4991
	4992	/* If not a hash, it must be the swash's inversion list instead */
	4993	if (SvTYPE(hv) != SVt_PVHV) {
	4994	return SvREFCNT_inc_simple_NN((SV*) hv);
	4995	}
	4996
	4997	/* The string containing the main body of the table */
	4998	listsvp = hv_fetchs(hv, "LIST", FALSE);
	4999	typesvp = hv_fetchs(hv, "TYPE", FALSE);
	5000	bitssvp = hv_fetchs(hv, "BITS", FALSE);
	5001	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	5002	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	5003
	5004	typestr = (U8)SvPV_nolen(typesvp);
	5005	bits = SvUV(*bitssvp);
	5006	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	5007
	5008	/* read $swash->{LIST} */
	5009	if (SvPOK(*listsvp)) {
	5010	l = (U8)SvPV(listsvp, lcur);
	5011	}
	5012	else {
	5013	/* LIST legitimately doesn't contain a string during compilation phases
	5014	* of Perl itself, before the Unicode tables are generated. In this
	5015	* case, just fake things up by creating an empty list */
	5016	l = empty;
	5017	lcur = 0;
	5018	}
	5019	loc = (char *) l;
	5020	lend = l + lcur;
	5021
	5022	if (l == 'V') { / Inversion list format */
	5023	const char after_atou = (char ) lend;
	5024	UV element0;
	5025	UV* other_elements_ptr;
	5026
	5027	/* The first number is a count of the rest */
	5028	l++;
	5029	if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
	5030	Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
	5031	" at start of inversion list");
	5032	}
	5033	if (elements == 0) {
	5034	invlist = _new_invlist(0);
	5035	}
	5036	else {
	5037	l = (U8 *) after_atou;
	5038
	5039	/* Get the 0th element, which is needed to setup the inversion list
	5040	* */
	5041	while (isSPACE(*l)) l++;
	5042	if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
	5043	Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
	5044	" inversion list");
	5045	}
	5046	l = (U8 *) after_atou;
	5047	invlist = _setup_canned_invlist(elements, element0,
	5048	&other_elements_ptr);
	5049	elements--;
	5050
	5051	/* Then just populate the rest of the input */
	5052	while (elements-- > 0) {
	5053	if (l > lend) {
	5054	Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
	5055	" elements than available", elements);
	5056	}
	5057	while (isSPACE(*l)) l++;
	5058	if (!grok_atoUV((const char *)l, other_elements_ptr++,
	5059	&after_atou))
	5060	{
	5061	Perl_croak(aTHX_ "panic: Expecting a valid element"
	5062	" in inversion list");
	5063	}
	5064	l = (U8 *) after_atou;
	5065	}
	5066	}
	5067	}
	5068	else {
	5069
	5070	/* Scan the input to count the number of lines to preallocate array
	5071	* size based on worst possible case, which is each line in the input
	5072	* creates 2 elements in the inversion list: 1) the beginning of a
	5073	* range in the list; 2) the beginning of a range not in the list. */
	5074	while ((loc = (strchr(loc, '\n'))) != NULL) {
	5075	elements += 2;
	5076	loc++;
	5077	}
	5078
	5079	/* If the ending is somehow corrupt and isn't a new line, add another
	5080	* element for the final range that isn't in the inversion list */
	5081	if (! (*lend == '\n'
	5082	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	5083	{
	5084	elements++;
	5085	}
	5086
	5087	invlist = _new_invlist(elements);
	5088
	5089	/* Now go through the input again, adding each range to the list */
	5090	while (l < lend) {
	5091	UV start, end;
	5092	UV val; /* Not used by this function */
	5093
	5094	l = swash_scan_list_line(l, lend, &start, &end, &val,
	5095	cBOOL(octets), typestr);
	5096
	5097	if (l > lend) {
	5098	break;
	5099	}
	5100
	5101	invlist = _add_range_to_invlist(invlist, start, end);
	5102	}
	5103	}
	5104
	5105	/* Invert if the data says it should be */
	5106	if (invert_it_svp && SvUV(*invert_it_svp)) {
	5107	_invlist_invert(invlist);
	5108	}
	5109
	5110	/* This code is copied from swatch_get()
	5111	* read $swash->{EXTRAS} */
	5112	x = (U8)SvPV(extssvp, xcur);
	5113	xend = x + xcur;
	5114	while (x < xend) {
	5115	STRLEN namelen;
	5116	U8 *namestr;
	5117	SV** othersvp;
	5118	HV* otherhv;
	5119	STRLEN otherbits;
	5120	SV *otherbitssvp, other;
	5121	U8 *nl;
	5122
	5123	const U8 opc = *x++;
	5124	if (opc == '\n')
	5125	continue;
	5126
	5127	nl = (U8*)memchr(x, '\n', xend - x);
	5128
	5129	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	5130	if (nl) {
	5131	x = nl + 1; /* 1 is length of "\n" */
	5132	continue;
	5133	}
	5134	else {
	5135	x = xend; /* to EXTRAS' end at which \n is not found */
	5136	break;
	5137	}
	5138	}
	5139
	5140	namestr = x;
	5141	if (nl) {
	5142	namelen = nl - namestr;
	5143	x = nl + 1;
	5144	}
	5145	else {
	5146	namelen = xend - namestr;
	5147	x = xend;
	5148	}
	5149
	5150	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	5151	otherhv = MUTABLE_HV(SvRV(*othersvp));
	5152	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	5153	otherbits = (STRLEN)SvUV(*otherbitssvp);
	5154
	5155	if (bits != otherbits \|\| bits != 1) {
	5156	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	5157	"properties, bits=%" UVuf ", otherbits=%" UVuf,
	5158	(UV)bits, (UV)otherbits);
	5159	}
	5160
	5161	/* The "other" swatch must be destroyed after. */
	5162	other = _swash_to_invlist((SV )othersvp);
	5163
	5164	/* End of code copied from swatch_get() */
	5165	switch (opc) {
	5166	case '+':
	5167	_invlist_union(invlist, other, &invlist);
	5168	break;
	5169	case '!':
	5170	_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
	5171	break;
	5172	case '-':
	5173	_invlist_subtract(invlist, other, &invlist);
	5174	break;
	5175	case '&':
	5176	_invlist_intersection(invlist, other, &invlist);
	5177	break;
	5178	default:
	5179	break;
	5180	}
	5181	sv_free(other); /* through with it! */
	5182	}
	5183
	5184	SvREADONLY_on(invlist);
	5185	return invlist;
	5186	}
	5187
	5188	SV*
	5189	Perl__get_swash_invlist(pTHX_ SV* const swash)
	5190	{
	5191	SV** ptr;
	5192
	5193	PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
	5194
	5195	if (! SvROK(swash)) {
	5196	return NULL;
	5197	}
	5198
	5199	/* If it really isn't a hash, it isn't really swash; must be an inversion
	5200	* list */
	5201	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
	5202	return SvRV(swash);
	5203	}
	5204
	5205	ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
	5206	if (! ptr) {
	5207	return NULL;
	5208	}
	5209
	5210	return *ptr;
	5211	}
	5212
	5213	bool
	5214	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	5215	{
	5216	/* May change: warns if surrogates, non-character code points, or
	5217	* non-Unicode code points are in 's' which has length 'len' bytes.
	5218	* Returns TRUE if none found; FALSE otherwise. The only other validity
	5219	* check is to make sure that this won't exceed the string's length nor
	5220	* overflow */
	5221
	5222	const U8* const e = s + len;
	5223	bool ok = TRUE;
	5224
	5225	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	5226
	5227	while (s < e) {
	5228	if (UTF8SKIP(s) > len) {
	5229	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	5230	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	5231	return FALSE;
	5232	}
	5233	if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
	5234	if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
	5235	if ( ckWARN_d(WARN_NON_UNICODE)
	5236	\|\| UNLIKELY(0 < does_utf8_overflow(s, s + len,
	5237	0 /* Don't consider overlongs */
	5238	)))
	5239	{
	5240	/* A side effect of this function will be to warn */
	5241	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
	5242	ok = FALSE;
	5243	}
	5244	}
	5245	else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
	5246	if (ckWARN_d(WARN_SURROGATE)) {
	5247	/* This has a different warning than the one the called
	5248	* function would output, so can't just call it, unlike we
	5249	* do for the non-chars and above-unicodes */
	5250	UV uv = utf8_to_uvchr_buf(s, e, NULL);
	5251	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	5252	"Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
	5253	uv);
	5254	ok = FALSE;
	5255	}
	5256	}
	5257	else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
	5258	&& (ckWARN_d(WARN_NONCHAR)))
	5259	{
	5260	/* A side effect of this function will be to warn */
	5261	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
	5262	ok = FALSE;
	5263	}
	5264	}
	5265	s += UTF8SKIP(s);
	5266	}
	5267
	5268	return ok;
	5269	}
	5270
	5271	/*
	5272	=for apidoc pv_uni_display
	5273
	5274	Build to the scalar C<dsv> a displayable version of the string C<spv>,
	5275	length C<len>, the displayable version being at most C<pvlim> bytes long
	5276	(if longer, the rest is truncated and C<"..."> will be appended).
	5277
	5278	The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
	5279	C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
	5280	to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
	5281	(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
	5282	C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
	5283	C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
	5284
	5285	The pointer to the PV of the C<dsv> is returned.
	5286
	5287	See also L</sv_uni_display>.
	5288
	5289	=cut */
	5290	char *
	5291	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim,
	5292	UV flags)
	5293	{
	5294	int truncated = 0;
	5295	const char s, e;
	5296
	5297	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	5298
	5299	SvPVCLEAR(dsv);
	5300	SvUTF8_off(dsv);
	5301	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	5302	UV u;
	5303	/* This serves double duty as a flag and a character to print after
	5304	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	5305	*/
	5306	char ok = 0;
	5307
	5308	if (pvlim && SvCUR(dsv) >= pvlim) {
	5309	truncated++;
	5310	break;
	5311	}
	5312	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	5313	if (u < 256) {
	5314	const unsigned char c = (unsigned char)u & 0xFF;
	5315	if (flags & UNI_DISPLAY_BACKSLASH) {
	5316	switch (c) {
	5317	case '\n':
	5318	ok = 'n'; break;
	5319	case '\r':
	5320	ok = 'r'; break;
	5321	case '\t':
	5322	ok = 't'; break;
	5323	case '\f':
	5324	ok = 'f'; break;
	5325	case '\a':
	5326	ok = 'a'; break;
	5327	case '\\':
	5328	ok = '\\'; break;
	5329	default: break;
	5330	}
	5331	if (ok) {
	5332	const char string = ok;
	5333	sv_catpvs(dsv, "\\");
	5334	sv_catpvn(dsv, &string, 1);
	5335	}
	5336	}
	5337	/* isPRINT() is the locale-blind version. */
	5338	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	5339	const char string = c;
	5340	sv_catpvn(dsv, &string, 1);
	5341	ok = 1;
	5342	}
	5343	}
	5344	if (!ok)
	5345	Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
	5346	}
	5347	if (truncated)
	5348	sv_catpvs(dsv, "...");
	5349
	5350	return SvPVX(dsv);
	5351	}
	5352
	5353	/*
	5354	=for apidoc sv_uni_display
	5355
	5356	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	5357	the displayable version being at most C<pvlim> bytes long
	5358	(if longer, the rest is truncated and "..." will be appended).
	5359
	5360	The C<flags> argument is as in L</pv_uni_display>().
	5361
	5362	The pointer to the PV of the C<dsv> is returned.
	5363
	5364	=cut
	5365	*/
	5366	char *
	5367	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	5368	{
	5369	const char * const ptr =
	5370	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	5371
	5372	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	5373
	5374	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	5375	SvCUR(ssv), pvlim, flags);
	5376	}
	5377
	5378	/*
	5379	=for apidoc foldEQ_utf8
	5380
	5381	Returns true if the leading portions of the strings C<s1> and C<s2> (either or
	5382	both of which may be in UTF-8) are the same case-insensitively; false
	5383	otherwise. How far into the strings to compare is determined by other input
	5384	parameters.
	5385
	5386	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	5387	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
	5388	C<u2> with respect to C<s2>.
	5389
	5390	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
	5391	fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
	5392	The scan will not be considered to be a match unless the goal is reached, and
	5393	scanning won't continue past that goal. Correspondingly for C<l2> with respect
	5394	to C<s2>.
	5395
	5396	If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
	5397	pointer is considered an end pointer to the position 1 byte past the maximum
	5398	point in C<s1> beyond which scanning will not continue under any circumstances.
	5399	(This routine assumes that UTF-8 encoded input strings are not malformed;
	5400	malformed input can cause it to read past C<pe1>). This means that if both
	5401	C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
	5402	will never be successful because it can never
	5403	get as far as its goal (and in fact is asserted against). Correspondingly for
	5404	C<pe2> with respect to C<s2>.
	5405
	5406	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	5407	C<l2> must be non-zero), and if both do, both have to be
	5408	reached for a successful match. Also, if the fold of a character is multiple
	5409	characters, all of them must be matched (see tr21 reference below for
	5410	'folding').
	5411
	5412	Upon a successful match, if C<pe1> is non-C<NULL>,
	5413	it will be set to point to the beginning of the I<next> character of C<s1>
	5414	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	5415
	5416	For case-insensitiveness, the "casefolding" of Unicode is used
	5417	instead of upper/lowercasing both the characters, see
	5418	L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	5419
	5420	=cut */
	5421
	5422	/* A flags parameter has been added which may change, and hence isn't
	5423	* externally documented. Currently it is:
	5424	* 0 for as-documented above
	5425	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	5426	ASCII one, to not match
	5427	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	5428	* locale are to be used.
	5429	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	5430	* routine. This allows that step to be skipped.
	5431	* Currently, this requires s1 to be encoded as UTF-8
	5432	* (u1 must be true), which is asserted for.
	5433	* FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
	5434	* cross certain boundaries. Hence, the caller should
	5435	* let this function do the folding instead of
	5436	* pre-folding. This code contains an assertion to
	5437	* that effect. However, if the caller knows what
	5438	* it's doing, it can pass this flag to indicate that,
	5439	* and the assertion is skipped.
	5440	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	5441	* FOLDEQ_S2_FOLDS_SANE
	5442	*/
	5443	I32
	5444	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char *pe1, UV l1, bool u1,
	5445	const char s2, char *pe2, UV l2, bool u2,
	5446	U32 flags)
	5447	{
	5448	const U8 p1 = (const U8)s1; /* Point to current char */
	5449	const U8 p2 = (const U8)s2;
	5450	const U8 g1 = NULL; / goal for s1 */
	5451	const U8 *g2 = NULL;
	5452	const U8 e1 = NULL; / Don't scan s1 past this */
	5453	U8 f1 = NULL; / Point to current folded */
	5454	const U8 *e2 = NULL;
	5455	U8 *f2 = NULL;
	5456	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	5457	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	5458	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	5459	U8 flags_for_folder = FOLD_FLAGS_FULL;
	5460
	5461	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	5462
	5463	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	5464	&& (((flags & FOLDEQ_S1_ALREADY_FOLDED)
	5465	&& !(flags & FOLDEQ_S1_FOLDS_SANE))
	5466	\|\| ((flags & FOLDEQ_S2_ALREADY_FOLDED)
	5467	&& !(flags & FOLDEQ_S2_FOLDS_SANE)))));
	5468	/* The algorithm is to trial the folds without regard to the flags on
	5469	* the first line of the above assert(), and then see if the result
	5470	* violates them. This means that the inputs can't be pre-folded to a
	5471	* violating result, hence the assert. This could be changed, with the
	5472	* addition of extra tests here for the already-folded case, which would
	5473	* slow it down. That cost is more than any possible gain for when these
	5474	* flags are specified, as the flags indicate /il or /iaa matching which
	5475	* is less common than /iu, and I (khw) also believe that real-world /il
	5476	* and /iaa matches are most likely to involve code points 0-255, and this
	5477	* function only under rare conditions gets called for 0-255. */
	5478
	5479	if (flags & FOLDEQ_LOCALE) {
	5480	if (IN_UTF8_CTYPE_LOCALE) {
	5481	flags &= ~FOLDEQ_LOCALE;
	5482	}
	5483	else {
	5484	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	5485	}
	5486	}
	5487
	5488	if (pe1) {
	5489	e1 = (U8*)pe1;
	5490	}
	5491
	5492	if (l1) {
	5493	g1 = (const U8*)s1 + l1;
	5494	}
	5495
	5496	if (pe2) {
	5497	e2 = (U8*)pe2;
	5498	}
	5499
	5500	if (l2) {
	5501	g2 = (const U8*)s2 + l2;
	5502	}
	5503
	5504	/* Must have at least one goal */
	5505	assert(g1 \|\| g2);
	5506
	5507	if (g1) {
	5508
	5509	/* Will never match if goal is out-of-bounds */
	5510	assert(! e1 \|\| e1 >= g1);
	5511
	5512	/* Here, there isn't an end pointer, or it is beyond the goal. We
	5513	* only go as far as the goal */
	5514	e1 = g1;
	5515	}
	5516	else {
	5517	assert(e1); /* Must have an end for looking at s1 */
	5518	}
	5519
	5520	/* Same for goal for s2 */
	5521	if (g2) {
	5522	assert(! e2 \|\| e2 >= g2);
	5523	e2 = g2;
	5524	}
	5525	else {
	5526	assert(e2);
	5527	}
	5528
	5529	/* If both operands are already folded, we could just do a memEQ on the
	5530	* whole strings at once, but it would be better if the caller realized
	5531	* this and didn't even call us */
	5532
	5533	/* Look through both strings, a character at a time */
	5534	while (p1 < e1 && p2 < e2) {
	5535
	5536	/* If at the beginning of a new character in s1, get its fold to use
	5537	* and the length of the fold. */
	5538	if (n1 == 0) {
	5539	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	5540	f1 = (U8 *) p1;
	5541	assert(u1);
	5542	n1 = UTF8SKIP(f1);
	5543	}
	5544	else {
	5545	if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
	5546
	5547	/* We have to forbid mixing ASCII with non-ASCII if the
	5548	* flags so indicate. And, we can short circuit having to
	5549	* call the general functions for this common ASCII case,
	5550	* all of whose non-locale folds are also ASCII, and hence
	5551	* UTF-8 invariants, so the UTF8ness of the strings is not
	5552	* relevant. */
	5553	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	5554	return 0;
	5555	}
	5556	n1 = 1;
	5557	foldbuf1 = toFOLD(p1);
	5558	}
	5559	else if (u1) {
	5560	_toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
	5561	}
	5562	else { /* Not UTF-8, get UTF-8 fold */
	5563	_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
	5564	}
	5565	f1 = foldbuf1;
	5566	}
	5567	}
	5568
	5569	if (n2 == 0) { /* Same for s2 */
	5570	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	5571	f2 = (U8 *) p2;
	5572	assert(u2);
	5573	n2 = UTF8SKIP(f2);
	5574	}
	5575	else {
	5576	if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
	5577	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	5578	return 0;
	5579	}
	5580	n2 = 1;
	5581	foldbuf2 = toFOLD(p2);
	5582	}
	5583	else if (u2) {
	5584	_toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
	5585	}
	5586	else {
	5587	_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
	5588	}
	5589	f2 = foldbuf2;
	5590	}
	5591	}
	5592
	5593	/* Here f1 and f2 point to the beginning of the strings to compare.
	5594	* These strings are the folds of the next character from each input
	5595	* string, stored in UTF-8. */
	5596
	5597	/* While there is more to look for in both folds, see if they
	5598	* continue to match */
	5599	while (n1 && n2) {
	5600	U8 fold_length = UTF8SKIP(f1);
	5601	if (fold_length != UTF8SKIP(f2)
	5602	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	5603	function call for single
	5604	byte */
	5605	\|\| memNE((char)f1, (char)f2, fold_length))
	5606	{
	5607	return 0; /* mismatch */
	5608	}
	5609
	5610	/* Here, they matched, advance past them */
	5611	n1 -= fold_length;
	5612	f1 += fold_length;
	5613	n2 -= fold_length;
	5614	f2 += fold_length;
	5615	}
	5616
	5617	/* When reach the end of any fold, advance the input past it */
	5618	if (n1 == 0) {
	5619	p1 += u1 ? UTF8SKIP(p1) : 1;
	5620	}
	5621	if (n2 == 0) {
	5622	p2 += u2 ? UTF8SKIP(p2) : 1;
	5623	}
	5624	} /* End of loop through both strings */
	5625
	5626	/* A match is defined by each scan that specified an explicit length
	5627	* reaching its final goal, and the other not having matched a partial
	5628	* character (which can happen when the fold of a character is more than one
	5629	* character). */
	5630	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	5631	return 0;
	5632	}
	5633
	5634	/* Successful match. Set output pointers */
	5635	if (pe1) {
	5636	pe1 = (char)p1;
	5637	}
	5638	if (pe2) {
	5639	pe2 = (char)p2;
	5640	}
	5641	return 1;
	5642	}
	5643
	5644	/* XXX The next two functions should likely be moved to mathoms.c once all
	5645	* occurrences of them are removed from the core; some cpan-upstream modules
	5646	* still use them */
	5647
	5648	U8 *
	5649	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	5650	{
	5651	PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
	5652
	5653	return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
	5654	}
	5655
	5656	/*
	5657	=for apidoc utf8n_to_uvuni
	5658
	5659	Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
	5660
	5661	This function was useful for code that wanted to handle both EBCDIC and
	5662	ASCII platforms with Unicode properties, but starting in Perl v5.20, the
	5663	distinctions between the platforms have mostly been made invisible to most
	5664	code, so this function is quite unlikely to be what you want. If you do need
	5665	this precise functionality, use instead
	5666	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>>
	5667	or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))\|/utf8n_to_uvchr>>.
	5668
	5669	=cut
	5670	*/
	5671
	5672	UV
	5673	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	5674	{
	5675	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	5676
	5677	return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
	5678	}
	5679
	5680	/*
	5681	=for apidoc uvuni_to_utf8_flags
	5682
	5683	Instead you almost certainly want to use L</uvchr_to_utf8> or
	5684	L</uvchr_to_utf8_flags>.
	5685
	5686	This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
	5687	which itself, while not deprecated, should be used only in isolated
	5688	circumstances. These functions were useful for code that wanted to handle
	5689	both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
	5690	v5.20, the distinctions between the platforms have mostly been made invisible
	5691	to most code, so this function is quite unlikely to be what you want.
	5692
	5693	=cut
	5694	*/
	5695
	5696	U8 *
	5697	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	5698	{
	5699	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	5700
	5701	return uvoffuni_to_utf8_flags(d, uv, flags);
	5702	}
	5703
	5704	/*
	5705	* ex: set ts=8 sts=4 sw=4 et:
	5706	*/