perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "invlist_inline.h"
	35
	36	static const char malformed_text[] = "Malformed UTF-8 character";
	37	static const char unees[] =
	38	"Malformed UTF-8 character (unexpected end of string)";
	39	static const char cp_above_legal_max[] =
	40	"Use of code point 0x%" UVXf " is not allowed; the"
	41	" permissible max is 0x%" UVXf;
	42
	43	#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
	44
	45	/*
	46	=head1 Unicode Support
	47	These are various utility functions for manipulating UTF8-encoded
	48	strings. For the uninitiated, this is a method of representing arbitrary
	49	Unicode characters as a variable number of bytes, in such a way that
	50	characters in the ASCII range are unmodified, and a zero byte never appears
	51	within non-zero characters.
	52
	53	=cut
	54	*/
	55
	56	void
	57	Perl__force_out_malformed_utf8_message(pTHX_
	58	const U8 const p, / First byte in UTF-8 sequence */
	59	const U8 * const e, /* Final byte in sequence (may include
	60	multiple chars */
	61	const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
	62	usually 0, or some DISALLOW flags */
	63	const bool die_here) /* If TRUE, this function does not return */
	64	{
	65	/* This core-only function is to be called when a malformed UTF-8 character
	66	* is found, in order to output the detailed information about the
	67	* malformation before dieing. The reason it exists is for the occasions
	68	* when such a malformation is fatal, but warnings might be turned off, so
	69	* that normally they would not be actually output. This ensures that they
	70	* do get output. Because a sequence may be malformed in more than one
	71	* way, multiple messages may be generated, so we can't make them fatal, as
	72	* that would cause the first one to die.
	73	*
	74	* Instead we pretend -W was passed to perl, then die afterwards. The
	75	* flexibility is here to return to the caller so they can finish up and
	76	* die themselves */
	77	U32 errors;
	78
	79	PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
	80
	81	ENTER;
	82	SAVEI8(PL_dowarn);
	83	SAVESPTR(PL_curcop);
	84
	85	PL_dowarn = G_WARN_ALL_ON\|G_WARN_ON;
	86	if (PL_curcop) {
	87	PL_curcop->cop_warnings = pWARN_ALL;
	88	}
	89
	90	(void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
	91
	92	LEAVE;
	93
	94	if (! errors) {
	95	Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
	96	" be called only when there are errors found");
	97	}
	98
	99	if (die_here) {
	100	Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
	101	}
	102	}
	103
	104	/*
	105	=for apidoc uvoffuni_to_utf8_flags
	106
	107	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	108	Instead, B<Almost all code should use L</uvchr_to_utf8> or
	109	L</uvchr_to_utf8_flags>>.
	110
	111	This function is like them, but the input is a strict Unicode
	112	(as opposed to native) code point. Only in very rare circumstances should code
	113	not be using the native code point.
	114
	115	For details, see the description for L</uvchr_to_utf8_flags>.
	116
	117	=cut
	118	*/
	119
	120	/* All these formats take a single UV code point argument */
	121	const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
	122	const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
	123	" is not recommended for open interchange";
	124	const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
	125	" may not be portable";
	126	const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
	127	" Unicode, requires a Perl extension," \
	128	" and so is not portable";
	129
	130	#define HANDLE_UNICODE_SURROGATE(uv, flags) \
	131	STMT_START { \
	132	if (flags & UNICODE_WARN_SURROGATE) { \
	133	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
	134	surrogate_cp_format, uv); \
	135	} \
	136	if (flags & UNICODE_DISALLOW_SURROGATE) { \
	137	return NULL; \
	138	} \
	139	} STMT_END;
	140
	141	#define HANDLE_UNICODE_NONCHAR(uv, flags) \
	142	STMT_START { \
	143	if (flags & UNICODE_WARN_NONCHAR) { \
	144	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
	145	nonchar_cp_format, uv); \
	146	} \
	147	if (flags & UNICODE_DISALLOW_NONCHAR) { \
	148	return NULL; \
	149	} \
	150	} STMT_END;
	151
	152	/* Use shorter names internally in this file */
	153	#define SHIFT UTF_ACCUMULATION_SHIFT
	154	#undef MARK
	155	#define MARK UTF_CONTINUATION_MARK
	156	#define MASK UTF_CONTINUATION_MASK
	157
	158	U8 *
	159	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
	160	{
	161	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	162
	163	if (OFFUNI_IS_INVARIANT(uv)) {
	164	*d++ = LATIN1_TO_NATIVE(uv);
	165	return d;
	166	}
	167
	168	if (uv <= MAX_UTF8_TWO_BYTE) {
	169	*d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) \| UTF_START_MARK(2));
	170	*d++ = I8_TO_NATIVE_UTF8(( uv & MASK) \| MARK);
	171	return d;
	172	}
	173
	174	/* Not 2-byte; test for and handle 3-byte result. In the test immediately
	175	* below, the 16 is for start bytes E0-EF (which are all the possible ones
	176	* for 3 byte characters). The 2 is for 2 continuation bytes; these each
	177	* contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
	178	* on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
	179	* 0x800-0xFFFF on ASCII */
	180	if (uv < (16 * (1U << (2 * SHIFT)))) {
	181	d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) SHIFT)) \| UTF_START_MARK(3));
	182	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	183	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	184
	185	#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
	186	aren't tested here */
	187	/* The most likely code points in this range are below the surrogates.
	188	* Do an extra test to quickly exclude those. */
	189	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
	190	if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
	191	\|\| UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
	192	{
	193	HANDLE_UNICODE_NONCHAR(uv, flags);
	194	}
	195	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	196	HANDLE_UNICODE_SURROGATE(uv, flags);
	197	}
	198	}
	199	#endif
	200	return d;
	201	}
	202
	203	/* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
	204	* platforms, and 0x4000 on EBCDIC. There are problematic cases that can
	205	* happen starting with 4-byte characters on ASCII platforms. We unify the
	206	* code for these with EBCDIC, even though some of them require 5-bytes on
	207	* those, because khw believes the code saving is worth the very slight
	208	* performance hit on these high EBCDIC code points. */
	209
	210	if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
	211	if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
	212	Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
	213	}
	214	if ( (flags & UNICODE_WARN_SUPER)
	215	\|\| ( (flags & UNICODE_WARN_PERL_EXTENDED)
	216	&& UNICODE_IS_PERL_EXTENDED(uv)))
	217	{
	218	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	219
	220	/* Choose the more dire applicable warning */
	221	(UNICODE_IS_PERL_EXTENDED(uv))
	222	? perl_extended_cp_format
	223	: super_cp_format,
	224	uv);
	225	}
	226	if ( (flags & UNICODE_DISALLOW_SUPER)
	227	\|\| ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
	228	&& UNICODE_IS_PERL_EXTENDED(uv)))
	229	{
	230	return NULL;
	231	}
	232	}
	233	else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
	234	HANDLE_UNICODE_NONCHAR(uv, flags);
	235	}
	236
	237	/* Test for and handle 4-byte result. In the test immediately below, the
	238	* 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
	239	* characters). The 3 is for 3 continuation bytes; these each contribute
	240	* SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
	241	* ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
	242	* 0x1_0000-0x1F_FFFF on ASCII */
	243	if (uv < (8 * (1U << (3 * SHIFT)))) {
	244	d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) SHIFT)) \| UTF_START_MARK(4));
	245	d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) SHIFT)) & MASK) \| MARK);
	246	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	247	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	248
	249	#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
	250	characters. The end-plane non-characters for EBCDIC were
	251	handled just above */
	252	if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
	253	HANDLE_UNICODE_NONCHAR(uv, flags);
	254	}
	255	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	256	HANDLE_UNICODE_SURROGATE(uv, flags);
	257	}
	258	#endif
	259
	260	return d;
	261	}
	262
	263	/* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
	264	* platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
	265	* format. The unrolled version above turns out to not save all that much
	266	* time, and at these high code points (well above the legal Unicode range
	267	* on ASCII platforms, and well above anything in common use in EBCDIC),
	268	* khw believes that less code outweighs slight performance gains. */
	269
	270	{
	271	STRLEN len = OFFUNISKIP(uv);
	272	U8 *p = d+len-1;
	273	while (p > d) {
	274	*p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	275	uv >>= UTF_ACCUMULATION_SHIFT;
	276	}
	277	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	278	return d+len;
	279	}
	280	}
	281
	282	/*
	283	=for apidoc uvchr_to_utf8
	284
	285	Adds the UTF-8 representation of the native code point C<uv> to the end
	286	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	287	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	288	the byte after the end of the new character. In other words,
	289
	290	d = uvchr_to_utf8(d, uv);
	291
	292	is the recommended wide native character-aware way of saying
	293
	294	*(d++) = uv;
	295
	296	This function accepts any code point from 0..C<IV_MAX> as input.
	297	C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	298
	299	It is possible to forbid or warn on non-Unicode code points, or those that may
	300	be problematic by using L</uvchr_to_utf8_flags>.
	301
	302	=cut
	303	*/
	304
	305	/* This is also a macro */
	306	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	307
	308	U8 *
	309	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	310	{
	311	return uvchr_to_utf8(d, uv);
	312	}
	313
	314	/*
	315	=for apidoc uvchr_to_utf8_flags
	316
	317	Adds the UTF-8 representation of the native code point C<uv> to the end
	318	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	319	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	320	the byte after the end of the new character. In other words,
	321
	322	d = uvchr_to_utf8_flags(d, uv, flags);
	323
	324	or, in most cases,
	325
	326	d = uvchr_to_utf8_flags(d, uv, 0);
	327
	328	This is the Unicode-aware way of saying
	329
	330	*(d++) = uv;
	331
	332	If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
	333	input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	334
	335	Specifying C<flags> can further restrict what is allowed and not warned on, as
	336	follows:
	337
	338	If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
	339	the function will raise a warning, provided UTF8 warnings are enabled. If
	340	instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
	341	NULL. If both flags are set, the function will both warn and return NULL.
	342
	343	Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
	344	affect how the function handles a Unicode non-character.
	345
	346	And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
	347	affect the handling of code points that are above the Unicode maximum of
	348	0x10FFFF. Languages other than Perl may not be able to accept files that
	349	contain these.
	350
	351	The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
	352	the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
	353	three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
	354	allowed inputs to the strict UTF-8 traditionally defined by Unicode.
	355	Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
	356	C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
	357	above-Unicode and surrogate flags, but not the non-character ones, as
	358	defined in
	359	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	360	See L<perlunicode/Noncharacter code points>.
	361
	362	Extremely high code points were never specified in any standard, and require an
	363	extension to UTF-8 to express, which Perl does. It is likely that programs
	364	written in something other than Perl would not be able to read files that
	365	contain these; nor would Perl understand files written by something that uses a
	366	different extension. For these reasons, there is a separate set of flags that
	367	can warn and/or disallow these extremely high code points, even if other
	368	above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
	369	and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
	370	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
	371	treat all above-Unicode code points, including these, as malformations. (Note
	372	that the Unicode standard considers anything above 0x10FFFF to be illegal, but
	373	there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
	374
	375	A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
	376	retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
	377	C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	378	C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
	379	flags can apply to code points that actually do fit in 31 bits. This happens
	380	on EBCDIC platforms, and sometimes when the L<overlong
	381	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	382	describe the situation in all cases.
	383
	384	=cut
	385	*/
	386
	387	/* This is also a macro */
	388	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	389
	390	U8 *
	391	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	392	{
	393	return uvchr_to_utf8_flags(d, uv, flags);
	394	}
	395
	396	#ifndef UV_IS_QUAD
	397
	398	STATIC int
	399	S_is_utf8_cp_above_31_bits(const U8 * const s,
	400	const U8 * const e,
	401	const bool consider_overlongs)
	402	{
	403	/* Returns TRUE if the first code point represented by the Perl-extended-
	404	* UTF-8-encoded string starting at 's', and looking no further than 'e -
	405	* 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
	406	*
	407	* The function handles the case where the input bytes do not include all
	408	* the ones necessary to represent a full character. That is, they may be
	409	* the intial bytes of the representation of a code point, but possibly
	410	* the final ones necessary for the complete representation may be beyond
	411	* 'e - 1'.
	412	*
	413	* The function also can handle the case where the input is an overlong
	414	* sequence. If 'consider_overlongs' is 0, the function assumes the
	415	* input is not overlong, without checking, and will return based on that
	416	* assumption. If this parameter is 1, the function will go to the trouble
	417	* of figuring out if it actually evaluates to above or below 31 bits.
	418	*
	419	* The sequence is otherwise assumed to be well-formed, without checking.
	420	*/
	421
	422	const STRLEN len = e - s;
	423	int is_overlong;
	424
	425	PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
	426
	427	assert(! UTF8_IS_INVARIANT(*s) && e > s);
	428
	429	#ifdef EBCDIC
	430
	431	PERL_UNUSED_ARG(consider_overlongs);
	432
	433	/* On the EBCDIC code pages we handle, only the native start byte 0xFE can
	434	* mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
	435	* also be the start byte for a 31-bit code point; we need at least 2
	436	* bytes, and maybe up through 8 bytes, to determine that. (It can also be
	437	* the start byte for an overlong sequence, but for 30-bit or smaller code
	438	* points, so we don't have to worry about overlongs on EBCDIC.) */
	439	if (*s != 0xFE) {
	440	return 0;
	441	}
	442
	443	if (len == 1) {
	444	return -1;
	445	}
	446
	447	#else
	448
	449	/* On ASCII, FE and FF are the only start bytes that can evaluate to
	450	* needing more than 31 bits. */
	451	if (LIKELY(*s < 0xFE)) {
	452	return 0;
	453	}
	454
	455	/* What we have left are FE and FF. Both of these require more than 31
	456	* bits unless they are for overlongs. */
	457	if (! consider_overlongs) {
	458	return 1;
	459	}
	460
	461	/* Here, we have FE or FF. If the input isn't overlong, it evaluates to
	462	* above 31 bits. But we need more than one byte to discern this, so if
	463	* passed just the start byte, it could be an overlong evaluating to
	464	* smaller */
	465	if (len == 1) {
	466	return -1;
	467	}
	468
	469	/* Having excluded len==1, and knowing that FE and FF are both valid start
	470	* bytes, we can call the function below to see if the sequence is
	471	* overlong. (We don't need the full generality of the called function,
	472	* but for these huge code points, speed shouldn't be a consideration, and
	473	* the compiler does have enough information, since it's static to this
	474	* file, to optimize to just the needed parts.) */
	475	is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
	476
	477	/* If it isn't overlong, more than 31 bits are required. */
	478	if (is_overlong == 0) {
	479	return 1;
	480	}
	481
	482	/* If it is indeterminate if it is overlong, return that */
	483	if (is_overlong < 0) {
	484	return -1;
	485	}
	486
	487	/* Here is overlong. Such a sequence starting with FE is below 31 bits, as
	488	* the max it can be is 2*31 - 1 /
	489	if (*s == 0xFE) {
	490	return 0;
	491	}
	492
	493	#endif
	494
	495	/* Here, ASCII and EBCDIC rejoin:
	496	* On ASCII: We have an overlong sequence starting with FF
	497	* On EBCDIC: We have a sequence starting with FE. */
	498
	499	{ /* For C89, use a block so the declaration can be close to its use */
	500
	501	#ifdef EBCDIC
	502
	503	/* U+7FFFFFFF (2 ** 31 - 1)
	504	* [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
	505	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
	506	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
	507	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
	508	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
	509	* U+80000000 (2 ** 31):
	510	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	511	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	512	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	513	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
	514	*
	515	* and since we know that *s = \xfe, any continuation sequcence
	516	* following it that is gt the below is above 31 bits
	517	[0] [1] [2] [3] [4] [5] [6] */
	518	const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
	519
	520	#else
	521
	522	/* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
	523	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
	524	* FF overlong for U+80000000 (2 ** 31):
	525	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
	526	* and since we know that *s = \xff, any continuation sequcence
	527	* following it that is gt the below is above 30 bits
	528	[0] [1] [2] [3] [4] [5] [6] */
	529	const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
	530
	531
	532	#endif
	533	const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
	534	const STRLEN cmp_len = MIN(conts_len, len - 1);
	535
	536	/* Now compare the continuation bytes in s with the ones we have
	537	* compiled in that are for the largest 30 bit code point. If we have
	538	* enough bytes available to determine the answer, or the bytes we do
	539	* have differ from them, we can compare the two to get a definitive
	540	* answer (Note that in UTF-EBCDIC, the two lowest possible
	541	* continuation bytes are \x41 and \x42.) */
	542	if (cmp_len >= conts_len \|\| memNE(s + 1,
	543	conts_for_highest_30_bit,
	544	cmp_len))
	545	{
	546	return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
	547	}
	548
	549	/* Here, all the bytes we have are the same as the highest 30-bit code
	550	* point, but we are missing so many bytes that we can't make the
	551	* determination */
	552	return -1;
	553	}
	554	}
	555
	556	#endif
	557
	558	PERL_STATIC_INLINE int
	559	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
	560	{
	561	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	562	* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
	563	* it isn't, and -1 if there isn't enough information to tell. This last
	564	* return value can happen if the sequence is incomplete, missing some
	565	* trailing bytes that would form a complete character. If there are
	566	* enough bytes to make a definitive decision, this function does so.
	567	* Usually 2 bytes sufficient.
	568	*
	569	* Overlongs can occur whenever the number of continuation bytes changes.
	570	* That means whenever the number of leading 1 bits in a start byte
	571	* increases from the next lower start byte. That happens for start bytes
	572	* C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
	573	* start bytes have already been excluded, so don't need to be tested here;
	574	* ASCII platforms: C0, C1
	575	* EBCDIC platforms C0, C1, C2, C3, C4, E0
	576	*/
	577
	578	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	579	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	580
	581	PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
	582	assert(len > 1 && UTF8_IS_START(*s));
	583
	584	/* Each platform has overlongs after the start bytes given above (expressed
	585	* in I8 for EBCDIC). What constitutes an overlong varies by platform, but
	586	* the logic is the same, except the E0 overlong has already been excluded
	587	* on EBCDIC platforms. The values below were found by manually
	588	* inspecting the UTF-8 patterns. See the tables in utf8.h and
	589	* utfebcdic.h. */
	590
	591	# ifdef EBCDIC
	592	# define F0_ABOVE_OVERLONG 0xB0
	593	# define F8_ABOVE_OVERLONG 0xA8
	594	# define FC_ABOVE_OVERLONG 0xA4
	595	# define FE_ABOVE_OVERLONG 0xA2
	596	# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
	597	/* I8(0xfe) is FF */
	598	# else
	599
	600	if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
	601	return 1;
	602	}
	603
	604	# define F0_ABOVE_OVERLONG 0x90
	605	# define F8_ABOVE_OVERLONG 0x88
	606	# define FC_ABOVE_OVERLONG 0x84
	607	# define FE_ABOVE_OVERLONG 0x82
	608	# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
	609	# endif
	610
	611
	612	if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
	613	\|\| (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
	614	\|\| (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
	615	\|\| (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
	616	{
	617	return 1;
	618	}
	619
	620	/* Check for the FF overlong */
	621	return isFF_OVERLONG(s, len);
	622	}
	623
	624	PERL_STATIC_INLINE int
	625	S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
	626	{
	627	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	628	* 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
	629	* it isn't, and -1 if there isn't enough information to tell. This last
	630	* return value can happen if the sequence is incomplete, missing some
	631	* trailing bytes that would form a complete character. If there are
	632	* enough bytes to make a definitive decision, this function does so. */
	633
	634	PERL_ARGS_ASSERT_ISFF_OVERLONG;
	635
	636	/* To be an FF overlong, all the available bytes must match */
	637	if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
	638	MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
	639	{
	640	return 0;
	641	}
	642
	643	/* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
	644	* be there; what comes after them doesn't matter. See tables in utf8.h,
	645	* utfebcdic.h. */
	646	if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
	647	return 1;
	648	}
	649
	650	/* The missing bytes could cause the result to go one way or the other, so
	651	* the result is indeterminate */
	652	return -1;
	653	}
	654
	655	#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2*63-1 /
	656	# ifdef EBCDIC /* Actually is I8 */
	657	# define HIGHEST_REPRESENTABLE_UTF8 \
	658	"\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	659	# else
	660	# define HIGHEST_REPRESENTABLE_UTF8 \
	661	"\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	662	# endif
	663	#endif
	664
	665	PERL_STATIC_INLINE int
	666	S_does_utf8_overflow(const U8 * const s,
	667	const U8 * e,
	668	const bool consider_overlongs)
	669	{
	670	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	671	* 'e' - 1 would overflow an IV on this platform; that is if it represents
	672	* a code point larger than the highest representable code point. It
	673	* returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
	674	* enough information to tell. This last return value can happen if the
	675	* sequence is incomplete, missing some trailing bytes that would form a
	676	* complete character. If there are enough bytes to make a definitive
	677	* decision, this function does so.
	678	*
	679	* If 'consider_overlongs' is TRUE, the function checks for the possibility
	680	* that the sequence is an overlong that doesn't overflow. Otherwise, it
	681	* assumes the sequence is not an overlong. This can give different
	682	* results only on ASCII 32-bit platforms.
	683	*
	684	* (For ASCII platforms, we could use memcmp() because we don't have to
	685	* convert each byte to I8, but it's very rare input indeed that would
	686	* approach overflow, so the loop below will likely only get executed once.)
	687	*
	688	* 'e' - 1 must not be beyond a full character. */
	689
	690
	691	PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
	692	assert(s <= e && s + UTF8SKIP(s) >= e);
	693
	694	#if ! defined(UV_IS_QUAD)
	695
	696	return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
	697
	698	#else
	699
	700	PERL_UNUSED_ARG(consider_overlongs);
	701
	702	{
	703	const STRLEN len = e - s;
	704	const U8 *x;
	705	const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
	706
	707	for (x = s; x < e; x++, y++) {
	708
	709	if (UNLIKELY(NATIVE_UTF8_TO_I8(x) == y)) {
	710	continue;
	711	}
	712
	713	/* If this byte is larger than the corresponding highest UTF-8
	714	* byte, the sequence overflow; otherwise the byte is less than,
	715	* and so the sequence doesn't overflow */
	716	return NATIVE_UTF8_TO_I8(x) > y;
	717
	718	}
	719
	720	/* Got to the end and all bytes are the same. If the input is a whole
	721	* character, it doesn't overflow. And if it is a partial character,
	722	* there's not enough information to tell */
	723	if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
	724	return -1;
	725	}
	726
	727	return 0;
	728	}
	729
	730	#endif
	731
	732	}
	733
	734	#if 0
	735
	736	/* This is the portions of the above function that deal with UV_MAX instead of
	737	* IV_MAX. They are left here in case we want to combine them so that internal
	738	* uses can have larger code points. The only logic difference is that the
	739	* 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
	740	* different logic.
	741	*/
	742
	743	/* Anything larger than this will overflow the word if it were converted into a UV */
	744	#if defined(UV_IS_QUAD)
	745	# ifdef EBCDIC /* Actually is I8 */
	746	# define HIGHEST_REPRESENTABLE_UTF8 \
	747	"\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	748	# else
	749	# define HIGHEST_REPRESENTABLE_UTF8 \
	750	"\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	751	# endif
	752	#else /* 32-bit */
	753	# ifdef EBCDIC
	754	# define HIGHEST_REPRESENTABLE_UTF8 \
	755	"\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
	756	# else
	757	# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
	758	# endif
	759	#endif
	760
	761	#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
	762
	763	/* On 32 bit ASCII machines, many overlongs that start with FF don't
	764	* overflow */
	765	if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
	766
	767	/* To be such an overlong, the first bytes of 's' must match
	768	* FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
	769	* don't have any additional bytes available, the sequence, when
	770	* completed might or might not fit in 32 bits. But if we have that
	771	* next byte, we can tell for sure. If it is <= 0x83, then it does
	772	* fit. */
	773	if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
	774	return -1;
	775	}
	776
	777	return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
	778	}
	779
	780	/* Starting with the #else, the rest of the function is identical except
	781	* 1. we need to move the 'len' declaration to be global to the function
	782	* 2. the endif move to just after the UNUSED_ARG.
	783	* An empty endif is given just below to satisfy the preprocessor
	784	*/
	785	#endif
	786
	787	#endif
	788
	789	#undef F0_ABOVE_OVERLONG
	790	#undef F8_ABOVE_OVERLONG
	791	#undef FC_ABOVE_OVERLONG
	792	#undef FE_ABOVE_OVERLONG
	793	#undef FF_OVERLONG_PREFIX
	794
	795	STRLEN
	796	Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
	797	{
	798	STRLEN len;
	799	const U8 *x;
	800
	801	/* A helper function that should not be called directly.
	802	*
	803	* This function returns non-zero if the string beginning at 's' and
	804	* looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
	805	* code point; otherwise it returns 0. The examination stops after the
	806	* first code point in 's' is validated, not looking at the rest of the
	807	* input. If 'e' is such that there are not enough bytes to represent a
	808	* complete code point, this function will return non-zero anyway, if the
	809	* bytes it does have are well-formed UTF-8 as far as they go, and aren't
	810	* excluded by 'flags'.
	811	*
	812	* A non-zero return gives the number of bytes required to represent the
	813	* code point. Be aware that if the input is for a partial character, the
	814	* return will be larger than 'e - s'.
	815	*
	816	* This function assumes that the code point represented is UTF-8 variant.
	817	* The caller should have excluded the possibility of it being invariant
	818	* before calling this function.
	819	*
	820	* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
	821	* accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
	822	* 0 if the code point represented is well-formed Perl-extended-UTF-8, but
	823	* disallowed by the flags. If the input is only for a partial character,
	824	* the function will return non-zero if there is any sequence of
	825	* well-formed UTF-8 that, when appended to the input sequence, could
	826	* result in an allowed code point; otherwise it returns 0. Non characters
	827	* cannot be determined based on partial character input. But many of the
	828	* other excluded types can be determined with just the first one or two
	829	* bytes.
	830	*
	831	*/
	832
	833	PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
	834
	835	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	836	\|UTF8_DISALLOW_PERL_EXTENDED)));
	837	assert(! UTF8_IS_INVARIANT(*s));
	838
	839	/* A variant char must begin with a start byte */
	840	if (UNLIKELY(! UTF8_IS_START(*s))) {
	841	return 0;
	842	}
	843
	844	/* Examine a maximum of a single whole code point */
	845	if (e - s > UTF8SKIP(s)) {
	846	e = s + UTF8SKIP(s);
	847	}
	848
	849	len = e - s;
	850
	851	if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
	852	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	853
	854	/* Here, we are disallowing some set of largish code points, and the
	855	* first byte indicates the sequence is for a code point that could be
	856	* in the excluded set. We generally don't have to look beyond this or
	857	* the second byte to see if the sequence is actually for one of the
	858	* excluded classes. The code below is derived from this table:
	859	*
	860	* UTF-8 UTF-EBCDIC I8
	861	* U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
	862	* U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
	863	* U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
	864	*
	865	* Keep in mind that legal continuation bytes range between \x80..\xBF
	866	* for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
	867	* continuation bytes. Hence, we don't have to test the upper edge
	868	* because if any of those is encountered, the sequence is malformed,
	869	* and would fail elsewhere in this function.
	870	*
	871	* The code here likewise assumes that there aren't other
	872	* malformations; again the function should fail elsewhere because of
	873	* these. For example, an overlong beginning with FC doesn't actually
	874	* have to be a super; it could actually represent a small code point,
	875	* even U+0000. But, since overlongs (and other malformations) are
	876	* illegal, the function should return FALSE in either case.
	877	*/
	878
	879	#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
	880	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
	881	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
	882
	883	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
	884	/* B6 and B7 */ \
	885	&& ((s1) & 0xFE ) == 0xB6)
	886	# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
	887	#else
	888	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
	889	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
	890	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
	891	# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
	892	#endif
	893
	894	if ( (flags & UTF8_DISALLOW_SUPER)
	895	&& UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	896	{
	897	return 0; /* Above Unicode */
	898	}
	899
	900	if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
	901	&& UNLIKELY(isUTF8_PERL_EXTENDED(s)))
	902	{
	903	return 0;
	904	}
	905
	906	if (len > 1) {
	907	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	908
	909	if ( (flags & UTF8_DISALLOW_SUPER)
	910	&& UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
	911	{
	912	return 0; /* Above Unicode */
	913	}
	914
	915	if ( (flags & UTF8_DISALLOW_SURROGATE)
	916	&& UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
	917	{
	918	return 0; /* Surrogate */
	919	}
	920
	921	if ( (flags & UTF8_DISALLOW_NONCHAR)
	922	&& UNLIKELY(UTF8_IS_NONCHAR(s, e)))
	923	{
	924	return 0; /* Noncharacter code point */
	925	}
	926	}
	927	}
	928
	929	/* Make sure that all that follows are continuation bytes */
	930	for (x = s + 1; x < e; x++) {
	931	if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
	932	return 0;
	933	}
	934	}
	935
	936	/* Here is syntactically valid. Next, make sure this isn't the start of an
	937	* overlong. */
	938	if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
	939	return 0;
	940	}
	941
	942	/* And finally, that the code point represented fits in a word on this
	943	* platform */
	944	if (0 < does_utf8_overflow(s, e,
	945	0 /* Don't consider overlongs */
	946	))
	947	{
	948	return 0;
	949	}
	950
	951	return UTF8SKIP(s);
	952	}
	953
	954	char *
	955	Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
	956	{
	957	/* Returns a mortalized C string that is a displayable copy of the 'len'
	958	* bytes starting at 'start'. 'format' gives how to display each byte.
	959	* Currently, there are only two formats, so it is currently a bool:
	960	* 0 \xab
	961	* 1 ab (that is a space between two hex digit bytes)
	962	*/
	963
	964	const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
	965	trailing NUL */
	966	const U8 * s = start;
	967	const U8 * const e = start + len;
	968	char * output;
	969	char * d;
	970
	971	PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
	972
	973	Newx(output, output_len, char);
	974	SAVEFREEPV(output);
	975
	976	d = output;
	977	for (s = start; s < e; s++) {
	978	const unsigned high_nibble = (*s & 0xF0) >> 4;
	979	const unsigned low_nibble = (*s & 0x0F);
	980
	981	if (format) {
	982	if (s > start) {
	983	*d++ = ' ';
	984	}
	985	}
	986	else {
	987	*d++ = '\\';
	988	*d++ = 'x';
	989	}
	990
	991	if (high_nibble < 10) {
	992	*d++ = high_nibble + '0';
	993	}
	994	else {
	995	*d++ = high_nibble - 10 + 'a';
	996	}
	997
	998	if (low_nibble < 10) {
	999	*d++ = low_nibble + '0';
	1000	}
	1001	else {
	1002	*d++ = low_nibble - 10 + 'a';
	1003	}
	1004	}
	1005
	1006	*d = '\0';
	1007	return output;
	1008	}
	1009
	1010	PERL_STATIC_INLINE char *
	1011	S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
	1012
	1013	/* How many bytes to print */
	1014	STRLEN print_len,
	1015
	1016	/* Which one is the non-continuation */
	1017	const STRLEN non_cont_byte_pos,
	1018
	1019	/* How many bytes should there be? */
	1020	const STRLEN expect_len)
	1021	{
	1022	/* Return the malformation warning text for an unexpected continuation
	1023	* byte. */
	1024
	1025	const char * const where = (non_cont_byte_pos == 1)
	1026	? "immediately"
	1027	: Perl_form(aTHX_ "%d bytes",
	1028	(int) non_cont_byte_pos);
	1029
	1030	PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
	1031
	1032	/* We don't need to pass this parameter, but since it has already been
	1033	* calculated, it's likely faster to pass it; verify under DEBUGGING */
	1034	assert(expect_len == UTF8SKIP(s));
	1035
	1036	return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
	1037	" %s after start byte 0x%02x; need %d bytes, got %d)",
	1038	malformed_text,
	1039	_byte_dump_string(s, print_len, 0),
	1040	*(s + non_cont_byte_pos),
	1041	where,
	1042	*s,
	1043	(int) expect_len,
	1044	(int) non_cont_byte_pos);
	1045	}
	1046
	1047	/*
	1048
	1049	=for apidoc utf8n_to_uvchr
	1050
	1051	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1052	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1053
	1054	Bottom level UTF-8 decode routine.
	1055	Returns the native code point value of the first character in the string C<s>,
	1056	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	1057	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	1058	the length, in bytes, of that character.
	1059
	1060	The value of C<flags> determines the behavior when C<s> does not point to a
	1061	well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
	1062	causes zero to be returned and C<retlen> is set so that (S<C<s> + C<retlen>>)
	1063	is the next possible position in C<s> that could begin a non-malformed
	1064	character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
	1065	is raised. Some UTF-8 input sequences may contain multiple malformations.
	1066	This function tries to find every possible one in each call, so multiple
	1067	warnings can be raised for the same sequence.
	1068
	1069	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	1070	individual types of malformations, such as the sequence being overlong (that
	1071	is, when there is a shorter sequence that can express the same code point;
	1072	overlong sequences are expressly forbidden in the UTF-8 standard due to
	1073	potential security issues). Another malformation example is the first byte of
	1074	a character not being a legal first byte. See F<utf8.h> for the list of such
	1075	flags. Even if allowed, this function generally returns the Unicode
	1076	REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
	1077	F<utf8.h> to override this behavior for the overlong malformations, but don't
	1078	do that except for very specialized purposes.
	1079
	1080	The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
	1081	flags) malformation is found. If this flag is set, the routine assumes that
	1082	the caller will raise a warning, and this function will silently just set
	1083	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	1084
	1085	Note that this API requires disambiguation between successful decoding a C<NUL>
	1086	character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
	1087	in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
	1088	be set to 1. To disambiguate, upon a zero return, see if the first byte of
	1089	C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
	1090	error. Or you can use C<L</utf8n_to_uvchr_error>>.
	1091
	1092	Certain code points are considered problematic. These are Unicode surrogates,
	1093	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	1094	By default these are considered regular code points, but certain situations
	1095	warrant special handling for them, which can be specified using the C<flags>
	1096	parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
	1097	three classes are treated as malformations and handled as such. The flags
	1098	C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
	1099	C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
	1100	disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
	1101	restricts the allowed inputs to the strict UTF-8 traditionally defined by
	1102	Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
	1103	definition given by
	1104	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	1105	The difference between traditional strictness and C9 strictness is that the
	1106	latter does not forbid non-character code points. (They are still discouraged,
	1107	however.) For more discussion see L<perlunicode/Noncharacter code points>.
	1108
	1109	The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
	1110	C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
	1111	C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
	1112	raised for their respective categories, but otherwise the code points are
	1113	considered valid (not malformations). To get a category to both be treated as
	1114	a malformation and raise a warning, specify both the WARN and DISALLOW flags.
	1115	(But note that warnings are not raised if lexically disabled nor if
	1116	C<UTF8_CHECK_ONLY> is also specified.)
	1117
	1118	Extremely high code points were never specified in any standard, and require an
	1119	extension to UTF-8 to express, which Perl does. It is likely that programs
	1120	written in something other than Perl would not be able to read files that
	1121	contain these; nor would Perl understand files written by something that uses a
	1122	different extension. For these reasons, there is a separate set of flags that
	1123	can warn and/or disallow these extremely high code points, even if other
	1124	above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
	1125	C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
	1126	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
	1127	above-Unicode code points, including these, as malformations.
	1128	(Note that the Unicode standard considers anything above 0x10FFFF to be
	1129	illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
	1130	(2**31 -1))
	1131
	1132	A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
	1133	retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
	1134	C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	1135	C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
	1136	can apply to code points that actually do fit in 31 bits. This happens on
	1137	EBCDIC platforms, and sometimes when the L<overlong
	1138	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	1139	describe the situation in all cases.
	1140
	1141
	1142	All other code points corresponding to Unicode characters, including private
	1143	use and those yet to be assigned, are never considered malformed and never
	1144	warn.
	1145
	1146	=cut
	1147
	1148	Also implemented as a macro in utf8.h
	1149	*/
	1150
	1151	UV
	1152	Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
	1153	STRLEN curlen,
	1154	STRLEN *retlen,
	1155	const U32 flags)
	1156	{
	1157	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	1158
	1159	return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
	1160	}
	1161
	1162	/*
	1163
	1164	=for apidoc utf8n_to_uvchr_error
	1165
	1166	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1167	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1168
	1169	This function is for code that needs to know what the precise malformation(s)
	1170	are when an error is found.
	1171
	1172	It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
	1173	all the others, C<errors>. If this parameter is 0, this function behaves
	1174	identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
	1175	to a C<U32> variable, which this function sets to indicate any errors found.
	1176	Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
	1177	C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
	1178	of these bits will be set if a malformation is found, even if the input
	1179	C<flags> parameter indicates that the given malformation is allowed; those
	1180	exceptions are noted:
	1181
	1182	=over 4
	1183
	1184	=item C<UTF8_GOT_PERL_EXTENDED>
	1185
	1186	The input sequence is not standard UTF-8, but a Perl extension. This bit is
	1187	set only if the input C<flags> parameter contains either the
	1188	C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
	1189
	1190	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	1191	and so some extension must be used to express them. Perl uses a natural
	1192	extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
	1193	extension to represent even higher ones, so that any code point that fits in a
	1194	64-bit word can be represented. Text using these extensions is not likely to
	1195	be portable to non-Perl code. We lump both of these extensions together and
	1196	refer to them as Perl extended UTF-8. There exist other extensions that people
	1197	have invented, incompatible with Perl's.
	1198
	1199	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	1200	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	1201	than on ASCII. Prior to that, code points 2**31 and higher were simply
	1202	unrepresentable, and a different, incompatible method was used to represent
	1203	code points between 230 and 231 - 1.
	1204
	1205	On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
	1206	Perl extended UTF-8 is used.
	1207
	1208	In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
	1209	may use for backward compatibility. That name is misleading, as this flag may
	1210	be set when the code point actually does fit in 31 bits. This happens on
	1211	EBCDIC platforms, and sometimes when the L<overlong
	1212	malformation\|/C<UTF8_GOT_LONG>> is also present. The new name accurately
	1213	describes the situation in all cases.
	1214
	1215	=item C<UTF8_GOT_CONTINUATION>
	1216
	1217	The input sequence was malformed in that the first byte was a a UTF-8
	1218	continuation byte.
	1219
	1220	=item C<UTF8_GOT_EMPTY>
	1221
	1222	The input C<curlen> parameter was 0.
	1223
	1224	=item C<UTF8_GOT_LONG>
	1225
	1226	The input sequence was malformed in that there is some other sequence that
	1227	evaluates to the same code point, but that sequence is shorter than this one.
	1228
	1229	Until Unicode 3.1, it was legal for programs to accept this malformation, but
	1230	it was discovered that this created security issues.
	1231
	1232	=item C<UTF8_GOT_NONCHAR>
	1233
	1234	The code point represented by the input UTF-8 sequence is for a Unicode
	1235	non-character code point.
	1236	This bit is set only if the input C<flags> parameter contains either the
	1237	C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
	1238
	1239	=item C<UTF8_GOT_NON_CONTINUATION>
	1240
	1241	The input sequence was malformed in that a non-continuation type byte was found
	1242	in a position where only a continuation type one should be.
	1243
	1244	=item C<UTF8_GOT_OVERFLOW>
	1245
	1246	The input sequence was malformed in that it is for a code point that is not
	1247	representable in the number of bits available in an IV on the current platform.
	1248
	1249	=item C<UTF8_GOT_SHORT>
	1250
	1251	The input sequence was malformed in that C<curlen> is smaller than required for
	1252	a complete sequence. In other words, the input is for a partial character
	1253	sequence.
	1254
	1255	=item C<UTF8_GOT_SUPER>
	1256
	1257	The input sequence was malformed in that it is for a non-Unicode code point;
	1258	that is, one above the legal Unicode maximum.
	1259	This bit is set only if the input C<flags> parameter contains either the
	1260	C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
	1261
	1262	=item C<UTF8_GOT_SURROGATE>
	1263
	1264	The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
	1265	code point.
	1266	This bit is set only if the input C<flags> parameter contains either the
	1267	C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
	1268
	1269	=back
	1270
	1271	To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
	1272	flag to suppress any warnings, and then examine the C<*errors> return.
	1273
	1274	=cut
	1275	*/
	1276
	1277	UV
	1278	Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
	1279	STRLEN curlen,
	1280	STRLEN *retlen,
	1281	const U32 flags,
	1282	U32 * errors)
	1283	{
	1284	const U8 * const s0 = s;
	1285	U8 * send = NULL; /* (initialized to silence compilers' wrong
	1286	warning) */
	1287	U32 possible_problems = 0; /* A bit is set here for each potential problem
	1288	found as we go along */
	1289	UV uv = *s;
	1290	STRLEN expectlen = 0; /* How long should this sequence be?
	1291	(initialized to silence compilers' wrong
	1292	warning) */
	1293	STRLEN avail_len = 0; /* When input is too short, gives what that is */
	1294	U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
	1295	this gets set and discarded */
	1296
	1297	/* The below are used only if there is both an overlong malformation and a
	1298	* too short one. Otherwise the first two are set to 's0' and 'send', and
	1299	* the third not used at all */
	1300	U8 * adjusted_s0 = (U8 *) s0;
	1301	U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
	1302	routine; see [perl #130921] */
	1303	UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
	1304
	1305	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
	1306
	1307	if (errors) {
	1308	*errors = 0;
	1309	}
	1310	else {
	1311	errors = &discard_errors;
	1312	}
	1313
	1314	/* The order of malformation tests here is important. We should consume as
	1315	* few bytes as possible in order to not skip any valid character. This is
	1316	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	1317	* http://unicode.org/reports/tr36 for more discussion as to why. For
	1318	* example, once we've done a UTF8SKIP, we can tell the expected number of
	1319	* bytes, and could fail right off the bat if the input parameters indicate
	1320	* that there are too few available. But it could be that just that first
	1321	* byte is garbled, and the intended character occupies fewer bytes. If we
	1322	* blindly assumed that the first byte is correct, and skipped based on
	1323	* that number, we could skip over a valid input character. So instead, we
	1324	* always examine the sequence byte-by-byte.
	1325	*
	1326	* We also should not consume too few bytes, otherwise someone could inject
	1327	* things. For example, an input could be deliberately designed to
	1328	* overflow, and if this code bailed out immediately upon discovering that,
	1329	* returning to the caller C<*retlen> pointing to the very next byte (one
	1330	* which is actually part of of the overflowing sequence), that could look
	1331	* legitimate to the caller, which could discard the initial partial
	1332	* sequence and process the rest, inappropriately.
	1333	*
	1334	* Some possible input sequences are malformed in more than one way. This
	1335	* function goes to lengths to try to find all of them. This is necessary
	1336	* for correctness, as the inputs may allow one malformation but not
	1337	* another, and if we abandon searching for others after finding the
	1338	* allowed one, we could allow in something that shouldn't have been.
	1339	*/
	1340
	1341	if (UNLIKELY(curlen == 0)) {
	1342	possible_problems \|= UTF8_GOT_EMPTY;
	1343	curlen = 0;
	1344	uv = UNICODE_REPLACEMENT;
	1345	goto ready_to_handle_errors;
	1346	}
	1347
	1348	expectlen = UTF8SKIP(s);
	1349
	1350	/* A well-formed UTF-8 character, as the vast majority of calls to this
	1351	* function will be for, has this expected length. For efficiency, set
	1352	* things up here to return it. It will be overriden only in those rare
	1353	* cases where a malformation is found */
	1354	if (retlen) {
	1355	*retlen = expectlen;
	1356	}
	1357
	1358	/* An invariant is trivially well-formed */
	1359	if (UTF8_IS_INVARIANT(uv)) {
	1360	return uv;
	1361	}
	1362
	1363	/* A continuation character can't start a valid sequence */
	1364	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	1365	possible_problems \|= UTF8_GOT_CONTINUATION;
	1366	curlen = 1;
	1367	uv = UNICODE_REPLACEMENT;
	1368	goto ready_to_handle_errors;
	1369	}
	1370
	1371	/* Here is not a continuation byte, nor an invariant. The only thing left
	1372	* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
	1373	* because it excludes start bytes like \xC0 that always lead to
	1374	* overlongs.) */
	1375
	1376	/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
	1377	* that indicate the number of bytes in the character's whole UTF-8
	1378	* sequence, leaving just the bits that are part of the value. */
	1379	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	1380
	1381	/* Setup the loop end point, making sure to not look past the end of the
	1382	* input string, and flag it as too short if the size isn't big enough. */
	1383	send = (U8*) s0;
	1384	if (UNLIKELY(curlen < expectlen)) {
	1385	possible_problems \|= UTF8_GOT_SHORT;
	1386	avail_len = curlen;
	1387	send += curlen;
	1388	}
	1389	else {
	1390	send += expectlen;
	1391	}
	1392
	1393	/* Now, loop through the remaining bytes in the character's sequence,
	1394	* accumulating each into the working value as we go. */
	1395	for (s = s0 + 1; s < send; s++) {
	1396	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	1397	uv = UTF8_ACCUMULATE(uv, *s);
	1398	continue;
	1399	}
	1400
	1401	/* Here, found a non-continuation before processing all expected bytes.
	1402	* This byte indicates the beginning of a new character, so quit, even
	1403	* if allowing this malformation. */
	1404	possible_problems \|= UTF8_GOT_NON_CONTINUATION;
	1405	break;
	1406	} /* End of loop through the character's bytes */
	1407
	1408	/* Save how many bytes were actually in the character */
	1409	curlen = s - s0;
	1410
	1411	/* Note that there are two types of too-short malformation. One is when
	1412	* there is actual wrong data before the normal termination of the
	1413	* sequence. The other is that the sequence wasn't complete before the end
	1414	* of the data we are allowed to look at, based on the input 'curlen'.
	1415	* This means that we were passed data for a partial character, but it is
	1416	* valid as far as we saw. The other is definitely invalid. This
	1417	* distinction could be important to a caller, so the two types are kept
	1418	* separate.
	1419	*
	1420	* A convenience macro that matches either of the too-short conditions. */
	1421	# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT\|UTF8_GOT_NON_CONTINUATION)
	1422
	1423	if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
	1424	uv_so_far = uv;
	1425	uv = UNICODE_REPLACEMENT;
	1426	}
	1427
	1428	/* Check for overflow. The algorithm requires us to not look past the end
	1429	* of the current character, even if partial, so the upper limit is 's' */
	1430	if (UNLIKELY(0 < does_utf8_overflow(s0, s,
	1431	1 /* Do consider overlongs */
	1432	)))
	1433	{
	1434	possible_problems \|= UTF8_GOT_OVERFLOW;
	1435	uv = UNICODE_REPLACEMENT;
	1436	}
	1437
	1438	/* Check for overlong. If no problems so far, 'uv' is the correct code
	1439	* point value. Simply see if it is expressible in fewer bytes. Otherwise
	1440	* we must look at the UTF-8 byte sequence itself to see if it is for an
	1441	* overlong */
	1442	if ( ( LIKELY(! possible_problems)
	1443	&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
	1444	\|\| ( UNLIKELY(possible_problems)
	1445	&& ( UNLIKELY(! UTF8_IS_START(*s0))
	1446	\|\| ( curlen > 1
	1447	&& UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
	1448	s - s0))))))
	1449	{
	1450	possible_problems \|= UTF8_GOT_LONG;
	1451
	1452	if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
	1453
	1454	/* The calculation in the 'true' branch of this 'if'
	1455	* below won't work if overflows, and isn't needed
	1456	* anyway. Further below we handle all overflow
	1457	* cases */
	1458	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
	1459	{
	1460	UV min_uv = uv_so_far;
	1461	STRLEN i;
	1462
	1463	/* Here, the input is both overlong and is missing some trailing
	1464	* bytes. There is no single code point it could be for, but there
	1465	* may be enough information present to determine if what we have
	1466	* so far is for an unallowed code point, such as for a surrogate.
	1467	* The code further below has the intelligence to determine this,
	1468	* but just for non-overlong UTF-8 sequences. What we do here is
	1469	* calculate the smallest code point the input could represent if
	1470	* there were no too short malformation. Then we compute and save
	1471	* the UTF-8 for that, which is what the code below looks at
	1472	* instead of the raw input. It turns out that the smallest such
	1473	* code point is all we need. */
	1474	for (i = curlen; i < expectlen; i++) {
	1475	min_uv = UTF8_ACCUMULATE(min_uv,
	1476	I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
	1477	}
	1478
	1479	adjusted_s0 = temp_char_buf;
	1480	(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
	1481	}
	1482	}
	1483
	1484	/* Here, we have found all the possible problems, except for when the input
	1485	* is for a problematic code point not allowed by the input parameters. */
	1486
	1487	/* uv is valid for overlongs */
	1488	if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
	1489
	1490	/* isn't problematic if < this */
	1491	&& uv >= UNICODE_SURROGATE_FIRST)
	1492	\|\| ( UNLIKELY(possible_problems)
	1493
	1494	/* if overflow, we know without looking further
	1495	* precisely which of the problematic types it is,
	1496	* and we deal with those in the overflow handling
	1497	* code */
	1498	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
	1499	&& ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
	1500	\|\| UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
	1501	&& ((flags & ( UTF8_DISALLOW_NONCHAR
	1502	\|UTF8_DISALLOW_SURROGATE
	1503	\|UTF8_DISALLOW_SUPER
	1504	\|UTF8_DISALLOW_PERL_EXTENDED
	1505	\|UTF8_WARN_NONCHAR
	1506	\|UTF8_WARN_SURROGATE
	1507	\|UTF8_WARN_SUPER
	1508	\|UTF8_WARN_PERL_EXTENDED))))
	1509	{
	1510	/* If there were no malformations, or the only malformation is an
	1511	* overlong, 'uv' is valid */
	1512	if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
	1513	if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	1514	possible_problems \|= UTF8_GOT_SURROGATE;
	1515	}
	1516	else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
	1517	possible_problems \|= UTF8_GOT_SUPER;
	1518	}
	1519	else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
	1520	possible_problems \|= UTF8_GOT_NONCHAR;
	1521	}
	1522	}
	1523	else { /* Otherwise, need to look at the source UTF-8, possibly
	1524	adjusted to be non-overlong */
	1525
	1526	if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
	1527	>= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1528	{
	1529	possible_problems \|= UTF8_GOT_SUPER;
	1530	}
	1531	else if (curlen > 1) {
	1532	if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
	1533	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1534	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1535	{
	1536	possible_problems \|= UTF8_GOT_SUPER;
	1537	}
	1538	else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
	1539	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1540	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1541	{
	1542	possible_problems \|= UTF8_GOT_SURROGATE;
	1543	}
	1544	}
	1545
	1546	/* We need a complete well-formed UTF-8 character to discern
	1547	* non-characters, so can't look for them here */
	1548	}
	1549	}
	1550
	1551	ready_to_handle_errors:
	1552
	1553	/* At this point:
	1554	* curlen contains the number of bytes in the sequence that
	1555	* this call should advance the input by.
	1556	* avail_len gives the available number of bytes passed in, but
	1557	* only if this is less than the expected number of
	1558	* bytes, based on the code point's start byte.
	1559	* possible_problems' is 0 if there weren't any problems; otherwise a bit
	1560	* is set in it for each potential problem found.
	1561	* uv contains the code point the input sequence
	1562	* represents; or if there is a problem that prevents
	1563	* a well-defined value from being computed, it is
	1564	* some subsitute value, typically the REPLACEMENT
	1565	* CHARACTER.
	1566	* s0 points to the first byte of the character
	1567	* s points to just after were we left off processing
	1568	* the character
	1569	* send points to just after where that character should
	1570	* end, based on how many bytes the start byte tells
	1571	* us should be in it, but no further than s0 +
	1572	* avail_len
	1573	*/
	1574
	1575	if (UNLIKELY(possible_problems)) {
	1576	bool disallowed = FALSE;
	1577	const U32 orig_problems = possible_problems;
	1578
	1579	while (possible_problems) { /* Handle each possible problem */
	1580	UV pack_warn = 0;
	1581	char * message = NULL;
	1582
	1583	/* Each 'if' clause handles one problem. They are ordered so that
	1584	* the first ones' messages will be displayed before the later
	1585	* ones; this is kinda in decreasing severity order. But the
	1586	* overlong must come last, as it changes 'uv' looked at by the
	1587	* others */
	1588	if (possible_problems & UTF8_GOT_OVERFLOW) {
	1589
	1590	/* Overflow means also got a super and are using Perl's
	1591	* extended UTF-8, but we handle all three cases here */
	1592	possible_problems
	1593	&= ~(UTF8_GOT_OVERFLOW\|UTF8_GOT_SUPER\|UTF8_GOT_PERL_EXTENDED);
	1594	*errors \|= UTF8_GOT_OVERFLOW;
	1595
	1596	/* But the API says we flag all errors found */
	1597	if (flags & (UTF8_WARN_SUPER\|UTF8_DISALLOW_SUPER)) {
	1598	*errors \|= UTF8_GOT_SUPER;
	1599	}
	1600	if (flags
	1601	& (UTF8_WARN_PERL_EXTENDED\|UTF8_DISALLOW_PERL_EXTENDED))
	1602	{
	1603	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1604	}
	1605
	1606	/* Disallow if any of the three categories say to */
	1607	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1608	\|\| (flags & ( UTF8_DISALLOW_SUPER
	1609	\|UTF8_DISALLOW_PERL_EXTENDED)))
	1610	{
	1611	disallowed = TRUE;
	1612	}
	1613
	1614	/* Likewise, warn if any say to */
	1615	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1616	\|\| (flags & (UTF8_WARN_SUPER\|UTF8_WARN_PERL_EXTENDED)))
	1617	{
	1618
	1619	/* The warnings code explicitly says it doesn't handle the
	1620	* case of packWARN2 and two categories which have
	1621	* parent-child relationship. Even if it works now to
	1622	* raise the warning if either is enabled, it wouldn't
	1623	* necessarily do so in the future. We output (only) the
	1624	* most dire warning */
	1625	if (! (flags & UTF8_CHECK_ONLY)) {
	1626	if (ckWARN_d(WARN_UTF8)) {
	1627	pack_warn = packWARN(WARN_UTF8);
	1628	}
	1629	else if (ckWARN_d(WARN_NON_UNICODE)) {
	1630	pack_warn = packWARN(WARN_NON_UNICODE);
	1631	}
	1632	if (pack_warn) {
	1633	message = Perl_form(aTHX_ "%s: %s (overflows)",
	1634	malformed_text,
	1635	_byte_dump_string(s0, curlen, 0));
	1636	}
	1637	}
	1638	}
	1639	}
	1640	else if (possible_problems & UTF8_GOT_EMPTY) {
	1641	possible_problems &= ~UTF8_GOT_EMPTY;
	1642	*errors \|= UTF8_GOT_EMPTY;
	1643
	1644	if (! (flags & UTF8_ALLOW_EMPTY)) {
	1645
	1646	/* This so-called malformation is now treated as a bug in
	1647	* the caller. If you have nothing to decode, skip calling
	1648	* this function */
	1649	assert(0);
	1650
	1651	disallowed = TRUE;
	1652	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1653	pack_warn = packWARN(WARN_UTF8);
	1654	message = Perl_form(aTHX_ "%s (empty string)",
	1655	malformed_text);
	1656	}
	1657	}
	1658	}
	1659	else if (possible_problems & UTF8_GOT_CONTINUATION) {
	1660	possible_problems &= ~UTF8_GOT_CONTINUATION;
	1661	*errors \|= UTF8_GOT_CONTINUATION;
	1662
	1663	if (! (flags & UTF8_ALLOW_CONTINUATION)) {
	1664	disallowed = TRUE;
	1665	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1666	pack_warn = packWARN(WARN_UTF8);
	1667	message = Perl_form(aTHX_
	1668	"%s: %s (unexpected continuation byte 0x%02x,"
	1669	" with no preceding start byte)",
	1670	malformed_text,
	1671	_byte_dump_string(s0, 1, 0), *s0);
	1672	}
	1673	}
	1674	}
	1675	else if (possible_problems & UTF8_GOT_SHORT) {
	1676	possible_problems &= ~UTF8_GOT_SHORT;
	1677	*errors \|= UTF8_GOT_SHORT;
	1678
	1679	if (! (flags & UTF8_ALLOW_SHORT)) {
	1680	disallowed = TRUE;
	1681	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1682	pack_warn = packWARN(WARN_UTF8);
	1683	message = Perl_form(aTHX_
	1684	"%s: %s (too short; %d byte%s available, need %d)",
	1685	malformed_text,
	1686	_byte_dump_string(s0, send - s0, 0),
	1687	(int)avail_len,
	1688	avail_len == 1 ? "" : "s",
	1689	(int)expectlen);
	1690	}
	1691	}
	1692
	1693	}
	1694	else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
	1695	possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
	1696	*errors \|= UTF8_GOT_NON_CONTINUATION;
	1697
	1698	if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
	1699	disallowed = TRUE;
	1700	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1701
	1702	/* If we don't know for sure that the input length is
	1703	* valid, avoid as much as possible reading past the
	1704	* end of the buffer */
	1705	int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
	1706	? s - s0
	1707	: send - s0;
	1708	pack_warn = packWARN(WARN_UTF8);
	1709	message = Perl_form(aTHX_ "%s",
	1710	unexpected_non_continuation_text(s0,
	1711	printlen,
	1712	s - s0,
	1713	(int) expectlen));
	1714	}
	1715	}
	1716	}
	1717	else if (possible_problems & UTF8_GOT_SURROGATE) {
	1718	possible_problems &= ~UTF8_GOT_SURROGATE;
	1719
	1720	if (flags & UTF8_WARN_SURROGATE) {
	1721	*errors \|= UTF8_GOT_SURROGATE;
	1722
	1723	if ( ! (flags & UTF8_CHECK_ONLY)
	1724	&& ckWARN_d(WARN_SURROGATE))
	1725	{
	1726	pack_warn = packWARN(WARN_SURROGATE);
	1727
	1728	/* These are the only errors that can occur with a
	1729	* surrogate when the 'uv' isn't valid */
	1730	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	1731	message = Perl_form(aTHX_
	1732	"UTF-16 surrogate (any UTF-8 sequence that"
	1733	" starts with \"%s\" is for a surrogate)",
	1734	_byte_dump_string(s0, curlen, 0));
	1735	}
	1736	else {
	1737	message = Perl_form(aTHX_ surrogate_cp_format, uv);
	1738	}
	1739	}
	1740	}
	1741
	1742	if (flags & UTF8_DISALLOW_SURROGATE) {
	1743	disallowed = TRUE;
	1744	*errors \|= UTF8_GOT_SURROGATE;
	1745	}
	1746	}
	1747	else if (possible_problems & UTF8_GOT_SUPER) {
	1748	possible_problems &= ~UTF8_GOT_SUPER;
	1749
	1750	if (flags & UTF8_WARN_SUPER) {
	1751	*errors \|= UTF8_GOT_SUPER;
	1752
	1753	if ( ! (flags & UTF8_CHECK_ONLY)
	1754	&& ckWARN_d(WARN_NON_UNICODE))
	1755	{
	1756	pack_warn = packWARN(WARN_NON_UNICODE);
	1757
	1758	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	1759	message = Perl_form(aTHX_
	1760	"Any UTF-8 sequence that starts with"
	1761	" \"%s\" is for a non-Unicode code point,"
	1762	" may not be portable",
	1763	_byte_dump_string(s0, curlen, 0));
	1764	}
	1765	else {
	1766	message = Perl_form(aTHX_ super_cp_format, uv);
	1767	}
	1768	}
	1769	}
	1770
	1771	/* Test for Perl's extended UTF-8 after the regular SUPER ones,
	1772	* and before possibly bailing out, so that the more dire
	1773	* warning will override the regular one. */
	1774	if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
	1775	if ( ! (flags & UTF8_CHECK_ONLY)
	1776	&& (flags & (UTF8_WARN_PERL_EXTENDED\|UTF8_WARN_SUPER))
	1777	&& ckWARN_d(WARN_NON_UNICODE))
	1778	{
	1779	pack_warn = packWARN(WARN_NON_UNICODE);
	1780
	1781	/* If it is an overlong that evaluates to a code point
	1782	* that doesn't have to use the Perl extended UTF-8, it
	1783	* still used it, and so we output a message that
	1784	* doesn't refer to the code point. The same is true
	1785	* if there was a SHORT malformation where the code
	1786	* point is not valid. In that case, 'uv' will have
	1787	* been set to the REPLACEMENT CHAR, and the message
	1788	* below without the code point in it will be selected
	1789	* */
	1790	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	1791	message = Perl_form(aTHX_
	1792	perl_extended_cp_format, uv);
	1793	}
	1794	else {
	1795	message = Perl_form(aTHX_
	1796	"Any UTF-8 sequence that starts with"
	1797	" \"%s\" is a Perl extension, and"
	1798	" so is not portable",
	1799	_byte_dump_string(s0, curlen, 0));
	1800	}
	1801	}
	1802
	1803	if (flags & ( UTF8_WARN_PERL_EXTENDED
	1804	\|UTF8_DISALLOW_PERL_EXTENDED))
	1805	{
	1806	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1807
	1808	if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
	1809	disallowed = TRUE;
	1810	}
	1811	}
	1812	}
	1813
	1814	if (flags & UTF8_DISALLOW_SUPER) {
	1815	*errors \|= UTF8_GOT_SUPER;
	1816	disallowed = TRUE;
	1817	}
	1818	}
	1819	else if (possible_problems & UTF8_GOT_NONCHAR) {
	1820	possible_problems &= ~UTF8_GOT_NONCHAR;
	1821
	1822	if (flags & UTF8_WARN_NONCHAR) {
	1823	*errors \|= UTF8_GOT_NONCHAR;
	1824
	1825	if ( ! (flags & UTF8_CHECK_ONLY)
	1826	&& ckWARN_d(WARN_NONCHAR))
	1827	{
	1828	/* The code above should have guaranteed that we don't
	1829	* get here with errors other than overlong */
	1830	assert (! (orig_problems
	1831	& ~(UTF8_GOT_LONG\|UTF8_GOT_NONCHAR)));
	1832
	1833	pack_warn = packWARN(WARN_NONCHAR);
	1834	message = Perl_form(aTHX_ nonchar_cp_format, uv);
	1835	}
	1836	}
	1837
	1838	if (flags & UTF8_DISALLOW_NONCHAR) {
	1839	disallowed = TRUE;
	1840	*errors \|= UTF8_GOT_NONCHAR;
	1841	}
	1842	}
	1843	else if (possible_problems & UTF8_GOT_LONG) {
	1844	possible_problems &= ~UTF8_GOT_LONG;
	1845	*errors \|= UTF8_GOT_LONG;
	1846
	1847	if (flags & UTF8_ALLOW_LONG) {
	1848
	1849	/* We don't allow the actual overlong value, unless the
	1850	* special extra bit is also set */
	1851	if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
	1852	& ~UTF8_ALLOW_LONG)))
	1853	{
	1854	uv = UNICODE_REPLACEMENT;
	1855	}
	1856	}
	1857	else {
	1858	disallowed = TRUE;
	1859
	1860	if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
	1861	pack_warn = packWARN(WARN_UTF8);
	1862
	1863	/* These error types cause 'uv' to be something that
	1864	* isn't what was intended, so can't use it in the
	1865	* message. The other error types either can't
	1866	* generate an overlong, or else the 'uv' is valid */
	1867	if (orig_problems &
	1868	(UTF8_GOT_TOO_SHORT\|UTF8_GOT_OVERFLOW))
	1869	{
	1870	message = Perl_form(aTHX_
	1871	"%s: %s (any UTF-8 sequence that starts"
	1872	" with \"%s\" is overlong which can and"
	1873	" should be represented with a"
	1874	" different, shorter sequence)",
	1875	malformed_text,
	1876	_byte_dump_string(s0, send - s0, 0),
	1877	_byte_dump_string(s0, curlen, 0));
	1878	}
	1879	else {
	1880	U8 tmpbuf[UTF8_MAXBYTES+1];
	1881	const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
	1882	uv, 0);
	1883	/* Don't use U+ for non-Unicode code points, which
	1884	* includes those in the Latin1 range */
	1885	const char * preface = ( uv > PERL_UNICODE_MAX
	1886	#ifdef EBCDIC
	1887	\|\| uv <= 0xFF
	1888	#endif
	1889	)
	1890	? "0x"
	1891	: "U+";
	1892	message = Perl_form(aTHX_
	1893	"%s: %s (overlong; instead use %s to represent"
	1894	" %s%0*" UVXf ")",
	1895	malformed_text,
	1896	_byte_dump_string(s0, send - s0, 0),
	1897	_byte_dump_string(tmpbuf, e - tmpbuf, 0),
	1898	preface,
	1899	((uv < 256) ? 2 : 4), /* Field width of 2 for
	1900	small code points */
	1901	UNI_TO_NATIVE(uv));
	1902	}
	1903	}
	1904	}
	1905	} /* End of looking through the possible flags */
	1906
	1907	/* Display the message (if any) for the problem being handled in
	1908	* this iteration of the loop */
	1909	if (message) {
	1910	if (PL_op)
	1911	Perl_warner(aTHX_ pack_warn, "%s in %s", message,
	1912	OP_DESC(PL_op));
	1913	else
	1914	Perl_warner(aTHX_ pack_warn, "%s", message);
	1915	}
	1916	} /* End of 'while (possible_problems)' */
	1917
	1918	/* Since there was a possible problem, the returned length may need to
	1919	* be changed from the one stored at the beginning of this function.
	1920	* Instead of trying to figure out if that's needed, just do it. */
	1921	if (retlen) {
	1922	*retlen = curlen;
	1923	}
	1924
	1925	if (disallowed) {
	1926	if (flags & UTF8_CHECK_ONLY && retlen) {
	1927	*retlen = ((STRLEN) -1);
	1928	}
	1929	return 0;
	1930	}
	1931	}
	1932
	1933	return UNI_TO_NATIVE(uv);
	1934	}
	1935
	1936	/*
	1937	=for apidoc utf8_to_uvchr_buf
	1938
	1939	Returns the native code point of the first character in the string C<s> which
	1940	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	1941	C<*retlen> will be set to the length, in bytes, of that character.
	1942
	1943	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	1944	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	1945	C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
	1946	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	1947	C<retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<retlen>>) is
	1948	the next possible position in C<s> that could begin a non-malformed character.
	1949	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	1950	returned.
	1951
	1952	=cut
	1953
	1954	Also implemented as a macro in utf8.h
	1955
	1956	*/
	1957
	1958
	1959	UV
	1960	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1961	{
	1962	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
	1963
	1964	assert(s < send);
	1965
	1966	return utf8n_to_uvchr(s, send - s, retlen,
	1967	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	1968	}
	1969
	1970	/* This is marked as deprecated
	1971	*
	1972	=for apidoc utf8_to_uvuni_buf
	1973
	1974	Only in very rare circumstances should code need to be dealing in Unicode
	1975	(as opposed to native) code points. In those few cases, use
	1976	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>> instead.
	1977
	1978	Returns the Unicode (not-native) code point of the first character in the
	1979	string C<s> which
	1980	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	1981	C<retlen> will be set to the length, in bytes, of that character.
	1982
	1983	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	1984	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	1985	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	1986	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	1987	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	1988	next possible position in C<s> that could begin a non-malformed character.
	1989	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	1990
	1991	=cut
	1992	*/
	1993
	1994	UV
	1995	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1996	{
	1997	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	1998
	1999	assert(send > s);
	2000
	2001	return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
	2002	}
	2003
	2004	/*
	2005	=for apidoc utf8_length
	2006
	2007	Return the length of the UTF-8 char encoded string C<s> in characters.
	2008	Stops at C<e> (i.e. the C<*e> byte does not form part of the character).
	2009	If C<e E<lt> s> or if the scan would end up past C<e>, it croaks.
	2010
	2011	=cut
	2012	*/
	2013
	2014	STRLEN
	2015	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	2016	{
	2017	STRLEN len = 0;
	2018
	2019	PERL_ARGS_ASSERT_UTF8_LENGTH;
	2020
	2021	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	2022	* the bitops (especially ~) can create illegal UTF-8.
	2023	* In other words: in Perl UTF-8 is not just for Unicode. */
	2024
	2025	if (e < s)
	2026	goto warn_and_return;
	2027	while (s < e) {
	2028	s += UTF8SKIP(s);
	2029	len++;
	2030	}
	2031
	2032	if (e != s) {
	2033	len--;
	2034	warn_and_return:
	2035	if (PL_op)
	2036	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2037	"%s in %s", unees, OP_DESC(PL_op));
	2038	else
	2039	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2040	}
	2041
	2042	return len;
	2043	}
	2044
	2045	/*
	2046	=for apidoc bytes_cmp_utf8
	2047
	2048	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	2049	sequence of characters (stored as UTF-8)
	2050	in C<u>, C<ulen>. Returns 0 if they are
	2051	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	2052	if the first string is greater than the second string.
	2053
	2054	-1 or +1 is returned if the shorter string was identical to the start of the
	2055	longer string. -2 or +2 is returned if
	2056	there was a difference between characters
	2057	within the strings.
	2058
	2059	=cut
	2060	*/
	2061
	2062	int
	2063	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	2064	{
	2065	const U8 *const bend = b + blen;
	2066	const U8 *const uend = u + ulen;
	2067
	2068	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	2069
	2070	while (b < bend && u < uend) {
	2071	U8 c = *u++;
	2072	if (!UTF8_IS_INVARIANT(c)) {
	2073	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2074	if (u < uend) {
	2075	U8 c1 = *u++;
	2076	if (UTF8_IS_CONTINUATION(c1)) {
	2077	c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
	2078	} else {
	2079	/* diag_listed_as: Malformed UTF-8 character%s */
	2080	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2081	"%s %s%s",
	2082	unexpected_non_continuation_text(u - 2, 2, 1, 2),
	2083	PL_op ? " in " : "",
	2084	PL_op ? OP_DESC(PL_op) : "");
	2085	return -2;
	2086	}
	2087	} else {
	2088	if (PL_op)
	2089	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2090	"%s in %s", unees, OP_DESC(PL_op));
	2091	else
	2092	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2093	return -2; /* Really want to return undef :-) */
	2094	}
	2095	} else {
	2096	return -2;
	2097	}
	2098	}
	2099	if (*b != c) {
	2100	return *b < c ? -2 : +2;
	2101	}
	2102	++b;
	2103	}
	2104
	2105	if (b == bend && u == uend)
	2106	return 0;
	2107
	2108	return b < bend ? +1 : -1;
	2109	}
	2110
	2111	/*
	2112	=for apidoc utf8_to_bytes
	2113
	2114	Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
	2115	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	2116	updates C<*lenp> to contain the new length.
	2117	Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
	2118
	2119	Upon successful return, the number of variants in the string can be computed by
	2120	having saved the value of C<*lenp> before the call, and subtracting the
	2121	after-call value of C<*lenp> from it.
	2122
	2123	If you need a copy of the string, see L</bytes_from_utf8>.
	2124
	2125	=cut
	2126	*/
	2127
	2128	U8 *
	2129	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN lenp)
	2130	{
	2131	U8 * first_variant;
	2132
	2133	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	2134	PERL_UNUSED_CONTEXT;
	2135
	2136	/* This is a no-op if no variants at all in the input */
	2137	if (is_utf8_invariant_string_loc(s, lenp, (const U8 *) &first_variant)) {
	2138	return s;
	2139	}
	2140
	2141	{
	2142	U8 * const save = s;
	2143	U8 * const send = s + *lenp;
	2144	U8 * d;
	2145
	2146	/* Nothing before the first variant needs to be changed, so start the real
	2147	* work there */
	2148	s = first_variant;
	2149	while (s < send) {
	2150	if (! UTF8_IS_INVARIANT(*s)) {
	2151	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	2152	*lenp = ((STRLEN) -1);
	2153	return 0;
	2154	}
	2155	s++;
	2156	}
	2157	s++;
	2158	}
	2159
	2160	/* Is downgradable, so do it */
	2161	d = s = first_variant;
	2162	while (s < send) {
	2163	U8 c = *s++;
	2164	if (! UVCHR_IS_INVARIANT(c)) {
	2165	/* Then it is two-byte encoded */
	2166	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2167	s++;
	2168	}
	2169	*d++ = c;
	2170	}
	2171	*d = '\0';
	2172	*lenp = d - save;
	2173
	2174	return save;
	2175	}
	2176	}
	2177
	2178	/*
	2179	=for apidoc bytes_from_utf8
	2180
	2181	Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
	2182	byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
	2183	actually encoded in UTF-8.
	2184
	2185	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
	2186	the input string.
	2187
	2188	Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
	2189	not expressible in native byte encoding. In these cases, C<*is_utf8p> and
	2190	C<*lenp> are unchanged, and the return value is the original C<s>.
	2191
	2192	Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
	2193	newly created string containing a downgraded copy of C<s>, and whose length is
	2194	returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
	2195
	2196	Upon successful return, the number of variants in the string can be computed by
	2197	having saved the value of C<*lenp> before the call, and subtracting the
	2198	after-call value of C<*lenp> from it.
	2199
	2200	=cut
	2201
	2202	There is a macro that avoids this function call, but this is retained for
	2203	anyone who calls it with the Perl_ prefix */
	2204
	2205	U8 *
	2206	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN lenp, bool *is_utf8p)
	2207	{
	2208	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	2209	PERL_UNUSED_CONTEXT;
	2210
	2211	return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
	2212	}
	2213
	2214	/*
	2215	No = here because currently externally undocumented
	2216	for apidoc bytes_from_utf8_loc
	2217
	2218	Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
	2219	to store the location of the first character in C<"s"> that cannot be
	2220	converted to non-UTF8.
	2221
	2222	If that parameter is C<NULL>, this function behaves identically to
	2223	C<bytes_from_utf8>.
	2224
	2225	Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
	2226	C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
	2227
	2228	Otherwise, the function returns a newly created C<NUL>-terminated string
	2229	containing the non-UTF8 equivalent of the convertible first portion of
	2230	C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
	2231	If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
	2232	and C<*first_non_downgradable> is set to C<NULL>.
	2233
	2234	Otherwise, C<*first_non_downgradable> set to point to the first byte of the
	2235	first character in the original string that wasn't converted. C<*is_utf8p> is
	2236	unchanged. Note that the new string may have length 0.
	2237
	2238	Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
	2239	C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
	2240	converts as many characters in it as possible stopping at the first one it
	2241	finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
	2242	set to point to that. The function returns the portion that could be converted
	2243	in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
	2244	not including the terminating C<NUL>. If the very first character in the
	2245	original could not be converted, C<*lenp> will be 0, and the new string will
	2246	contain just a single C<NUL>. If the entire input string was converted,
	2247	C<is_utf8p> is set to FALSE and C<first_non_downgradable> is set to C<NULL>.
	2248
	2249	Upon successful return, the number of variants in the converted portion of the
	2250	string can be computed by having saved the value of C<*lenp> before the call,
	2251	and subtracting the after-call value of C<*lenp> from it.
	2252
	2253	=cut
	2254
	2255
	2256	*/
	2257
	2258	U8 *
	2259	Perl_bytes_from_utf8_loc(const U8 s, STRLEN lenp, bool is_utf8p, const U8* first_unconverted)
	2260	{
	2261	U8 *d;
	2262	const U8 *original = s;
	2263	U8 *converted_start;
	2264	const U8 send = s + lenp;
	2265
	2266	PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
	2267
	2268	if (! *is_utf8p) {
	2269	if (first_unconverted) {
	2270	*first_unconverted = NULL;
	2271	}
	2272
	2273	return (U8 *) original;
	2274	}
	2275
	2276	Newx(d, (*lenp) + 1, U8);
	2277
	2278	converted_start = d;
	2279	while (s < send) {
	2280	U8 c = *s++;
	2281	if (! UTF8_IS_INVARIANT(c)) {
	2282
	2283	/* Then it is multi-byte encoded. If the code point is above 0xFF,
	2284	* have to stop now */
	2285	if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
	2286	if (first_unconverted) {
	2287	*first_unconverted = s - 1;
	2288	goto finish_and_return;
	2289	}
	2290	else {
	2291	Safefree(converted_start);
	2292	return (U8 *) original;
	2293	}
	2294	}
	2295
	2296	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2297	s++;
	2298	}
	2299	*d++ = c;
	2300	}
	2301
	2302	/* Here, converted the whole of the input */
	2303	*is_utf8p = FALSE;
	2304	if (first_unconverted) {
	2305	*first_unconverted = NULL;
	2306	}
	2307
	2308	finish_and_return:
	2309	*d = '\0';
	2310	*lenp = d - converted_start;
	2311
	2312	/* Trim unused space */
	2313	Renew(converted_start, *lenp + 1, U8);
	2314
	2315	return converted_start;
	2316	}
	2317
	2318	/*
	2319	=for apidoc bytes_to_utf8
	2320
	2321	Converts a string C<s> of length C<*lenp> bytes from the native encoding into
	2322	UTF-8.
	2323	Returns a pointer to the newly-created string, and sets C<*lenp> to
	2324	reflect the new length in bytes.
	2325
	2326	Upon successful return, the number of variants in the string can be computed by
	2327	having saved the value of C<*lenp> before the call, and subtracting it from the
	2328	after-call value of C<*lenp>.
	2329
	2330	A C<NUL> character will be written after the end of the string.
	2331
	2332	If you want to convert to UTF-8 from encodings other than
	2333	the native (Latin1 or EBCDIC),
	2334	see L</sv_recode_to_utf8>().
	2335
	2336	=cut
	2337	*/
	2338
	2339	U8*
	2340	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN lenp)
	2341	{
	2342	const U8 * const send = s + (*lenp);
	2343	U8 *d;
	2344	U8 *dst;
	2345
	2346	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	2347	PERL_UNUSED_CONTEXT;
	2348
	2349	Newx(d, (lenp) 2 + 1, U8);
	2350	dst = d;
	2351
	2352	while (s < send) {
	2353	append_utf8_from_native_byte(*s, &d);
	2354	s++;
	2355	}
	2356
	2357	*d = '\0';
	2358	*lenp = d-dst;
	2359
	2360	/* Trim unused space */
	2361	Renew(dst, *lenp + 1, U8);
	2362
	2363	return dst;
	2364	}
	2365
	2366	/*
	2367	* Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
	2368	* use utf16_to_utf8_reversed().
	2369	*
	2370	* UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
	2371	* UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
	2372	* UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
	2373	*
	2374	* These functions don't check for overflow. The worst case is every code
	2375	* point in the input is 2 bytes, and requires 4 bytes on output. (If the code
	2376	* is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
	2377	* destination must be pre-extended to 2 times the source length.
	2378	*
	2379	* Do not use in-place. We optimize for native, for obvious reasons. */
	2380
	2381	U8*
	2382	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2383	{
	2384	U8* pend;
	2385	U8* dstart = d;
	2386
	2387	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	2388
	2389	if (bytelen & 1)
	2390	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
	2391	(UV)bytelen);
	2392
	2393	pend = p + bytelen;
	2394
	2395	while (p < pend) {
	2396	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	2397	p += 2;
	2398	if (OFFUNI_IS_INVARIANT(uv)) {
	2399	*d++ = LATIN1_TO_NATIVE((U8) uv);
	2400	continue;
	2401	}
	2402	if (uv <= MAX_UTF8_TWO_BYTE) {
	2403	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	2404	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	2405	continue;
	2406	}
	2407
	2408	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	2409	#define LAST_HIGH_SURROGATE 0xDBFF
	2410	#define FIRST_LOW_SURROGATE 0xDC00
	2411	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	2412	#define FIRST_IN_PLANE1 0x10000
	2413
	2414	/* This assumes that most uses will be in the first Unicode plane, not
	2415	* needing surrogates */
	2416	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
	2417	&& uv <= UNICODE_SURROGATE_LAST))
	2418	{
	2419	if (UNLIKELY(p >= pend) \|\| UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
	2420	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2421	}
	2422	else {
	2423	UV low = (p[0] << 8) + p[1];
	2424	if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
	2425	\|\| UNLIKELY(low > LAST_LOW_SURROGATE))
	2426	{
	2427	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2428	}
	2429	p += 2;
	2430	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	2431	+ (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
	2432	}
	2433	}
	2434	#ifdef EBCDIC
	2435	d = uvoffuni_to_utf8_flags(d, uv, 0);
	2436	#else
	2437	if (uv < FIRST_IN_PLANE1) {
	2438	*d++ = (U8)(( uv >> 12) \| 0xe0);
	2439	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2440	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2441	continue;
	2442	}
	2443	else {
	2444	*d++ = (U8)(( uv >> 18) \| 0xf0);
	2445	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	2446	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2447	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2448	continue;
	2449	}
	2450	#endif
	2451	}
	2452	*newlen = d - dstart;
	2453	return d;
	2454	}
	2455
	2456	/* Note: this one is slightly destructive of the source. */
	2457
	2458	U8*
	2459	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2460	{
	2461	U8* s = (U8*)p;
	2462	U8* const send = s + bytelen;
	2463
	2464	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	2465
	2466	if (bytelen & 1)
	2467	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
	2468	(UV)bytelen);
	2469
	2470	while (s < send) {
	2471	const U8 tmp = s[0];
	2472	s[0] = s[1];
	2473	s[1] = tmp;
	2474	s += 2;
	2475	}
	2476	return utf16_to_utf8(p, d, bytelen, newlen);
	2477	}
	2478
	2479	bool
	2480	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	2481	{
	2482	U8 tmpbuf[UTF8_MAXBYTES+1];
	2483	uvchr_to_utf8(tmpbuf, c);
	2484	return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
	2485	}
	2486
	2487	/* Internal function so we can deprecate the external one, and call
	2488	this one from other deprecated functions in this file */
	2489
	2490	bool
	2491	Perl__is_utf8_idstart(pTHX_ const U8 *p)
	2492	{
	2493	PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
	2494
	2495	if (*p == '_')
	2496	return TRUE;
	2497	return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
	2498	}
	2499
	2500	bool
	2501	Perl__is_uni_perl_idcont(pTHX_ UV c)
	2502	{
	2503	U8 tmpbuf[UTF8_MAXBYTES+1];
	2504	uvchr_to_utf8(tmpbuf, c);
	2505	return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
	2506	}
	2507
	2508	bool
	2509	Perl__is_uni_perl_idstart(pTHX_ UV c)
	2510	{
	2511	U8 tmpbuf[UTF8_MAXBYTES+1];
	2512	uvchr_to_utf8(tmpbuf, c);
	2513	return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
	2514	}
	2515
	2516	UV
	2517	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2518	const char S_or_s)
	2519	{
	2520	/* We have the latin1-range values compiled into the core, so just use
	2521	* those, converting the result to UTF-8. The only difference between upper
	2522	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	2523	* either "SS" or "Ss". Which one to use is passed into the routine in
	2524	* 'S_or_s' to avoid a test */
	2525
	2526	UV converted = toUPPER_LATIN1_MOD(c);
	2527
	2528	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	2529
	2530	assert(S_or_s == 'S' \|\| S_or_s == 's');
	2531
	2532	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	2533	characters in this range */
	2534	*p = (U8) converted;
	2535	*lenp = 1;
	2536	return converted;
	2537	}
	2538
	2539	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	2540	* which it maps to one of them, so as to only have to have one check for
	2541	* it in the main case */
	2542	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	2543	switch (c) {
	2544	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	2545	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	2546	break;
	2547	case MICRO_SIGN:
	2548	converted = GREEK_CAPITAL_LETTER_MU;
	2549	break;
	2550	#if UNICODE_MAJOR_VERSION > 2 \
	2551	\|\| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
	2552	&& UNICODE_DOT_DOT_VERSION >= 8)
	2553	case LATIN_SMALL_LETTER_SHARP_S:
	2554	*(p)++ = 'S';
	2555	*p = S_or_s;
	2556	*lenp = 2;
	2557	return 'S';
	2558	#endif
	2559	default:
	2560	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
	2561	" '%c' to map to '%c'",
	2562	c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	2563	NOT_REACHED; /* NOTREACHED */
	2564	}
	2565	}
	2566
	2567	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2568	*p = UTF8_TWO_BYTE_LO(converted);
	2569	*lenp = 2;
	2570
	2571	return converted;
	2572	}
	2573
	2574	/* Call the function to convert a UTF-8 encoded character to the specified case.
	2575	* Note that there may be more than one character in the result.
	2576	* INP is a pointer to the first byte of the input character
	2577	* OUTP will be set to the first byte of the string of changed characters. It
	2578	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	2579	* LENP will be set to the length in bytes of the string of changed characters
	2580	*
	2581	* The functions return the ordinal of the first character in the string of
	2582	* OUTP */
	2583	#define CALL_UPPER_CASE(uv, s, d, lenp) \
	2584	_to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
	2585	#define CALL_TITLE_CASE(uv, s, d, lenp) \
	2586	_to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
	2587	#define CALL_LOWER_CASE(uv, s, d, lenp) \
	2588	_to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
	2589
	2590	/* This additionally has the input parameter 'specials', which if non-zero will
	2591	* cause this to use the specials hash for folding (meaning get full case
	2592	* folding); otherwise, when zero, this implies a simple case fold */
	2593	#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
	2594	_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
	2595
	2596	UV
	2597	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	2598	{
	2599	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	2600	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	2601	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	2602	* the changed version may be longer than the original character.
	2603	*
	2604	* The ordinal of the first character of the changed version is returned
	2605	* (but note, as explained above, that there may be more.) */
	2606
	2607	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	2608
	2609	if (c < 256) {
	2610	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	2611	}
	2612
	2613	uvchr_to_utf8(p, c);
	2614	return CALL_UPPER_CASE(c, p, p, lenp);
	2615	}
	2616
	2617	UV
	2618	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	2619	{
	2620	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	2621
	2622	if (c < 256) {
	2623	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	2624	}
	2625
	2626	uvchr_to_utf8(p, c);
	2627	return CALL_TITLE_CASE(c, p, p, lenp);
	2628	}
	2629
	2630	STATIC U8
	2631	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
	2632	{
	2633	/* We have the latin1-range values compiled into the core, so just use
	2634	* those, converting the result to UTF-8. Since the result is always just
	2635	* one character, we allow <p> to be NULL */
	2636
	2637	U8 converted = toLOWER_LATIN1(c);
	2638
	2639	PERL_UNUSED_ARG(dummy);
	2640
	2641	if (p != NULL) {
	2642	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	2643	*p = converted;
	2644	*lenp = 1;
	2645	}
	2646	else {
	2647	/* Result is known to always be < 256, so can use the EIGHT_BIT
	2648	* macros */
	2649	*p = UTF8_EIGHT_BIT_HI(converted);
	2650	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	2651	*lenp = 2;
	2652	}
	2653	}
	2654	return converted;
	2655	}
	2656
	2657	UV
	2658	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	2659	{
	2660	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	2661
	2662	if (c < 256) {
	2663	return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
	2664	}
	2665
	2666	uvchr_to_utf8(p, c);
	2667	return CALL_LOWER_CASE(c, p, p, lenp);
	2668	}
	2669
	2670	UV
	2671	Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2672	const unsigned int flags)
	2673	{
	2674	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	2675	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	2676	* FOLD_FLAGS_FULL iff full folding is to be used;
	2677	*
	2678	* Not to be used for locale folds
	2679	*/
	2680
	2681	UV converted;
	2682
	2683	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	2684	PERL_UNUSED_CONTEXT;
	2685
	2686	assert (! (flags & FOLD_FLAGS_LOCALE));
	2687
	2688	if (UNLIKELY(c == MICRO_SIGN)) {
	2689	converted = GREEK_SMALL_LETTER_MU;
	2690	}
	2691	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	2692	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	2693	\|\| UNICODE_DOT_DOT_VERSION > 0)
	2694	else if ( (flags & FOLD_FLAGS_FULL)
	2695	&& UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
	2696	{
	2697	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	2698	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	2699	* under those circumstances. */
	2700	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	2701	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	2702	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	2703	p, *lenp, U8);
	2704	return LATIN_SMALL_LETTER_LONG_S;
	2705	}
	2706	else {
	2707	*(p)++ = 's';
	2708	*p = 's';
	2709	*lenp = 2;
	2710	return 's';
	2711	}
	2712	}
	2713	#endif
	2714	else { /* In this range the fold of all other characters is their lower
	2715	case */
	2716	converted = toLOWER_LATIN1(c);
	2717	}
	2718
	2719	if (UVCHR_IS_INVARIANT(converted)) {
	2720	*p = (U8) converted;
	2721	*lenp = 1;
	2722	}
	2723	else {
	2724	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2725	*p = UTF8_TWO_BYTE_LO(converted);
	2726	*lenp = 2;
	2727	}
	2728
	2729	return converted;
	2730	}
	2731
	2732	UV
	2733	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	2734	{
	2735
	2736	/* Not currently externally documented, and subject to change
	2737	* <flags> bits meanings:
	2738	* FOLD_FLAGS_FULL iff full folding is to be used;
	2739	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	2740	* locale are to be used.
	2741	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	2742	*/
	2743
	2744	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	2745
	2746	if (flags & FOLD_FLAGS_LOCALE) {
	2747	/* Treat a UTF-8 locale as not being in locale at all */
	2748	if (IN_UTF8_CTYPE_LOCALE) {
	2749	flags &= ~FOLD_FLAGS_LOCALE;
	2750	}
	2751	else {
	2752	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2753	goto needs_full_generality;
	2754	}
	2755	}
	2756
	2757	if (c < 256) {
	2758	return _to_fold_latin1((U8) c, p, lenp,
	2759	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2760	}
	2761
	2762	/* Here, above 255. If no special needs, just use the macro */
	2763	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	2764	uvchr_to_utf8(p, c);
	2765	return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
	2766	}
	2767	else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
	2768	the special flags. */
	2769	U8 utf8_c[UTF8_MAXBYTES + 1];
	2770
	2771	needs_full_generality:
	2772	uvchr_to_utf8(utf8_c, c);
	2773	return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
	2774	p, lenp, flags);
	2775	}
	2776	}
	2777
	2778	PERL_STATIC_INLINE bool
	2779	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	2780	const char const swashname, SV const invlist)
	2781	{
	2782	/* returns a boolean giving whether or not the UTF8-encoded character that
	2783	* starts at <p> is in the swash indicated by <swashname>. <swash>
	2784	* contains a pointer to where the swash indicated by <swashname>
	2785	* is to be stored; which this routine will do, so that future calls will
	2786	* look at <*swash> and only generate a swash if it is not null. <invlist>
	2787	* is NULL or an inversion list that defines the swash. If not null, it
	2788	* saves time during initialization of the swash.
	2789	*
	2790	* Note that it is assumed that the buffer length of <p> is enough to
	2791	* contain all the bytes that comprise the character. Thus, <*p> should
	2792	* have been checked before this call for mal-formedness enough to assure
	2793	* that. */
	2794
	2795	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	2796
	2797	/* The API should have included a length for the UTF-8 character in <p>,
	2798	* but it doesn't. We therefore assume that p has been validated at least
	2799	* as far as there being enough bytes available in it to accommodate the
	2800	* character without reading beyond the end, and pass that number on to the
	2801	* validating routine */
	2802	if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
	2803	_force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
	2804	_UTF8_NO_CONFIDENCE_IN_CURLEN,
	2805	1 /* Die */ );
	2806	NOT_REACHED; /* NOTREACHED */
	2807	}
	2808
	2809	if (!*swash) {
	2810	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	2811	*swash = _core_swash_init("utf8",
	2812
	2813	/* Only use the name if there is no inversion
	2814	* list; otherwise will go out to disk */
	2815	(invlist) ? "" : swashname,
	2816
	2817	&PL_sv_undef, 1, 0, invlist, &flags);
	2818	}
	2819
	2820	return swash_fetch(*swash, p, TRUE) != 0;
	2821	}
	2822
	2823	PERL_STATIC_INLINE bool
	2824	S_is_utf8_common_with_len(pTHX_ const U8 const p, const U8 const e,
	2825	SV *swash, const char const swashname,
	2826	SV* const invlist)
	2827	{
	2828	/* returns a boolean giving whether or not the UTF8-encoded character that
	2829	* starts at <p>, and extending no further than <e - 1> is in the swash
	2830	* indicated by <swashname>. <swash> contains a pointer to where the swash
	2831	* indicated by <swashname> is to be stored; which this routine will do, so
	2832	* that future calls will look at <*swash> and only generate a swash if it
	2833	* is not null. <invlist> is NULL or an inversion list that defines the
	2834	* swash. If not null, it saves time during initialization of the swash.
	2835	*/
	2836
	2837	PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
	2838
	2839	if (! isUTF8_CHAR(p, e)) {
	2840	_force_out_malformed_utf8_message(p, e, 0, 1);
	2841	NOT_REACHED; /* NOTREACHED */
	2842	}
	2843
	2844	if (!*swash) {
	2845	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	2846	*swash = _core_swash_init("utf8",
	2847
	2848	/* Only use the name if there is no inversion
	2849	* list; otherwise will go out to disk */
	2850	(invlist) ? "" : swashname,
	2851
	2852	&PL_sv_undef, 1, 0, invlist, &flags);
	2853	}
	2854
	2855	return swash_fetch(*swash, p, TRUE) != 0;
	2856	}
	2857
	2858	STATIC void
	2859	S_warn_on_first_deprecated_use(pTHX_ const char * const name,
	2860	const char * const alternative,
	2861	const bool use_locale,
	2862	const char * const file,
	2863	const unsigned line)
	2864	{
	2865	const char * key;
	2866
	2867	PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
	2868
	2869	if (ckWARN_d(WARN_DEPRECATED)) {
	2870
	2871	key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
	2872	if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
	2873	if (! PL_seen_deprecated_macro) {
	2874	PL_seen_deprecated_macro = newHV();
	2875	}
	2876	if (! hv_store(PL_seen_deprecated_macro, key,
	2877	strlen(key), &PL_sv_undef, 0))
	2878	{
	2879	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2880	}
	2881
	2882	if (instr(file, "mathoms.c")) {
	2883	Perl_warner(aTHX_ WARN_DEPRECATED,
	2884	"In %s, line %d, starting in Perl v5.30, %s()"
	2885	" will be removed. Avoid this message by"
	2886	" converting to use %s().\n",
	2887	file, line, name, alternative);
	2888	}
	2889	else {
	2890	Perl_warner(aTHX_ WARN_DEPRECATED,
	2891	"In %s, line %d, starting in Perl v5.30, %s() will"
	2892	" require an additional parameter. Avoid this"
	2893	" message by converting to use %s().\n",
	2894	file, line, name, alternative);
	2895	}
	2896	}
	2897	}
	2898	}
	2899
	2900	bool
	2901	Perl__is_utf8_FOO(pTHX_ U8 classnum,
	2902	const U8 * const p,
	2903	const char * const name,
	2904	const char * const alternative,
	2905	const bool use_utf8,
	2906	const bool use_locale,
	2907	const char * const file,
	2908	const unsigned line)
	2909	{
	2910	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	2911
	2912	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	2913
	2914	if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
	2915
	2916	switch (classnum) {
	2917	case _CC_WORDCHAR:
	2918	case _CC_DIGIT:
	2919	case _CC_ALPHA:
	2920	case _CC_LOWER:
	2921	case _CC_UPPER:
	2922	case _CC_PUNCT:
	2923	case _CC_PRINT:
	2924	case _CC_ALPHANUMERIC:
	2925	case _CC_GRAPH:
	2926	case _CC_CASED:
	2927
	2928	return is_utf8_common(p,
	2929	&PL_utf8_swash_ptrs[classnum],
	2930	swash_property_names[classnum],
	2931	PL_XPosix_ptrs[classnum]);
	2932
	2933	case _CC_SPACE:
	2934	return is_XPERLSPACE_high(p);
	2935	case _CC_BLANK:
	2936	return is_HORIZWS_high(p);
	2937	case _CC_XDIGIT:
	2938	return is_XDIGIT_high(p);
	2939	case _CC_CNTRL:
	2940	return 0;
	2941	case _CC_ASCII:
	2942	return 0;
	2943	case _CC_VERTSPACE:
	2944	return is_VERTWS_high(p);
	2945	case _CC_IDFIRST:
	2946	if (! PL_utf8_perl_idstart) {
	2947	PL_utf8_perl_idstart
	2948	= _new_invlist_C_array(_Perl_IDStart_invlist);
	2949	}
	2950	return is_utf8_common(p, &PL_utf8_perl_idstart,
	2951	"_Perl_IDStart", NULL);
	2952	case _CC_IDCONT:
	2953	if (! PL_utf8_perl_idcont) {
	2954	PL_utf8_perl_idcont
	2955	= _new_invlist_C_array(_Perl_IDCont_invlist);
	2956	}
	2957	return is_utf8_common(p, &PL_utf8_perl_idcont,
	2958	"_Perl_IDCont", NULL);
	2959	}
	2960	}
	2961
	2962	/* idcont is the same as wordchar below 256 */
	2963	if (classnum == _CC_IDCONT) {
	2964	classnum = _CC_WORDCHAR;
	2965	}
	2966	else if (classnum == _CC_IDFIRST) {
	2967	if (*p == '_') {
	2968	return TRUE;
	2969	}
	2970	classnum = _CC_ALPHA;
	2971	}
	2972
	2973	if (! use_locale) {
	2974	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	2975	return _generic_isCC(*p, classnum);
	2976	}
	2977
	2978	return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )), classnum);
	2979	}
	2980	else {
	2981	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	2982	return isFOO_lc(classnum, *p);
	2983	}
	2984
	2985	return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )));
	2986	}
	2987
	2988	NOT_REACHED; /* NOTREACHED */
	2989	}
	2990
	2991	bool
	2992	Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
	2993	const U8 * const e)
	2994	{
	2995	PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
	2996
	2997	assert(classnum < _FIRST_NON_SWASH_CC);
	2998
	2999	return is_utf8_common_with_len(p,
	3000	e,
	3001	&PL_utf8_swash_ptrs[classnum],
	3002	swash_property_names[classnum],
	3003	PL_XPosix_ptrs[classnum]);
	3004	}
	3005
	3006	bool
	3007	Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 p, const U8 const e)
	3008	{
	3009	SV* invlist = NULL;
	3010
	3011	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
	3012
	3013	if (! PL_utf8_perl_idstart) {
	3014	invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
	3015	}
	3016	return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
	3017	"_Perl_IDStart", invlist);
	3018	}
	3019
	3020	bool
	3021	Perl__is_utf8_xidstart(pTHX_ const U8 *p)
	3022	{
	3023	PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
	3024
	3025	if (*p == '_')
	3026	return TRUE;
	3027	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
	3028	}
	3029
	3030	bool
	3031	Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 p, const U8 const e)
	3032	{
	3033	SV* invlist = NULL;
	3034
	3035	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
	3036
	3037	if (! PL_utf8_perl_idcont) {
	3038	invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
	3039	}
	3040	return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
	3041	"_Perl_IDCont", invlist);
	3042	}
	3043
	3044	bool
	3045	Perl__is_utf8_idcont(pTHX_ const U8 *p)
	3046	{
	3047	PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
	3048
	3049	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
	3050	}
	3051
	3052	bool
	3053	Perl__is_utf8_xidcont(pTHX_ const U8 *p)
	3054	{
	3055	PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
	3056
	3057	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
	3058	}
	3059
	3060	bool
	3061	Perl__is_utf8_mark(pTHX_ const U8 *p)
	3062	{
	3063	PERL_ARGS_ASSERT__IS_UTF8_MARK;
	3064
	3065	return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
	3066	}
	3067
	3068	/* change namve uv1 to 'from' */
	3069	STATIC UV
	3070	S__to_utf8_case(pTHX_ const UV uv1, const U8 p, U8 ustrp, STRLEN *lenp,
	3071	SV *swashp, const char normal, const char *special)
	3072	{
	3073	STRLEN len = 0;
	3074
	3075	PERL_ARGS_ASSERT__TO_UTF8_CASE;
	3076
	3077	/* For code points that don't change case, we already know that the output
	3078	* of this function is the unchanged input, so we can skip doing look-ups
	3079	* for them. Unfortunately the case-changing code points are scattered
	3080	* around. But there are some long consecutive ranges where there are no
	3081	* case changing code points. By adding tests, we can eliminate the lookup
	3082	* for all the ones in such ranges. This is currently done here only for
	3083	* just a few cases where the scripts are in common use in modern commerce
	3084	* (and scripts adjacent to those which can be included without additional
	3085	* tests). */
	3086
	3087	if (uv1 >= 0x0590) {
	3088	/* This keeps from needing further processing the code points most
	3089	* likely to be used in the following non-cased scripts: Hebrew,
	3090	* Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
	3091	* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
	3092	* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
	3093	if (uv1 < 0x10A0) {
	3094	goto cases_to_self;
	3095	}
	3096
	3097	/* The following largish code point ranges also don't have case
	3098	* changes, but khw didn't think they warranted extra tests to speed
	3099	* them up (which would slightly slow down everything else above them):
	3100	* 1100..139F Hangul Jamo, Ethiopic
	3101	* 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
	3102	* Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
	3103	* Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
	3104	* Combining Diacritical Marks Extended, Balinese,
	3105	* Sundanese, Batak, Lepcha, Ol Chiki
	3106	* 2000..206F General Punctuation
	3107	*/
	3108
	3109	if (uv1 >= 0x2D30) {
	3110
	3111	/* This keeps the from needing further processing the code points
	3112	* most likely to be used in the following non-cased major scripts:
	3113	* CJK, Katakana, Hiragana, plus some less-likely scripts.
	3114	*
	3115	* (0x2D30 above might have to be changed to 2F00 in the unlikely
	3116	* event that Unicode eventually allocates the unused block as of
	3117	* v8.0 2FE0..2FEF to code points that are cased. khw has verified
	3118	* that the test suite will start having failures to alert you
	3119	* should that happen) */
	3120	if (uv1 < 0xA640) {
	3121	goto cases_to_self;
	3122	}
	3123
	3124	if (uv1 >= 0xAC00) {
	3125	if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
	3126	if (ckWARN_d(WARN_SURROGATE)) {
	3127	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3128	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3129	"Operation \"%s\" returns its argument for"
	3130	" UTF-16 surrogate U+%04" UVXf, desc, uv1);
	3131	}
	3132	goto cases_to_self;
	3133	}
	3134
	3135	/* AC00..FAFF Catches Hangul syllables and private use, plus
	3136	* some others */
	3137	if (uv1 < 0xFB00) {
	3138	goto cases_to_self;
	3139
	3140	}
	3141
	3142	if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
	3143	if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
	3144	Perl_croak(aTHX_ cp_above_legal_max, uv1,
	3145	MAX_EXTERNALLY_LEGAL_CP);
	3146	}
	3147	if (ckWARN_d(WARN_NON_UNICODE)) {
	3148	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3149	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3150	"Operation \"%s\" returns its argument for"
	3151	" non-Unicode code point 0x%04" UVXf, desc, uv1);
	3152	}
	3153	goto cases_to_self;
	3154	}
	3155	#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
	3156	if (UNLIKELY(uv1
	3157	> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
	3158	{
	3159
	3160	/* As of Unicode 10.0, this means we avoid swash creation
	3161	* for anything beyond high Plane 1 (below emojis) */
	3162	goto cases_to_self;
	3163	}
	3164	#endif
	3165	}
	3166	}
	3167
	3168	/* Note that non-characters are perfectly legal, so no warning should
	3169	* be given. There are so few of them, that it isn't worth the extra
	3170	* tests to avoid swash creation */
	3171	}
	3172
	3173	if (!swashp) / load on-demand */
	3174	*swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
	3175	4, 0, NULL, NULL);
	3176
	3177	if (special) {
	3178	/* It might be "special" (sometimes, but not always,
	3179	* a multicharacter mapping) */
	3180	HV *hv = NULL;
	3181	SV **svp;
	3182
	3183	/* If passed in the specials name, use that; otherwise use any
	3184	* given in the swash */
	3185	if (*special != '\0') {
	3186	hv = get_hv(special, 0);
	3187	}
	3188	else {
	3189	svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
	3190	if (svp) {
	3191	hv = MUTABLE_HV(SvRV(*svp));
	3192	}
	3193	}
	3194
	3195	if (hv
	3196	&& (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
	3197	&& (*svp))
	3198	{
	3199	const char *s;
	3200
	3201	s = SvPV_const(*svp, len);
	3202	if (len == 1)
	3203	/* EIGHTBIT */
	3204	len = uvchr_to_utf8(ustrp, (U8)s) - ustrp;
	3205	else {
	3206	Copy(s, ustrp, len, U8);
	3207	}
	3208	}
	3209	}
	3210
	3211	if (!len && *swashp) {
	3212	const UV uv2 = swash_fetch(swashp, p, TRUE / => is UTF-8 */);
	3213
	3214	if (uv2) {
	3215	/* It was "normal" (a single character mapping). */
	3216	len = uvchr_to_utf8(ustrp, uv2) - ustrp;
	3217	}
	3218	}
	3219
	3220	if (len) {
	3221	if (lenp) {
	3222	*lenp = len;
	3223	}
	3224	return valid_utf8_to_uvchr(ustrp, 0);
	3225	}
	3226
	3227	/* Here, there was no mapping defined, which means that the code point maps
	3228	* to itself. Return the inputs */
	3229	cases_to_self:
	3230	len = UTF8SKIP(p);
	3231	if (p != ustrp) { /* Don't copy onto itself */
	3232	Copy(p, ustrp, len, U8);
	3233	}
	3234
	3235	if (lenp)
	3236	*lenp = len;
	3237
	3238	return uv1;
	3239
	3240	}
	3241
	3242	STATIC UV
	3243	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
	3244	U8* const ustrp, STRLEN *lenp)
	3245	{
	3246	/* This is called when changing the case of a UTF-8-encoded character above
	3247	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	3248	* result contains a character that crosses the 255/256 boundary, disallow
	3249	* the change, and return the original code point. See L<perlfunc/lc> for
	3250	* why;
	3251	*
	3252	* p points to the original string whose case was changed; assumed
	3253	* by this routine to be well-formed
	3254	* result the code point of the first character in the changed-case string
	3255	* ustrp points to the changed-case string (<result> represents its
	3256	* first char)
	3257	* lenp points to the length of <ustrp> */
	3258
	3259	UV original; /* To store the first code point of <p> */
	3260
	3261	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	3262
	3263	assert(UTF8_IS_ABOVE_LATIN1(*p));
	3264
	3265	/* We know immediately if the first character in the string crosses the
	3266	* boundary, so can skip */
	3267	if (result > 255) {
	3268
	3269	/* Look at every character in the result; if any cross the
	3270	* boundary, the whole thing is disallowed */
	3271	U8* s = ustrp + UTF8SKIP(ustrp);
	3272	U8* e = ustrp + *lenp;
	3273	while (s < e) {
	3274	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	3275	goto bad_crossing;
	3276	}
	3277	s += UTF8SKIP(s);
	3278	}
	3279
	3280	/* Here, no characters crossed, result is ok as-is, but we warn. */
	3281	_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
	3282	return result;
	3283	}
	3284
	3285	bad_crossing:
	3286
	3287	/* Failed, have to return the original */
	3288	original = valid_utf8_to_uvchr(p, lenp);
	3289
	3290	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3291	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3292	"Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
	3293	" locale; resolved to \"\\x{%" UVXf "}\".",
	3294	OP_DESC(PL_op),
	3295	original,
	3296	original);
	3297	Copy(p, ustrp, *lenp, char);
	3298	return original;
	3299	}
	3300
	3301	STATIC U32
	3302	S_check_and_deprecate(pTHX_ const U8 *p,
	3303	const U8 **e,
	3304	const unsigned int type, /* See below */
	3305	const bool use_locale, /* Is this a 'LC_'
	3306	macro call? */
	3307	const char * const file,
	3308	const unsigned line)
	3309	{
	3310	/* This is a temporary function to deprecate the unsafe calls to the case
	3311	* changing macros and functions. It keeps all the special stuff in just
	3312	* one place.
	3313	*
	3314	* It updates *e with the pointer to the end of the input string. If using
	3315	* the old-style macros, *e is NULL on input, and so this function assumes
	3316	* the input string is long enough to hold the entire UTF-8 sequence, and
	3317	* sets *e accordingly, but it then returns a flag to pass the
	3318	* utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
	3319	* using the full length if possible.
	3320	*
	3321	* It also does the assert that e > p when e is not NULL. This should be
	3322	* migrated to the callers when this function gets deleted.
	3323	*
	3324	* The 'type' parameter is used for the caller to specify which case
	3325	* changing function this is called from: */
	3326
	3327	# define DEPRECATE_TO_UPPER 0
	3328	# define DEPRECATE_TO_TITLE 1
	3329	# define DEPRECATE_TO_LOWER 2
	3330	# define DEPRECATE_TO_FOLD 3
	3331
	3332	U32 utf8n_flags = 0;
	3333	const char * name;
	3334	const char * alternative;
	3335
	3336	PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
	3337
	3338	if (*e == NULL) {
	3339	utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
	3340	*e = p + UTF8SKIP(p);
	3341
	3342	/* For mathoms.c calls, we use the function name we know is stored
	3343	* there. It could be part of a larger path */
	3344	if (type == DEPRECATE_TO_UPPER) {
	3345	name = instr(file, "mathoms.c")
	3346	? "to_utf8_upper"
	3347	: "toUPPER_utf8";
	3348	alternative = "toUPPER_utf8_safe";
	3349	}
	3350	else if (type == DEPRECATE_TO_TITLE) {
	3351	name = instr(file, "mathoms.c")
	3352	? "to_utf8_title"
	3353	: "toTITLE_utf8";
	3354	alternative = "toTITLE_utf8_safe";
	3355	}
	3356	else if (type == DEPRECATE_TO_LOWER) {
	3357	name = instr(file, "mathoms.c")
	3358	? "to_utf8_lower"
	3359	: "toLOWER_utf8";
	3360	alternative = "toLOWER_utf8_safe";
	3361	}
	3362	else if (type == DEPRECATE_TO_FOLD) {
	3363	name = instr(file, "mathoms.c")
	3364	? "to_utf8_fold"
	3365	: "toFOLD_utf8";
	3366	alternative = "toFOLD_utf8_safe";
	3367	}
	3368	else Perl_croak(aTHX_ "panic: Unexpected case change type");
	3369
	3370	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	3371	}
	3372	else {
	3373	assert (p < *e);
	3374	}
	3375
	3376	return utf8n_flags;
	3377	}
	3378
	3379	/* The process for changing the case is essentially the same for the four case
	3380	* change types, except there are complications for folding. Otherwise the
	3381	* difference is only which case to change to. To make sure that they all do
	3382	* the same thing, the bodies of the functions are extracted out into the
	3383	* following two macros. The functions are written with the same variable
	3384	* names, and these are known and used inside these macros. It would be
	3385	* better, of course, to have inline functions to do it, but since different
	3386	* macros are called, depending on which case is being changed to, this is not
	3387	* feasible in C (to khw's knowledge). Two macros are created so that the fold
	3388	* function can start with the common start macro, then finish with its special
	3389	* handling; while the other three cases can just use the common end macro.
	3390	*
	3391	* The algorithm is to use the proper (passed in) macro or function to change
	3392	* the case for code points that are below 256. The macro is used if using
	3393	* locale rules for the case change; the function if not. If the code point is
	3394	* above 255, it is computed from the input UTF-8, and another macro is called
	3395	* to do the conversion. If necessary, the output is converted to UTF-8. If
	3396	* using a locale, we have to check that the change did not cross the 255/256
	3397	* boundary, see check_locale_boundary_crossing() for further details.
	3398	*
	3399	* The macros are split with the correct case change for the below-256 case
	3400	* stored into 'result', and in the middle of an else clause for the above-255
	3401	* case. At that point in the 'else', 'result' is not the final result, but is
	3402	* the input code point calculated from the UTF-8. The fold code needs to
	3403	* realize all this and take it from there.
	3404	*
	3405	* If you read the two macros as sequential, it's easier to understand what's
	3406	* going on. */
	3407	#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
	3408	L1_func_extra_param) \
	3409	\
	3410	if (flags & (locale_flags)) { \
	3411	/* Treat a UTF-8 locale as not being in locale at all */ \
	3412	if (IN_UTF8_CTYPE_LOCALE) { \
	3413	flags &= ~(locale_flags); \
	3414	} \
	3415	else { \
	3416	_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
	3417	} \
	3418	} \
	3419	\
	3420	if (UTF8_IS_INVARIANT(*p)) { \
	3421	if (flags & (locale_flags)) { \
	3422	result = LC_L1_change_macro(*p); \
	3423	} \
	3424	else { \
	3425	return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
	3426	} \
	3427	} \
	3428	else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
	3429	if (flags & (locale_flags)) { \
	3430	result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
	3431	*(p+1))); \
	3432	} \
	3433	else { \
	3434	return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)), \
	3435	ustrp, lenp, L1_func_extra_param); \
	3436	} \
	3437	} \
	3438	else { /* malformed UTF-8 or ord above 255 */ \
	3439	STRLEN len_result; \
	3440	result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
	3441	if (len_result == (STRLEN) -1) { \
	3442	_force_out_malformed_utf8_message(p, e, utf8n_flags, \
	3443	1 /* Die */ ); \
	3444	}
	3445
	3446	#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
	3447	result = change_macro(result, p, ustrp, lenp); \
	3448	\
	3449	if (flags & (locale_flags)) { \
	3450	result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
	3451	} \
	3452	return result; \
	3453	} \
	3454	\
	3455	/* Here, used locale rules. Convert back to UTF-8 */ \
	3456	if (UTF8_IS_INVARIANT(result)) { \
	3457	*ustrp = (U8) result; \
	3458	*lenp = 1; \
	3459	} \
	3460	else { \
	3461	*ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
	3462	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
	3463	*lenp = 2; \
	3464	} \
	3465	\
	3466	return result;
	3467
	3468	/*
	3469	=for apidoc to_utf8_upper
	3470
	3471	Instead use L</toUPPER_utf8_safe>.
	3472
	3473	=cut */
	3474
	3475	/* Not currently externally documented, and subject to change:
	3476	* <flags> is set iff iff the rules from the current underlying locale are to
	3477	* be used. */
	3478
	3479	UV
	3480	Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
	3481	const U8 *e,
	3482	U8* ustrp,
	3483	STRLEN *lenp,
	3484	bool flags,
	3485	const char * const file,
	3486	const int line)
	3487	{
	3488	UV result;
	3489	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
	3490	cBOOL(flags), file, line);
	3491
	3492	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	3493
	3494	/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
	3495	/* 2nd char of uc(U+DF) is 'S' */
	3496	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
	3497	CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
	3498	}
	3499
	3500	/*
	3501	=for apidoc to_utf8_title
	3502
	3503	Instead use L</toTITLE_utf8_safe>.
	3504
	3505	=cut */
	3506
	3507	/* Not currently externally documented, and subject to change:
	3508	* <flags> is set iff the rules from the current underlying locale are to be
	3509	* used. Since titlecase is not defined in POSIX, for other than a
	3510	* UTF-8 locale, uppercase is used instead for code points < 256.
	3511	*/
	3512
	3513	UV
	3514	Perl__to_utf8_title_flags(pTHX_ const U8 *p,
	3515	const U8 *e,
	3516	U8* ustrp,
	3517	STRLEN *lenp,
	3518	bool flags,
	3519	const char * const file,
	3520	const int line)
	3521	{
	3522	UV result;
	3523	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
	3524	cBOOL(flags), file, line);
	3525
	3526	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	3527
	3528	/* 2nd char of ucfirst(U+DF) is 's' */
	3529	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
	3530	CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
	3531	}
	3532
	3533	/*
	3534	=for apidoc to_utf8_lower
	3535
	3536	Instead use L</toLOWER_utf8_safe>.
	3537
	3538	=cut */
	3539
	3540	/* Not currently externally documented, and subject to change:
	3541	* <flags> is set iff iff the rules from the current underlying locale are to
	3542	* be used.
	3543	*/
	3544
	3545	UV
	3546	Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
	3547	const U8 *e,
	3548	U8* ustrp,
	3549	STRLEN *lenp,
	3550	bool flags,
	3551	const char * const file,
	3552	const int line)
	3553	{
	3554	UV result;
	3555	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
	3556	cBOOL(flags), file, line);
	3557
	3558	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	3559
	3560	CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
	3561	CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
	3562	}
	3563
	3564	/*
	3565	=for apidoc to_utf8_fold
	3566
	3567	Instead use L</toFOLD_utf8_safe>.
	3568
	3569	=cut */
	3570
	3571	/* Not currently externally documented, and subject to change,
	3572	* in <flags>
	3573	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3574	* locale are to be used.
	3575	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	3576	* otherwise simple folds
	3577	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	3578	* prohibited
	3579	*/
	3580
	3581	UV
	3582	Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
	3583	const U8 *e,
	3584	U8* ustrp,
	3585	STRLEN *lenp,
	3586	U8 flags,
	3587	const char * const file,
	3588	const int line)
	3589	{
	3590	UV result;
	3591	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
	3592	cBOOL(flags), file, line);
	3593
	3594	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	3595
	3596	/* These are mutually exclusive */
	3597	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	3598
	3599	assert(p != ustrp); /* Otherwise overwrites */
	3600
	3601	CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
	3602	((flags) & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII)));
	3603
	3604	result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	3605
	3606	if (flags & FOLD_FLAGS_LOCALE) {
	3607
	3608	# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
	3609	# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3610	# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3611
	3612	/* Special case these two characters, as what normally gets
	3613	* returned under locale doesn't work */
	3614	if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
	3615	{
	3616	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3617	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3618	"Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
	3619	"resolved to \"\\x{17F}\\x{17F}\".");
	3620	goto return_long_s;
	3621	}
	3622	else
	3623	#endif
	3624	if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
	3625	{
	3626	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3627	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3628	"Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
	3629	"resolved to \"\\x{FB06}\".");
	3630	goto return_ligature_st;
	3631	}
	3632
	3633	#if UNICODE_MAJOR_VERSION == 3 \
	3634	&& UNICODE_DOT_VERSION == 0 \
	3635	&& UNICODE_DOT_DOT_VERSION == 1
	3636	# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
	3637
	3638	/* And special case this on this Unicode version only, for the same
	3639	* reaons the other two are special cased. They would cross the
	3640	* 255/256 boundary which is forbidden under /l, and so the code
	3641	* wouldn't catch that they are equivalent (which they are only in
	3642	* this release) */
	3643	else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
	3644	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3645	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3646	"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
	3647	"resolved to \"\\x{0131}\".");
	3648	goto return_dotless_i;
	3649	}
	3650	#endif
	3651
	3652	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	3653	}
	3654	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	3655	return result;
	3656	}
	3657	else {
	3658	/* This is called when changing the case of a UTF-8-encoded
	3659	* character above the ASCII range, and the result should not
	3660	* contain an ASCII character. */
	3661
	3662	UV original; /* To store the first code point of <p> */
	3663
	3664	/* Look at every character in the result; if any cross the
	3665	* boundary, the whole thing is disallowed */
	3666	U8* s = ustrp;
	3667	U8* e = ustrp + *lenp;
	3668	while (s < e) {
	3669	if (isASCII(*s)) {
	3670	/* Crossed, have to return the original */
	3671	original = valid_utf8_to_uvchr(p, lenp);
	3672
	3673	/* But in these instances, there is an alternative we can
	3674	* return that is valid */
	3675	if (original == LATIN_SMALL_LETTER_SHARP_S
	3676	#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
	3677	\|\| original == LATIN_CAPITAL_LETTER_SHARP_S
	3678	#endif
	3679	) {
	3680	goto return_long_s;
	3681	}
	3682	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	3683	goto return_ligature_st;
	3684	}
	3685	#if UNICODE_MAJOR_VERSION == 3 \
	3686	&& UNICODE_DOT_VERSION == 0 \
	3687	&& UNICODE_DOT_DOT_VERSION == 1
	3688
	3689	else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	3690	goto return_dotless_i;
	3691	}
	3692	#endif
	3693	Copy(p, ustrp, *lenp, char);
	3694	return original;
	3695	}
	3696	s += UTF8SKIP(s);
	3697	}
	3698
	3699	/* Here, no characters crossed, result is ok as-is */
	3700	return result;
	3701	}
	3702	}
	3703
	3704	/* Here, used locale rules. Convert back to UTF-8 */
	3705	if (UTF8_IS_INVARIANT(result)) {
	3706	*ustrp = (U8) result;
	3707	*lenp = 1;
	3708	}
	3709	else {
	3710	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	3711	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	3712	*lenp = 2;
	3713	}
	3714
	3715	return result;
	3716
	3717	return_long_s:
	3718	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	3719	* folds to a string of two of these characters. By returning this
	3720	* instead, then, e.g.,
	3721	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	3722	* works. */
	3723
	3724	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	3725	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	3726	ustrp, *lenp, U8);
	3727	return LATIN_SMALL_LETTER_LONG_S;
	3728
	3729	return_ligature_st:
	3730	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	3731	* have the other one fold to it */
	3732
	3733	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	3734	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	3735	return LATIN_SMALL_LIGATURE_ST;
	3736
	3737	#if UNICODE_MAJOR_VERSION == 3 \
	3738	&& UNICODE_DOT_VERSION == 0 \
	3739	&& UNICODE_DOT_DOT_VERSION == 1
	3740
	3741	return_dotless_i:
	3742	*lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
	3743	Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
	3744	return LATIN_SMALL_LETTER_DOTLESS_I;
	3745
	3746	#endif
	3747
	3748	}
	3749
	3750	/* Note:
	3751	* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
	3752	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	3753	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	3754	*/
	3755
	3756	SV*
	3757	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	3758	I32 minbits, I32 none)
	3759	{
	3760	PERL_ARGS_ASSERT_SWASH_INIT;
	3761
	3762	/* Returns a copy of a swash initiated by the called function. This is the
	3763	* public interface, and returning a copy prevents others from doing
	3764	* mischief on the original */
	3765
	3766	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
	3767	NULL, NULL));
	3768	}
	3769
	3770	SV*
	3771	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	3772	I32 minbits, I32 none, SV* invlist,
	3773	U8* const flags_p)
	3774	{
	3775
	3776	/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
	3777	* use the following define */
	3778
	3779	#define CORE_SWASH_INIT_RETURN(x) \
	3780	PL_curpm= old_PL_curpm; \
	3781	return x
	3782
	3783	/* Initialize and return a swash, creating it if necessary. It does this
	3784	* by calling utf8_heavy.pl in the general case. The returned value may be
	3785	* the swash's inversion list instead if the input parameters allow it.
	3786	* Which is returned should be immaterial to callers, as the only
	3787	* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
	3788	* and swash_to_invlist() handle both these transparently.
	3789	*
	3790	* This interface should only be used by functions that won't destroy or
	3791	* adversely change the swash, as doing so affects all other uses of the
	3792	* swash in the program; the general public should use 'Perl_swash_init'
	3793	* instead.
	3794	*
	3795	* pkg is the name of the package that <name> should be in.
	3796	* name is the name of the swash to find. Typically it is a Unicode
	3797	* property name, including user-defined ones
	3798	* listsv is a string to initialize the swash with. It must be of the form
	3799	* documented as the subroutine return value in
	3800	* L<perlunicode/User-Defined Character Properties>
	3801	* minbits is the number of bits required to represent each data element.
	3802	* It is '1' for binary properties.
	3803	* none I (khw) do not understand this one, but it is used only in tr///.
	3804	* invlist is an inversion list to initialize the swash with (or NULL)
	3805	* flags_p if non-NULL is the address of various input and output flag bits
	3806	* to the routine, as follows: ('I' means is input to the routine;
	3807	* 'O' means output from the routine. Only flags marked O are
	3808	* meaningful on return.)
	3809	* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
	3810	* came from a user-defined property. (I O)
	3811	* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
	3812	* when the swash cannot be located, to simply return NULL. (I)
	3813	* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
	3814	* return of an inversion list instead of a swash hash if this routine
	3815	* thinks that would result in faster execution of swash_fetch() later
	3816	* on. (I)
	3817	*
	3818	* Thus there are three possible inputs to find the swash: <name>,
	3819	* <listsv>, and <invlist>. At least one must be specified. The result
	3820	* will be the union of the specified ones, although <listsv>'s various
	3821	* actions can intersect, etc. what <name> gives. To avoid going out to
	3822	* disk at all, <invlist> should specify completely what the swash should
	3823	* have, and <listsv> should be &PL_sv_undef and <name> should be "".
	3824	*
	3825	* <invlist> is only valid for binary properties */
	3826
	3827	PMOP old_PL_curpm= PL_curpm; / save away the old PL_curpm */
	3828
	3829	SV* retval = &PL_sv_undef;
	3830	HV* swash_hv = NULL;
	3831	const int invlist_swash_boundary =
	3832	(flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
	3833	? 512 /* Based on some benchmarking, but not extensive, see commit
	3834	message */
	3835	: -1; /* Never return just an inversion list */
	3836
	3837	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	3838	assert(! invlist \|\| minbits == 1);
	3839
	3840	PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
	3841	regex that triggered the swash init and the swash init
	3842	perl logic itself. See perl #122747 */
	3843
	3844	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	3845	* so */
	3846	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	3847	dSP;
	3848	const size_t pkg_len = strlen(pkg);
	3849	const size_t name_len = strlen(name);
	3850	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	3851	SV* errsv_save;
	3852	GV *method;
	3853
	3854	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	3855
	3856	PUSHSTACKi(PERLSI_MAGIC);
	3857	ENTER;
	3858	SAVEHINTS();
	3859	save_re_context();
	3860	/* We might get here via a subroutine signature which uses a utf8
	3861	* parameter name, at which point PL_subname will have been set
	3862	* but not yet used. */
	3863	save_item(PL_subname);
	3864	if (PL_parser && PL_parser->error_count)
	3865	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	3866	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	3867	if (!method) { /* demand load UTF-8 */
	3868	ENTER;
	3869	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	3870	GvSV(PL_errgv) = NULL;
	3871	#ifndef NO_TAINT_SUPPORT
	3872	/* It is assumed that callers of this routine are not passing in
	3873	* any user derived data. */
	3874	/* Need to do this after save_re_context() as it will set
	3875	* PL_tainted to 1 while saving $1 etc (see the code after getrx:
	3876	* in Perl_magic_get). Even line to create errsv_save can turn on
	3877	* PL_tainted. */
	3878	SAVEBOOL(TAINT_get);
	3879	TAINT_NOT;
	3880	#endif
	3881	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	3882	NULL);
	3883	{
	3884	/* Not ERRSV, as there is no need to vivify a scalar we are
	3885	about to discard. */
	3886	SV * const errsv = GvSV(PL_errgv);
	3887	if (!SvTRUE(errsv)) {
	3888	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	3889	SvREFCNT_dec(errsv);
	3890	}
	3891	}
	3892	LEAVE;
	3893	}
	3894	SPAGAIN;
	3895	PUSHMARK(SP);
	3896	EXTEND(SP,5);
	3897	mPUSHp(pkg, pkg_len);
	3898	mPUSHp(name, name_len);
	3899	PUSHs(listsv);
	3900	mPUSHi(minbits);
	3901	mPUSHi(none);
	3902	PUTBACK;
	3903	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	3904	GvSV(PL_errgv) = NULL;
	3905	/* If we already have a pointer to the method, no need to use
	3906	* call_method() to repeat the lookup. */
	3907	if (method
	3908	? call_sv(MUTABLE_SV(method), G_SCALAR)
	3909	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	3910	{
	3911	retval = *PL_stack_sp--;
	3912	SvREFCNT_inc(retval);
	3913	}
	3914	{
	3915	/* Not ERRSV. See above. */
	3916	SV * const errsv = GvSV(PL_errgv);
	3917	if (!SvTRUE(errsv)) {
	3918	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	3919	SvREFCNT_dec(errsv);
	3920	}
	3921	}
	3922	LEAVE;
	3923	POPSTACK;
	3924	if (IN_PERL_COMPILETIME) {
	3925	CopHINTS_set(PL_curcop, PL_hints);
	3926	}
	3927	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	3928	if (SvPOK(retval)) {
	3929
	3930	/* If caller wants to handle missing properties, let them */
	3931	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
	3932	CORE_SWASH_INIT_RETURN(NULL);
	3933	}
	3934	Perl_croak(aTHX_
	3935	"Can't find Unicode property definition \"%" SVf "\"",
	3936	SVfARG(retval));
	3937	NOT_REACHED; /* NOTREACHED */
	3938	}
	3939	}
	3940	} /* End of calling the module to find the swash */
	3941
	3942	/* If this operation fetched a swash, and we will need it later, get it */
	3943	if (retval != &PL_sv_undef
	3944	&& (minbits == 1 \|\| (flags_p
	3945	&& ! (*flags_p
	3946	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
	3947	{
	3948	swash_hv = MUTABLE_HV(SvRV(retval));
	3949
	3950	/* If we don't already know that there is a user-defined component to
	3951	* this swash, and the user has indicated they wish to know if there is
	3952	* one (by passing <flags_p>), find out */
	3953	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
	3954	SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
	3955	if (user_defined && SvUV(*user_defined)) {
	3956	*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	3957	}
	3958	}
	3959	}
	3960
	3961	/* Make sure there is an inversion list for binary properties */
	3962	if (minbits == 1) {
	3963	SV** swash_invlistsvp = NULL;
	3964	SV* swash_invlist = NULL;
	3965	bool invlist_in_swash_is_valid = FALSE;
	3966	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
	3967	an unclaimed reference count */
	3968
	3969	/* If this operation fetched a swash, get its already existing
	3970	* inversion list, or create one for it */
	3971
	3972	if (swash_hv) {
	3973	swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
	3974	if (swash_invlistsvp) {
	3975	swash_invlist = *swash_invlistsvp;
	3976	invlist_in_swash_is_valid = TRUE;
	3977	}
	3978	else {
	3979	swash_invlist = _swash_to_invlist(retval);
	3980	swash_invlist_unclaimed = TRUE;
	3981	}
	3982	}
	3983
	3984	/* If an inversion list was passed in, have to include it */
	3985	if (invlist) {
	3986
	3987	/* Any fetched swash will by now have an inversion list in it;
	3988	* otherwise <swash_invlist> will be NULL, indicating that we
	3989	* didn't fetch a swash */
	3990	if (swash_invlist) {
	3991
	3992	/* Add the passed-in inversion list, which invalidates the one
	3993	* already stored in the swash */
	3994	invlist_in_swash_is_valid = FALSE;
	3995	SvREADONLY_off(swash_invlist); /* Turned on again below */
	3996	_invlist_union(invlist, swash_invlist, &swash_invlist);
	3997	}
	3998	else {
	3999
	4000	/* Here, there is no swash already. Set up a minimal one, if
	4001	* we are going to return a swash */
	4002	if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
	4003	swash_hv = newHV();
	4004	retval = newRV_noinc(MUTABLE_SV(swash_hv));
	4005	}
	4006	swash_invlist = invlist;
	4007	}
	4008	}
	4009
	4010	/* Here, we have computed the union of all the passed-in data. It may
	4011	* be that there was an inversion list in the swash which didn't get
	4012	* touched; otherwise save the computed one */
	4013	if (! invlist_in_swash_is_valid
	4014	&& (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
	4015	{
	4016	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
	4017	{
	4018	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4019	}
	4020	/* We just stole a reference count. */
	4021	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
	4022	else SvREFCNT_inc_simple_void_NN(swash_invlist);
	4023	}
	4024
	4025	/* The result is immutable. Forbid attempts to change it. */
	4026	SvREADONLY_on(swash_invlist);
	4027
	4028	/* Use the inversion list stand-alone if small enough */
	4029	if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
	4030	SvREFCNT_dec(retval);
	4031	if (!swash_invlist_unclaimed)
	4032	SvREFCNT_inc_simple_void_NN(swash_invlist);
	4033	retval = newRV_noinc(swash_invlist);
	4034	}
	4035	}
	4036
	4037	CORE_SWASH_INIT_RETURN(retval);
	4038	#undef CORE_SWASH_INIT_RETURN
	4039	}
	4040
	4041
	4042	/* This API is wrong for special case conversions since we may need to
	4043	* return several Unicode characters for a single Unicode character
	4044	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	4045	* the lower-level routine, and it is similarly broken for returning
	4046	* multiple values. --jhi
	4047	* For those, you should use S__to_utf8_case() instead */
	4048	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	4049
	4050	/* Note:
	4051	* Returns the value of property/mapping C<swash> for the first character
	4052	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	4053	* assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
	4054	* is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	4055	*
	4056	* A "swash" is a hash which contains initially the keys/values set up by
	4057	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	4058	* property for all possible code points. Things are stored in a compact form
	4059	* (see utf8_heavy.pl) so that calculation is required to find the actual
	4060	* property value for a given code point. As code points are looked up, new
	4061	* key/value pairs are added to the hash, so that the calculation doesn't have
	4062	* to ever be re-done. Further, each calculation is done, not just for the
	4063	* desired one, but for a whole block of code points adjacent to that one.
	4064	* For binary properties on ASCII machines, the block is usually for 64 code
	4065	* points, starting with a code point evenly divisible by 64. Thus if the
	4066	* property value for code point 257 is requested, the code goes out and
	4067	* calculates the property values for all 64 code points between 256 and 319,
	4068	* and stores these as a single 64-bit long bit vector, called a "swatch",
	4069	* under the key for code point 256. The key is the UTF-8 encoding for code
	4070	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	4071	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	4072	* for code point 258 is then requested, this code realizes that it would be
	4073	* stored under the key for 256, and would find that value and extract the
	4074	* relevant bit, offset from 256.
	4075	*
	4076	* Non-binary properties are stored in as many bits as necessary to represent
	4077	* their values (32 currently, though the code is more general than that), not
	4078	* as single bits, but the principle is the same: the value for each key is a
	4079	* vector that encompasses the property values for all code points whose UTF-8
	4080	* representations are represented by the key. That is, for all code points
	4081	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	4082	* bytes of that.
	4083	*/
	4084	UV
	4085	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	4086	{
	4087	HV *const hv = MUTABLE_HV(SvRV(swash));
	4088	U32 klen;
	4089	U32 off;
	4090	STRLEN slen = 0;
	4091	STRLEN needents;
	4092	const U8 *tmps = NULL;
	4093	SV *swatch;
	4094	const U8 c = *ptr;
	4095
	4096	PERL_ARGS_ASSERT_SWASH_FETCH;
	4097
	4098	/* If it really isn't a hash, it isn't really swash; must be an inversion
	4099	* list */
	4100	if (SvTYPE(hv) != SVt_PVHV) {
	4101	return _invlist_contains_cp((SV*)hv,
	4102	(do_utf8)
	4103	? valid_utf8_to_uvchr(ptr, NULL)
	4104	: c);
	4105	}
	4106
	4107	/* We store the values in a "swatch" which is a vec() value in a swash
	4108	* hash. Code points 0-255 are a single vec() stored with key length
	4109	* (klen) 0. All other code points have a UTF-8 representation
	4110	* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
	4111	* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
	4112	* length for them is the length of the encoded char - 1. ptr[klen] is the
	4113	* final byte in the sequence representing the character */
	4114	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
	4115	klen = 0;
	4116	needents = 256;
	4117	off = c;
	4118	}
	4119	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	4120	klen = 0;
	4121	needents = 256;
	4122	off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
	4123	}
	4124	else {
	4125	klen = UTF8SKIP(ptr) - 1;
	4126
	4127	/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
	4128	* the vec is the final byte in the sequence. (In EBCDIC this is
	4129	* converted to I8 to get consecutive values.) To help you visualize
	4130	* all this:
	4131	* Straight 1047 After final byte
	4132	* UTF-8 UTF-EBCDIC I8 transform
	4133	* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
	4134	* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
	4135	* ...
	4136	* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
	4137	* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
	4138	* ...
	4139	* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
	4140	* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
	4141	* ...
	4142	* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
	4143	* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
	4144	* ...
	4145	* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
	4146	* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
	4147	*
	4148	* (There are no discontinuities in the elided (...) entries.)
	4149	* The UTF-8 key for these 33 code points is '\xD0' (which also is the
	4150	* key for the next 31, up through U+043F, whose UTF-8 final byte is
	4151	* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
	4152	* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
	4153	* index into the vec() swatch (after subtracting 0x80, which we
	4154	* actually do with an '&').
	4155	* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
	4156	* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
	4157	* dicontinuities which go away by transforming it into I8, and we
	4158	* effectively subtract 0xA0 to get the index. */
	4159	needents = (1 << UTF_ACCUMULATION_SHIFT);
	4160	off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
	4161	}
	4162
	4163	/*
	4164	* This single-entry cache saves about 1/3 of the UTF-8 overhead in test
	4165	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	4166	* it's nothing to sniff at.) Pity we usually come through at least
	4167	* two function calls to get here...
	4168	*
	4169	* NB: this code assumes that swatches are never modified, once generated!
	4170	*/
	4171
	4172	if (hv == PL_last_swash_hv &&
	4173	klen == PL_last_swash_klen &&
	4174	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	4175	{
	4176	tmps = PL_last_swash_tmps;
	4177	slen = PL_last_swash_slen;
	4178	}
	4179	else {
	4180	/* Try our second-level swatch cache, kept in a hash. */
	4181	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	4182
	4183	/* If not cached, generate it via swatch_get */
	4184	if (!svp \|\| !SvPOK(*svp)
	4185	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
	4186	{
	4187	if (klen) {
	4188	const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
	4189	swatch = swatch_get(swash,
	4190	code_point & ~((UV)needents - 1),
	4191	needents);
	4192	}
	4193	else { /* For the first 256 code points, the swatch has a key of
	4194	length 0 */
	4195	swatch = swatch_get(swash, 0, needents);
	4196	}
	4197
	4198	if (IN_PERL_COMPILETIME)
	4199	CopHINTS_set(PL_curcop, PL_hints);
	4200
	4201	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	4202
	4203	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	4204	\|\| (slen << 3) < needents)
	4205	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	4206	"svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
	4207	svp, tmps, (UV)slen, (UV)needents);
	4208	}
	4209
	4210	PL_last_swash_hv = hv;
	4211	assert(klen <= sizeof(PL_last_swash_key));
	4212	PL_last_swash_klen = (U8)klen;
	4213	/* FIXME change interpvar.h? */
	4214	PL_last_swash_tmps = (U8 *) tmps;
	4215	PL_last_swash_slen = slen;
	4216	if (klen)
	4217	Copy(ptr, PL_last_swash_key, klen, U8);
	4218	}
	4219
	4220	switch ((int)((slen << 3) / needents)) {
	4221	case 1:
	4222	return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
	4223	case 8:
	4224	return ((UV) tmps[off]);
	4225	case 16:
	4226	off <<= 1;
	4227	return
	4228	((UV) tmps[off ] << 8) +
	4229	((UV) tmps[off + 1]);
	4230	case 32:
	4231	off <<= 2;
	4232	return
	4233	((UV) tmps[off ] << 24) +
	4234	((UV) tmps[off + 1] << 16) +
	4235	((UV) tmps[off + 2] << 8) +
	4236	((UV) tmps[off + 3]);
	4237	}
	4238	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	4239	"slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
	4240	NORETURN_FUNCTION_END;
	4241	}
	4242
	4243	/* Read a single line of the main body of the swash input text. These are of
	4244	* the form:
	4245	* 0053 0056 0073
	4246	* where each number is hex. The first two numbers form the minimum and
	4247	* maximum of a range, and the third is the value associated with the range.
	4248	* Not all swashes should have a third number
	4249	*
	4250	* On input: l points to the beginning of the line to be examined; it points
	4251	* to somewhere in the string of the whole input text, and is
	4252	* terminated by a \n or the null string terminator.
	4253	* lend points to the null terminator of that string
	4254	* wants_value is non-zero if the swash expects a third number
	4255	* typestr is the name of the swash's mapping, like 'ToLower'
	4256	* On output: min, max, and *val are set to the values read from the line.
	4257	* returns a pointer just beyond the line examined. If there was no
	4258	* valid min number on the line, returns lend+1
	4259	*/
	4260
	4261	STATIC U8*
	4262	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	4263	const bool wants_value, const U8* const typestr)
	4264	{
	4265	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	4266	STRLEN numlen; /* Length of the number */
	4267	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	4268	\| PERL_SCAN_DISALLOW_PREFIX
	4269	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4270
	4271	/* nl points to the next \n in the scan */
	4272	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	4273
	4274	PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
	4275
	4276	/* Get the first number on the line: the range minimum */
	4277	numlen = lend - l;
	4278	min = grok_hex((char )l, &numlen, &flags, NULL);
	4279	max = min; /* So can never return without setting max */
	4280	if (numlen) /* If found a hex number, position past it */
	4281	l += numlen;
	4282	else if (nl) { /* Else, go handle next line, if any */
	4283	return nl + 1; /* 1 is length of "\n" */
	4284	}
	4285	else { /* Else, no next line */
	4286	return lend + 1; /* to LIST's end at which \n is not found */
	4287	}
	4288
	4289	/* The max range value follows, separated by a BLANK */
	4290	if (isBLANK(*l)) {
	4291	++l;
	4292	flags = PERL_SCAN_SILENT_ILLDIGIT
	4293	\| PERL_SCAN_DISALLOW_PREFIX
	4294	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4295	numlen = lend - l;
	4296	max = grok_hex((char )l, &numlen, &flags, NULL);
	4297	if (numlen)
	4298	l += numlen;
	4299	else /* If no value here, it is a single element range */
	4300	max = min;
	4301
	4302	/* Non-binary tables have a third entry: what the first element of the
	4303	* range maps to. The map for those currently read here is in hex */
	4304	if (wants_value) {
	4305	if (isBLANK(*l)) {
	4306	++l;
	4307	flags = PERL_SCAN_SILENT_ILLDIGIT
	4308	\| PERL_SCAN_DISALLOW_PREFIX
	4309	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4310	numlen = lend - l;
	4311	val = grok_hex((char )l, &numlen, &flags, NULL);
	4312	if (numlen)
	4313	l += numlen;
	4314	else
	4315	*val = 0;
	4316	}
	4317	else {
	4318	*val = 0;
	4319	if (typeto) {
	4320	/* diag_listed_as: To%s: illegal mapping '%s' */
	4321	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	4322	typestr, l);
	4323	}
	4324	}
	4325	}
	4326	else
	4327	val = 0; / bits == 1, then any val should be ignored */
	4328	}
	4329	else { /* Nothing following range min, should be single element with no
	4330	mapping expected */
	4331	if (wants_value) {
	4332	*val = 0;
	4333	if (typeto) {
	4334	/* diag_listed_as: To%s: illegal mapping '%s' */
	4335	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	4336	}
	4337	}
	4338	else
	4339	val = 0; / bits == 1, then val should be ignored */
	4340	}
	4341
	4342	/* Position to next line if any, or EOF */
	4343	if (nl)
	4344	l = nl + 1;
	4345	else
	4346	l = lend;
	4347
	4348	return l;
	4349	}
	4350
	4351	/* Note:
	4352	* Returns a swatch (a bit vector string) for a code point sequence
	4353	* that starts from the value C<start> and comprises the number C<span>.
	4354	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	4355	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	4356	*/
	4357	STATIC SV*
	4358	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	4359	{
	4360	SV *swatch;
	4361	U8 l, lend, x, xend, s, send;
	4362	STRLEN lcur, xcur, scur;
	4363	HV *const hv = MUTABLE_HV(SvRV(swash));
	4364	SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
	4365
	4366	SV** listsvp = NULL; /* The string containing the main body of the table */
	4367	SV** extssvp = NULL;
	4368	SV** invert_it_svp = NULL;
	4369	U8* typestr = NULL;
	4370	STRLEN bits;
	4371	STRLEN octets; /* if bits == 1, then octets == 0 */
	4372	UV none;
	4373	UV end = start + span;
	4374
	4375	if (invlistsvp == NULL) {
	4376	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	4377	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	4378	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	4379	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	4380	listsvp = hv_fetchs(hv, "LIST", FALSE);
	4381	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	4382
	4383	bits = SvUV(*bitssvp);
	4384	none = SvUV(*nonesvp);
	4385	typestr = (U8)SvPV_nolen(typesvp);
	4386	}
	4387	else {
	4388	bits = 1;
	4389	none = 0;
	4390	}
	4391	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	4392
	4393	PERL_ARGS_ASSERT_SWATCH_GET;
	4394
	4395	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	4396	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
	4397	(UV)bits);
	4398	}
	4399
	4400	/* If overflowed, use the max possible */
	4401	if (end < start) {
	4402	end = UV_MAX;
	4403	span = end - start;
	4404	}
	4405
	4406	/* create and initialize $swatch */
	4407	scur = octets ? (span * octets) : (span + 7) / 8;
	4408	swatch = newSV(scur);
	4409	SvPOK_on(swatch);
	4410	s = (U8*)SvPVX(swatch);
	4411	if (octets && none) {
	4412	const U8* const e = s + scur;
	4413	while (s < e) {
	4414	if (bits == 8)
	4415	*s++ = (U8)(none & 0xff);
	4416	else if (bits == 16) {
	4417	*s++ = (U8)((none >> 8) & 0xff);
	4418	*s++ = (U8)( none & 0xff);
	4419	}
	4420	else if (bits == 32) {
	4421	*s++ = (U8)((none >> 24) & 0xff);
	4422	*s++ = (U8)((none >> 16) & 0xff);
	4423	*s++ = (U8)((none >> 8) & 0xff);
	4424	*s++ = (U8)( none & 0xff);
	4425	}
	4426	}
	4427	*s = '\0';
	4428	}
	4429	else {
	4430	(void)memzero((U8*)s, scur + 1);
	4431	}
	4432	SvCUR_set(swatch, scur);
	4433	s = (U8*)SvPVX(swatch);
	4434
	4435	if (invlistsvp) { /* If has an inversion list set up use that */
	4436	_invlist_populate_swatch(*invlistsvp, start, end, s);
	4437	return swatch;
	4438	}
	4439
	4440	/* read $swash->{LIST} */
	4441	l = (U8)SvPV(listsvp, lcur);
	4442	lend = l + lcur;
	4443	while (l < lend) {
	4444	UV min, max, val, upper;
	4445	l = swash_scan_list_line(l, lend, &min, &max, &val,
	4446	cBOOL(octets), typestr);
	4447	if (l > lend) {
	4448	break;
	4449	}
	4450
	4451	/* If looking for something beyond this range, go try the next one */
	4452	if (max < start)
	4453	continue;
	4454
	4455	/* <end> is generally 1 beyond where we want to set things, but at the
	4456	* platform's infinity, where we can't go any higher, we want to
	4457	* include the code point at <end> */
	4458	upper = (max < end)
	4459	? max
	4460	: (max != UV_MAX \|\| end != UV_MAX)
	4461	? end - 1
	4462	: end;
	4463
	4464	if (octets) {
	4465	UV key;
	4466	if (min < start) {
	4467	if (!none \|\| val < none) {
	4468	val += start - min;
	4469	}
	4470	min = start;
	4471	}
	4472	for (key = min; key <= upper; key++) {
	4473	STRLEN offset;
	4474	/* offset must be non-negative (start <= min <= key < end) */
	4475	offset = octets * (key - start);
	4476	if (bits == 8)
	4477	s[offset] = (U8)(val & 0xff);
	4478	else if (bits == 16) {
	4479	s[offset ] = (U8)((val >> 8) & 0xff);
	4480	s[offset + 1] = (U8)( val & 0xff);
	4481	}
	4482	else if (bits == 32) {
	4483	s[offset ] = (U8)((val >> 24) & 0xff);
	4484	s[offset + 1] = (U8)((val >> 16) & 0xff);
	4485	s[offset + 2] = (U8)((val >> 8) & 0xff);
	4486	s[offset + 3] = (U8)( val & 0xff);
	4487	}
	4488
	4489	if (!none \|\| val < none)
	4490	++val;
	4491	}
	4492	}
	4493	else { /* bits == 1, then val should be ignored */
	4494	UV key;
	4495	if (min < start)
	4496	min = start;
	4497
	4498	for (key = min; key <= upper; key++) {
	4499	const STRLEN offset = (STRLEN)(key - start);
	4500	s[offset >> 3] \|= 1 << (offset & 7);
	4501	}
	4502	}
	4503	} /* while */
	4504
	4505	/* Invert if the data says it should be. Assumes that bits == 1 */
	4506	if (invert_it_svp && SvUV(*invert_it_svp)) {
	4507
	4508	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	4509	* be 0, and their inversion should also be 0, as we don't succeed any
	4510	* Unicode property matches for non-Unicode code points */
	4511	if (start <= PERL_UNICODE_MAX) {
	4512
	4513	/* The code below assumes that we never cross the
	4514	* Unicode/above-Unicode boundary in a range, as otherwise we would
	4515	* have to figure out where to stop flipping the bits. Since this
	4516	* boundary is divisible by a large power of 2, and swatches comes
	4517	* in small powers of 2, this should be a valid assumption */
	4518	assert(start + span - 1 <= PERL_UNICODE_MAX);
	4519
	4520	send = s + scur;
	4521	while (s < send) {
	4522	s = ~(s);
	4523	s++;
	4524	}
	4525	}
	4526	}
	4527
	4528	/* read $swash->{EXTRAS}
	4529	* This code also copied to swash_to_invlist() below */
	4530	x = (U8)SvPV(extssvp, xcur);
	4531	xend = x + xcur;
	4532	while (x < xend) {
	4533	STRLEN namelen;
	4534	U8 *namestr;
	4535	SV** othersvp;
	4536	HV* otherhv;
	4537	STRLEN otherbits;
	4538	SV *otherbitssvp, other;
	4539	U8 s, o, *nl;
	4540	STRLEN slen, olen;
	4541
	4542	const U8 opc = *x++;
	4543	if (opc == '\n')
	4544	continue;
	4545
	4546	nl = (U8*)memchr(x, '\n', xend - x);
	4547
	4548	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	4549	if (nl) {
	4550	x = nl + 1; /* 1 is length of "\n" */
	4551	continue;
	4552	}
	4553	else {
	4554	x = xend; /* to EXTRAS' end at which \n is not found */
	4555	break;
	4556	}
	4557	}
	4558
	4559	namestr = x;
	4560	if (nl) {
	4561	namelen = nl - namestr;
	4562	x = nl + 1;
	4563	}
	4564	else {
	4565	namelen = xend - namestr;
	4566	x = xend;
	4567	}
	4568
	4569	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	4570	otherhv = MUTABLE_HV(SvRV(*othersvp));
	4571	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	4572	otherbits = (STRLEN)SvUV(*otherbitssvp);
	4573	if (bits < otherbits)
	4574	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	4575	"bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
	4576
	4577	/* The "other" swatch must be destroyed after. */
	4578	other = swatch_get(*othersvp, start, span);
	4579	o = (U8*)SvPV(other, olen);
	4580
	4581	if (!olen)
	4582	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	4583
	4584	s = (U8*)SvPV(swatch, slen);
	4585	if (bits == 1 && otherbits == 1) {
	4586	if (slen != olen)
	4587	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	4588	"mismatch, slen=%" UVuf ", olen=%" UVuf,
	4589	(UV)slen, (UV)olen);
	4590
	4591	switch (opc) {
	4592	case '+':
	4593	while (slen--)
	4594	s++ \|= o++;
	4595	break;
	4596	case '!':
	4597	while (slen--)
	4598	s++ \|= ~o++;
	4599	break;
	4600	case '-':
	4601	while (slen--)
	4602	s++ &= ~o++;
	4603	break;
	4604	case '&':
	4605	while (slen--)
	4606	s++ &= o++;
	4607	break;
	4608	default:
	4609	break;
	4610	}
	4611	}
	4612	else {
	4613	STRLEN otheroctets = otherbits >> 3;
	4614	STRLEN offset = 0;
	4615	U8* const send = s + slen;
	4616
	4617	while (s < send) {
	4618	UV otherval = 0;
	4619
	4620	if (otherbits == 1) {
	4621	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	4622	++offset;
	4623	}
	4624	else {
	4625	STRLEN vlen = otheroctets;
	4626	otherval = *o++;
	4627	while (--vlen) {
	4628	otherval <<= 8;
	4629	otherval \|= *o++;
	4630	}
	4631	}
	4632
	4633	if (opc == '+' && otherval)
	4634	NOOP; /* replace with otherval */
	4635	else if (opc == '!' && !otherval)
	4636	otherval = 1;
	4637	else if (opc == '-' && otherval)
	4638	otherval = 0;
	4639	else if (opc == '&' && !otherval)
	4640	otherval = 0;
	4641	else {
	4642	s += octets; /* no replacement */
	4643	continue;
	4644	}
	4645
	4646	if (bits == 8)
	4647	*s++ = (U8)( otherval & 0xff);
	4648	else if (bits == 16) {
	4649	*s++ = (U8)((otherval >> 8) & 0xff);
	4650	*s++ = (U8)( otherval & 0xff);
	4651	}
	4652	else if (bits == 32) {
	4653	*s++ = (U8)((otherval >> 24) & 0xff);
	4654	*s++ = (U8)((otherval >> 16) & 0xff);
	4655	*s++ = (U8)((otherval >> 8) & 0xff);
	4656	*s++ = (U8)( otherval & 0xff);
	4657	}
	4658	}
	4659	}
	4660	sv_free(other); /* through with it! */
	4661	} /* while */
	4662	return swatch;
	4663	}
	4664
	4665	HV*
	4666	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	4667	{
	4668
	4669	/* Subject to change or removal. For use only in regcomp.c and regexec.c
	4670	* Can't be used on a property that is subject to user override, as it
	4671	* relies on the value of SPECIALS in the swash which would be set by
	4672	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	4673	* for overridden properties
	4674	*
	4675	* Returns a hash which is the inversion and closure of a swash mapping.
	4676	* For example, consider the input lines:
	4677	* 004B 006B
	4678	* 004C 006C
	4679	* 212A 006B
	4680	*
	4681	* The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
	4682	* 006C. The value for each key is an array. For 006C, the array would
	4683	* have two elements, the UTF-8 for itself, and for 004C. For 006B, there
	4684	* would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
	4685	*
	4686	* Note that there are no elements in the hash for 004B, 004C, 212A. The
	4687	* keys are only code points that are folded-to, so it isn't a full closure.
	4688	*
	4689	* Essentially, for any code point, it gives all the code points that map to
	4690	* it, or the list of 'froms' for that point.
	4691	*
	4692	* Currently it ignores any additions or deletions from other swashes,
	4693	* looking at just the main body of the swash, and if there are SPECIALS
	4694	* in the swash, at that hash
	4695	*
	4696	* The specials hash can be extra code points, and most likely consists of
	4697	* maps from single code points to multiple ones (each expressed as a string
	4698	* of UTF-8 characters). This function currently returns only 1-1 mappings.
	4699	* However consider this possible input in the specials hash:
	4700	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	4701	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	4702	*
	4703	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	4704	* currently handle. But it also means that FB05 and FB06 are equivalent in
	4705	* a 1-1 mapping which we should handle, and this relationship may not be in
	4706	* the main table. Therefore this function examines all the multi-char
	4707	* sequences and adds the 1-1 mappings that come out of that.
	4708	*
	4709	* XXX This function was originally intended to be multipurpose, but its
	4710	* only use is quite likely to remain for constructing the inversion of
	4711	* the CaseFolding (//i) property. If it were more general purpose for
	4712	* regex patterns, it would have to do the FB05/FB06 game for simple folds,
	4713	* because certain folds are prohibited under /iaa and /il. As an example,
	4714	* in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
	4715	* equivalent under /i. But under /iaa and /il, the folds to 'i' are
	4716	* prohibited, so we would not figure out that they fold to each other.
	4717	* Code could be written to automatically figure this out, similar to the
	4718	* code that does this for multi-character folds, but this is the only case
	4719	* where something like this is ever likely to happen, as all the single
	4720	* char folds to the 0-255 range are now quite settled. Instead there is a
	4721	* little special code that is compiled only for this Unicode version. This
	4722	* is smaller and didn't require much coding time to do. But this makes
	4723	* this routine strongly tied to being used just for CaseFolding. If ever
	4724	* it should be generalized, this would have to be fixed */
	4725
	4726	U8 l, lend;
	4727	STRLEN lcur;
	4728	HV *const hv = MUTABLE_HV(SvRV(swash));
	4729
	4730	/* The string containing the main body of the table. This will have its
	4731	* assertion fail if the swash has been converted to its inversion list */
	4732	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	4733
	4734	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	4735	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	4736	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	4737	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	4738	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	4739	const STRLEN bits = SvUV(*bitssvp);
	4740	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	4741	const UV none = SvUV(*nonesvp);
	4742	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	4743
	4744	HV* ret = newHV();
	4745
	4746	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	4747
	4748	/* Must have at least 8 bits to get the mappings */
	4749	if (bits != 8 && bits != 16 && bits != 32) {
	4750	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"
	4751	UVuf, (UV)bits);
	4752	}
	4753
	4754	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	4755	mapping to more than one character */
	4756
	4757	/* Construct an inverse mapping hash for the specials */
	4758	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	4759	HV * specials_inverse = newHV();
	4760	char char_from; / the lhs of the map */
	4761	I32 from_len; /* its byte length */
	4762	char char_to; / the rhs of the map */
	4763	I32 to_len; /* its byte length */
	4764	SV sv_to; / and in a sv */
	4765	AV* from_list; /* list of things that map to each 'to' */
	4766
	4767	hv_iterinit(specials_hv);
	4768
	4769	/* The keys are the characters (in UTF-8) that map to the corresponding
	4770	* UTF-8 string value. Iterate through the list creating the inverse
	4771	* list. */
	4772	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	4773	SV** listp;
	4774	if (! SvPOK(sv_to)) {
	4775	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
	4776	"unexpectedly is not a string, flags=%lu",
	4777	(unsigned long)SvFLAGS(sv_to));
	4778	}
	4779	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8) char_from, 0), valid_utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	4780
	4781	/* Each key in the inverse list is a mapped-to value, and the key's
	4782	* hash value is a list of the strings (each in UTF-8) that map to
	4783	* it. Those strings are all one character long */
	4784	if ((listp = hv_fetch(specials_inverse,
	4785	SvPVX(sv_to),
	4786	SvCUR(sv_to), 0)))
	4787	{
	4788	from_list = (AV) listp;
	4789	}
	4790	else { /* No entry yet for it: create one */
	4791	from_list = newAV();
	4792	if (! hv_store(specials_inverse,
	4793	SvPVX(sv_to),
	4794	SvCUR(sv_to),
	4795	(SV*) from_list, 0))
	4796	{
	4797	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4798	}
	4799	}
	4800
	4801	/* Here have the list associated with this 'to' (perhaps newly
	4802	* created and empty). Just add to it. Note that we ASSUME that
	4803	* the input is guaranteed to not have duplications, so we don't
	4804	* check for that. Duplications just slow down execution time. */
	4805	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	4806	}
	4807
	4808	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	4809	* it looking for cases like the FB05/FB06 examples above. There would
	4810	* be an entry in the hash like
	4811	* 'st' => [ FB05, FB06 ]
	4812	* In this example we will create two lists that get stored in the
	4813	* returned hash, 'ret':
	4814	* FB05 => [ FB05, FB06 ]
	4815	* FB06 => [ FB05, FB06 ]
	4816	*
	4817	* Note that there is nothing to do if the array only has one element.
	4818	* (In the normal 1-1 case handled below, we don't have to worry about
	4819	* two lists, as everything gets tied to the single list that is
	4820	* generated for the single character 'to'. But here, we are omitting
	4821	* that list, ('st' in the example), so must have multiple lists.) */
	4822	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	4823	&char_to, &to_len)))
	4824	{
	4825	if (av_tindex_skip_len_mg(from_list) > 0) {
	4826	SSize_t i;
	4827
	4828	/* We iterate over all combinations of i,j to place each code
	4829	* point on each list */
	4830	for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
	4831	SSize_t j;
	4832	AV* i_list = newAV();
	4833	SV** entryp = av_fetch(from_list, i, FALSE);
	4834	if (entryp == NULL) {
	4835	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly"
	4836	" failed");
	4837	}
	4838	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	4839	Perl_croak(aTHX_ "panic: unexpected entry for %s",
	4840	SvPVX(*entryp));
	4841	}
	4842	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	4843	(SV*) i_list, FALSE))
	4844	{
	4845	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4846	}
	4847
	4848	/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	4849	for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
	4850	entryp = av_fetch(from_list, j, FALSE);
	4851	if (entryp == NULL) {
	4852	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	4853	}
	4854
	4855	/* When i==j this adds itself to the list */
	4856	av_push(i_list, newSVuv(utf8_to_uvchr_buf(
	4857	(U8) SvPVX(entryp),
	4858	(U8) SvPVX(entryp) + SvCUR(*entryp),
	4859	0)));
	4860	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	4861	}
	4862	}
	4863	}
	4864	}
	4865	SvREFCNT_dec(specials_inverse); /* done with it */
	4866	} /* End of specials */
	4867
	4868	/* read $swash->{LIST} */
	4869
	4870	#if UNICODE_MAJOR_VERSION == 3 \
	4871	&& UNICODE_DOT_VERSION == 0 \
	4872	&& UNICODE_DOT_DOT_VERSION == 1
	4873
	4874	/* For this version only U+130 and U+131 are equivalent under qr//i. Add a
	4875	* rule so that things work under /iaa and /il */
	4876
	4877	SV * mod_listsv = sv_mortalcopy(*listsvp);
	4878	sv_catpv(mod_listsv, "130\t130\t131\n");
	4879	l = (U8*)SvPV(mod_listsv, lcur);
	4880
	4881	#else
	4882
	4883	l = (U8)SvPV(listsvp, lcur);
	4884
	4885	#endif
	4886
	4887	lend = l + lcur;
	4888
	4889	/* Go through each input line */
	4890	while (l < lend) {
	4891	UV min, max, val;
	4892	UV inverse;
	4893	l = swash_scan_list_line(l, lend, &min, &max, &val,
	4894	cBOOL(octets), typestr);
	4895	if (l > lend) {
	4896	break;
	4897	}
	4898
	4899	/* Each element in the range is to be inverted */
	4900	for (inverse = min; inverse <= max; inverse++) {
	4901	AV* list;
	4902	SV** listp;
	4903	IV i;
	4904	bool found_key = FALSE;
	4905	bool found_inverse = FALSE;
	4906
	4907	/* The key is the inverse mapping */
	4908	char key[UTF8_MAXBYTES+1];
	4909	char* key_end = (char ) uvchr_to_utf8((U8) key, val);
	4910	STRLEN key_len = key_end - key;
	4911
	4912	/* Get the list for the map */
	4913	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	4914	list = (AV) listp;
	4915	}
	4916	else { /* No entry yet for it: create one */
	4917	list = newAV();
	4918	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	4919	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4920	}
	4921	}
	4922
	4923	/* Look through list to see if this inverse mapping already is
	4924	* listed, or if there is a mapping to itself already */
	4925	for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
	4926	SV** entryp = av_fetch(list, i, FALSE);
	4927	SV* entry;
	4928	UV uv;
	4929	if (entryp == NULL) {
	4930	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	4931	}
	4932	entry = *entryp;
	4933	uv = SvUV(entry);
	4934	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));/
	4935	if (uv == val) {
	4936	found_key = TRUE;
	4937	}
	4938	if (uv == inverse) {
	4939	found_inverse = TRUE;
	4940	}
	4941
	4942	/* No need to continue searching if found everything we are
	4943	* looking for */
	4944	if (found_key && found_inverse) {
	4945	break;
	4946	}
	4947	}
	4948
	4949	/* Make sure there is a mapping to itself on the list */
	4950	if (! found_key) {
	4951	av_push(list, newSVuv(val));
	4952	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));/
	4953	}
	4954
	4955
	4956	/* Simply add the value to the list */
	4957	if (! found_inverse) {
	4958	av_push(list, newSVuv(inverse));
	4959	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));/
	4960	}
	4961
	4962	/* swatch_get() increments the value of val for each element in the
	4963	* range. That makes more compact tables possible. You can
	4964	* express the capitalization, for example, of all consecutive
	4965	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	4966	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	4967	* and it's not documented; it appears to be used only in
	4968	* implementing tr//; I copied the semantics from swatch_get(), just
	4969	* in case */
	4970	if (!none \|\| val < none) {
	4971	++val;
	4972	}
	4973	}
	4974	}
	4975
	4976	return ret;
	4977	}
	4978
	4979	SV*
	4980	Perl__swash_to_invlist(pTHX_ SV* const swash)
	4981	{
	4982
	4983	/* Subject to change or removal. For use only in one place in regcomp.c.
	4984	* Ownership is given to one reference count in the returned SV* */
	4985
	4986	U8 l, lend;
	4987	char *loc;
	4988	STRLEN lcur;
	4989	HV *const hv = MUTABLE_HV(SvRV(swash));
	4990	UV elements = 0; /* Number of elements in the inversion list */
	4991	U8 empty[] = "";
	4992	SV** listsvp;
	4993	SV** typesvp;
	4994	SV** bitssvp;
	4995	SV** extssvp;
	4996	SV** invert_it_svp;
	4997
	4998	U8* typestr;
	4999	STRLEN bits;
	5000	STRLEN octets; /* if bits == 1, then octets == 0 */
	5001	U8 x, xend;
	5002	STRLEN xcur;
	5003
	5004	SV* invlist;
	5005
	5006	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	5007
	5008	/* If not a hash, it must be the swash's inversion list instead */
	5009	if (SvTYPE(hv) != SVt_PVHV) {
	5010	return SvREFCNT_inc_simple_NN((SV*) hv);
	5011	}
	5012
	5013	/* The string containing the main body of the table */
	5014	listsvp = hv_fetchs(hv, "LIST", FALSE);
	5015	typesvp = hv_fetchs(hv, "TYPE", FALSE);
	5016	bitssvp = hv_fetchs(hv, "BITS", FALSE);
	5017	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	5018	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	5019
	5020	typestr = (U8)SvPV_nolen(typesvp);
	5021	bits = SvUV(*bitssvp);
	5022	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	5023
	5024	/* read $swash->{LIST} */
	5025	if (SvPOK(*listsvp)) {
	5026	l = (U8)SvPV(listsvp, lcur);
	5027	}
	5028	else {
	5029	/* LIST legitimately doesn't contain a string during compilation phases
	5030	* of Perl itself, before the Unicode tables are generated. In this
	5031	* case, just fake things up by creating an empty list */
	5032	l = empty;
	5033	lcur = 0;
	5034	}
	5035	loc = (char *) l;
	5036	lend = l + lcur;
	5037
	5038	if (l == 'V') { / Inversion list format */
	5039	const char after_atou = (char ) lend;
	5040	UV element0;
	5041	UV* other_elements_ptr;
	5042
	5043	/* The first number is a count of the rest */
	5044	l++;
	5045	if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
	5046	Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
	5047	" at start of inversion list");
	5048	}
	5049	if (elements == 0) {
	5050	invlist = _new_invlist(0);
	5051	}
	5052	else {
	5053	l = (U8 *) after_atou;
	5054
	5055	/* Get the 0th element, which is needed to setup the inversion list
	5056	* */
	5057	while (isSPACE(*l)) l++;
	5058	if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
	5059	Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
	5060	" inversion list");
	5061	}
	5062	l = (U8 *) after_atou;
	5063	invlist = _setup_canned_invlist(elements, element0,
	5064	&other_elements_ptr);
	5065	elements--;
	5066
	5067	/* Then just populate the rest of the input */
	5068	while (elements-- > 0) {
	5069	if (l > lend) {
	5070	Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
	5071	" elements than available", elements);
	5072	}
	5073	while (isSPACE(*l)) l++;
	5074	if (!grok_atoUV((const char *)l, other_elements_ptr++,
	5075	&after_atou))
	5076	{
	5077	Perl_croak(aTHX_ "panic: Expecting a valid element"
	5078	" in inversion list");
	5079	}
	5080	l = (U8 *) after_atou;
	5081	}
	5082	}
	5083	}
	5084	else {
	5085
	5086	/* Scan the input to count the number of lines to preallocate array
	5087	* size based on worst possible case, which is each line in the input
	5088	* creates 2 elements in the inversion list: 1) the beginning of a
	5089	* range in the list; 2) the beginning of a range not in the list. */
	5090	while ((loc = (char ) memchr(loc, '\n', lend - (U8 ) loc)) != NULL) {
	5091	elements += 2;
	5092	loc++;
	5093	}
	5094
	5095	/* If the ending is somehow corrupt and isn't a new line, add another
	5096	* element for the final range that isn't in the inversion list */
	5097	if (! (*lend == '\n'
	5098	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	5099	{
	5100	elements++;
	5101	}
	5102
	5103	invlist = _new_invlist(elements);
	5104
	5105	/* Now go through the input again, adding each range to the list */
	5106	while (l < lend) {
	5107	UV start, end;
	5108	UV val; /* Not used by this function */
	5109
	5110	l = swash_scan_list_line(l, lend, &start, &end, &val,
	5111	cBOOL(octets), typestr);
	5112
	5113	if (l > lend) {
	5114	break;
	5115	}
	5116
	5117	invlist = _add_range_to_invlist(invlist, start, end);
	5118	}
	5119	}
	5120
	5121	/* Invert if the data says it should be */
	5122	if (invert_it_svp && SvUV(*invert_it_svp)) {
	5123	_invlist_invert(invlist);
	5124	}
	5125
	5126	/* This code is copied from swatch_get()
	5127	* read $swash->{EXTRAS} */
	5128	x = (U8)SvPV(extssvp, xcur);
	5129	xend = x + xcur;
	5130	while (x < xend) {
	5131	STRLEN namelen;
	5132	U8 *namestr;
	5133	SV** othersvp;
	5134	HV* otherhv;
	5135	STRLEN otherbits;
	5136	SV *otherbitssvp, other;
	5137	U8 *nl;
	5138
	5139	const U8 opc = *x++;
	5140	if (opc == '\n')
	5141	continue;
	5142
	5143	nl = (U8*)memchr(x, '\n', xend - x);
	5144
	5145	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	5146	if (nl) {
	5147	x = nl + 1; /* 1 is length of "\n" */
	5148	continue;
	5149	}
	5150	else {
	5151	x = xend; /* to EXTRAS' end at which \n is not found */
	5152	break;
	5153	}
	5154	}
	5155
	5156	namestr = x;
	5157	if (nl) {
	5158	namelen = nl - namestr;
	5159	x = nl + 1;
	5160	}
	5161	else {
	5162	namelen = xend - namestr;
	5163	x = xend;
	5164	}
	5165
	5166	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	5167	otherhv = MUTABLE_HV(SvRV(*othersvp));
	5168	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	5169	otherbits = (STRLEN)SvUV(*otherbitssvp);
	5170
	5171	if (bits != otherbits \|\| bits != 1) {
	5172	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	5173	"properties, bits=%" UVuf ", otherbits=%" UVuf,
	5174	(UV)bits, (UV)otherbits);
	5175	}
	5176
	5177	/* The "other" swatch must be destroyed after. */
	5178	other = _swash_to_invlist((SV )othersvp);
	5179
	5180	/* End of code copied from swatch_get() */
	5181	switch (opc) {
	5182	case '+':
	5183	_invlist_union(invlist, other, &invlist);
	5184	break;
	5185	case '!':
	5186	_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
	5187	break;
	5188	case '-':
	5189	_invlist_subtract(invlist, other, &invlist);
	5190	break;
	5191	case '&':
	5192	_invlist_intersection(invlist, other, &invlist);
	5193	break;
	5194	default:
	5195	break;
	5196	}
	5197	sv_free(other); /* through with it! */
	5198	}
	5199
	5200	SvREADONLY_on(invlist);
	5201	return invlist;
	5202	}
	5203
	5204	SV*
	5205	Perl__get_swash_invlist(pTHX_ SV* const swash)
	5206	{
	5207	SV** ptr;
	5208
	5209	PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
	5210
	5211	if (! SvROK(swash)) {
	5212	return NULL;
	5213	}
	5214
	5215	/* If it really isn't a hash, it isn't really swash; must be an inversion
	5216	* list */
	5217	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
	5218	return SvRV(swash);
	5219	}
	5220
	5221	ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
	5222	if (! ptr) {
	5223	return NULL;
	5224	}
	5225
	5226	return *ptr;
	5227	}
	5228
	5229	bool
	5230	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	5231	{
	5232	/* May change: warns if surrogates, non-character code points, or
	5233	* non-Unicode code points are in 's' which has length 'len' bytes.
	5234	* Returns TRUE if none found; FALSE otherwise. The only other validity
	5235	* check is to make sure that this won't exceed the string's length nor
	5236	* overflow */
	5237
	5238	const U8* const e = s + len;
	5239	bool ok = TRUE;
	5240
	5241	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	5242
	5243	while (s < e) {
	5244	if (UTF8SKIP(s) > len) {
	5245	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	5246	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	5247	return FALSE;
	5248	}
	5249	if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
	5250	if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
	5251	if ( ckWARN_d(WARN_NON_UNICODE)
	5252	\|\| UNLIKELY(0 < does_utf8_overflow(s, s + len,
	5253	0 /* Don't consider overlongs */
	5254	)))
	5255	{
	5256	/* A side effect of this function will be to warn */
	5257	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
	5258	ok = FALSE;
	5259	}
	5260	}
	5261	else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
	5262	if (ckWARN_d(WARN_SURROGATE)) {
	5263	/* This has a different warning than the one the called
	5264	* function would output, so can't just call it, unlike we
	5265	* do for the non-chars and above-unicodes */
	5266	UV uv = utf8_to_uvchr_buf(s, e, NULL);
	5267	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	5268	"Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
	5269	uv);
	5270	ok = FALSE;
	5271	}
	5272	}
	5273	else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
	5274	&& (ckWARN_d(WARN_NONCHAR)))
	5275	{
	5276	/* A side effect of this function will be to warn */
	5277	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
	5278	ok = FALSE;
	5279	}
	5280	}
	5281	s += UTF8SKIP(s);
	5282	}
	5283
	5284	return ok;
	5285	}
	5286
	5287	/*
	5288	=for apidoc pv_uni_display
	5289
	5290	Build to the scalar C<dsv> a displayable version of the string C<spv>,
	5291	length C<len>, the displayable version being at most C<pvlim> bytes long
	5292	(if longer, the rest is truncated and C<"..."> will be appended).
	5293
	5294	The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
	5295	C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
	5296	to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
	5297	(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
	5298	C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
	5299	C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
	5300
	5301	The pointer to the PV of the C<dsv> is returned.
	5302
	5303	See also L</sv_uni_display>.
	5304
	5305	=cut */
	5306	char *
	5307	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim,
	5308	UV flags)
	5309	{
	5310	int truncated = 0;
	5311	const char s, e;
	5312
	5313	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	5314
	5315	SvPVCLEAR(dsv);
	5316	SvUTF8_off(dsv);
	5317	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	5318	UV u;
	5319	/* This serves double duty as a flag and a character to print after
	5320	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	5321	*/
	5322	char ok = 0;
	5323
	5324	if (pvlim && SvCUR(dsv) >= pvlim) {
	5325	truncated++;
	5326	break;
	5327	}
	5328	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	5329	if (u < 256) {
	5330	const unsigned char c = (unsigned char)u & 0xFF;
	5331	if (flags & UNI_DISPLAY_BACKSLASH) {
	5332	switch (c) {
	5333	case '\n':
	5334	ok = 'n'; break;
	5335	case '\r':
	5336	ok = 'r'; break;
	5337	case '\t':
	5338	ok = 't'; break;
	5339	case '\f':
	5340	ok = 'f'; break;
	5341	case '\a':
	5342	ok = 'a'; break;
	5343	case '\\':
	5344	ok = '\\'; break;
	5345	default: break;
	5346	}
	5347	if (ok) {
	5348	const char string = ok;
	5349	sv_catpvs(dsv, "\\");
	5350	sv_catpvn(dsv, &string, 1);
	5351	}
	5352	}
	5353	/* isPRINT() is the locale-blind version. */
	5354	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	5355	const char string = c;
	5356	sv_catpvn(dsv, &string, 1);
	5357	ok = 1;
	5358	}
	5359	}
	5360	if (!ok)
	5361	Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
	5362	}
	5363	if (truncated)
	5364	sv_catpvs(dsv, "...");
	5365
	5366	return SvPVX(dsv);
	5367	}
	5368
	5369	/*
	5370	=for apidoc sv_uni_display
	5371
	5372	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	5373	the displayable version being at most C<pvlim> bytes long
	5374	(if longer, the rest is truncated and "..." will be appended).
	5375
	5376	The C<flags> argument is as in L</pv_uni_display>().
	5377
	5378	The pointer to the PV of the C<dsv> is returned.
	5379
	5380	=cut
	5381	*/
	5382	char *
	5383	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	5384	{
	5385	const char * const ptr =
	5386	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	5387
	5388	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	5389
	5390	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	5391	SvCUR(ssv), pvlim, flags);
	5392	}
	5393
	5394	/*
	5395	=for apidoc foldEQ_utf8
	5396
	5397	Returns true if the leading portions of the strings C<s1> and C<s2> (either or
	5398	both of which may be in UTF-8) are the same case-insensitively; false
	5399	otherwise. How far into the strings to compare is determined by other input
	5400	parameters.
	5401
	5402	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	5403	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
	5404	C<u2> with respect to C<s2>.
	5405
	5406	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
	5407	fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
	5408	The scan will not be considered to be a match unless the goal is reached, and
	5409	scanning won't continue past that goal. Correspondingly for C<l2> with respect
	5410	to C<s2>.
	5411
	5412	If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
	5413	pointer is considered an end pointer to the position 1 byte past the maximum
	5414	point in C<s1> beyond which scanning will not continue under any circumstances.
	5415	(This routine assumes that UTF-8 encoded input strings are not malformed;
	5416	malformed input can cause it to read past C<pe1>). This means that if both
	5417	C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
	5418	will never be successful because it can never
	5419	get as far as its goal (and in fact is asserted against). Correspondingly for
	5420	C<pe2> with respect to C<s2>.
	5421
	5422	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	5423	C<l2> must be non-zero), and if both do, both have to be
	5424	reached for a successful match. Also, if the fold of a character is multiple
	5425	characters, all of them must be matched (see tr21 reference below for
	5426	'folding').
	5427
	5428	Upon a successful match, if C<pe1> is non-C<NULL>,
	5429	it will be set to point to the beginning of the I<next> character of C<s1>
	5430	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	5431
	5432	For case-insensitiveness, the "casefolding" of Unicode is used
	5433	instead of upper/lowercasing both the characters, see
	5434	L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	5435
	5436	=cut */
	5437
	5438	/* A flags parameter has been added which may change, and hence isn't
	5439	* externally documented. Currently it is:
	5440	* 0 for as-documented above
	5441	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	5442	ASCII one, to not match
	5443	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	5444	* locale are to be used.
	5445	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	5446	* routine. This allows that step to be skipped.
	5447	* Currently, this requires s1 to be encoded as UTF-8
	5448	* (u1 must be true), which is asserted for.
	5449	* FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
	5450	* cross certain boundaries. Hence, the caller should
	5451	* let this function do the folding instead of
	5452	* pre-folding. This code contains an assertion to
	5453	* that effect. However, if the caller knows what
	5454	* it's doing, it can pass this flag to indicate that,
	5455	* and the assertion is skipped.
	5456	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	5457	* FOLDEQ_S2_FOLDS_SANE
	5458	*/
	5459	I32
	5460	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char *pe1, UV l1, bool u1,
	5461	const char s2, char *pe2, UV l2, bool u2,
	5462	U32 flags)
	5463	{
	5464	const U8 p1 = (const U8)s1; /* Point to current char */
	5465	const U8 p2 = (const U8)s2;
	5466	const U8 g1 = NULL; / goal for s1 */
	5467	const U8 *g2 = NULL;
	5468	const U8 e1 = NULL; / Don't scan s1 past this */
	5469	U8 f1 = NULL; / Point to current folded */
	5470	const U8 *e2 = NULL;
	5471	U8 *f2 = NULL;
	5472	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	5473	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	5474	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	5475	U8 flags_for_folder = FOLD_FLAGS_FULL;
	5476
	5477	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	5478
	5479	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	5480	&& (((flags & FOLDEQ_S1_ALREADY_FOLDED)
	5481	&& !(flags & FOLDEQ_S1_FOLDS_SANE))
	5482	\|\| ((flags & FOLDEQ_S2_ALREADY_FOLDED)
	5483	&& !(flags & FOLDEQ_S2_FOLDS_SANE)))));
	5484	/* The algorithm is to trial the folds without regard to the flags on
	5485	* the first line of the above assert(), and then see if the result
	5486	* violates them. This means that the inputs can't be pre-folded to a
	5487	* violating result, hence the assert. This could be changed, with the
	5488	* addition of extra tests here for the already-folded case, which would
	5489	* slow it down. That cost is more than any possible gain for when these
	5490	* flags are specified, as the flags indicate /il or /iaa matching which
	5491	* is less common than /iu, and I (khw) also believe that real-world /il
	5492	* and /iaa matches are most likely to involve code points 0-255, and this
	5493	* function only under rare conditions gets called for 0-255. */
	5494
	5495	if (flags & FOLDEQ_LOCALE) {
	5496	if (IN_UTF8_CTYPE_LOCALE) {
	5497	flags &= ~FOLDEQ_LOCALE;
	5498	}
	5499	else {
	5500	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	5501	}
	5502	}
	5503
	5504	if (pe1) {
	5505	e1 = (U8*)pe1;
	5506	}
	5507
	5508	if (l1) {
	5509	g1 = (const U8*)s1 + l1;
	5510	}
	5511
	5512	if (pe2) {
	5513	e2 = (U8*)pe2;
	5514	}
	5515
	5516	if (l2) {
	5517	g2 = (const U8*)s2 + l2;
	5518	}
	5519
	5520	/* Must have at least one goal */
	5521	assert(g1 \|\| g2);
	5522
	5523	if (g1) {
	5524
	5525	/* Will never match if goal is out-of-bounds */
	5526	assert(! e1 \|\| e1 >= g1);
	5527
	5528	/* Here, there isn't an end pointer, or it is beyond the goal. We
	5529	* only go as far as the goal */
	5530	e1 = g1;
	5531	}
	5532	else {
	5533	assert(e1); /* Must have an end for looking at s1 */
	5534	}
	5535
	5536	/* Same for goal for s2 */
	5537	if (g2) {
	5538	assert(! e2 \|\| e2 >= g2);
	5539	e2 = g2;
	5540	}
	5541	else {
	5542	assert(e2);
	5543	}
	5544
	5545	/* If both operands are already folded, we could just do a memEQ on the
	5546	* whole strings at once, but it would be better if the caller realized
	5547	* this and didn't even call us */
	5548
	5549	/* Look through both strings, a character at a time */
	5550	while (p1 < e1 && p2 < e2) {
	5551
	5552	/* If at the beginning of a new character in s1, get its fold to use
	5553	* and the length of the fold. */
	5554	if (n1 == 0) {
	5555	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	5556	f1 = (U8 *) p1;
	5557	assert(u1);
	5558	n1 = UTF8SKIP(f1);
	5559	}
	5560	else {
	5561	if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
	5562
	5563	/* We have to forbid mixing ASCII with non-ASCII if the
	5564	* flags so indicate. And, we can short circuit having to
	5565	* call the general functions for this common ASCII case,
	5566	* all of whose non-locale folds are also ASCII, and hence
	5567	* UTF-8 invariants, so the UTF8ness of the strings is not
	5568	* relevant. */
	5569	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	5570	return 0;
	5571	}
	5572	n1 = 1;
	5573	foldbuf1 = toFOLD(p1);
	5574	}
	5575	else if (u1) {
	5576	_toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
	5577	}
	5578	else { /* Not UTF-8, get UTF-8 fold */
	5579	_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
	5580	}
	5581	f1 = foldbuf1;
	5582	}
	5583	}
	5584
	5585	if (n2 == 0) { /* Same for s2 */
	5586	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	5587	f2 = (U8 *) p2;
	5588	assert(u2);
	5589	n2 = UTF8SKIP(f2);
	5590	}
	5591	else {
	5592	if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
	5593	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	5594	return 0;
	5595	}
	5596	n2 = 1;
	5597	foldbuf2 = toFOLD(p2);
	5598	}
	5599	else if (u2) {
	5600	_toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
	5601	}
	5602	else {
	5603	_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
	5604	}
	5605	f2 = foldbuf2;
	5606	}
	5607	}
	5608
	5609	/* Here f1 and f2 point to the beginning of the strings to compare.
	5610	* These strings are the folds of the next character from each input
	5611	* string, stored in UTF-8. */
	5612
	5613	/* While there is more to look for in both folds, see if they
	5614	* continue to match */
	5615	while (n1 && n2) {
	5616	U8 fold_length = UTF8SKIP(f1);
	5617	if (fold_length != UTF8SKIP(f2)
	5618	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	5619	function call for single
	5620	byte */
	5621	\|\| memNE((char)f1, (char)f2, fold_length))
	5622	{
	5623	return 0; /* mismatch */
	5624	}
	5625
	5626	/* Here, they matched, advance past them */
	5627	n1 -= fold_length;
	5628	f1 += fold_length;
	5629	n2 -= fold_length;
	5630	f2 += fold_length;
	5631	}
	5632
	5633	/* When reach the end of any fold, advance the input past it */
	5634	if (n1 == 0) {
	5635	p1 += u1 ? UTF8SKIP(p1) : 1;
	5636	}
	5637	if (n2 == 0) {
	5638	p2 += u2 ? UTF8SKIP(p2) : 1;
	5639	}
	5640	} /* End of loop through both strings */
	5641
	5642	/* A match is defined by each scan that specified an explicit length
	5643	* reaching its final goal, and the other not having matched a partial
	5644	* character (which can happen when the fold of a character is more than one
	5645	* character). */
	5646	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	5647	return 0;
	5648	}
	5649
	5650	/* Successful match. Set output pointers */
	5651	if (pe1) {
	5652	pe1 = (char)p1;
	5653	}
	5654	if (pe2) {
	5655	pe2 = (char)p2;
	5656	}
	5657	return 1;
	5658	}
	5659
	5660	/* XXX The next two functions should likely be moved to mathoms.c once all
	5661	* occurrences of them are removed from the core; some cpan-upstream modules
	5662	* still use them */
	5663
	5664	U8 *
	5665	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	5666	{
	5667	PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
	5668
	5669	return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
	5670	}
	5671
	5672	/*
	5673	=for apidoc utf8n_to_uvuni
	5674
	5675	Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
	5676
	5677	This function was useful for code that wanted to handle both EBCDIC and
	5678	ASCII platforms with Unicode properties, but starting in Perl v5.20, the
	5679	distinctions between the platforms have mostly been made invisible to most
	5680	code, so this function is quite unlikely to be what you want. If you do need
	5681	this precise functionality, use instead
	5682	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>>
	5683	or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))\|/utf8n_to_uvchr>>.
	5684
	5685	=cut
	5686	*/
	5687
	5688	UV
	5689	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	5690	{
	5691	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	5692
	5693	return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
	5694	}
	5695
	5696	/*
	5697	=for apidoc uvuni_to_utf8_flags
	5698
	5699	Instead you almost certainly want to use L</uvchr_to_utf8> or
	5700	L</uvchr_to_utf8_flags>.
	5701
	5702	This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
	5703	which itself, while not deprecated, should be used only in isolated
	5704	circumstances. These functions were useful for code that wanted to handle
	5705	both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
	5706	v5.20, the distinctions between the platforms have mostly been made invisible
	5707	to most code, so this function is quite unlikely to be what you want.
	5708
	5709	=cut
	5710	*/
	5711
	5712	U8 *
	5713	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	5714	{
	5715	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	5716
	5717	return uvoffuni_to_utf8_flags(d, uv, flags);
	5718	}
	5719
	5720	/*
	5721	* ex: set ts=8 sts=4 sw=4 et:
	5722	*/