perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "invlist_inline.h"
	35
	36	static const char malformed_text[] = "Malformed UTF-8 character";
	37	static const char unees[] =
	38	"Malformed UTF-8 character (unexpected end of string)";
	39
	40	/*
	41	These are various utility functions for manipulating UTF8-encoded
	42	strings. For the uninitiated, this is a method of representing arbitrary
	43	Unicode characters as a variable number of bytes, in such a way that
	44	characters in the ASCII range are unmodified, and a zero byte never appears
	45	within non-zero characters.
	46	*/
	47
	48	/* helper for Perl__force_out_malformed_utf8_message(). Like
	49	* SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
	50	* PL_compiling */
	51
	52	static void
	53	S_restore_cop_warnings(pTHX_ void *p)
	54	{
	55	free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
	56	}
	57
	58
	59	void
	60	Perl__force_out_malformed_utf8_message(pTHX_
	61	const U8 const p, / First byte in UTF-8 sequence */
	62	const U8 * const e, /* Final byte in sequence (may include
	63	multiple chars */
	64	const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
	65	usually 0, or some DISALLOW flags */
	66	const bool die_here) /* If TRUE, this function does not return */
	67	{
	68	/* This core-only function is to be called when a malformed UTF-8 character
	69	* is found, in order to output the detailed information about the
	70	* malformation before dieing. The reason it exists is for the occasions
	71	* when such a malformation is fatal, but warnings might be turned off, so
	72	* that normally they would not be actually output. This ensures that they
	73	* do get output. Because a sequence may be malformed in more than one
	74	* way, multiple messages may be generated, so we can't make them fatal, as
	75	* that would cause the first one to die.
	76	*
	77	* Instead we pretend -W was passed to perl, then die afterwards. The
	78	* flexibility is here to return to the caller so they can finish up and
	79	* die themselves */
	80	U32 errors;
	81
	82	PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
	83
	84	ENTER;
	85	SAVEI8(PL_dowarn);
	86	SAVESPTR(PL_curcop);
	87
	88	PL_dowarn = G_WARN_ALL_ON\|G_WARN_ON;
	89	if (PL_curcop) {
	90	/* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
	91	* than PL_compiling */
	92	SAVEDESTRUCTOR_X(S_restore_cop_warnings,
	93	(void*)PL_curcop->cop_warnings);
	94	PL_curcop->cop_warnings = pWARN_ALL;
	95	}
	96
	97	(void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
	98
	99	LEAVE;
	100
	101	if (! errors) {
	102	Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
	103	" be called only when there are errors found");
	104	}
	105
	106	if (die_here) {
	107	Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
	108	}
	109	}
	110
	111	STATIC HV *
	112	S_new_msg_hv(pTHX_ const char * const message, /* The message text */
	113	U32 categories, /* Packed warning categories */
	114	U32 flag) /* Flag associated with this message */
	115	{
	116	/* Creates, populates, and returns an HV* that describes an error message
	117	* for the translators between UTF8 and code point */
	118
	119	SV* msg_sv = newSVpv(message, 0);
	120	SV* category_sv = newSVuv(categories);
	121	SV* flag_bit_sv = newSVuv(flag);
	122
	123	HV* msg_hv = newHV();
	124
	125	PERL_ARGS_ASSERT_NEW_MSG_HV;
	126
	127	(void) hv_stores(msg_hv, "text", msg_sv);
	128	(void) hv_stores(msg_hv, "warn_categories", category_sv);
	129	(void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
	130
	131	return msg_hv;
	132	}
	133
	134	/*
	135	=for apidoc uvoffuni_to_utf8_flags
	136
	137	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	138	Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
	139	L<perlapi/uvchr_to_utf8_flags>>.
	140
	141	This function is like them, but the input is a strict Unicode
	142	(as opposed to native) code point. Only in very rare circumstances should code
	143	not be using the native code point.
	144
	145	For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
	146
	147	=cut
	148	*/
	149
	150	U8 *
	151	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
	152	{
	153	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	154
	155	return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
	156	}
	157
	158	/* All these formats take a single UV code point argument */
	159	const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
	160	const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
	161	" is not recommended for open interchange";
	162	const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
	163	" may not be portable";
	164
	165	#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
	166	STMT_START { \
	167	if (flags & UNICODE_WARN_SURROGATE) { \
	168	U32 category = packWARN(WARN_SURROGATE); \
	169	const char * format = surrogate_cp_format; \
	170	if (msgs) { \
	171	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
	172	category, \
	173	UNICODE_GOT_SURROGATE); \
	174	} \
	175	else { \
	176	Perl_ck_warner_d(aTHX_ category, format, uv); \
	177	} \
	178	} \
	179	if (flags & UNICODE_DISALLOW_SURROGATE) { \
	180	return NULL; \
	181	} \
	182	} STMT_END;
	183
	184	#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
	185	STMT_START { \
	186	if (flags & UNICODE_WARN_NONCHAR) { \
	187	U32 category = packWARN(WARN_NONCHAR); \
	188	const char * format = nonchar_cp_format; \
	189	if (msgs) { \
	190	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
	191	category, \
	192	UNICODE_GOT_NONCHAR); \
	193	} \
	194	else { \
	195	Perl_ck_warner_d(aTHX_ category, format, uv); \
	196	} \
	197	} \
	198	if (flags & UNICODE_DISALLOW_NONCHAR) { \
	199	return NULL; \
	200	} \
	201	} STMT_END;
	202
	203	/* Use shorter names internally in this file */
	204	#define SHIFT UTF_ACCUMULATION_SHIFT
	205	#undef MARK
	206	#define MARK UTF_CONTINUATION_MARK
	207	#define MASK UTF_CONTINUATION_MASK
	208
	209	/*
	210	=for apidoc uvchr_to_utf8_flags_msgs
	211
	212	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	213
	214	Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
	215
	216	This function is for code that wants any warning and/or error messages to be
	217	returned to the caller rather than be displayed. All messages that would have
	218	been displayed if all lexical warnings are enabled will be returned.
	219
	220	It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
	221	placed after all the others, C<msgs>. If this parameter is 0, this function
	222	behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
	223	be a pointer to an C<HV *> variable, in which this function creates a new HV to
	224	contain any appropriate messages. The hash has three key-value pairs, as
	225	follows:
	226
	227	=over 4
	228
	229	=item C<text>
	230
	231	The text of the message as a C<SVpv>.
	232
	233	=item C<warn_categories>
	234
	235	The warning category (or categories) packed into a C<SVuv>.
	236
	237	=item C<flag>
	238
	239	A single flag bit associated with this message, in a C<SVuv>.
	240	The bit corresponds to some bit in the C<*errors> return value,
	241	such as C<UNICODE_GOT_SURROGATE>.
	242
	243	=back
	244
	245	It's important to note that specifying this parameter as non-null will cause
	246	any warnings this function would otherwise generate to be suppressed, and
	247	instead be placed in C<*msgs>. The caller can check the lexical warnings state
	248	(or not) when choosing what to do with the returned messages.
	249
	250	The caller, of course, is responsible for freeing any returned HV.
	251
	252	=cut
	253	*/
	254
	255	/* Undocumented; we don't want people using this. Instead they should use
	256	* uvchr_to_utf8_flags_msgs() */
	257	U8 *
	258	Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 d, UV uv, const UV flags, HV* msgs)
	259	{
	260	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
	261
	262	if (msgs) {
	263	*msgs = NULL;
	264	}
	265
	266	if (OFFUNI_IS_INVARIANT(uv)) {
	267	*d++ = LATIN1_TO_NATIVE(uv);
	268	return d;
	269	}
	270
	271	if (uv <= MAX_UTF8_TWO_BYTE) {
	272	*d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) \| UTF_START_MARK(2));
	273	*d++ = I8_TO_NATIVE_UTF8(( uv & MASK) \| MARK);
	274	return d;
	275	}
	276
	277	/* Not 2-byte; test for and handle 3-byte result. In the test immediately
	278	* below, the 16 is for start bytes E0-EF (which are all the possible ones
	279	* for 3 byte characters). The 2 is for 2 continuation bytes; these each
	280	* contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
	281	* on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
	282	* 0x800-0xFFFF on ASCII */
	283	if (uv < (16 * (1U << (2 * SHIFT)))) {
	284	d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) SHIFT)) \| UTF_START_MARK(3));
	285	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	286	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	287
	288	#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
	289	aren't tested here */
	290	/* The most likely code points in this range are below the surrogates.
	291	* Do an extra test to quickly exclude those. */
	292	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
	293	if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
	294	\|\| UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
	295	{
	296	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	297	}
	298	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	299	HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
	300	}
	301	}
	302	#endif
	303	return d;
	304	}
	305
	306	/* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
	307	* platforms, and 0x4000 on EBCDIC. There are problematic cases that can
	308	* happen starting with 4-byte characters on ASCII platforms. We unify the
	309	* code for these with EBCDIC, even though some of them require 5-bytes on
	310	* those, because khw believes the code saving is worth the very slight
	311	* performance hit on these high EBCDIC code points. */
	312
	313	if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
	314	if (UNLIKELY( uv > MAX_LEGAL_CP
	315	&& ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
	316	{
	317	Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
	318	}
	319	if ( (flags & UNICODE_WARN_SUPER)
	320	\|\| ( (flags & UNICODE_WARN_PERL_EXTENDED)
	321	&& UNICODE_IS_PERL_EXTENDED(uv)))
	322	{
	323	const char * format = super_cp_format;
	324	U32 category = packWARN(WARN_NON_UNICODE);
	325	U32 flag = UNICODE_GOT_SUPER;
	326
	327	/* Choose the more dire applicable warning */
	328	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	329	format = PL_extended_cp_format;
	330	category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
	331	if (flags & (UNICODE_WARN_PERL_EXTENDED
	332	\|UNICODE_DISALLOW_PERL_EXTENDED))
	333	{
	334	flag = UNICODE_GOT_PERL_EXTENDED;
	335	}
	336	}
	337
	338	if (msgs) {
	339	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
	340	category, flag);
	341	}
	342	else if ( ckWARN_d(WARN_NON_UNICODE)
	343	\|\| ( (flag & UNICODE_GOT_PERL_EXTENDED)
	344	&& ckWARN(WARN_PORTABLE)))
	345	{
	346	Perl_warner(aTHX_ category, format, uv);
	347	}
	348	}
	349	if ( (flags & UNICODE_DISALLOW_SUPER)
	350	\|\| ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
	351	&& UNICODE_IS_PERL_EXTENDED(uv)))
	352	{
	353	return NULL;
	354	}
	355	}
	356	else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
	357	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	358	}
	359
	360	/* Test for and handle 4-byte result. In the test immediately below, the
	361	* 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
	362	* characters). The 3 is for 3 continuation bytes; these each contribute
	363	* SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
	364	* ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
	365	* 0x1_0000-0x1F_FFFF on ASCII */
	366	if (uv < (8 * (1U << (3 * SHIFT)))) {
	367	d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) SHIFT)) \| UTF_START_MARK(4));
	368	d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) SHIFT)) & MASK) \| MARK);
	369	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	370	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	371
	372	#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
	373	characters. The end-plane non-characters for EBCDIC were
	374	handled just above */
	375	if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
	376	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	377	}
	378	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	379	HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
	380	}
	381	#endif
	382
	383	return d;
	384	}
	385
	386	/* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
	387	* platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
	388	* format. The unrolled version above turns out to not save all that much
	389	* time, and at these high code points (well above the legal Unicode range
	390	* on ASCII platforms, and well above anything in common use in EBCDIC),
	391	* khw believes that less code outweighs slight performance gains. */
	392
	393	{
	394	STRLEN len = OFFUNISKIP(uv);
	395	U8 *p = d+len-1;
	396	while (p > d) {
	397	*p-- = I8_TO_NATIVE_UTF8((uv & MASK) \| MARK);
	398	uv >>= SHIFT;
	399	}
	400	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	401	return d+len;
	402	}
	403	}
	404
	405	/*
	406	=for apidoc uvchr_to_utf8
	407
	408	Adds the UTF-8 representation of the native code point C<uv> to the end
	409	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	410	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	411	the byte after the end of the new character. In other words,
	412
	413	d = uvchr_to_utf8(d, uv);
	414
	415	is the recommended wide native character-aware way of saying
	416
	417	*(d++) = uv;
	418
	419	This function accepts any code point from 0..C<IV_MAX> as input.
	420	C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	421
	422	It is possible to forbid or warn on non-Unicode code points, or those that may
	423	be problematic by using L</uvchr_to_utf8_flags>.
	424
	425	=cut
	426	*/
	427
	428	/* This is also a macro */
	429	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	430
	431	U8 *
	432	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	433	{
	434	return uvchr_to_utf8(d, uv);
	435	}
	436
	437	/*
	438	=for apidoc uvchr_to_utf8_flags
	439
	440	Adds the UTF-8 representation of the native code point C<uv> to the end
	441	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	442	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	443	the byte after the end of the new character. In other words,
	444
	445	d = uvchr_to_utf8_flags(d, uv, flags);
	446
	447	or, in most cases,
	448
	449	d = uvchr_to_utf8_flags(d, uv, 0);
	450
	451	This is the Unicode-aware way of saying
	452
	453	*(d++) = uv;
	454
	455	If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
	456	input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	457
	458	Specifying C<flags> can further restrict what is allowed and not warned on, as
	459	follows:
	460
	461	If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
	462	the function will raise a warning, provided UTF8 warnings are enabled. If
	463	instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
	464	NULL. If both flags are set, the function will both warn and return NULL.
	465
	466	Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
	467	affect how the function handles a Unicode non-character.
	468
	469	And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
	470	affect the handling of code points that are above the Unicode maximum of
	471	0x10FFFF. Languages other than Perl may not be able to accept files that
	472	contain these.
	473
	474	The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
	475	the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
	476	three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
	477	allowed inputs to the strict UTF-8 traditionally defined by Unicode.
	478	Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
	479	C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
	480	above-Unicode and surrogate flags, but not the non-character ones, as
	481	defined in
	482	L<Unicode Corrigendum #9\|https://www.unicode.org/versions/corrigendum9.html>.
	483	See L<perlunicode/Noncharacter code points>.
	484
	485	Extremely high code points were never specified in any standard, and require an
	486	extension to UTF-8 to express, which Perl does. It is likely that programs
	487	written in something other than Perl would not be able to read files that
	488	contain these; nor would Perl understand files written by something that uses a
	489	different extension. For these reasons, there is a separate set of flags that
	490	can warn and/or disallow these extremely high code points, even if other
	491	above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
	492	and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
	493	C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
	494	treat all above-Unicode code points, including these, as malformations. (Note
	495	that the Unicode standard considers anything above 0x10FFFF to be illegal, but
	496	there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
	497
	498	A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
	499	retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
	500	C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	501	C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
	502	platforms,these flags can apply to code points that actually do fit in 31 bits.
	503	The new names accurately describe the situation in all cases.
	504
	505	=cut
	506	*/
	507
	508	/* This is also a macro */
	509	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	510
	511	U8 *
	512	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	513	{
	514	return uvchr_to_utf8_flags(d, uv, flags);
	515	}
	516
	517	#ifndef UV_IS_QUAD
	518
	519	STATIC int
	520	S_is_utf8_cp_above_31_bits(const U8 * const s,
	521	const U8 * const e,
	522	const bool consider_overlongs)
	523	{
	524	/* Returns TRUE if the first code point represented by the Perl-extended-
	525	* UTF-8-encoded string starting at 's', and looking no further than 'e -
	526	* 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
	527	*
	528	* The function handles the case where the input bytes do not include all
	529	* the ones necessary to represent a full character. That is, they may be
	530	* the intial bytes of the representation of a code point, but possibly
	531	* the final ones necessary for the complete representation may be beyond
	532	* 'e - 1'.
	533	*
	534	* The function also can handle the case where the input is an overlong
	535	* sequence. If 'consider_overlongs' is 0, the function assumes the
	536	* input is not overlong, without checking, and will return based on that
	537	* assumption. If this parameter is 1, the function will go to the trouble
	538	* of figuring out if it actually evaluates to above or below 31 bits.
	539	*
	540	* The sequence is otherwise assumed to be well-formed, without checking.
	541	*/
	542
	543	const STRLEN len = e - s;
	544	int is_overlong;
	545
	546	PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
	547
	548	assert(! UTF8_IS_INVARIANT(*s) && e > s);
	549
	550	#ifdef EBCDIC
	551
	552	PERL_UNUSED_ARG(consider_overlongs);
	553
	554	/* On the EBCDIC code pages we handle, only the native start byte 0xFE can
	555	* mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
	556	* also be the start byte for a 31-bit code point; we need at least 2
	557	* bytes, and maybe up through 8 bytes, to determine that. (It can also be
	558	* the start byte for an overlong sequence, but for 30-bit or smaller code
	559	* points, so we don't have to worry about overlongs on EBCDIC.) */
	560	if (*s != 0xFE) {
	561	return 0;
	562	}
	563
	564	if (len == 1) {
	565	return -1;
	566	}
	567
	568	#else
	569
	570	/* On ASCII, FE and FF are the only start bytes that can evaluate to
	571	* needing more than 31 bits. */
	572	if (LIKELY(*s < 0xFE)) {
	573	return 0;
	574	}
	575
	576	/* What we have left are FE and FF. Both of these require more than 31
	577	* bits unless they are for overlongs. */
	578	if (! consider_overlongs) {
	579	return 1;
	580	}
	581
	582	/* Here, we have FE or FF. If the input isn't overlong, it evaluates to
	583	* above 31 bits. But we need more than one byte to discern this, so if
	584	* passed just the start byte, it could be an overlong evaluating to
	585	* smaller */
	586	if (len == 1) {
	587	return -1;
	588	}
	589
	590	/* Having excluded len==1, and knowing that FE and FF are both valid start
	591	* bytes, we can call the function below to see if the sequence is
	592	* overlong. (We don't need the full generality of the called function,
	593	* but for these huge code points, speed shouldn't be a consideration, and
	594	* the compiler does have enough information, since it's static to this
	595	* file, to optimize to just the needed parts.) */
	596	is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
	597
	598	/* If it isn't overlong, more than 31 bits are required. */
	599	if (is_overlong == 0) {
	600	return 1;
	601	}
	602
	603	/* If it is indeterminate if it is overlong, return that */
	604	if (is_overlong < 0) {
	605	return -1;
	606	}
	607
	608	/* Here is overlong. Such a sequence starting with FE is below 31 bits, as
	609	* the max it can be is 2*31 - 1 /
	610	if (*s == 0xFE) {
	611	return 0;
	612	}
	613
	614	#endif
	615
	616	/* Here, ASCII and EBCDIC rejoin:
	617	* On ASCII: We have an overlong sequence starting with FF
	618	* On EBCDIC: We have a sequence starting with FE. */
	619
	620	{ /* For C89, use a block so the declaration can be close to its use */
	621
	622	#ifdef EBCDIC
	623
	624	/* U+7FFFFFFF (2 ** 31 - 1)
	625	* [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
	626	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
	627	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
	628	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
	629	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
	630	* U+80000000 (2 ** 31):
	631	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	632	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	633	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	634	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
	635	*
	636	* and since we know that *s = \xfe, any continuation sequcence
	637	* following it that is gt the below is above 31 bits
	638	[0] [1] [2] [3] [4] [5] [6] */
	639	const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
	640
	641	#else
	642
	643	/* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
	644	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
	645	* FF overlong for U+80000000 (2 ** 31):
	646	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
	647	* and since we know that *s = \xff, any continuation sequcence
	648	* following it that is gt the below is above 30 bits
	649	[0] [1] [2] [3] [4] [5] [6] */
	650	const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
	651
	652
	653	#endif
	654	const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
	655	const STRLEN cmp_len = MIN(conts_len, len - 1);
	656
	657	/* Now compare the continuation bytes in s with the ones we have
	658	* compiled in that are for the largest 30 bit code point. If we have
	659	* enough bytes available to determine the answer, or the bytes we do
	660	* have differ from them, we can compare the two to get a definitive
	661	* answer (Note that in UTF-EBCDIC, the two lowest possible
	662	* continuation bytes are \x41 and \x42.) */
	663	if (cmp_len >= conts_len \|\| memNE(s + 1,
	664	conts_for_highest_30_bit,
	665	cmp_len))
	666	{
	667	return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
	668	}
	669
	670	/* Here, all the bytes we have are the same as the highest 30-bit code
	671	* point, but we are missing so many bytes that we can't make the
	672	* determination */
	673	return -1;
	674	}
	675	}
	676
	677	#endif
	678
	679	PERL_STATIC_INLINE int
	680	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
	681	{
	682	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	683	* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
	684	* it isn't, and -1 if there isn't enough information to tell. This last
	685	* return value can happen if the sequence is incomplete, missing some
	686	* trailing bytes that would form a complete character. If there are
	687	* enough bytes to make a definitive decision, this function does so.
	688	* Usually 2 bytes sufficient.
	689	*
	690	* Overlongs can occur whenever the number of continuation bytes changes.
	691	* That means whenever the number of leading 1 bits in a start byte
	692	* increases from the next lower start byte. That happens for start bytes
	693	* C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
	694	* start bytes have already been excluded, so don't need to be tested here;
	695	* ASCII platforms: C0, C1
	696	* EBCDIC platforms C0, C1, C2, C3, C4, E0
	697	*/
	698
	699	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	700	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	701
	702	PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
	703	assert(len > 1 && UTF8_IS_START(*s));
	704
	705	/* Each platform has overlongs after the start bytes given above (expressed
	706	* in I8 for EBCDIC). What constitutes an overlong varies by platform, but
	707	* the logic is the same, except the E0 overlong has already been excluded
	708	* on EBCDIC platforms. The values below were found by manually
	709	* inspecting the UTF-8 patterns. See the tables in utf8.h and
	710	* utfebcdic.h. */
	711
	712	# ifdef EBCDIC
	713	# define F0_ABOVE_OVERLONG 0xB0
	714	# define F8_ABOVE_OVERLONG 0xA8
	715	# define FC_ABOVE_OVERLONG 0xA4
	716	# define FE_ABOVE_OVERLONG 0xA2
	717	# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
	718	/* I8(0xfe) is FF */
	719	# else
	720
	721	if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
	722	return 1;
	723	}
	724
	725	# define F0_ABOVE_OVERLONG 0x90
	726	# define F8_ABOVE_OVERLONG 0x88
	727	# define FC_ABOVE_OVERLONG 0x84
	728	# define FE_ABOVE_OVERLONG 0x82
	729	# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
	730	# endif
	731
	732
	733	if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
	734	\|\| (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
	735	\|\| (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
	736	\|\| (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
	737	{
	738	return 1;
	739	}
	740
	741	/* Check for the FF overlong */
	742	return isFF_OVERLONG(s, len);
	743	}
	744
	745	PERL_STATIC_INLINE int
	746	S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
	747	{
	748	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	749	* 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
	750	* it isn't, and -1 if there isn't enough information to tell. This last
	751	* return value can happen if the sequence is incomplete, missing some
	752	* trailing bytes that would form a complete character. If there are
	753	* enough bytes to make a definitive decision, this function does so. */
	754
	755	PERL_ARGS_ASSERT_ISFF_OVERLONG;
	756
	757	/* To be an FF overlong, all the available bytes must match */
	758	if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
	759	MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
	760	{
	761	return 0;
	762	}
	763
	764	/* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
	765	* be there; what comes after them doesn't matter. See tables in utf8.h,
	766	* utfebcdic.h. */
	767	if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
	768	return 1;
	769	}
	770
	771	/* The missing bytes could cause the result to go one way or the other, so
	772	* the result is indeterminate */
	773	return -1;
	774	}
	775
	776	#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2*63-1 /
	777	# ifdef EBCDIC /* Actually is I8 */
	778	# define HIGHEST_REPRESENTABLE_UTF8 \
	779	"\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	780	# else
	781	# define HIGHEST_REPRESENTABLE_UTF8 \
	782	"\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	783	# endif
	784	#endif
	785
	786	PERL_STATIC_INLINE int
	787	S_does_utf8_overflow(const U8 * const s,
	788	const U8 * e,
	789	const bool consider_overlongs)
	790	{
	791	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	792	* 'e' - 1 would overflow an IV on this platform; that is if it represents
	793	* a code point larger than the highest representable code point. It
	794	* returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
	795	* enough information to tell. This last return value can happen if the
	796	* sequence is incomplete, missing some trailing bytes that would form a
	797	* complete character. If there are enough bytes to make a definitive
	798	* decision, this function does so.
	799	*
	800	* If 'consider_overlongs' is TRUE, the function checks for the possibility
	801	* that the sequence is an overlong that doesn't overflow. Otherwise, it
	802	* assumes the sequence is not an overlong. This can give different
	803	* results only on ASCII 32-bit platforms.
	804	*
	805	* (For ASCII platforms, we could use memcmp() because we don't have to
	806	* convert each byte to I8, but it's very rare input indeed that would
	807	* approach overflow, so the loop below will likely only get executed once.)
	808	*
	809	* 'e' - 1 must not be beyond a full character. */
	810
	811
	812	PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
	813	assert(s <= e && s + UTF8SKIP(s) >= e);
	814
	815	#if ! defined(UV_IS_QUAD)
	816
	817	return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
	818
	819	#else
	820
	821	PERL_UNUSED_ARG(consider_overlongs);
	822
	823	{
	824	const STRLEN len = e - s;
	825	const U8 *x;
	826	const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
	827
	828	for (x = s; x < e; x++, y++) {
	829
	830	if (UNLIKELY(NATIVE_UTF8_TO_I8(x) == y)) {
	831	continue;
	832	}
	833
	834	/* If this byte is larger than the corresponding highest UTF-8
	835	* byte, the sequence overflow; otherwise the byte is less than,
	836	* and so the sequence doesn't overflow */
	837	return NATIVE_UTF8_TO_I8(x) > y;
	838
	839	}
	840
	841	/* Got to the end and all bytes are the same. If the input is a whole
	842	* character, it doesn't overflow. And if it is a partial character,
	843	* there's not enough information to tell */
	844	if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
	845	return -1;
	846	}
	847
	848	return 0;
	849	}
	850
	851	#endif
	852
	853	}
	854
	855	#if 0
	856
	857	/* This is the portions of the above function that deal with UV_MAX instead of
	858	* IV_MAX. They are left here in case we want to combine them so that internal
	859	* uses can have larger code points. The only logic difference is that the
	860	* 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
	861	* different logic.
	862	*/
	863
	864	/* Anything larger than this will overflow the word if it were converted into a UV */
	865	#if defined(UV_IS_QUAD)
	866	# ifdef EBCDIC /* Actually is I8 */
	867	# define HIGHEST_REPRESENTABLE_UTF8 \
	868	"\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	869	# else
	870	# define HIGHEST_REPRESENTABLE_UTF8 \
	871	"\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	872	# endif
	873	#else /* 32-bit */
	874	# ifdef EBCDIC
	875	# define HIGHEST_REPRESENTABLE_UTF8 \
	876	"\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
	877	# else
	878	# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
	879	# endif
	880	#endif
	881
	882	#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
	883
	884	/* On 32 bit ASCII machines, many overlongs that start with FF don't
	885	* overflow */
	886	if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
	887
	888	/* To be such an overlong, the first bytes of 's' must match
	889	* FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
	890	* don't have any additional bytes available, the sequence, when
	891	* completed might or might not fit in 32 bits. But if we have that
	892	* next byte, we can tell for sure. If it is <= 0x83, then it does
	893	* fit. */
	894	if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
	895	return -1;
	896	}
	897
	898	return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
	899	}
	900
	901	/* Starting with the #else, the rest of the function is identical except
	902	* 1. we need to move the 'len' declaration to be global to the function
	903	* 2. the endif move to just after the UNUSED_ARG.
	904	* An empty endif is given just below to satisfy the preprocessor
	905	*/
	906	#endif
	907
	908	#endif
	909
	910	#undef F0_ABOVE_OVERLONG
	911	#undef F8_ABOVE_OVERLONG
	912	#undef FC_ABOVE_OVERLONG
	913	#undef FE_ABOVE_OVERLONG
	914	#undef FF_OVERLONG_PREFIX
	915
	916	STRLEN
	917	Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
	918	{
	919	STRLEN len;
	920	const U8 *x;
	921
	922	/* A helper function that should not be called directly.
	923	*
	924	* This function returns non-zero if the string beginning at 's' and
	925	* looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
	926	* code point; otherwise it returns 0. The examination stops after the
	927	* first code point in 's' is validated, not looking at the rest of the
	928	* input. If 'e' is such that there are not enough bytes to represent a
	929	* complete code point, this function will return non-zero anyway, if the
	930	* bytes it does have are well-formed UTF-8 as far as they go, and aren't
	931	* excluded by 'flags'.
	932	*
	933	* A non-zero return gives the number of bytes required to represent the
	934	* code point. Be aware that if the input is for a partial character, the
	935	* return will be larger than 'e - s'.
	936	*
	937	* This function assumes that the code point represented is UTF-8 variant.
	938	* The caller should have excluded the possibility of it being invariant
	939	* before calling this function.
	940	*
	941	* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
	942	* accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
	943	* 0 if the code point represented is well-formed Perl-extended-UTF-8, but
	944	* disallowed by the flags. If the input is only for a partial character,
	945	* the function will return non-zero if there is any sequence of
	946	* well-formed UTF-8 that, when appended to the input sequence, could
	947	* result in an allowed code point; otherwise it returns 0. Non characters
	948	* cannot be determined based on partial character input. But many of the
	949	* other excluded types can be determined with just the first one or two
	950	* bytes.
	951	*
	952	*/
	953
	954	PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
	955
	956	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	957	\|UTF8_DISALLOW_PERL_EXTENDED)));
	958	assert(! UTF8_IS_INVARIANT(*s));
	959
	960	/* A variant char must begin with a start byte */
	961	if (UNLIKELY(! UTF8_IS_START(*s))) {
	962	return 0;
	963	}
	964
	965	/* Examine a maximum of a single whole code point */
	966	if (e - s > UTF8SKIP(s)) {
	967	e = s + UTF8SKIP(s);
	968	}
	969
	970	len = e - s;
	971
	972	if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
	973	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	974
	975	/* Here, we are disallowing some set of largish code points, and the
	976	* first byte indicates the sequence is for a code point that could be
	977	* in the excluded set. We generally don't have to look beyond this or
	978	* the second byte to see if the sequence is actually for one of the
	979	* excluded classes. The code below is derived from this table:
	980	*
	981	* UTF-8 UTF-EBCDIC I8
	982	* U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
	983	* U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
	984	* U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
	985	*
	986	* Keep in mind that legal continuation bytes range between \x80..\xBF
	987	* for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
	988	* continuation bytes. Hence, we don't have to test the upper edge
	989	* because if any of those is encountered, the sequence is malformed,
	990	* and would fail elsewhere in this function.
	991	*
	992	* The code here likewise assumes that there aren't other
	993	* malformations; again the function should fail elsewhere because of
	994	* these. For example, an overlong beginning with FC doesn't actually
	995	* have to be a super; it could actually represent a small code point,
	996	* even U+0000. But, since overlongs (and other malformations) are
	997	* illegal, the function should return FALSE in either case.
	998	*/
	999
	1000	#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
	1001	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
	1002	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
	1003
	1004	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
	1005	/* B6 and B7 */ \
	1006	&& ((s1) & 0xFE ) == 0xB6)
	1007	# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
	1008	#else
	1009	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
	1010	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
	1011	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
	1012	# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
	1013	#endif
	1014
	1015	if ( (flags & UTF8_DISALLOW_SUPER)
	1016	&& UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1017	{
	1018	return 0; /* Above Unicode */
	1019	}
	1020
	1021	if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
	1022	&& UNLIKELY(isUTF8_PERL_EXTENDED(s)))
	1023	{
	1024	return 0;
	1025	}
	1026
	1027	if (len > 1) {
	1028	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	1029
	1030	if ( (flags & UTF8_DISALLOW_SUPER)
	1031	&& UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
	1032	{
	1033	return 0; /* Above Unicode */
	1034	}
	1035
	1036	if ( (flags & UTF8_DISALLOW_SURROGATE)
	1037	&& UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
	1038	{
	1039	return 0; /* Surrogate */
	1040	}
	1041
	1042	if ( (flags & UTF8_DISALLOW_NONCHAR)
	1043	&& UNLIKELY(UTF8_IS_NONCHAR(s, e)))
	1044	{
	1045	return 0; /* Noncharacter code point */
	1046	}
	1047	}
	1048	}
	1049
	1050	/* Make sure that all that follows are continuation bytes */
	1051	for (x = s + 1; x < e; x++) {
	1052	if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
	1053	return 0;
	1054	}
	1055	}
	1056
	1057	/* Here is syntactically valid. Next, make sure this isn't the start of an
	1058	* overlong. */
	1059	if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
	1060	return 0;
	1061	}
	1062
	1063	/* And finally, that the code point represented fits in a word on this
	1064	* platform */
	1065	if (0 < does_utf8_overflow(s, e,
	1066	0 /* Don't consider overlongs */
	1067	))
	1068	{
	1069	return 0;
	1070	}
	1071
	1072	return UTF8SKIP(s);
	1073	}
	1074
	1075	char *
	1076	Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
	1077	{
	1078	/* Returns a mortalized C string that is a displayable copy of the 'len'
	1079	* bytes starting at 'start'. 'format' gives how to display each byte.
	1080	* Currently, there are only two formats, so it is currently a bool:
	1081	* 0 \xab
	1082	* 1 ab (that is a space between two hex digit bytes)
	1083	*/
	1084
	1085	const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
	1086	trailing NUL */
	1087	const U8 * s = start;
	1088	const U8 * const e = start + len;
	1089	char * output;
	1090	char * d;
	1091
	1092	PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
	1093
	1094	Newx(output, output_len, char);
	1095	SAVEFREEPV(output);
	1096
	1097	d = output;
	1098	for (s = start; s < e; s++) {
	1099	const unsigned high_nibble = (*s & 0xF0) >> 4;
	1100	const unsigned low_nibble = (*s & 0x0F);
	1101
	1102	if (format) {
	1103	if (s > start) {
	1104	*d++ = ' ';
	1105	}
	1106	}
	1107	else {
	1108	*d++ = '\\';
	1109	*d++ = 'x';
	1110	}
	1111
	1112	if (high_nibble < 10) {
	1113	*d++ = high_nibble + '0';
	1114	}
	1115	else {
	1116	*d++ = high_nibble - 10 + 'a';
	1117	}
	1118
	1119	if (low_nibble < 10) {
	1120	*d++ = low_nibble + '0';
	1121	}
	1122	else {
	1123	*d++ = low_nibble - 10 + 'a';
	1124	}
	1125	}
	1126
	1127	*d = '\0';
	1128	return output;
	1129	}
	1130
	1131	PERL_STATIC_INLINE char *
	1132	S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
	1133
	1134	/* Max number of bytes to print */
	1135	STRLEN print_len,
	1136
	1137	/* Which one is the non-continuation */
	1138	const STRLEN non_cont_byte_pos,
	1139
	1140	/* How many bytes should there be? */
	1141	const STRLEN expect_len)
	1142	{
	1143	/* Return the malformation warning text for an unexpected continuation
	1144	* byte. */
	1145
	1146	const char * const where = (non_cont_byte_pos == 1)
	1147	? "immediately"
	1148	: Perl_form(aTHX_ "%d bytes",
	1149	(int) non_cont_byte_pos);
	1150	const U8 * x = s + non_cont_byte_pos;
	1151	const U8 * e = s + print_len;
	1152
	1153	PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
	1154
	1155	/* We don't need to pass this parameter, but since it has already been
	1156	* calculated, it's likely faster to pass it; verify under DEBUGGING */
	1157	assert(expect_len == UTF8SKIP(s));
	1158
	1159	/* As a defensive coding measure, don't output anything past a NUL. Such
	1160	* bytes shouldn't be in the middle of a malformation, and could mark the
	1161	* end of the allocated string, and what comes after is undefined */
	1162	for (; x < e; x++) {
	1163	if (*x == '\0') {
	1164	x++; /* Output this particular NUL */
	1165	break;
	1166	}
	1167	}
	1168
	1169	return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
	1170	" %s after start byte 0x%02x; need %d bytes, got %d)",
	1171	malformed_text,
	1172	_byte_dump_string(s, x - s, 0),
	1173	*(s + non_cont_byte_pos),
	1174	where,
	1175	*s,
	1176	(int) expect_len,
	1177	(int) non_cont_byte_pos);
	1178	}
	1179
	1180	/*
	1181
	1182	=for apidoc utf8n_to_uvchr
	1183
	1184	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1185	Most code should use L</utf8_to_uvchr_buf>() rather than call this
	1186	directly.
	1187
	1188	Bottom level UTF-8 decode routine.
	1189	Returns the native code point value of the first character in the string C<s>,
	1190	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	1191	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	1192	the length, in bytes, of that character.
	1193
	1194	The value of C<flags> determines the behavior when C<s> does not point to a
	1195	well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
	1196	causes zero to be returned and C<retlen> is set so that (S<C<s> + C<retlen>>)
	1197	is the next possible position in C<s> that could begin a non-malformed
	1198	character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
	1199	is raised. Some UTF-8 input sequences may contain multiple malformations.
	1200	This function tries to find every possible one in each call, so multiple
	1201	warnings can be raised for the same sequence.
	1202
	1203	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	1204	individual types of malformations, such as the sequence being overlong (that
	1205	is, when there is a shorter sequence that can express the same code point;
	1206	overlong sequences are expressly forbidden in the UTF-8 standard due to
	1207	potential security issues). Another malformation example is the first byte of
	1208	a character not being a legal first byte. See F<utf8.h> for the list of such
	1209	flags. Even if allowed, this function generally returns the Unicode
	1210	REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
	1211	F<utf8.h> to override this behavior for the overlong malformations, but don't
	1212	do that except for very specialized purposes.
	1213
	1214	The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
	1215	flags) malformation is found. If this flag is set, the routine assumes that
	1216	the caller will raise a warning, and this function will silently just set
	1217	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	1218
	1219	Note that this API requires disambiguation between successful decoding a C<NUL>
	1220	character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
	1221	in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
	1222	be set to 1. To disambiguate, upon a zero return, see if the first byte of
	1223	C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
	1224	error. Or you can use C<L</utf8n_to_uvchr_error>>.
	1225
	1226	Certain code points are considered problematic. These are Unicode surrogates,
	1227	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	1228	By default these are considered regular code points, but certain situations
	1229	warrant special handling for them, which can be specified using the C<flags>
	1230	parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
	1231	three classes are treated as malformations and handled as such. The flags
	1232	C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
	1233	C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
	1234	disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
	1235	restricts the allowed inputs to the strict UTF-8 traditionally defined by
	1236	Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
	1237	definition given by
	1238	L<Unicode Corrigendum #9\|https://www.unicode.org/versions/corrigendum9.html>.
	1239	The difference between traditional strictness and C9 strictness is that the
	1240	latter does not forbid non-character code points. (They are still discouraged,
	1241	however.) For more discussion see L<perlunicode/Noncharacter code points>.
	1242
	1243	The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
	1244	C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
	1245	C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
	1246	raised for their respective categories, but otherwise the code points are
	1247	considered valid (not malformations). To get a category to both be treated as
	1248	a malformation and raise a warning, specify both the WARN and DISALLOW flags.
	1249	(But note that warnings are not raised if lexically disabled nor if
	1250	C<UTF8_CHECK_ONLY> is also specified.)
	1251
	1252	Extremely high code points were never specified in any standard, and require an
	1253	extension to UTF-8 to express, which Perl does. It is likely that programs
	1254	written in something other than Perl would not be able to read files that
	1255	contain these; nor would Perl understand files written by something that uses a
	1256	different extension. For these reasons, there is a separate set of flags that
	1257	can warn and/or disallow these extremely high code points, even if other
	1258	above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
	1259	C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
	1260	C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
	1261	above-Unicode code points, including these, as malformations.
	1262	(Note that the Unicode standard considers anything above 0x10FFFF to be
	1263	illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
	1264	(2**31 -1))
	1265
	1266	A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
	1267	retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
	1268	C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	1269	C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
	1270	can apply to code points that actually do fit in 31 bits. This happens on
	1271	EBCDIC platforms, and sometimes when the L<overlong
	1272	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	1273	describe the situation in all cases.
	1274
	1275
	1276	All other code points corresponding to Unicode characters, including private
	1277	use and those yet to be assigned, are never considered malformed and never
	1278	warn.
	1279
	1280	=for apidoc Amnh\|\|UTF8_CHECK_ONLY
	1281	=for apidoc Amnh\|\|UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	1282	=for apidoc Amnh\|\|UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
	1283	=for apidoc Amnh\|\|UTF8_DISALLOW_SURROGATE
	1284	=for apidoc Amnh\|\|UTF8_DISALLOW_NONCHAR
	1285	=for apidoc Amnh\|\|UTF8_DISALLOW_SUPER
	1286	=for apidoc Amnh\|\|UTF8_WARN_ILLEGAL_INTERCHANGE
	1287	=for apidoc Amnh\|\|UTF8_WARN_ILLEGAL_C9_INTERCHANGE
	1288	=for apidoc Amnh\|\|UTF8_WARN_SURROGATE
	1289	=for apidoc Amnh\|\|UTF8_WARN_NONCHAR
	1290	=for apidoc Amnh\|\|UTF8_WARN_SUPER
	1291	=for apidoc Amnh\|\|UTF8_WARN_PERL_EXTENDED
	1292	=for apidoc Amnh\|\|UTF8_DISALLOW_PERL_EXTENDED
	1293
	1294	=cut
	1295
	1296	Also implemented as a macro in utf8.h
	1297	*/
	1298
	1299	UV
	1300	Perl_utf8n_to_uvchr(const U8 *s,
	1301	STRLEN curlen,
	1302	STRLEN *retlen,
	1303	const U32 flags)
	1304	{
	1305	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	1306
	1307	return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
	1308	}
	1309
	1310	/*
	1311
	1312	=for apidoc utf8n_to_uvchr_error
	1313
	1314	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1315	Most code should use L</utf8_to_uvchr_buf>() rather than call this
	1316	directly.
	1317
	1318	This function is for code that needs to know what the precise malformation(s)
	1319	are when an error is found. If you also need to know the generated warning
	1320	messages, use L</utf8n_to_uvchr_msgs>() instead.
	1321
	1322	It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
	1323	all the others, C<errors>. If this parameter is 0, this function behaves
	1324	identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
	1325	to a C<U32> variable, which this function sets to indicate any errors found.
	1326	Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
	1327	C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
	1328	of these bits will be set if a malformation is found, even if the input
	1329	C<flags> parameter indicates that the given malformation is allowed; those
	1330	exceptions are noted:
	1331
	1332	=over 4
	1333
	1334	=item C<UTF8_GOT_PERL_EXTENDED>
	1335
	1336	The input sequence is not standard UTF-8, but a Perl extension. This bit is
	1337	set only if the input C<flags> parameter contains either the
	1338	C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
	1339
	1340	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	1341	and so some extension must be used to express them. Perl uses a natural
	1342	extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
	1343	extension to represent even higher ones, so that any code point that fits in a
	1344	64-bit word can be represented. Text using these extensions is not likely to
	1345	be portable to non-Perl code. We lump both of these extensions together and
	1346	refer to them as Perl extended UTF-8. There exist other extensions that people
	1347	have invented, incompatible with Perl's.
	1348
	1349	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	1350	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	1351	than on ASCII. Prior to that, code points 2**31 and higher were simply
	1352	unrepresentable, and a different, incompatible method was used to represent
	1353	code points between 230 and 231 - 1.
	1354
	1355	On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
	1356	Perl extended UTF-8 is used.
	1357
	1358	In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
	1359	may use for backward compatibility. That name is misleading, as this flag may
	1360	be set when the code point actually does fit in 31 bits. This happens on
	1361	EBCDIC platforms, and sometimes when the L<overlong
	1362	malformation\|/C<UTF8_GOT_LONG>> is also present. The new name accurately
	1363	describes the situation in all cases.
	1364
	1365	=item C<UTF8_GOT_CONTINUATION>
	1366
	1367	The input sequence was malformed in that the first byte was a UTF-8
	1368	continuation byte.
	1369
	1370	=item C<UTF8_GOT_EMPTY>
	1371
	1372	The input C<curlen> parameter was 0.
	1373
	1374	=item C<UTF8_GOT_LONG>
	1375
	1376	The input sequence was malformed in that there is some other sequence that
	1377	evaluates to the same code point, but that sequence is shorter than this one.
	1378
	1379	Until Unicode 3.1, it was legal for programs to accept this malformation, but
	1380	it was discovered that this created security issues.
	1381
	1382	=item C<UTF8_GOT_NONCHAR>
	1383
	1384	The code point represented by the input UTF-8 sequence is for a Unicode
	1385	non-character code point.
	1386	This bit is set only if the input C<flags> parameter contains either the
	1387	C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
	1388
	1389	=item C<UTF8_GOT_NON_CONTINUATION>
	1390
	1391	The input sequence was malformed in that a non-continuation type byte was found
	1392	in a position where only a continuation type one should be. See also
	1393	C<L</UTF8_GOT_SHORT>>.
	1394
	1395	=item C<UTF8_GOT_OVERFLOW>
	1396
	1397	The input sequence was malformed in that it is for a code point that is not
	1398	representable in the number of bits available in an IV on the current platform.
	1399
	1400	=item C<UTF8_GOT_SHORT>
	1401
	1402	The input sequence was malformed in that C<curlen> is smaller than required for
	1403	a complete sequence. In other words, the input is for a partial character
	1404	sequence.
	1405
	1406
	1407	C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
	1408	sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
	1409	that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
	1410	sequence was looked at. If no other flags are present, it means that the
	1411	sequence was valid as far as it went. Depending on the application, this could
	1412	mean one of three things:
	1413
	1414	=over
	1415
	1416	=item *
	1417
	1418	The C<curlen> length parameter passed in was too small, and the function was
	1419	prevented from examining all the necessary bytes.
	1420
	1421	=item *
	1422
	1423	The buffer being looked at is based on reading data, and the data received so
	1424	far stopped in the middle of a character, so that the next read will
	1425	read the remainder of this character. (It is up to the caller to deal with the
	1426	split bytes somehow.)
	1427
	1428	=item *
	1429
	1430	This is a real error, and the partial sequence is all we're going to get.
	1431
	1432	=back
	1433
	1434	=item C<UTF8_GOT_SUPER>
	1435
	1436	The input sequence was malformed in that it is for a non-Unicode code point;
	1437	that is, one above the legal Unicode maximum.
	1438	This bit is set only if the input C<flags> parameter contains either the
	1439	C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
	1440
	1441	=item C<UTF8_GOT_SURROGATE>
	1442
	1443	The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
	1444	code point.
	1445	This bit is set only if the input C<flags> parameter contains either the
	1446	C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
	1447
	1448	=back
	1449
	1450	To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
	1451	flag to suppress any warnings, and then examine the C<*errors> return.
	1452
	1453	=cut
	1454
	1455	Also implemented as a macro in utf8.h
	1456	*/
	1457
	1458	UV
	1459	Perl_utf8n_to_uvchr_error(const U8 *s,
	1460	STRLEN curlen,
	1461	STRLEN *retlen,
	1462	const U32 flags,
	1463	U32 * errors)
	1464	{
	1465	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
	1466
	1467	return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
	1468	}
	1469
	1470	/*
	1471
	1472	=for apidoc utf8n_to_uvchr_msgs
	1473
	1474	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1475	Most code should use L</utf8_to_uvchr_buf>() rather than call this
	1476	directly.
	1477
	1478	This function is for code that needs to know what the precise malformation(s)
	1479	are when an error is found, and wants the corresponding warning and/or error
	1480	messages to be returned to the caller rather than be displayed. All messages
	1481	that would have been displayed if all lexical warnings are enabled will be
	1482	returned.
	1483
	1484	It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
	1485	placed after all the others, C<msgs>. If this parameter is 0, this function
	1486	behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
	1487	be a pointer to an C<AV *> variable, in which this function creates a new AV to
	1488	contain any appropriate messages. The elements of the array are ordered so
	1489	that the first message that would have been displayed is in the 0th element,
	1490	and so on. Each element is a hash with three key-value pairs, as follows:
	1491
	1492	=over 4
	1493
	1494	=item C<text>
	1495
	1496	The text of the message as a C<SVpv>.
	1497
	1498	=item C<warn_categories>
	1499
	1500	The warning category (or categories) packed into a C<SVuv>.
	1501
	1502	=item C<flag>
	1503
	1504	A single flag bit associated with this message, in a C<SVuv>.
	1505	The bit corresponds to some bit in the C<*errors> return value,
	1506	such as C<UTF8_GOT_LONG>.
	1507
	1508	=back
	1509
	1510	It's important to note that specifying this parameter as non-null will cause
	1511	any warnings this function would otherwise generate to be suppressed, and
	1512	instead be placed in C<*msgs>. The caller can check the lexical warnings state
	1513	(or not) when choosing what to do with the returned messages.
	1514
	1515	If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
	1516	no AV is created.
	1517
	1518	The caller, of course, is responsible for freeing any returned AV.
	1519
	1520	=cut
	1521	*/
	1522
	1523	UV
	1524	Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
	1525	STRLEN curlen,
	1526	STRLEN *retlen,
	1527	const U32 flags,
	1528	U32 * errors,
	1529	AV ** msgs)
	1530	{
	1531	const U8 * const s0 = s;
	1532	const U8 * send = s0 + curlen;
	1533	U32 possible_problems; /* A bit is set here for each potential problem
	1534	found as we go along */
	1535	UV uv;
	1536	STRLEN expectlen; /* How long should this sequence be? */
	1537	STRLEN avail_len; /* When input is too short, gives what that is */
	1538	U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
	1539	gets set and discarded */
	1540
	1541	/* The below are used only if there is both an overlong malformation and a
	1542	* too short one. Otherwise the first two are set to 's0' and 'send', and
	1543	* the third not used at all */
	1544	U8 * adjusted_s0;
	1545	U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
	1546	routine; see [perl #130921] */
	1547	UV uv_so_far;
	1548	dTHX;
	1549
	1550	PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
	1551
	1552	/* Here, is one of: a) malformed; b) a problematic code point (surrogate,
	1553	* non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
	1554	* syllables that the dfa doesn't properly handle. Quickly dispose of the
	1555	* final case. */
	1556
	1557	#ifndef EBCDIC
	1558
	1559	/* Each of the affected Hanguls starts with \xED */
	1560
	1561	if (is_HANGUL_ED_utf8_safe(s0, send)) {
	1562	if (retlen) {
	1563	*retlen = 3;
	1564	}
	1565	if (errors) {
	1566	*errors = 0;
	1567	}
	1568	if (msgs) {
	1569	*msgs = NULL;
	1570	}
	1571
	1572	return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
	1573	\| ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
	1574	\| (s0[2] & UTF_CONTINUATION_MASK);
	1575	}
	1576
	1577	#endif
	1578
	1579	/* In conjunction with the exhaustive tests that can be enabled in
	1580	* APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
	1581	* what it is intended to do, and that no flaws in it are masked by
	1582	* dropping down and executing the code below
	1583	assert(! isUTF8_CHAR(s0, send)
	1584	\|\| UTF8_IS_SURROGATE(s0, send)
	1585	\|\| UTF8_IS_SUPER(s0, send)
	1586	\|\| UTF8_IS_NONCHAR(s0,send));
	1587	*/
	1588
	1589	s = s0;
	1590	uv = *s0;
	1591	possible_problems = 0;
	1592	expectlen = 0;
	1593	avail_len = 0;
	1594	discard_errors = 0;
	1595	adjusted_s0 = (U8 *) s0;
	1596	uv_so_far = 0;
	1597
	1598	if (errors) {
	1599	*errors = 0;
	1600	}
	1601	else {
	1602	errors = &discard_errors;
	1603	}
	1604
	1605	/* The order of malformation tests here is important. We should consume as
	1606	* few bytes as possible in order to not skip any valid character. This is
	1607	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	1608	* https://unicode.org/reports/tr36 for more discussion as to why. For
	1609	* example, once we've done a UTF8SKIP, we can tell the expected number of
	1610	* bytes, and could fail right off the bat if the input parameters indicate
	1611	* that there are too few available. But it could be that just that first
	1612	* byte is garbled, and the intended character occupies fewer bytes. If we
	1613	* blindly assumed that the first byte is correct, and skipped based on
	1614	* that number, we could skip over a valid input character. So instead, we
	1615	* always examine the sequence byte-by-byte.
	1616	*
	1617	* We also should not consume too few bytes, otherwise someone could inject
	1618	* things. For example, an input could be deliberately designed to
	1619	* overflow, and if this code bailed out immediately upon discovering that,
	1620	* returning to the caller C<*retlen> pointing to the very next byte (one
	1621	* which is actually part of the overflowing sequence), that could look
	1622	* legitimate to the caller, which could discard the initial partial
	1623	* sequence and process the rest, inappropriately.
	1624	*
	1625	* Some possible input sequences are malformed in more than one way. This
	1626	* function goes to lengths to try to find all of them. This is necessary
	1627	* for correctness, as the inputs may allow one malformation but not
	1628	* another, and if we abandon searching for others after finding the
	1629	* allowed one, we could allow in something that shouldn't have been.
	1630	*/
	1631
	1632	if (UNLIKELY(curlen == 0)) {
	1633	possible_problems \|= UTF8_GOT_EMPTY;
	1634	curlen = 0;
	1635	uv = UNICODE_REPLACEMENT;
	1636	goto ready_to_handle_errors;
	1637	}
	1638
	1639	expectlen = UTF8SKIP(s);
	1640
	1641	/* A well-formed UTF-8 character, as the vast majority of calls to this
	1642	* function will be for, has this expected length. For efficiency, set
	1643	* things up here to return it. It will be overriden only in those rare
	1644	* cases where a malformation is found */
	1645	if (retlen) {
	1646	*retlen = expectlen;
	1647	}
	1648
	1649	/* A continuation character can't start a valid sequence */
	1650	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	1651	possible_problems \|= UTF8_GOT_CONTINUATION;
	1652	curlen = 1;
	1653	uv = UNICODE_REPLACEMENT;
	1654	goto ready_to_handle_errors;
	1655	}
	1656
	1657	/* Here is not a continuation byte, nor an invariant. The only thing left
	1658	* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
	1659	* because it excludes start bytes like \xC0 that always lead to
	1660	* overlongs.) */
	1661
	1662	/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
	1663	* that indicate the number of bytes in the character's whole UTF-8
	1664	* sequence, leaving just the bits that are part of the value. */
	1665	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	1666
	1667	/* Setup the loop end point, making sure to not look past the end of the
	1668	* input string, and flag it as too short if the size isn't big enough. */
	1669	if (UNLIKELY(curlen < expectlen)) {
	1670	possible_problems \|= UTF8_GOT_SHORT;
	1671	avail_len = curlen;
	1672	}
	1673	else {
	1674	send = (U8*) s0 + expectlen;
	1675	}
	1676
	1677	/* Now, loop through the remaining bytes in the character's sequence,
	1678	* accumulating each into the working value as we go. */
	1679	for (s = s0 + 1; s < send; s++) {
	1680	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	1681	uv = UTF8_ACCUMULATE(uv, *s);
	1682	continue;
	1683	}
	1684
	1685	/* Here, found a non-continuation before processing all expected bytes.
	1686	* This byte indicates the beginning of a new character, so quit, even
	1687	* if allowing this malformation. */
	1688	possible_problems \|= UTF8_GOT_NON_CONTINUATION;
	1689	break;
	1690	} /* End of loop through the character's bytes */
	1691
	1692	/* Save how many bytes were actually in the character */
	1693	curlen = s - s0;
	1694
	1695	/* Note that there are two types of too-short malformation. One is when
	1696	* there is actual wrong data before the normal termination of the
	1697	* sequence. The other is that the sequence wasn't complete before the end
	1698	* of the data we are allowed to look at, based on the input 'curlen'.
	1699	* This means that we were passed data for a partial character, but it is
	1700	* valid as far as we saw. The other is definitely invalid. This
	1701	* distinction could be important to a caller, so the two types are kept
	1702	* separate.
	1703	*
	1704	* A convenience macro that matches either of the too-short conditions. */
	1705	# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT\|UTF8_GOT_NON_CONTINUATION)
	1706
	1707	if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
	1708	uv_so_far = uv;
	1709	uv = UNICODE_REPLACEMENT;
	1710	}
	1711
	1712	/* Check for overflow. The algorithm requires us to not look past the end
	1713	* of the current character, even if partial, so the upper limit is 's' */
	1714	if (UNLIKELY(0 < does_utf8_overflow(s0, s,
	1715	1 /* Do consider overlongs */
	1716	)))
	1717	{
	1718	possible_problems \|= UTF8_GOT_OVERFLOW;
	1719	uv = UNICODE_REPLACEMENT;
	1720	}
	1721
	1722	/* Check for overlong. If no problems so far, 'uv' is the correct code
	1723	* point value. Simply see if it is expressible in fewer bytes. Otherwise
	1724	* we must look at the UTF-8 byte sequence itself to see if it is for an
	1725	* overlong */
	1726	if ( ( LIKELY(! possible_problems)
	1727	&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
	1728	\|\| ( UNLIKELY(possible_problems)
	1729	&& ( UNLIKELY(! UTF8_IS_START(*s0))
	1730	\|\| ( curlen > 1
	1731	&& UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
	1732	s - s0))))))
	1733	{
	1734	possible_problems \|= UTF8_GOT_LONG;
	1735
	1736	if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
	1737
	1738	/* The calculation in the 'true' branch of this 'if'
	1739	* below won't work if overflows, and isn't needed
	1740	* anyway. Further below we handle all overflow
	1741	* cases */
	1742	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
	1743	{
	1744	UV min_uv = uv_so_far;
	1745	STRLEN i;
	1746
	1747	/* Here, the input is both overlong and is missing some trailing
	1748	* bytes. There is no single code point it could be for, but there
	1749	* may be enough information present to determine if what we have
	1750	* so far is for an unallowed code point, such as for a surrogate.
	1751	* The code further below has the intelligence to determine this,
	1752	* but just for non-overlong UTF-8 sequences. What we do here is
	1753	* calculate the smallest code point the input could represent if
	1754	* there were no too short malformation. Then we compute and save
	1755	* the UTF-8 for that, which is what the code below looks at
	1756	* instead of the raw input. It turns out that the smallest such
	1757	* code point is all we need. */
	1758	for (i = curlen; i < expectlen; i++) {
	1759	min_uv = UTF8_ACCUMULATE(min_uv,
	1760	I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
	1761	}
	1762
	1763	adjusted_s0 = temp_char_buf;
	1764	(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
	1765	}
	1766	}
	1767
	1768	/* Here, we have found all the possible problems, except for when the input
	1769	* is for a problematic code point not allowed by the input parameters. */
	1770
	1771	/* uv is valid for overlongs */
	1772	if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
	1773
	1774	/* isn't problematic if < this */
	1775	&& uv >= UNICODE_SURROGATE_FIRST)
	1776	\|\| ( UNLIKELY(possible_problems)
	1777
	1778	/* if overflow, we know without looking further
	1779	* precisely which of the problematic types it is,
	1780	* and we deal with those in the overflow handling
	1781	* code */
	1782	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
	1783	&& ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
	1784	\|\| UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
	1785	&& ((flags & ( UTF8_DISALLOW_NONCHAR
	1786	\|UTF8_DISALLOW_SURROGATE
	1787	\|UTF8_DISALLOW_SUPER
	1788	\|UTF8_DISALLOW_PERL_EXTENDED
	1789	\|UTF8_WARN_NONCHAR
	1790	\|UTF8_WARN_SURROGATE
	1791	\|UTF8_WARN_SUPER
	1792	\|UTF8_WARN_PERL_EXTENDED))))
	1793	{
	1794	/* If there were no malformations, or the only malformation is an
	1795	* overlong, 'uv' is valid */
	1796	if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
	1797	if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	1798	possible_problems \|= UTF8_GOT_SURROGATE;
	1799	}
	1800	else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
	1801	possible_problems \|= UTF8_GOT_SUPER;
	1802	}
	1803	else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
	1804	possible_problems \|= UTF8_GOT_NONCHAR;
	1805	}
	1806	}
	1807	else { /* Otherwise, need to look at the source UTF-8, possibly
	1808	adjusted to be non-overlong */
	1809
	1810	if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
	1811	>= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1812	{
	1813	possible_problems \|= UTF8_GOT_SUPER;
	1814	}
	1815	else if (curlen > 1) {
	1816	if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
	1817	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1818	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1819	{
	1820	possible_problems \|= UTF8_GOT_SUPER;
	1821	}
	1822	else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
	1823	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1824	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1825	{
	1826	possible_problems \|= UTF8_GOT_SURROGATE;
	1827	}
	1828	}
	1829
	1830	/* We need a complete well-formed UTF-8 character to discern
	1831	* non-characters, so can't look for them here */
	1832	}
	1833	}
	1834
	1835	ready_to_handle_errors:
	1836
	1837	/* At this point:
	1838	* curlen contains the number of bytes in the sequence that
	1839	* this call should advance the input by.
	1840	* avail_len gives the available number of bytes passed in, but
	1841	* only if this is less than the expected number of
	1842	* bytes, based on the code point's start byte.
	1843	* possible_problems' is 0 if there weren't any problems; otherwise a bit
	1844	* is set in it for each potential problem found.
	1845	* uv contains the code point the input sequence
	1846	* represents; or if there is a problem that prevents
	1847	* a well-defined value from being computed, it is
	1848	* some subsitute value, typically the REPLACEMENT
	1849	* CHARACTER.
	1850	* s0 points to the first byte of the character
	1851	* s points to just after were we left off processing
	1852	* the character
	1853	* send points to just after where that character should
	1854	* end, based on how many bytes the start byte tells
	1855	* us should be in it, but no further than s0 +
	1856	* avail_len
	1857	*/
	1858
	1859	if (UNLIKELY(possible_problems)) {
	1860	bool disallowed = FALSE;
	1861	const U32 orig_problems = possible_problems;
	1862
	1863	if (msgs) {
	1864	*msgs = NULL;
	1865	}
	1866
	1867	while (possible_problems) { /* Handle each possible problem */
	1868	U32 pack_warn = 0;
	1869	char * message = NULL;
	1870	U32 this_flag_bit = 0;
	1871
	1872	/* Each 'if' clause handles one problem. They are ordered so that
	1873	* the first ones' messages will be displayed before the later
	1874	* ones; this is kinda in decreasing severity order. But the
	1875	* overlong must come last, as it changes 'uv' looked at by the
	1876	* others */
	1877	if (possible_problems & UTF8_GOT_OVERFLOW) {
	1878
	1879	/* Overflow means also got a super and are using Perl's
	1880	* extended UTF-8, but we handle all three cases here */
	1881	possible_problems
	1882	&= ~(UTF8_GOT_OVERFLOW\|UTF8_GOT_SUPER\|UTF8_GOT_PERL_EXTENDED);
	1883	*errors \|= UTF8_GOT_OVERFLOW;
	1884
	1885	/* But the API says we flag all errors found */
	1886	if (flags & (UTF8_WARN_SUPER\|UTF8_DISALLOW_SUPER)) {
	1887	*errors \|= UTF8_GOT_SUPER;
	1888	}
	1889	if (flags
	1890	& (UTF8_WARN_PERL_EXTENDED\|UTF8_DISALLOW_PERL_EXTENDED))
	1891	{
	1892	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1893	}
	1894
	1895	/* Disallow if any of the three categories say to */
	1896	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1897	\|\| (flags & ( UTF8_DISALLOW_SUPER
	1898	\|UTF8_DISALLOW_PERL_EXTENDED)))
	1899	{
	1900	disallowed = TRUE;
	1901	}
	1902
	1903	/* Likewise, warn if any say to */
	1904	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1905	\|\| (flags & (UTF8_WARN_SUPER\|UTF8_WARN_PERL_EXTENDED)))
	1906	{
	1907
	1908	/* The warnings code explicitly says it doesn't handle the
	1909	* case of packWARN2 and two categories which have
	1910	* parent-child relationship. Even if it works now to
	1911	* raise the warning if either is enabled, it wouldn't
	1912	* necessarily do so in the future. We output (only) the
	1913	* most dire warning */
	1914	if (! (flags & UTF8_CHECK_ONLY)) {
	1915	if (msgs \|\| ckWARN_d(WARN_UTF8)) {
	1916	pack_warn = packWARN(WARN_UTF8);
	1917	}
	1918	else if (msgs \|\| ckWARN_d(WARN_NON_UNICODE)) {
	1919	pack_warn = packWARN(WARN_NON_UNICODE);
	1920	}
	1921	if (pack_warn) {
	1922	message = Perl_form(aTHX_ "%s: %s (overflows)",
	1923	malformed_text,
	1924	_byte_dump_string(s0, curlen, 0));
	1925	this_flag_bit = UTF8_GOT_OVERFLOW;
	1926	}
	1927	}
	1928	}
	1929	}
	1930	else if (possible_problems & UTF8_GOT_EMPTY) {
	1931	possible_problems &= ~UTF8_GOT_EMPTY;
	1932	*errors \|= UTF8_GOT_EMPTY;
	1933
	1934	if (! (flags & UTF8_ALLOW_EMPTY)) {
	1935
	1936	/* This so-called malformation is now treated as a bug in
	1937	* the caller. If you have nothing to decode, skip calling
	1938	* this function */
	1939	assert(0);
	1940
	1941	disallowed = TRUE;
	1942	if ( (msgs
	1943	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1944	{
	1945	pack_warn = packWARN(WARN_UTF8);
	1946	message = Perl_form(aTHX_ "%s (empty string)",
	1947	malformed_text);
	1948	this_flag_bit = UTF8_GOT_EMPTY;
	1949	}
	1950	}
	1951	}
	1952	else if (possible_problems & UTF8_GOT_CONTINUATION) {
	1953	possible_problems &= ~UTF8_GOT_CONTINUATION;
	1954	*errors \|= UTF8_GOT_CONTINUATION;
	1955
	1956	if (! (flags & UTF8_ALLOW_CONTINUATION)) {
	1957	disallowed = TRUE;
	1958	if (( msgs
	1959	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1960	{
	1961	pack_warn = packWARN(WARN_UTF8);
	1962	message = Perl_form(aTHX_
	1963	"%s: %s (unexpected continuation byte 0x%02x,"
	1964	" with no preceding start byte)",
	1965	malformed_text,
	1966	_byte_dump_string(s0, 1, 0), *s0);
	1967	this_flag_bit = UTF8_GOT_CONTINUATION;
	1968	}
	1969	}
	1970	}
	1971	else if (possible_problems & UTF8_GOT_SHORT) {
	1972	possible_problems &= ~UTF8_GOT_SHORT;
	1973	*errors \|= UTF8_GOT_SHORT;
	1974
	1975	if (! (flags & UTF8_ALLOW_SHORT)) {
	1976	disallowed = TRUE;
	1977	if (( msgs
	1978	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1979	{
	1980	pack_warn = packWARN(WARN_UTF8);
	1981	message = Perl_form(aTHX_
	1982	"%s: %s (too short; %d byte%s available, need %d)",
	1983	malformed_text,
	1984	_byte_dump_string(s0, send - s0, 0),
	1985	(int)avail_len,
	1986	avail_len == 1 ? "" : "s",
	1987	(int)expectlen);
	1988	this_flag_bit = UTF8_GOT_SHORT;
	1989	}
	1990	}
	1991
	1992	}
	1993	else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
	1994	possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
	1995	*errors \|= UTF8_GOT_NON_CONTINUATION;
	1996
	1997	if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
	1998	disallowed = TRUE;
	1999	if (( msgs
	2000	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	2001	{
	2002
	2003	/* If we don't know for sure that the input length is
	2004	* valid, avoid as much as possible reading past the
	2005	* end of the buffer */
	2006	int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
	2007	? (int) (s - s0)
	2008	: (int) (send - s0);
	2009	pack_warn = packWARN(WARN_UTF8);
	2010	message = Perl_form(aTHX_ "%s",
	2011	unexpected_non_continuation_text(s0,
	2012	printlen,
	2013	s - s0,
	2014	(int) expectlen));
	2015	this_flag_bit = UTF8_GOT_NON_CONTINUATION;
	2016	}
	2017	}
	2018	}
	2019	else if (possible_problems & UTF8_GOT_SURROGATE) {
	2020	possible_problems &= ~UTF8_GOT_SURROGATE;
	2021
	2022	if (flags & UTF8_WARN_SURROGATE) {
	2023	*errors \|= UTF8_GOT_SURROGATE;
	2024
	2025	if ( ! (flags & UTF8_CHECK_ONLY)
	2026	&& (msgs \|\| ckWARN_d(WARN_SURROGATE)))
	2027	{
	2028	pack_warn = packWARN(WARN_SURROGATE);
	2029
	2030	/* These are the only errors that can occur with a
	2031	* surrogate when the 'uv' isn't valid */
	2032	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	2033	message = Perl_form(aTHX_
	2034	"UTF-16 surrogate (any UTF-8 sequence that"
	2035	" starts with \"%s\" is for a surrogate)",
	2036	_byte_dump_string(s0, curlen, 0));
	2037	}
	2038	else {
	2039	message = Perl_form(aTHX_ surrogate_cp_format, uv);
	2040	}
	2041	this_flag_bit = UTF8_GOT_SURROGATE;
	2042	}
	2043	}
	2044
	2045	if (flags & UTF8_DISALLOW_SURROGATE) {
	2046	disallowed = TRUE;
	2047	*errors \|= UTF8_GOT_SURROGATE;
	2048	}
	2049	}
	2050	else if (possible_problems & UTF8_GOT_SUPER) {
	2051	possible_problems &= ~UTF8_GOT_SUPER;
	2052
	2053	if (flags & UTF8_WARN_SUPER) {
	2054	*errors \|= UTF8_GOT_SUPER;
	2055
	2056	if ( ! (flags & UTF8_CHECK_ONLY)
	2057	&& (msgs \|\| ckWARN_d(WARN_NON_UNICODE)))
	2058	{
	2059	pack_warn = packWARN(WARN_NON_UNICODE);
	2060
	2061	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	2062	message = Perl_form(aTHX_
	2063	"Any UTF-8 sequence that starts with"
	2064	" \"%s\" is for a non-Unicode code point,"
	2065	" may not be portable",
	2066	_byte_dump_string(s0, curlen, 0));
	2067	}
	2068	else {
	2069	message = Perl_form(aTHX_ super_cp_format, uv);
	2070	}
	2071	this_flag_bit = UTF8_GOT_SUPER;
	2072	}
	2073	}
	2074
	2075	/* Test for Perl's extended UTF-8 after the regular SUPER ones,
	2076	* and before possibly bailing out, so that the more dire
	2077	* warning will override the regular one. */
	2078	if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
	2079	if ( ! (flags & UTF8_CHECK_ONLY)
	2080	&& (flags & (UTF8_WARN_PERL_EXTENDED\|UTF8_WARN_SUPER))
	2081	&& (msgs \|\| ( ckWARN_d(WARN_NON_UNICODE)
	2082	\|\| ckWARN(WARN_PORTABLE))))
	2083	{
	2084	pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
	2085
	2086	/* If it is an overlong that evaluates to a code point
	2087	* that doesn't have to use the Perl extended UTF-8, it
	2088	* still used it, and so we output a message that
	2089	* doesn't refer to the code point. The same is true
	2090	* if there was a SHORT malformation where the code
	2091	* point is not valid. In that case, 'uv' will have
	2092	* been set to the REPLACEMENT CHAR, and the message
	2093	* below without the code point in it will be selected
	2094	* */
	2095	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	2096	message = Perl_form(aTHX_
	2097	PL_extended_cp_format, uv);
	2098	}
	2099	else {
	2100	message = Perl_form(aTHX_
	2101	"Any UTF-8 sequence that starts with"
	2102	" \"%s\" is a Perl extension, and"
	2103	" so is not portable",
	2104	_byte_dump_string(s0, curlen, 0));
	2105	}
	2106	this_flag_bit = UTF8_GOT_PERL_EXTENDED;
	2107	}
	2108
	2109	if (flags & ( UTF8_WARN_PERL_EXTENDED
	2110	\|UTF8_DISALLOW_PERL_EXTENDED))
	2111	{
	2112	*errors \|= UTF8_GOT_PERL_EXTENDED;
	2113
	2114	if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
	2115	disallowed = TRUE;
	2116	}
	2117	}
	2118	}
	2119
	2120	if (flags & UTF8_DISALLOW_SUPER) {
	2121	*errors \|= UTF8_GOT_SUPER;
	2122	disallowed = TRUE;
	2123	}
	2124	}
	2125	else if (possible_problems & UTF8_GOT_NONCHAR) {
	2126	possible_problems &= ~UTF8_GOT_NONCHAR;
	2127
	2128	if (flags & UTF8_WARN_NONCHAR) {
	2129	*errors \|= UTF8_GOT_NONCHAR;
	2130
	2131	if ( ! (flags & UTF8_CHECK_ONLY)
	2132	&& (msgs \|\| ckWARN_d(WARN_NONCHAR)))
	2133	{
	2134	/* The code above should have guaranteed that we don't
	2135	* get here with errors other than overlong */
	2136	assert (! (orig_problems
	2137	& ~(UTF8_GOT_LONG\|UTF8_GOT_NONCHAR)));
	2138
	2139	pack_warn = packWARN(WARN_NONCHAR);
	2140	message = Perl_form(aTHX_ nonchar_cp_format, uv);
	2141	this_flag_bit = UTF8_GOT_NONCHAR;
	2142	}
	2143	}
	2144
	2145	if (flags & UTF8_DISALLOW_NONCHAR) {
	2146	disallowed = TRUE;
	2147	*errors \|= UTF8_GOT_NONCHAR;
	2148	}
	2149	}
	2150	else if (possible_problems & UTF8_GOT_LONG) {
	2151	possible_problems &= ~UTF8_GOT_LONG;
	2152	*errors \|= UTF8_GOT_LONG;
	2153
	2154	if (flags & UTF8_ALLOW_LONG) {
	2155
	2156	/* We don't allow the actual overlong value, unless the
	2157	* special extra bit is also set */
	2158	if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
	2159	& ~UTF8_ALLOW_LONG)))
	2160	{
	2161	uv = UNICODE_REPLACEMENT;
	2162	}
	2163	}
	2164	else {
	2165	disallowed = TRUE;
	2166
	2167	if (( msgs
	2168	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	2169	{
	2170	pack_warn = packWARN(WARN_UTF8);
	2171
	2172	/* These error types cause 'uv' to be something that
	2173	* isn't what was intended, so can't use it in the
	2174	* message. The other error types either can't
	2175	* generate an overlong, or else the 'uv' is valid */
	2176	if (orig_problems &
	2177	(UTF8_GOT_TOO_SHORT\|UTF8_GOT_OVERFLOW))
	2178	{
	2179	message = Perl_form(aTHX_
	2180	"%s: %s (any UTF-8 sequence that starts"
	2181	" with \"%s\" is overlong which can and"
	2182	" should be represented with a"
	2183	" different, shorter sequence)",
	2184	malformed_text,
	2185	_byte_dump_string(s0, send - s0, 0),
	2186	_byte_dump_string(s0, curlen, 0));
	2187	}
	2188	else {
	2189	U8 tmpbuf[UTF8_MAXBYTES+1];
	2190	const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
	2191	uv, 0);
	2192	/* Don't use U+ for non-Unicode code points, which
	2193	* includes those in the Latin1 range */
	2194	const char * preface = ( uv > PERL_UNICODE_MAX
	2195	#ifdef EBCDIC
	2196	\|\| uv <= 0xFF
	2197	#endif
	2198	)
	2199	? "0x"
	2200	: "U+";
	2201	message = Perl_form(aTHX_
	2202	"%s: %s (overlong; instead use %s to represent"
	2203	" %s%0*" UVXf ")",
	2204	malformed_text,
	2205	_byte_dump_string(s0, send - s0, 0),
	2206	_byte_dump_string(tmpbuf, e - tmpbuf, 0),
	2207	preface,
	2208	((uv < 256) ? 2 : 4), /* Field width of 2 for
	2209	small code points */
	2210	UNI_TO_NATIVE(uv));
	2211	}
	2212	this_flag_bit = UTF8_GOT_LONG;
	2213	}
	2214	}
	2215	} /* End of looking through the possible flags */
	2216
	2217	/* Display the message (if any) for the problem being handled in
	2218	* this iteration of the loop */
	2219	if (message) {
	2220	if (msgs) {
	2221	assert(this_flag_bit);
	2222
	2223	if (*msgs == NULL) {
	2224	*msgs = newAV();
	2225	}
	2226
	2227	av_push(msgs, newRV_noinc((SV) new_msg_hv(message,
	2228	pack_warn,
	2229	this_flag_bit)));
	2230	}
	2231	else if (PL_op)
	2232	Perl_warner(aTHX_ pack_warn, "%s in %s", message,
	2233	OP_DESC(PL_op));
	2234	else
	2235	Perl_warner(aTHX_ pack_warn, "%s", message);
	2236	}
	2237	} /* End of 'while (possible_problems)' */
	2238
	2239	/* Since there was a possible problem, the returned length may need to
	2240	* be changed from the one stored at the beginning of this function.
	2241	* Instead of trying to figure out if that's needed, just do it. */
	2242	if (retlen) {
	2243	*retlen = curlen;
	2244	}
	2245
	2246	if (disallowed) {
	2247	if (flags & UTF8_CHECK_ONLY && retlen) {
	2248	*retlen = ((STRLEN) -1);
	2249	}
	2250	return 0;
	2251	}
	2252	}
	2253
	2254	return UNI_TO_NATIVE(uv);
	2255	}
	2256
	2257	/*
	2258	=for apidoc utf8_to_uvchr_buf
	2259
	2260	Returns the native code point of the first character in the string C<s> which
	2261	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	2262	C<*retlen> will be set to the length, in bytes, of that character.
	2263
	2264	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	2265	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	2266	C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
	2267	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	2268	C<retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<retlen>>) is
	2269	the next possible position in C<s> that could begin a non-malformed character.
	2270	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	2271	returned.
	2272
	2273	=cut
	2274
	2275	Also implemented as a macro in utf8.h
	2276
	2277	*/
	2278
	2279
	2280	UV
	2281	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	2282	{
	2283	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
	2284
	2285	return utf8_to_uvchr_buf_helper(s, send, retlen);
	2286	}
	2287
	2288	/* This is marked as deprecated
	2289	*
	2290	=for apidoc utf8_to_uvuni_buf
	2291
	2292	Only in very rare circumstances should code need to be dealing in Unicode
	2293	(as opposed to native) code points. In those few cases, use
	2294	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|perlapi/utf8_to_uvchr_buf>> instead.
	2295	If you are not absolutely sure this is one of those cases, then assume it isn't
	2296	and use plain C<utf8_to_uvchr_buf> instead.
	2297
	2298	Returns the Unicode (not-native) code point of the first character in the
	2299	string C<s> which
	2300	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	2301	C<retlen> will be set to the length, in bytes, of that character.
	2302
	2303	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	2304	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	2305	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	2306	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	2307	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	2308	next possible position in C<s> that could begin a non-malformed character.
	2309	See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	2310	returned.
	2311
	2312	=cut
	2313	*/
	2314
	2315	UV
	2316	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	2317	{
	2318	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	2319
	2320	assert(send > s);
	2321
	2322	return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
	2323	}
	2324
	2325	/*
	2326	=for apidoc utf8_length
	2327
	2328	Returns the number of characters in the sequence of UTF-8-encoded bytes starting
	2329	at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
	2330	same place, it returns 0 with no warning raised.
	2331
	2332	If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
	2333	and returns the number of valid characters.
	2334
	2335	=cut
	2336	*/
	2337
	2338	STRLEN
	2339	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	2340	{
	2341	STRLEN len = 0;
	2342
	2343	PERL_ARGS_ASSERT_UTF8_LENGTH;
	2344
	2345	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	2346	* the bitops (especially ~) can create illegal UTF-8.
	2347	* In other words: in Perl UTF-8 is not just for Unicode. */
	2348
	2349	if (UNLIKELY(e < s))
	2350	goto warn_and_return;
	2351	while (s < e) {
	2352	s += UTF8SKIP(s);
	2353	len++;
	2354	}
	2355
	2356	if (UNLIKELY(e != s)) {
	2357	len--;
	2358	warn_and_return:
	2359	if (PL_op)
	2360	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2361	"%s in %s", unees, OP_DESC(PL_op));
	2362	else
	2363	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2364	}
	2365
	2366	return len;
	2367	}
	2368
	2369	/*
	2370	=for apidoc bytes_cmp_utf8
	2371
	2372	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	2373	sequence of characters (stored as UTF-8)
	2374	in C<u>, C<ulen>. Returns 0 if they are
	2375	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	2376	if the first string is greater than the second string.
	2377
	2378	-1 or +1 is returned if the shorter string was identical to the start of the
	2379	longer string. -2 or +2 is returned if
	2380	there was a difference between characters
	2381	within the strings.
	2382
	2383	=cut
	2384	*/
	2385
	2386	int
	2387	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	2388	{
	2389	const U8 *const bend = b + blen;
	2390	const U8 *const uend = u + ulen;
	2391
	2392	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	2393
	2394	while (b < bend && u < uend) {
	2395	U8 c = *u++;
	2396	if (!UTF8_IS_INVARIANT(c)) {
	2397	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2398	if (u < uend) {
	2399	U8 c1 = *u++;
	2400	if (UTF8_IS_CONTINUATION(c1)) {
	2401	c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
	2402	} else {
	2403	/* diag_listed_as: Malformed UTF-8 character%s */
	2404	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2405	"%s %s%s",
	2406	unexpected_non_continuation_text(u - 2, 2, 1, 2),
	2407	PL_op ? " in " : "",
	2408	PL_op ? OP_DESC(PL_op) : "");
	2409	return -2;
	2410	}
	2411	} else {
	2412	if (PL_op)
	2413	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2414	"%s in %s", unees, OP_DESC(PL_op));
	2415	else
	2416	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2417	return -2; /* Really want to return undef :-) */
	2418	}
	2419	} else {
	2420	return -2;
	2421	}
	2422	}
	2423	if (*b != c) {
	2424	return *b < c ? -2 : +2;
	2425	}
	2426	++b;
	2427	}
	2428
	2429	if (b == bend && u == uend)
	2430	return 0;
	2431
	2432	return b < bend ? +1 : -1;
	2433	}
	2434
	2435	/*
	2436	=for apidoc utf8_to_bytes
	2437
	2438	Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
	2439	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	2440	updates C<*lenp> to contain the new length.
	2441	Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
	2442
	2443	Upon successful return, the number of variants in the string can be computed by
	2444	having saved the value of C<*lenp> before the call, and subtracting the
	2445	after-call value of C<*lenp> from it.
	2446
	2447	If you need a copy of the string, see L</bytes_from_utf8>.
	2448
	2449	=cut
	2450	*/
	2451
	2452	U8 *
	2453	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN lenp)
	2454	{
	2455	U8 * first_variant;
	2456
	2457	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	2458	PERL_UNUSED_CONTEXT;
	2459
	2460	/* This is a no-op if no variants at all in the input */
	2461	if (is_utf8_invariant_string_loc(s, lenp, (const U8 *) &first_variant)) {
	2462	return s;
	2463	}
	2464
	2465	{
	2466	U8 * const save = s;
	2467	U8 * const send = s + *lenp;
	2468	U8 * d;
	2469
	2470	/* Nothing before the first variant needs to be changed, so start the real
	2471	* work there */
	2472	s = first_variant;
	2473	while (s < send) {
	2474	if (! UTF8_IS_INVARIANT(*s)) {
	2475	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	2476	*lenp = ((STRLEN) -1);
	2477	return 0;
	2478	}
	2479	s++;
	2480	}
	2481	s++;
	2482	}
	2483
	2484	/* Is downgradable, so do it */
	2485	d = s = first_variant;
	2486	while (s < send) {
	2487	U8 c = *s++;
	2488	if (! UVCHR_IS_INVARIANT(c)) {
	2489	/* Then it is two-byte encoded */
	2490	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2491	s++;
	2492	}
	2493	*d++ = c;
	2494	}
	2495	*d = '\0';
	2496	*lenp = d - save;
	2497
	2498	return save;
	2499	}
	2500	}
	2501
	2502	/*
	2503	=for apidoc bytes_from_utf8
	2504
	2505	Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
	2506	byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
	2507	actually encoded in UTF-8.
	2508
	2509	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
	2510	the input string.
	2511
	2512	Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
	2513	not expressible in native byte encoding. In these cases, C<*is_utf8p> and
	2514	C<*lenp> are unchanged, and the return value is the original C<s>.
	2515
	2516	Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
	2517	newly created string containing a downgraded copy of C<s>, and whose length is
	2518	returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
	2519	caller is responsible for arranging for the memory used by this string to get
	2520	freed.
	2521
	2522	Upon successful return, the number of variants in the string can be computed by
	2523	having saved the value of C<*lenp> before the call, and subtracting the
	2524	after-call value of C<*lenp> from it.
	2525
	2526	=cut
	2527
	2528	There is a macro that avoids this function call, but this is retained for
	2529	anyone who calls it with the Perl_ prefix */
	2530
	2531	U8 *
	2532	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN lenp, bool *is_utf8p)
	2533	{
	2534	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	2535	PERL_UNUSED_CONTEXT;
	2536
	2537	return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
	2538	}
	2539
	2540	/*
	2541	=for apidoc bytes_from_utf8_loc
	2542
	2543	Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
	2544	to where to store the location of the first character in C<"s"> that cannot be
	2545	converted to non-UTF8.
	2546
	2547	If that parameter is C<NULL>, this function behaves identically to
	2548	C<bytes_from_utf8>.
	2549
	2550	Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
	2551	C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
	2552
	2553	Otherwise, the function returns a newly created C<NUL>-terminated string
	2554	containing the non-UTF8 equivalent of the convertible first portion of
	2555	C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
	2556	If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
	2557	and C<*first_non_downgradable> is set to C<NULL>.
	2558
	2559	Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
	2560	first character in the original string that wasn't converted. C<*is_utf8p> is
	2561	unchanged. Note that the new string may have length 0.
	2562
	2563	Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
	2564	C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
	2565	converts as many characters in it as possible stopping at the first one it
	2566	finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
	2567	set to point to that. The function returns the portion that could be converted
	2568	in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
	2569	not including the terminating C<NUL>. If the very first character in the
	2570	original could not be converted, C<*lenp> will be 0, and the new string will
	2571	contain just a single C<NUL>. If the entire input string was converted,
	2572	C<is_utf8p> is set to FALSE and C<first_non_downgradable> is set to C<NULL>.
	2573
	2574	Upon successful return, the number of variants in the converted portion of the
	2575	string can be computed by having saved the value of C<*lenp> before the call,
	2576	and subtracting the after-call value of C<*lenp> from it.
	2577
	2578	=cut
	2579
	2580
	2581	*/
	2582
	2583	U8 *
	2584	Perl_bytes_from_utf8_loc(const U8 s, STRLEN lenp, bool is_utf8p, const U8* first_unconverted)
	2585	{
	2586	U8 *d;
	2587	const U8 *original = s;
	2588	U8 *converted_start;
	2589	const U8 send = s + lenp;
	2590
	2591	PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
	2592
	2593	if (! *is_utf8p) {
	2594	if (first_unconverted) {
	2595	*first_unconverted = NULL;
	2596	}
	2597
	2598	return (U8 *) original;
	2599	}
	2600
	2601	Newx(d, (*lenp) + 1, U8);
	2602
	2603	converted_start = d;
	2604	while (s < send) {
	2605	U8 c = *s++;
	2606	if (! UTF8_IS_INVARIANT(c)) {
	2607
	2608	/* Then it is multi-byte encoded. If the code point is above 0xFF,
	2609	* have to stop now */
	2610	if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
	2611	if (first_unconverted) {
	2612	*first_unconverted = s - 1;
	2613	goto finish_and_return;
	2614	}
	2615	else {
	2616	Safefree(converted_start);
	2617	return (U8 *) original;
	2618	}
	2619	}
	2620
	2621	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2622	s++;
	2623	}
	2624	*d++ = c;
	2625	}
	2626
	2627	/* Here, converted the whole of the input */
	2628	*is_utf8p = FALSE;
	2629	if (first_unconverted) {
	2630	*first_unconverted = NULL;
	2631	}
	2632
	2633	finish_and_return:
	2634	*d = '\0';
	2635	*lenp = d - converted_start;
	2636
	2637	/* Trim unused space */
	2638	Renew(converted_start, *lenp + 1, U8);
	2639
	2640	return converted_start;
	2641	}
	2642
	2643	/*
	2644	=for apidoc bytes_to_utf8
	2645
	2646	Converts a string C<s> of length C<*lenp> bytes from the native encoding into
	2647	UTF-8.
	2648	Returns a pointer to the newly-created string, and sets C<*lenp> to
	2649	reflect the new length in bytes. The caller is responsible for arranging for
	2650	the memory used by this string to get freed.
	2651
	2652	Upon successful return, the number of variants in the string can be computed by
	2653	having saved the value of C<*lenp> before the call, and subtracting it from the
	2654	after-call value of C<*lenp>.
	2655
	2656	A C<NUL> character will be written after the end of the string.
	2657
	2658	If you want to convert to UTF-8 from encodings other than
	2659	the native (Latin1 or EBCDIC),
	2660	see L</sv_recode_to_utf8>().
	2661
	2662	=cut
	2663	*/
	2664
	2665	U8*
	2666	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN lenp)
	2667	{
	2668	const U8 * const send = s + (*lenp);
	2669	U8 *d;
	2670	U8 *dst;
	2671
	2672	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	2673	PERL_UNUSED_CONTEXT;
	2674
	2675	/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
	2676	Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
	2677	dst = d;
	2678
	2679	while (s < send) {
	2680	append_utf8_from_native_byte(*s, &d);
	2681	s++;
	2682	}
	2683
	2684	*d = '\0';
	2685	*lenp = d-dst;
	2686
	2687	return dst;
	2688	}
	2689
	2690	/*
	2691	* Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
	2692	* use utf16_to_utf8_reversed().
	2693	*
	2694	* UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
	2695	* UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
	2696	* UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
	2697	*
	2698	* These functions don't check for overflow. The worst case is every code
	2699	* point in the input is 2 bytes, and requires 4 bytes on output. (If the code
	2700	* is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
	2701	* destination must be pre-extended to 2 times the source length.
	2702	*
	2703	* Do not use in-place. We optimize for native, for obvious reasons. */
	2704
	2705	U8*
	2706	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
	2707	{
	2708	U8* pend;
	2709	U8* dstart = d;
	2710
	2711	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	2712
	2713	if (bytelen & 1)
	2714	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
	2715	(UV)bytelen);
	2716
	2717	pend = p + bytelen;
	2718
	2719	while (p < pend) {
	2720	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	2721	p += 2;
	2722	if (OFFUNI_IS_INVARIANT(uv)) {
	2723	*d++ = LATIN1_TO_NATIVE((U8) uv);
	2724	continue;
	2725	}
	2726	if (uv <= MAX_UTF8_TWO_BYTE) {
	2727	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	2728	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	2729	continue;
	2730	}
	2731
	2732	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	2733	#define LAST_HIGH_SURROGATE 0xDBFF
	2734	#define FIRST_LOW_SURROGATE 0xDC00
	2735	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	2736	#define FIRST_IN_PLANE1 0x10000
	2737
	2738	/* This assumes that most uses will be in the first Unicode plane, not
	2739	* needing surrogates */
	2740	if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST,
	2741	UNICODE_SURROGATE_LAST)))
	2742	{
	2743	if (UNLIKELY(p >= pend) \|\| UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
	2744	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2745	}
	2746	else {
	2747	UV low = (p[0] << 8) + p[1];
	2748	if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
	2749	LAST_LOW_SURROGATE)))
	2750	{
	2751	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2752	}
	2753	p += 2;
	2754	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	2755	+ (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
	2756	}
	2757	}
	2758	#ifdef EBCDIC
	2759	d = uvoffuni_to_utf8_flags(d, uv, 0);
	2760	#else
	2761	if (uv < FIRST_IN_PLANE1) {
	2762	*d++ = (U8)(( uv >> 12) \| 0xe0);
	2763	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2764	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2765	continue;
	2766	}
	2767	else {
	2768	*d++ = (U8)(( uv >> 18) \| 0xf0);
	2769	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	2770	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2771	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2772	continue;
	2773	}
	2774	#endif
	2775	}
	2776	*newlen = d - dstart;
	2777	return d;
	2778	}
	2779
	2780	/* Note: this one is slightly destructive of the source. */
	2781
	2782	U8*
	2783	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
	2784	{
	2785	U8* s = (U8*)p;
	2786	U8* const send = s + bytelen;
	2787
	2788	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	2789
	2790	if (bytelen & 1)
	2791	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
	2792	(UV)bytelen);
	2793
	2794	while (s < send) {
	2795	const U8 tmp = s[0];
	2796	s[0] = s[1];
	2797	s[1] = tmp;
	2798	s += 2;
	2799	}
	2800	return utf16_to_utf8(p, d, bytelen, newlen);
	2801	}
	2802
	2803	bool
	2804	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	2805	{
	2806	return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
	2807	}
	2808
	2809	bool
	2810	Perl__is_uni_perl_idcont(pTHX_ UV c)
	2811	{
	2812	return _invlist_contains_cp(PL_utf8_perl_idcont, c);
	2813	}
	2814
	2815	bool
	2816	Perl__is_uni_perl_idstart(pTHX_ UV c)
	2817	{
	2818	return _invlist_contains_cp(PL_utf8_perl_idstart, c);
	2819	}
	2820
	2821	UV
	2822	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2823	const char S_or_s)
	2824	{
	2825	/* We have the latin1-range values compiled into the core, so just use
	2826	* those, converting the result to UTF-8. The only difference between upper
	2827	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	2828	* either "SS" or "Ss". Which one to use is passed into the routine in
	2829	* 'S_or_s' to avoid a test */
	2830
	2831	UV converted = toUPPER_LATIN1_MOD(c);
	2832
	2833	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	2834
	2835	assert(S_or_s == 'S' \|\| S_or_s == 's');
	2836
	2837	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	2838	characters in this range */
	2839	*p = (U8) converted;
	2840	*lenp = 1;
	2841	return converted;
	2842	}
	2843
	2844	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	2845	* which it maps to one of them, so as to only have to have one check for
	2846	* it in the main case */
	2847	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	2848	switch (c) {
	2849	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	2850	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	2851	break;
	2852	case MICRO_SIGN:
	2853	converted = GREEK_CAPITAL_LETTER_MU;
	2854	break;
	2855	#if UNICODE_MAJOR_VERSION > 2 \
	2856	\|\| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
	2857	&& UNICODE_DOT_DOT_VERSION >= 8)
	2858	case LATIN_SMALL_LETTER_SHARP_S:
	2859	*(p)++ = 'S';
	2860	*p = S_or_s;
	2861	*lenp = 2;
	2862	return 'S';
	2863	#endif
	2864	default:
	2865	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
	2866	" '%c' to map to '%c'",
	2867	c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	2868	NOT_REACHED; /* NOTREACHED */
	2869	}
	2870	}
	2871
	2872	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2873	*p = UTF8_TWO_BYTE_LO(converted);
	2874	*lenp = 2;
	2875
	2876	return converted;
	2877	}
	2878
	2879	/* If compiled on an early Unicode version, there may not be auxiliary tables
	2880	* */
	2881	#ifndef HAS_UC_AUX_TABLES
	2882	# define UC_AUX_TABLE_ptrs NULL
	2883	# define UC_AUX_TABLE_lengths NULL
	2884	#endif
	2885	#ifndef HAS_TC_AUX_TABLES
	2886	# define TC_AUX_TABLE_ptrs NULL
	2887	# define TC_AUX_TABLE_lengths NULL
	2888	#endif
	2889	#ifndef HAS_LC_AUX_TABLES
	2890	# define LC_AUX_TABLE_ptrs NULL
	2891	# define LC_AUX_TABLE_lengths NULL
	2892	#endif
	2893	#ifndef HAS_CF_AUX_TABLES
	2894	# define CF_AUX_TABLE_ptrs NULL
	2895	# define CF_AUX_TABLE_lengths NULL
	2896	#endif
	2897	#ifndef HAS_UC_AUX_TABLES
	2898	# define UC_AUX_TABLE_ptrs NULL
	2899	# define UC_AUX_TABLE_lengths NULL
	2900	#endif
	2901
	2902	/* Call the function to convert a UTF-8 encoded character to the specified case.
	2903	* Note that there may be more than one character in the result.
	2904	* 's' is a pointer to the first byte of the input character
	2905	* 'd' will be set to the first byte of the string of changed characters. It
	2906	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	2907	* 'lenp' will be set to the length in bytes of the string of changed characters
	2908	*
	2909	* The functions return the ordinal of the first character in the string of
	2910	* 'd' */
	2911	#define CALL_UPPER_CASE(uv, s, d, lenp) \
	2912	_to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
	2913	Uppercase_Mapping_invmap, \
	2914	UC_AUX_TABLE_ptrs, \
	2915	UC_AUX_TABLE_lengths, \
	2916	"uppercase")
	2917	#define CALL_TITLE_CASE(uv, s, d, lenp) \
	2918	_to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
	2919	Titlecase_Mapping_invmap, \
	2920	TC_AUX_TABLE_ptrs, \
	2921	TC_AUX_TABLE_lengths, \
	2922	"titlecase")
	2923	#define CALL_LOWER_CASE(uv, s, d, lenp) \
	2924	_to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
	2925	Lowercase_Mapping_invmap, \
	2926	LC_AUX_TABLE_ptrs, \
	2927	LC_AUX_TABLE_lengths, \
	2928	"lowercase")
	2929
	2930
	2931	/* This additionally has the input parameter 'specials', which if non-zero will
	2932	* cause this to use the specials hash for folding (meaning get full case
	2933	* folding); otherwise, when zero, this implies a simple case fold */
	2934	#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
	2935	(specials) \
	2936	? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
	2937	Case_Folding_invmap, \
	2938	CF_AUX_TABLE_ptrs, \
	2939	CF_AUX_TABLE_lengths, \
	2940	"foldcase") \
	2941	: _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
	2942	Simple_Case_Folding_invmap, \
	2943	NULL, NULL, \
	2944	"foldcase")
	2945
	2946	UV
	2947	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	2948	{
	2949	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	2950	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	2951	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	2952	* the changed version may be longer than the original character.
	2953	*
	2954	* The ordinal of the first character of the changed version is returned
	2955	* (but note, as explained above, that there may be more.) */
	2956
	2957	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	2958
	2959	if (c < 256) {
	2960	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	2961	}
	2962
	2963	return CALL_UPPER_CASE(c, NULL, p, lenp);
	2964	}
	2965
	2966	UV
	2967	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	2968	{
	2969	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	2970
	2971	if (c < 256) {
	2972	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	2973	}
	2974
	2975	return CALL_TITLE_CASE(c, NULL, p, lenp);
	2976	}
	2977
	2978	STATIC U8
	2979	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
	2980	{
	2981	/* We have the latin1-range values compiled into the core, so just use
	2982	* those, converting the result to UTF-8. Since the result is always just
	2983	* one character, we allow <p> to be NULL */
	2984
	2985	U8 converted = toLOWER_LATIN1(c);
	2986
	2987	PERL_UNUSED_ARG(dummy);
	2988
	2989	if (p != NULL) {
	2990	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	2991	*p = converted;
	2992	*lenp = 1;
	2993	}
	2994	else {
	2995	/* Result is known to always be < 256, so can use the EIGHT_BIT
	2996	* macros */
	2997	*p = UTF8_EIGHT_BIT_HI(converted);
	2998	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	2999	*lenp = 2;
	3000	}
	3001	}
	3002	return converted;
	3003	}
	3004
	3005	UV
	3006	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	3007	{
	3008	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	3009
	3010	if (c < 256) {
	3011	return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
	3012	}
	3013
	3014	return CALL_LOWER_CASE(c, NULL, p, lenp);
	3015	}
	3016
	3017	UV
	3018	Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
	3019	{
	3020	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	3021	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	3022	* FOLD_FLAGS_FULL iff full folding is to be used;
	3023	*
	3024	* Not to be used for locale folds
	3025	*/
	3026
	3027	UV converted;
	3028
	3029	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	3030
	3031	assert (! (flags & FOLD_FLAGS_LOCALE));
	3032
	3033	if (UNLIKELY(c == MICRO_SIGN)) {
	3034	converted = GREEK_SMALL_LETTER_MU;
	3035	}
	3036	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	3037	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	3038	\|\| UNICODE_DOT_DOT_VERSION > 0)
	3039	else if ( (flags & FOLD_FLAGS_FULL)
	3040	&& UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
	3041	{
	3042	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	3043	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	3044	* under those circumstances. */
	3045	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	3046	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	3047	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	3048	p, *lenp, U8);
	3049	return LATIN_SMALL_LETTER_LONG_S;
	3050	}
	3051	else {
	3052	*(p)++ = 's';
	3053	*p = 's';
	3054	*lenp = 2;
	3055	return 's';
	3056	}
	3057	}
	3058	#endif
	3059	else { /* In this range the fold of all other characters is their lower
	3060	case */
	3061	converted = toLOWER_LATIN1(c);
	3062	}
	3063
	3064	if (UVCHR_IS_INVARIANT(converted)) {
	3065	*p = (U8) converted;
	3066	*lenp = 1;
	3067	}
	3068	else {
	3069	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	3070	*p = UTF8_TWO_BYTE_LO(converted);
	3071	*lenp = 2;
	3072	}
	3073
	3074	return converted;
	3075	}
	3076
	3077	UV
	3078	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	3079	{
	3080
	3081	/* Not currently externally documented, and subject to change
	3082	* <flags> bits meanings:
	3083	* FOLD_FLAGS_FULL iff full folding is to be used;
	3084	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3085	* locale are to be used.
	3086	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	3087	*/
	3088
	3089	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	3090
	3091	if (flags & FOLD_FLAGS_LOCALE) {
	3092	/* Treat a non-Turkic UTF-8 locale as not being in locale at all,
	3093	* except for potentially warning */
	3094	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	3095	if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
	3096	flags &= ~FOLD_FLAGS_LOCALE;
	3097	}
	3098	else {
	3099	goto needs_full_generality;
	3100	}
	3101	}
	3102
	3103	if (c < 256) {
	3104	return _to_fold_latin1((U8) c, p, lenp,
	3105	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	3106	}
	3107
	3108	/* Here, above 255. If no special needs, just use the macro */
	3109	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	3110	return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
	3111	}
	3112	else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
	3113	the special flags. */
	3114	U8 utf8_c[UTF8_MAXBYTES + 1];
	3115
	3116	needs_full_generality:
	3117	uvchr_to_utf8(utf8_c, c);
	3118	return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
	3119	p, lenp, flags);
	3120	}
	3121	}
	3122
	3123	PERL_STATIC_INLINE bool
	3124	S_is_utf8_common(pTHX_ const U8 const p, const U8 const e,
	3125	SV* const invlist)
	3126	{
	3127	/* returns a boolean giving whether or not the UTF8-encoded character that
	3128	* starts at <p>, and extending no further than <e - 1> is in the inversion
	3129	* list <invlist>. */
	3130
	3131	UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
	3132
	3133	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	3134
	3135	if (cp == 0 && (p >= e \|\| *p != '\0')) {
	3136	_force_out_malformed_utf8_message(p, e, 0, 1);
	3137	NOT_REACHED; /* NOTREACHED */
	3138	}
	3139
	3140	assert(invlist);
	3141	return _invlist_contains_cp(invlist, cp);
	3142	}
	3143
	3144	#if 0 /* Not currently used, but may be needed in the future */
	3145	PERLVAR(I, seen_deprecated_macro, HV *)
	3146
	3147	STATIC void
	3148	S_warn_on_first_deprecated_use(pTHX_ const char * const name,
	3149	const char * const alternative,
	3150	const bool use_locale,
	3151	const char * const file,
	3152	const unsigned line)
	3153	{
	3154	const char * key;
	3155
	3156	PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
	3157
	3158	if (ckWARN_d(WARN_DEPRECATED)) {
	3159
	3160	key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
	3161	if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
	3162	if (! PL_seen_deprecated_macro) {
	3163	PL_seen_deprecated_macro = newHV();
	3164	}
	3165	if (! hv_store(PL_seen_deprecated_macro, key,
	3166	strlen(key), &PL_sv_undef, 0))
	3167	{
	3168	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3169	}
	3170
	3171	if (instr(file, "mathoms.c")) {
	3172	Perl_warner(aTHX_ WARN_DEPRECATED,
	3173	"In %s, line %d, starting in Perl v5.32, %s()"
	3174	" will be removed. Avoid this message by"
	3175	" converting to use %s().\n",
	3176	file, line, name, alternative);
	3177	}
	3178	else {
	3179	Perl_warner(aTHX_ WARN_DEPRECATED,
	3180	"In %s, line %d, starting in Perl v5.32, %s() will"
	3181	" require an additional parameter. Avoid this"
	3182	" message by converting to use %s().\n",
	3183	file, line, name, alternative);
	3184	}
	3185	}
	3186	}
	3187	}
	3188	#endif
	3189
	3190	bool
	3191	Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 p, const U8 const e)
	3192	{
	3193	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	3194
	3195	return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
	3196	}
	3197
	3198	bool
	3199	Perl__is_utf8_perl_idstart(pTHX_ const U8 p, const U8 const e)
	3200	{
	3201	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
	3202
	3203	return is_utf8_common(p, e, PL_utf8_perl_idstart);
	3204	}
	3205
	3206	bool
	3207	Perl__is_utf8_perl_idcont(pTHX_ const U8 p, const U8 const e)
	3208	{
	3209	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
	3210
	3211	return is_utf8_common(p, e, PL_utf8_perl_idcont);
	3212	}
	3213
	3214	STATIC UV
	3215	S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
	3216	U8* ustrp, STRLEN *lenp,
	3217	SV invlist, const I32 const invmap,
	3218	const U32 * const * const aux_tables,
	3219	const U8 * const aux_table_lengths,
	3220	const char * const normal)
	3221	{
	3222	STRLEN len = 0;
	3223
	3224	/* Change the case of code point 'uv1' whose UTF-8 representation (assumed
	3225	* by this routine to be valid) begins at 'p'. 'normal' is a string to use
	3226	* to name the new case in any generated messages, as a fallback if the
	3227	* operation being used is not available. The new case is given by the
	3228	* data structures in the remaining arguments.
	3229	*
	3230	* On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
	3231	* entire changed case string, and the return value is the first code point
	3232	* in that string */
	3233
	3234	PERL_ARGS_ASSERT__TO_UTF8_CASE;
	3235
	3236	/* For code points that don't change case, we already know that the output
	3237	* of this function is the unchanged input, so we can skip doing look-ups
	3238	* for them. Unfortunately the case-changing code points are scattered
	3239	* around. But there are some long consecutive ranges where there are no
	3240	* case changing code points. By adding tests, we can eliminate the lookup
	3241	* for all the ones in such ranges. This is currently done here only for
	3242	* just a few cases where the scripts are in common use in modern commerce
	3243	* (and scripts adjacent to those which can be included without additional
	3244	* tests). */
	3245
	3246	if (uv1 >= 0x0590) {
	3247	/* This keeps from needing further processing the code points most
	3248	* likely to be used in the following non-cased scripts: Hebrew,
	3249	* Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
	3250	* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
	3251	* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
	3252	if (uv1 < 0x10A0) {
	3253	goto cases_to_self;
	3254	}
	3255
	3256	/* The following largish code point ranges also don't have case
	3257	* changes, but khw didn't think they warranted extra tests to speed
	3258	* them up (which would slightly slow down everything else above them):
	3259	* 1100..139F Hangul Jamo, Ethiopic
	3260	* 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
	3261	* Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
	3262	* Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
	3263	* Combining Diacritical Marks Extended, Balinese,
	3264	* Sundanese, Batak, Lepcha, Ol Chiki
	3265	* 2000..206F General Punctuation
	3266	*/
	3267
	3268	if (uv1 >= 0x2D30) {
	3269
	3270	/* This keeps the from needing further processing the code points
	3271	* most likely to be used in the following non-cased major scripts:
	3272	* CJK, Katakana, Hiragana, plus some less-likely scripts.
	3273	*
	3274	* (0x2D30 above might have to be changed to 2F00 in the unlikely
	3275	* event that Unicode eventually allocates the unused block as of
	3276	* v8.0 2FE0..2FEF to code points that are cased. khw has verified
	3277	* that the test suite will start having failures to alert you
	3278	* should that happen) */
	3279	if (uv1 < 0xA640) {
	3280	goto cases_to_self;
	3281	}
	3282
	3283	if (uv1 >= 0xAC00) {
	3284	if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
	3285	if (ckWARN_d(WARN_SURROGATE)) {
	3286	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3287	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3288	"Operation \"%s\" returns its argument for"
	3289	" UTF-16 surrogate U+%04" UVXf, desc, uv1);
	3290	}
	3291	goto cases_to_self;
	3292	}
	3293
	3294	/* AC00..FAFF Catches Hangul syllables and private use, plus
	3295	* some others */
	3296	if (uv1 < 0xFB00) {
	3297	goto cases_to_self;
	3298	}
	3299
	3300	if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
	3301	if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
	3302	Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
	3303	}
	3304	if (ckWARN_d(WARN_NON_UNICODE)) {
	3305	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3306	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3307	"Operation \"%s\" returns its argument for"
	3308	" non-Unicode code point 0x%04" UVXf, desc, uv1);
	3309	}
	3310	goto cases_to_self;
	3311	}
	3312	#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
	3313	if (UNLIKELY(uv1
	3314	> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
	3315	{
	3316
	3317	goto cases_to_self;
	3318	}
	3319	#endif
	3320	}
	3321	}
	3322
	3323	/* Note that non-characters are perfectly legal, so no warning should
	3324	* be given. */
	3325	}
	3326
	3327	{
	3328	unsigned int i;
	3329	const U32 * cp_list;
	3330	U8 * d;
	3331
	3332	/* 'index' is guaranteed to be non-negative, as this is an inversion
	3333	* map that covers all possible inputs. See [perl #133365] */
	3334	SSize_t index = _invlist_search(invlist, uv1);
	3335	I32 base = invmap[index];
	3336
	3337	/* The data structures are set up so that if 'base' is non-negative,
	3338	* the case change is 1-to-1; and if 0, the change is to itself */
	3339	if (base >= 0) {
	3340	IV lc;
	3341
	3342	if (base == 0) {
	3343	goto cases_to_self;
	3344	}
	3345
	3346	/* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
	3347	lc = base + uv1 - invlist_array(invlist)[index];
	3348	*lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
	3349	return lc;
	3350	}
	3351
	3352	/* Here 'base' is negative. That means the mapping is 1-to-many, and
	3353	* requires an auxiliary table look up. abs(base) gives the index into
	3354	* a list of such tables which points to the proper aux table. And a
	3355	* parallel list gives the length of each corresponding aux table. */
	3356	cp_list = aux_tables[-base];
	3357
	3358	/* Create the string of UTF-8 from the mapped-to code points */
	3359	d = ustrp;
	3360	for (i = 0; i < aux_table_lengths[-base]; i++) {
	3361	d = uvchr_to_utf8(d, cp_list[i]);
	3362	}
	3363	*d = '\0';
	3364	*lenp = d - ustrp;
	3365
	3366	return cp_list[0];
	3367	}
	3368
	3369	/* Here, there was no mapping defined, which means that the code point maps
	3370	* to itself. Return the inputs */
	3371	cases_to_self:
	3372	if (p) {
	3373	len = UTF8SKIP(p);
	3374	if (p != ustrp) { /* Don't copy onto itself */
	3375	Copy(p, ustrp, len, U8);
	3376	}
	3377	*lenp = len;
	3378	}
	3379	else {
	3380	*lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
	3381	}
	3382
	3383	return uv1;
	3384
	3385	}
	3386
	3387	Size_t
	3388	Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
	3389	const U32 ** remaining_folds_to)
	3390	{
	3391	/* Returns the count of the number of code points that fold to the input
	3392	* 'cp' (besides itself).
	3393	*
	3394	* If the return is 0, there is nothing else that folds to it, and
	3395	* 'first_folds_to' is set to 0, and 'remaining_folds_to' is set to NULL.
	3396	*
	3397	* If the return is 1, '*first_folds_to' is set to the single code point,
	3398	* and '*remaining_folds_to' is set to NULL.
	3399	*
	3400	* Otherwise, '*first_folds_to' is set to a code point, and
	3401	* '*remaining_fold_to' is set to an array that contains the others. The
	3402	* length of this array is the returned count minus 1.
	3403	*
	3404	* The reason for this convolution is to avoid having to deal with
	3405	* allocating and freeing memory. The lists are already constructed, so
	3406	* the return can point to them, but single code points aren't, so would
	3407	* need to be constructed if we didn't employ something like this API
	3408	*
	3409	* The code points returned by this function are all legal Unicode, which
	3410	* occupy at most 21 bits, and so a U32 is sufficient, and the lists are
	3411	* constructed with this size (to save space and memory), and we return
	3412	* pointers, so they must be this size */
	3413
	3414	/* 'index' is guaranteed to be non-negative, as this is an inversion map
	3415	* that covers all possible inputs. See [perl #133365] */
	3416	SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
	3417	I32 base = _Perl_IVCF_invmap[index];
	3418
	3419	PERL_ARGS_ASSERT__INVERSE_FOLDS;
	3420
	3421	if (base == 0) { /* No fold */
	3422	*first_folds_to = 0;
	3423	*remaining_folds_to = NULL;
	3424	return 0;
	3425	}
	3426
	3427	#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
	3428
	3429	assert(base > 0);
	3430
	3431	#else
	3432
	3433	if (UNLIKELY(base < 0)) { /* Folds to more than one character */
	3434
	3435	/* The data structure is set up so that the absolute value of 'base' is
	3436	* an index into a table of pointers to arrays, with the array
	3437	* corresponding to the index being the list of code points that fold
	3438	* to 'cp', and the parallel array containing the length of the list
	3439	* array */
	3440	*first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
	3441	*remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
	3442	/* +1 excludes first_folds_to */
	3443	return IVCF_AUX_TABLE_lengths[-base];
	3444	}
	3445
	3446	#endif
	3447
	3448	/* Only the single code point. This works like 'fc(G) = G - A + a' */
	3449	*first_folds_to = (U32) (base + cp
	3450	- invlist_array(PL_utf8_foldclosures)[index]);
	3451	*remaining_folds_to = NULL;
	3452	return 1;
	3453	}
	3454
	3455	STATIC UV
	3456	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
	3457	U8* const ustrp, STRLEN *lenp)
	3458	{
	3459	/* This is called when changing the case of a UTF-8-encoded character above
	3460	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	3461	* result contains a character that crosses the 255/256 boundary, disallow
	3462	* the change, and return the original code point. See L<perlfunc/lc> for
	3463	* why;
	3464	*
	3465	* p points to the original string whose case was changed; assumed
	3466	* by this routine to be well-formed
	3467	* result the code point of the first character in the changed-case string
	3468	* ustrp points to the changed-case string (<result> represents its
	3469	* first char)
	3470	* lenp points to the length of <ustrp> */
	3471
	3472	UV original; /* To store the first code point of <p> */
	3473
	3474	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	3475
	3476	assert(UTF8_IS_ABOVE_LATIN1(*p));
	3477
	3478	/* We know immediately if the first character in the string crosses the
	3479	* boundary, so can skip testing */
	3480	if (result > 255) {
	3481
	3482	/* Look at every character in the result; if any cross the
	3483	* boundary, the whole thing is disallowed */
	3484	U8* s = ustrp + UTF8SKIP(ustrp);
	3485	U8* e = ustrp + *lenp;
	3486	while (s < e) {
	3487	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	3488	goto bad_crossing;
	3489	}
	3490	s += UTF8SKIP(s);
	3491	}
	3492
	3493	/* Here, no characters crossed, result is ok as-is, but we warn. */
	3494	_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
	3495	return result;
	3496	}
	3497
	3498	bad_crossing:
	3499
	3500	/* Failed, have to return the original */
	3501	original = valid_utf8_to_uvchr(p, lenp);
	3502
	3503	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3504	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3505	"Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
	3506	" locale; resolved to \"\\x{%" UVXf "}\".",
	3507	OP_DESC(PL_op),
	3508	original,
	3509	original);
	3510	Copy(p, ustrp, *lenp, char);
	3511	return original;
	3512	}
	3513
	3514	STATIC UV
	3515	S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
	3516	U8 * ustrp, STRLEN *lenp)
	3517	{
	3518	/* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
	3519	* p0..e-1 according to Turkic rules is the same as for non-Turkic.
	3520	* Otherwise, it returns the first code point of the Turkic foldcased
	3521	* sequence, and the entire sequence will be stored in *ustrp. ustrp will
	3522	* contain *lenp bytes
	3523	*
	3524	* Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
	3525	* I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
	3526	* DOTLESS I */
	3527
	3528	PERL_ARGS_ASSERT_TURKIC_FC;
	3529	assert(e > p);
	3530
	3531	if (UNLIKELY(*p == 'I')) {
	3532	*lenp = 2;
	3533	ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
	3534	ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
	3535	return LATIN_SMALL_LETTER_DOTLESS_I;
	3536	}
	3537
	3538	if (UNLIKELY(memBEGINs(p, e - p,
	3539	LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
	3540	{
	3541	*lenp = 1;
	3542	*ustrp = 'i';
	3543	return 'i';
	3544	}
	3545
	3546	return 0;
	3547	}
	3548
	3549	STATIC UV
	3550	S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
	3551	U8 * ustrp, STRLEN *lenp)
	3552	{
	3553	/* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
	3554	* p0..e-1 according to Turkic rules is the same as for non-Turkic.
	3555	* Otherwise, it returns the first code point of the Turkic lowercased
	3556	* sequence, and the entire sequence will be stored in *ustrp. ustrp will
	3557	* contain lenp bytes /
	3558
	3559	PERL_ARGS_ASSERT_TURKIC_LC;
	3560	assert(e > p0);
	3561
	3562	/* A 'I' requires context as to what to do */
	3563	if (UNLIKELY(*p0 == 'I')) {
	3564	const U8 * p = p0 + 1;
	3565
	3566	/* According to the Unicode SpecialCasing.txt file, a capital 'I'
	3567	* modified by a dot above lowercases to 'i' even in turkic locales. */
	3568	while (p < e) {
	3569	UV cp;
	3570
	3571	if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
	3572	ustrp[0] = 'i';
	3573	*lenp = 1;
	3574	return 'i';
	3575	}
	3576
	3577	/* For the dot above to modify the 'I', it must be part of a
	3578	* combining sequence immediately following the 'I', and no other
	3579	* modifier with a ccc of 230 may intervene */
	3580	cp = utf8_to_uvchr_buf(p, e, NULL);
	3581	if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
	3582	break;
	3583	}
	3584
	3585	/* Here the combining sequence continues */
	3586	p += UTF8SKIP(p);
	3587	}
	3588	}
	3589
	3590	/* In all other cases the lc is the same as the fold */
	3591	return turkic_fc(p0, e, ustrp, lenp);
	3592	}
	3593
	3594	STATIC UV
	3595	S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
	3596	U8 * ustrp, STRLEN *lenp)
	3597	{
	3598	/* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
	3599	* from p0..e-1 according to Turkic rules is the same as for non-Turkic.
	3600	* Otherwise, it returns the first code point of the Turkic upper or
	3601	* title-cased sequence, and the entire sequence will be stored in *ustrp.
	3602	* ustrp will contain *lenp bytes
	3603	*
	3604	* Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
	3605	* I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
	3606	* DOTLESS I */
	3607
	3608	PERL_ARGS_ASSERT_TURKIC_UC;
	3609	assert(e > p);
	3610
	3611	if (*p == 'i') {
	3612	*lenp = 2;
	3613	ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
	3614	ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
	3615	return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
	3616	}
	3617
	3618	if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
	3619	*lenp = 1;
	3620	*ustrp = 'I';
	3621	return 'I';
	3622	}
	3623
	3624	return 0;
	3625	}
	3626
	3627	/* The process for changing the case is essentially the same for the four case
	3628	* change types, except there are complications for folding. Otherwise the
	3629	* difference is only which case to change to. To make sure that they all do
	3630	* the same thing, the bodies of the functions are extracted out into the
	3631	* following two macros. The functions are written with the same variable
	3632	* names, and these are known and used inside these macros. It would be
	3633	* better, of course, to have inline functions to do it, but since different
	3634	* macros are called, depending on which case is being changed to, this is not
	3635	* feasible in C (to khw's knowledge). Two macros are created so that the fold
	3636	* function can start with the common start macro, then finish with its special
	3637	* handling; while the other three cases can just use the common end macro.
	3638	*
	3639	* The algorithm is to use the proper (passed in) macro or function to change
	3640	* the case for code points that are below 256. The macro is used if using
	3641	* locale rules for the case change; the function if not. If the code point is
	3642	* above 255, it is computed from the input UTF-8, and another macro is called
	3643	* to do the conversion. If necessary, the output is converted to UTF-8. If
	3644	* using a locale, we have to check that the change did not cross the 255/256
	3645	* boundary, see check_locale_boundary_crossing() for further details.
	3646	*
	3647	* The macros are split with the correct case change for the below-256 case
	3648	* stored into 'result', and in the middle of an else clause for the above-255
	3649	* case. At that point in the 'else', 'result' is not the final result, but is
	3650	* the input code point calculated from the UTF-8. The fold code needs to
	3651	* realize all this and take it from there.
	3652	*
	3653	* To deal with Turkic locales, the function specified by the parameter
	3654	* 'turkic' is called when appropriate.
	3655	*
	3656	* If you read the two macros as sequential, it's easier to understand what's
	3657	* going on. */
	3658	#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
	3659	L1_func_extra_param, turkic) \
	3660	\
	3661	if (flags & (locale_flags)) { \
	3662	_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
	3663	if (IN_UTF8_CTYPE_LOCALE) { \
	3664	if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
	3665	UV ret = turkic(p, e, ustrp, lenp); \
	3666	if (ret) return ret; \
	3667	} \
	3668	\
	3669	/* Otherwise, treat a UTF-8 locale as not being in locale at \
	3670	* all */ \
	3671	flags &= ~(locale_flags); \
	3672	} \
	3673	} \
	3674	\
	3675	if (UTF8_IS_INVARIANT(*p)) { \
	3676	if (flags & (locale_flags)) { \
	3677	result = LC_L1_change_macro(*p); \
	3678	} \
	3679	else { \
	3680	return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
	3681	} \
	3682	} \
	3683	else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
	3684	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)); \
	3685	if (flags & (locale_flags)) { \
	3686	result = LC_L1_change_macro(c); \
	3687	} \
	3688	else { \
	3689	return L1_func(c, ustrp, lenp, L1_func_extra_param); \
	3690	} \
	3691	} \
	3692	else { /* malformed UTF-8 or ord above 255 */ \
	3693	STRLEN len_result; \
	3694	result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
	3695	if (len_result == (STRLEN) -1) { \
	3696	_force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
	3697	}
	3698
	3699	#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
	3700	result = change_macro(result, p, ustrp, lenp); \
	3701	\
	3702	if (flags & (locale_flags)) { \
	3703	result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
	3704	} \
	3705	return result; \
	3706	} \
	3707	\
	3708	/* Here, used locale rules. Convert back to UTF-8 */ \
	3709	if (UTF8_IS_INVARIANT(result)) { \
	3710	*ustrp = (U8) result; \
	3711	*lenp = 1; \
	3712	} \
	3713	else { \
	3714	*ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
	3715	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
	3716	*lenp = 2; \
	3717	} \
	3718	\
	3719	return result;
	3720
	3721	/* Not currently externally documented, and subject to change:
	3722	* <flags> is set iff the rules from the current underlying locale are to
	3723	* be used. */
	3724
	3725	UV
	3726	Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
	3727	const U8 *e,
	3728	U8* ustrp,
	3729	STRLEN *lenp,
	3730	bool flags)
	3731	{
	3732	UV result;
	3733
	3734	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	3735
	3736	/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
	3737	/* 2nd char of uc(U+DF) is 'S' */
	3738	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
	3739	turkic_uc);
	3740	CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
	3741	}
	3742
	3743	/* Not currently externally documented, and subject to change:
	3744	* <flags> is set iff the rules from the current underlying locale are to be
	3745	* used. Since titlecase is not defined in POSIX, for other than a
	3746	* UTF-8 locale, uppercase is used instead for code points < 256.
	3747	*/
	3748
	3749	UV
	3750	Perl__to_utf8_title_flags(pTHX_ const U8 *p,
	3751	const U8 *e,
	3752	U8* ustrp,
	3753	STRLEN *lenp,
	3754	bool flags)
	3755	{
	3756	UV result;
	3757
	3758	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	3759
	3760	/* 2nd char of ucfirst(U+DF) is 's' */
	3761	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
	3762	turkic_uc);
	3763	CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
	3764	}
	3765
	3766	/* Not currently externally documented, and subject to change:
	3767	* <flags> is set iff the rules from the current underlying locale are to
	3768	* be used.
	3769	*/
	3770
	3771	UV
	3772	Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
	3773	const U8 *e,
	3774	U8* ustrp,
	3775	STRLEN *lenp,
	3776	bool flags)
	3777	{
	3778	UV result;
	3779
	3780	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	3781
	3782	CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
	3783	turkic_lc);
	3784	CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
	3785	}
	3786
	3787	/* Not currently externally documented, and subject to change,
	3788	* in <flags>
	3789	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3790	* locale are to be used.
	3791	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	3792	* otherwise simple folds
	3793	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	3794	* prohibited
	3795	*/
	3796
	3797	UV
	3798	Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
	3799	const U8 *e,
	3800	U8* ustrp,
	3801	STRLEN *lenp,
	3802	U8 flags)
	3803	{
	3804	UV result;
	3805
	3806	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	3807
	3808	/* These are mutually exclusive */
	3809	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	3810
	3811	assert(p != ustrp); /* Otherwise overwrites */
	3812
	3813	CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
	3814	((flags) & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII)),
	3815	turkic_fc);
	3816
	3817	result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	3818
	3819	if (flags & FOLD_FLAGS_LOCALE) {
	3820
	3821	# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
	3822	# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3823	# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3824
	3825	/* Special case these two characters, as what normally gets
	3826	* returned under locale doesn't work */
	3827	if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
	3828	{
	3829	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3830	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3831	"Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
	3832	"resolved to \"\\x{17F}\\x{17F}\".");
	3833	goto return_long_s;
	3834	}
	3835	else
	3836	#endif
	3837	if (memBEGINs((char *) p, e - p, LONG_S_T))
	3838	{
	3839	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3840	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3841	"Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
	3842	"resolved to \"\\x{FB06}\".");
	3843	goto return_ligature_st;
	3844	}
	3845
	3846	#if UNICODE_MAJOR_VERSION == 3 \
	3847	&& UNICODE_DOT_VERSION == 0 \
	3848	&& UNICODE_DOT_DOT_VERSION == 1
	3849	# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
	3850
	3851	/* And special case this on this Unicode version only, for the same
	3852	* reaons the other two are special cased. They would cross the
	3853	* 255/256 boundary which is forbidden under /l, and so the code
	3854	* wouldn't catch that they are equivalent (which they are only in
	3855	* this release) */
	3856	else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
	3857	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3858	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3859	"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
	3860	"resolved to \"\\x{0131}\".");
	3861	goto return_dotless_i;
	3862	}
	3863	#endif
	3864
	3865	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	3866	}
	3867	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	3868	return result;
	3869	}
	3870	else {
	3871	/* This is called when changing the case of a UTF-8-encoded
	3872	* character above the ASCII range, and the result should not
	3873	* contain an ASCII character. */
	3874
	3875	UV original; /* To store the first code point of <p> */
	3876
	3877	/* Look at every character in the result; if any cross the
	3878	* boundary, the whole thing is disallowed */
	3879	U8* s = ustrp;
	3880	U8* send = ustrp + *lenp;
	3881	while (s < send) {
	3882	if (isASCII(*s)) {
	3883	/* Crossed, have to return the original */
	3884	original = valid_utf8_to_uvchr(p, lenp);
	3885
	3886	/* But in these instances, there is an alternative we can
	3887	* return that is valid */
	3888	if (original == LATIN_SMALL_LETTER_SHARP_S
	3889	#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
	3890	\|\| original == LATIN_CAPITAL_LETTER_SHARP_S
	3891	#endif
	3892	) {
	3893	goto return_long_s;
	3894	}
	3895	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	3896	goto return_ligature_st;
	3897	}
	3898	#if UNICODE_MAJOR_VERSION == 3 \
	3899	&& UNICODE_DOT_VERSION == 0 \
	3900	&& UNICODE_DOT_DOT_VERSION == 1
	3901
	3902	else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	3903	goto return_dotless_i;
	3904	}
	3905	#endif
	3906	Copy(p, ustrp, *lenp, char);
	3907	return original;
	3908	}
	3909	s += UTF8SKIP(s);
	3910	}
	3911
	3912	/* Here, no characters crossed, result is ok as-is */
	3913	return result;
	3914	}
	3915	}
	3916
	3917	/* Here, used locale rules. Convert back to UTF-8 */
	3918	if (UTF8_IS_INVARIANT(result)) {
	3919	*ustrp = (U8) result;
	3920	*lenp = 1;
	3921	}
	3922	else {
	3923	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	3924	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	3925	*lenp = 2;
	3926	}
	3927
	3928	return result;
	3929
	3930	return_long_s:
	3931	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	3932	* folds to a string of two of these characters. By returning this
	3933	* instead, then, e.g.,
	3934	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	3935	* works. */
	3936
	3937	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	3938	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	3939	ustrp, *lenp, U8);
	3940	return LATIN_SMALL_LETTER_LONG_S;
	3941
	3942	return_ligature_st:
	3943	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	3944	* have the other one fold to it */
	3945
	3946	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	3947	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	3948	return LATIN_SMALL_LIGATURE_ST;
	3949
	3950	#if UNICODE_MAJOR_VERSION == 3 \
	3951	&& UNICODE_DOT_VERSION == 0 \
	3952	&& UNICODE_DOT_DOT_VERSION == 1
	3953
	3954	return_dotless_i:
	3955	*lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
	3956	Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
	3957	return LATIN_SMALL_LETTER_DOTLESS_I;
	3958
	3959	#endif
	3960
	3961	}
	3962
	3963	bool
	3964	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	3965	{
	3966	/* May change: warns if surrogates, non-character code points, or
	3967	* non-Unicode code points are in 's' which has length 'len' bytes.
	3968	* Returns TRUE if none found; FALSE otherwise. The only other validity
	3969	* check is to make sure that this won't exceed the string's length nor
	3970	* overflow */
	3971
	3972	const U8* const e = s + len;
	3973	bool ok = TRUE;
	3974
	3975	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3976
	3977	while (s < e) {
	3978	if (UTF8SKIP(s) > len) {
	3979	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3980	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3981	return FALSE;
	3982	}
	3983	if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
	3984	if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
	3985	if ( ckWARN_d(WARN_NON_UNICODE)
	3986	\|\| UNLIKELY(0 < does_utf8_overflow(s, s + len,
	3987	0 /* Don't consider overlongs */
	3988	)))
	3989	{
	3990	/* A side effect of this function will be to warn */
	3991	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
	3992	ok = FALSE;
	3993	}
	3994	}
	3995	else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
	3996	if (ckWARN_d(WARN_SURROGATE)) {
	3997	/* This has a different warning than the one the called
	3998	* function would output, so can't just call it, unlike we
	3999	* do for the non-chars and above-unicodes */
	4000	UV uv = utf8_to_uvchr_buf(s, e, NULL);
	4001	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	4002	"Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
	4003	uv);
	4004	ok = FALSE;
	4005	}
	4006	}
	4007	else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
	4008	&& (ckWARN_d(WARN_NONCHAR)))
	4009	{
	4010	/* A side effect of this function will be to warn */
	4011	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
	4012	ok = FALSE;
	4013	}
	4014	}
	4015	s += UTF8SKIP(s);
	4016	}
	4017
	4018	return ok;
	4019	}
	4020
	4021	/*
	4022	=for apidoc pv_uni_display
	4023
	4024	Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
	4025	C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
	4026	long (if longer, the rest is truncated and C<"..."> will be appended).
	4027
	4028	The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
	4029	C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
	4030	to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
	4031	(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
	4032	C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
	4033	C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
	4034
	4035	Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
	4036	backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
	4037
	4038	The pointer to the PV of the C<dsv> is returned.
	4039
	4040	See also L</sv_uni_display>.
	4041
	4042	=cut */
	4043	char *
	4044	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim,
	4045	UV flags)
	4046	{
	4047	int truncated = 0;
	4048	const char s, e;
	4049
	4050	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	4051
	4052	SvPVCLEAR(dsv);
	4053	SvUTF8_off(dsv);
	4054	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	4055	UV u;
	4056	bool ok = 0;
	4057
	4058	if (pvlim && SvCUR(dsv) >= pvlim) {
	4059	truncated++;
	4060	break;
	4061	}
	4062	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	4063	if (u < 256) {
	4064	const unsigned char c = (unsigned char)u & 0xFF;
	4065	if (flags & UNI_DISPLAY_BACKSLASH) {
	4066	if ( isMNEMONIC_CNTRL(c)
	4067	&& ( c != '\b'
	4068	\|\| (flags & UNI_DISPLAY_BACKSPACE)))
	4069	{
	4070	const char * mnemonic = cntrl_to_mnemonic(c);
	4071	sv_catpvn(dsv, mnemonic, strlen(mnemonic));
	4072	ok = 1;
	4073	}
	4074	else if (c == '\\') {
	4075	sv_catpvs(dsv, "\\\\");
	4076	ok = 1;
	4077	}
	4078	}
	4079	/* isPRINT() is the locale-blind version. */
	4080	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	4081	const char string = c;
	4082	sv_catpvn(dsv, &string, 1);
	4083	ok = 1;
	4084	}
	4085	}
	4086	if (!ok)
	4087	Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
	4088	}
	4089	if (truncated)
	4090	sv_catpvs(dsv, "...");
	4091
	4092	return SvPVX(dsv);
	4093	}
	4094
	4095	/*
	4096	=for apidoc sv_uni_display
	4097
	4098	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	4099	the displayable version being at most C<pvlim> bytes long
	4100	(if longer, the rest is truncated and "..." will be appended).
	4101
	4102	The C<flags> argument is as in L</pv_uni_display>().
	4103
	4104	The pointer to the PV of the C<dsv> is returned.
	4105
	4106	=cut
	4107	*/
	4108	char *
	4109	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	4110	{
	4111	const char * const ptr =
	4112	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	4113
	4114	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	4115
	4116	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	4117	SvCUR(ssv), pvlim, flags);
	4118	}
	4119
	4120	/*
	4121	=for apidoc foldEQ_utf8
	4122
	4123	Returns true if the leading portions of the strings C<s1> and C<s2> (either or
	4124	both of which may be in UTF-8) are the same case-insensitively; false
	4125	otherwise. How far into the strings to compare is determined by other input
	4126	parameters.
	4127
	4128	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	4129	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
	4130	C<u2> with respect to C<s2>.
	4131
	4132	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
	4133	fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
	4134	The scan will not be considered to be a match unless the goal is reached, and
	4135	scanning won't continue past that goal. Correspondingly for C<l2> with respect
	4136	to C<s2>.
	4137
	4138	If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
	4139	pointer is considered an end pointer to the position 1 byte past the maximum
	4140	point in C<s1> beyond which scanning will not continue under any circumstances.
	4141	(This routine assumes that UTF-8 encoded input strings are not malformed;
	4142	malformed input can cause it to read past C<pe1>). This means that if both
	4143	C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
	4144	will never be successful because it can never
	4145	get as far as its goal (and in fact is asserted against). Correspondingly for
	4146	C<pe2> with respect to C<s2>.
	4147
	4148	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	4149	C<l2> must be non-zero), and if both do, both have to be
	4150	reached for a successful match. Also, if the fold of a character is multiple
	4151	characters, all of them must be matched (see tr21 reference below for
	4152	'folding').
	4153
	4154	Upon a successful match, if C<pe1> is non-C<NULL>,
	4155	it will be set to point to the beginning of the I<next> character of C<s1>
	4156	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	4157
	4158	For case-insensitiveness, the "casefolding" of Unicode is used
	4159	instead of upper/lowercasing both the characters, see
	4160	L<https://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	4161
	4162	=cut */
	4163
	4164	/* A flags parameter has been added which may change, and hence isn't
	4165	* externally documented. Currently it is:
	4166	* 0 for as-documented above
	4167	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	4168	ASCII one, to not match
	4169	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	4170	* locale are to be used.
	4171	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	4172	* routine. This allows that step to be skipped.
	4173	* Currently, this requires s1 to be encoded as UTF-8
	4174	* (u1 must be true), which is asserted for.
	4175	* FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
	4176	* cross certain boundaries. Hence, the caller should
	4177	* let this function do the folding instead of
	4178	* pre-folding. This code contains an assertion to
	4179	* that effect. However, if the caller knows what
	4180	* it's doing, it can pass this flag to indicate that,
	4181	* and the assertion is skipped.
	4182	* FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
	4183	* to s2, and s2 doesn't have to be UTF-8 encoded.
	4184	* This introduces an asymmetry to save a few branches
	4185	* in a loop. Currently, this is not a problem, as
	4186	* never are both inputs pre-folded. Simply call this
	4187	* function with the pre-folded one as the second
	4188	* string.
	4189	* FOLDEQ_S2_FOLDS_SANE
	4190	*/
	4191
	4192	I32
	4193	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char *pe1, UV l1, bool u1,
	4194	const char s2, char *pe2, UV l2, bool u2,
	4195	U32 flags)
	4196	{
	4197	const U8 p1 = (const U8)s1; /* Point to current char */
	4198	const U8 p2 = (const U8)s2;
	4199	const U8 g1 = NULL; / goal for s1 */
	4200	const U8 *g2 = NULL;
	4201	const U8 e1 = NULL; / Don't scan s1 past this */
	4202	U8 f1 = NULL; / Point to current folded */
	4203	const U8 *e2 = NULL;
	4204	U8 *f2 = NULL;
	4205	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	4206	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	4207	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	4208	U8 flags_for_folder = FOLD_FLAGS_FULL;
	4209
	4210	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	4211
	4212	assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	4213	&& (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
	4214	&& !(flags & FOLDEQ_S1_FOLDS_SANE))
	4215	\|\| ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
	4216	&& !(flags & FOLDEQ_S2_FOLDS_SANE)))));
	4217	/* The algorithm is to trial the folds without regard to the flags on
	4218	* the first line of the above assert(), and then see if the result
	4219	* violates them. This means that the inputs can't be pre-folded to a
	4220	* violating result, hence the assert. This could be changed, with the
	4221	* addition of extra tests here for the already-folded case, which would
	4222	* slow it down. That cost is more than any possible gain for when these
	4223	* flags are specified, as the flags indicate /il or /iaa matching which
	4224	* is less common than /iu, and I (khw) also believe that real-world /il
	4225	* and /iaa matches are most likely to involve code points 0-255, and this
	4226	* function only under rare conditions gets called for 0-255. */
	4227
	4228	if (flags & FOLDEQ_LOCALE) {
	4229	if (IN_UTF8_CTYPE_LOCALE) {
	4230	if (UNLIKELY(PL_in_utf8_turkic_locale)) {
	4231	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	4232	}
	4233	else {
	4234	flags &= ~FOLDEQ_LOCALE;
	4235	}
	4236	}
	4237	else {
	4238	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	4239	}
	4240	}
	4241	if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
	4242	flags_for_folder \|= FOLD_FLAGS_NOMIX_ASCII;
	4243	}
	4244
	4245	if (pe1) {
	4246	e1 = (U8*)pe1;
	4247	}
	4248
	4249	if (l1) {
	4250	g1 = (const U8*)s1 + l1;
	4251	}
	4252
	4253	if (pe2) {
	4254	e2 = (U8*)pe2;
	4255	}
	4256
	4257	if (l2) {
	4258	g2 = (const U8*)s2 + l2;
	4259	}
	4260
	4261	/* Must have at least one goal */
	4262	assert(g1 \|\| g2);
	4263
	4264	if (g1) {
	4265
	4266	/* Will never match if goal is out-of-bounds */
	4267	assert(! e1 \|\| e1 >= g1);
	4268
	4269	/* Here, there isn't an end pointer, or it is beyond the goal. We
	4270	* only go as far as the goal */
	4271	e1 = g1;
	4272	}
	4273	else {
	4274	assert(e1); /* Must have an end for looking at s1 */
	4275	}
	4276
	4277	/* Same for goal for s2 */
	4278	if (g2) {
	4279	assert(! e2 \|\| e2 >= g2);
	4280	e2 = g2;
	4281	}
	4282	else {
	4283	assert(e2);
	4284	}
	4285
	4286	/* If both operands are already folded, we could just do a memEQ on the
	4287	* whole strings at once, but it would be better if the caller realized
	4288	* this and didn't even call us */
	4289
	4290	/* Look through both strings, a character at a time */
	4291	while (p1 < e1 && p2 < e2) {
	4292
	4293	/* If at the beginning of a new character in s1, get its fold to use
	4294	* and the length of the fold. */
	4295	if (n1 == 0) {
	4296	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	4297	f1 = (U8 *) p1;
	4298	assert(u1);
	4299	n1 = UTF8SKIP(f1);
	4300	}
	4301	else {
	4302	if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
	4303
	4304	/* We have to forbid mixing ASCII with non-ASCII if the
	4305	* flags so indicate. And, we can short circuit having to
	4306	* call the general functions for this common ASCII case,
	4307	* all of whose non-locale folds are also ASCII, and hence
	4308	* UTF-8 invariants, so the UTF8ness of the strings is not
	4309	* relevant. */
	4310	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	4311	return 0;
	4312	}
	4313	n1 = 1;
	4314	foldbuf1 = toFOLD(p1);
	4315	}
	4316	else if (u1) {
	4317	_toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
	4318	}
	4319	else { /* Not UTF-8, get UTF-8 fold */
	4320	_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
	4321	}
	4322	f1 = foldbuf1;
	4323	}
	4324	}
	4325
	4326	if (n2 == 0) { /* Same for s2 */
	4327	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	4328
	4329	/* Point to the already-folded character. But for non-UTF-8
	4330	* variants, convert to UTF-8 for the algorithm below */
	4331	if (UTF8_IS_INVARIANT(*p2)) {
	4332	f2 = (U8 *) p2;
	4333	n2 = 1;
	4334	}
	4335	else if (u2) {
	4336	f2 = (U8 *) p2;
	4337	n2 = UTF8SKIP(f2);
	4338	}
	4339	else {
	4340	foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
	4341	foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
	4342	f2 = foldbuf2;
	4343	n2 = 2;
	4344	}
	4345	}
	4346	else {
	4347	if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
	4348	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	4349	return 0;
	4350	}
	4351	n2 = 1;
	4352	foldbuf2 = toFOLD(p2);
	4353	}
	4354	else if (u2) {
	4355	_toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
	4356	}
	4357	else {
	4358	_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
	4359	}
	4360	f2 = foldbuf2;
	4361	}
	4362	}
	4363
	4364	/* Here f1 and f2 point to the beginning of the strings to compare.
	4365	* These strings are the folds of the next character from each input
	4366	* string, stored in UTF-8. */
	4367
	4368	/* While there is more to look for in both folds, see if they
	4369	* continue to match */
	4370	while (n1 && n2) {
	4371	U8 fold_length = UTF8SKIP(f1);
	4372	if (fold_length != UTF8SKIP(f2)
	4373	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	4374	function call for single
	4375	byte */
	4376	\|\| memNE((char)f1, (char)f2, fold_length))
	4377	{
	4378	return 0; /* mismatch */
	4379	}
	4380
	4381	/* Here, they matched, advance past them */
	4382	n1 -= fold_length;
	4383	f1 += fold_length;
	4384	n2 -= fold_length;
	4385	f2 += fold_length;
	4386	}
	4387
	4388	/* When reach the end of any fold, advance the input past it */
	4389	if (n1 == 0) {
	4390	p1 += u1 ? UTF8SKIP(p1) : 1;
	4391	}
	4392	if (n2 == 0) {
	4393	p2 += u2 ? UTF8SKIP(p2) : 1;
	4394	}
	4395	} /* End of loop through both strings */
	4396
	4397	/* A match is defined by each scan that specified an explicit length
	4398	* reaching its final goal, and the other not having matched a partial
	4399	* character (which can happen when the fold of a character is more than one
	4400	* character). */
	4401	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	4402	return 0;
	4403	}
	4404
	4405	/* Successful match. Set output pointers */
	4406	if (pe1) {
	4407	pe1 = (char)p1;
	4408	}
	4409	if (pe2) {
	4410	pe2 = (char)p2;
	4411	}
	4412	return 1;
	4413	}
	4414
	4415	/*
	4416	* ex: set ts=8 sts=4 sw=4 et:
	4417	*/