perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "invlist_inline.h"
	35
	36	static const char malformed_text[] = "Malformed UTF-8 character";
	37	static const char unees[] =
	38	"Malformed UTF-8 character (unexpected end of string)";
	39
	40	/* Be sure to synchronize this message with the similar one in regcomp.c */
	41	static const char cp_above_legal_max[] =
	42	"Use of code point 0x%" UVXf " is not allowed; the"
	43	" permissible max is 0x%" UVXf;
	44
	45	/*
	46	=head1 Unicode Support
	47	These are various utility functions for manipulating UTF8-encoded
	48	strings. For the uninitiated, this is a method of representing arbitrary
	49	Unicode characters as a variable number of bytes, in such a way that
	50	characters in the ASCII range are unmodified, and a zero byte never appears
	51	within non-zero characters.
	52
	53	=cut
	54	*/
	55
	56	void
	57	Perl__force_out_malformed_utf8_message(pTHX_
	58	const U8 const p, / First byte in UTF-8 sequence */
	59	const U8 * const e, /* Final byte in sequence (may include
	60	multiple chars */
	61	const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
	62	usually 0, or some DISALLOW flags */
	63	const bool die_here) /* If TRUE, this function does not return */
	64	{
	65	/* This core-only function is to be called when a malformed UTF-8 character
	66	* is found, in order to output the detailed information about the
	67	* malformation before dieing. The reason it exists is for the occasions
	68	* when such a malformation is fatal, but warnings might be turned off, so
	69	* that normally they would not be actually output. This ensures that they
	70	* do get output. Because a sequence may be malformed in more than one
	71	* way, multiple messages may be generated, so we can't make them fatal, as
	72	* that would cause the first one to die.
	73	*
	74	* Instead we pretend -W was passed to perl, then die afterwards. The
	75	* flexibility is here to return to the caller so they can finish up and
	76	* die themselves */
	77	U32 errors;
	78
	79	PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
	80
	81	ENTER;
	82	SAVEI8(PL_dowarn);
	83	SAVESPTR(PL_curcop);
	84
	85	PL_dowarn = G_WARN_ALL_ON\|G_WARN_ON;
	86	if (PL_curcop) {
	87	PL_curcop->cop_warnings = pWARN_ALL;
	88	}
	89
	90	(void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
	91
	92	LEAVE;
	93
	94	if (! errors) {
	95	Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
	96	" be called only when there are errors found");
	97	}
	98
	99	if (die_here) {
	100	Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
	101	}
	102	}
	103
	104	STATIC HV *
	105	S_new_msg_hv(pTHX_ const char * const message, /* The message text */
	106	U32 categories, /* Packed warning categories */
	107	U32 flag) /* Flag associated with this message */
	108	{
	109	/* Creates, populates, and returns an HV* that describes an error message
	110	* for the translators between UTF8 and code point */
	111
	112	SV* msg_sv = newSVpv(message, 0);
	113	SV* category_sv = newSVuv(categories);
	114	SV* flag_bit_sv = newSVuv(flag);
	115
	116	HV* msg_hv = newHV();
	117
	118	PERL_ARGS_ASSERT_NEW_MSG_HV;
	119
	120	(void) hv_stores(msg_hv, "text", msg_sv);
	121	(void) hv_stores(msg_hv, "warn_categories", category_sv);
	122	(void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
	123
	124	return msg_hv;
	125	}
	126
	127	/*
	128	=for apidoc uvoffuni_to_utf8_flags
	129
	130	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	131	Instead, B<Almost all code should use L</uvchr_to_utf8> or
	132	L</uvchr_to_utf8_flags>>.
	133
	134	This function is like them, but the input is a strict Unicode
	135	(as opposed to native) code point. Only in very rare circumstances should code
	136	not be using the native code point.
	137
	138	For details, see the description for L</uvchr_to_utf8_flags>.
	139
	140	=cut
	141	*/
	142
	143	U8 *
	144	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
	145	{
	146	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	147
	148	return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
	149	}
	150
	151	/* All these formats take a single UV code point argument */
	152	const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
	153	const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
	154	" is not recommended for open interchange";
	155	const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
	156	" may not be portable";
	157	const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
	158	" Unicode, requires a Perl extension," \
	159	" and so is not portable";
	160
	161	#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
	162	STMT_START { \
	163	if (flags & UNICODE_WARN_SURROGATE) { \
	164	U32 category = packWARN(WARN_SURROGATE); \
	165	const char * format = surrogate_cp_format; \
	166	if (msgs) { \
	167	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
	168	category, \
	169	UNICODE_GOT_SURROGATE); \
	170	} \
	171	else { \
	172	Perl_ck_warner_d(aTHX_ category, format, uv); \
	173	} \
	174	} \
	175	if (flags & UNICODE_DISALLOW_SURROGATE) { \
	176	return NULL; \
	177	} \
	178	} STMT_END;
	179
	180	#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
	181	STMT_START { \
	182	if (flags & UNICODE_WARN_NONCHAR) { \
	183	U32 category = packWARN(WARN_NONCHAR); \
	184	const char * format = nonchar_cp_format; \
	185	if (msgs) { \
	186	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
	187	category, \
	188	UNICODE_GOT_NONCHAR); \
	189	} \
	190	else { \
	191	Perl_ck_warner_d(aTHX_ category, format, uv); \
	192	} \
	193	} \
	194	if (flags & UNICODE_DISALLOW_NONCHAR) { \
	195	return NULL; \
	196	} \
	197	} STMT_END;
	198
	199	/* Use shorter names internally in this file */
	200	#define SHIFT UTF_ACCUMULATION_SHIFT
	201	#undef MARK
	202	#define MARK UTF_CONTINUATION_MARK
	203	#define MASK UTF_CONTINUATION_MASK
	204
	205	/*
	206	=for apidoc uvchr_to_utf8_flags_msgs
	207
	208	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	209
	210	Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
	211
	212	This function is for code that wants any warning and/or error messages to be
	213	returned to the caller rather than be displayed. All messages that would have
	214	been displayed if all lexical warnings are enabled will be returned.
	215
	216	It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
	217	placed after all the others, C<msgs>. If this parameter is 0, this function
	218	behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
	219	be a pointer to an C<HV *> variable, in which this function creates a new HV to
	220	contain any appropriate messages. The hash has three key-value pairs, as
	221	follows:
	222
	223	=over 4
	224
	225	=item C<text>
	226
	227	The text of the message as a C<SVpv>.
	228
	229	=item C<warn_categories>
	230
	231	The warning category (or categories) packed into a C<SVuv>.
	232
	233	=item C<flag>
	234
	235	A single flag bit associated with this message, in a C<SVuv>.
	236	The bit corresponds to some bit in the C<*errors> return value,
	237	such as C<UNICODE_GOT_SURROGATE>.
	238
	239	=back
	240
	241	It's important to note that specifying this parameter as non-null will cause
	242	any warnings this function would otherwise generate to be suppressed, and
	243	instead be placed in C<*msgs>. The caller can check the lexical warnings state
	244	(or not) when choosing what to do with the returned messages.
	245
	246	The caller, of course, is responsible for freeing any returned HV.
	247
	248	=cut
	249	*/
	250
	251	/* Undocumented; we don't want people using this. Instead they should use
	252	* uvchr_to_utf8_flags_msgs() */
	253	U8 *
	254	Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 d, UV uv, const UV flags, HV* msgs)
	255	{
	256	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
	257
	258	if (msgs) {
	259	*msgs = NULL;
	260	}
	261
	262	if (OFFUNI_IS_INVARIANT(uv)) {
	263	*d++ = LATIN1_TO_NATIVE(uv);
	264	return d;
	265	}
	266
	267	if (uv <= MAX_UTF8_TWO_BYTE) {
	268	*d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) \| UTF_START_MARK(2));
	269	*d++ = I8_TO_NATIVE_UTF8(( uv & MASK) \| MARK);
	270	return d;
	271	}
	272
	273	/* Not 2-byte; test for and handle 3-byte result. In the test immediately
	274	* below, the 16 is for start bytes E0-EF (which are all the possible ones
	275	* for 3 byte characters). The 2 is for 2 continuation bytes; these each
	276	* contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
	277	* on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
	278	* 0x800-0xFFFF on ASCII */
	279	if (uv < (16 * (1U << (2 * SHIFT)))) {
	280	d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) SHIFT)) \| UTF_START_MARK(3));
	281	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	282	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	283
	284	#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
	285	aren't tested here */
	286	/* The most likely code points in this range are below the surrogates.
	287	* Do an extra test to quickly exclude those. */
	288	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
	289	if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
	290	\|\| UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
	291	{
	292	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	293	}
	294	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	295	HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
	296	}
	297	}
	298	#endif
	299	return d;
	300	}
	301
	302	/* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
	303	* platforms, and 0x4000 on EBCDIC. There are problematic cases that can
	304	* happen starting with 4-byte characters on ASCII platforms. We unify the
	305	* code for these with EBCDIC, even though some of them require 5-bytes on
	306	* those, because khw believes the code saving is worth the very slight
	307	* performance hit on these high EBCDIC code points. */
	308
	309	if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
	310	if (UNLIKELY(uv > MAX_LEGAL_CP)) {
	311	Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
	312	}
	313	if ( (flags & UNICODE_WARN_SUPER)
	314	\|\| ( (flags & UNICODE_WARN_PERL_EXTENDED)
	315	&& UNICODE_IS_PERL_EXTENDED(uv)))
	316	{
	317	const char * format = super_cp_format;
	318	U32 category = packWARN(WARN_NON_UNICODE);
	319	U32 flag = UNICODE_GOT_SUPER;
	320
	321	/* Choose the more dire applicable warning */
	322	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	323	format = perl_extended_cp_format;
	324	if (flags & (UNICODE_WARN_PERL_EXTENDED
	325	\|UNICODE_DISALLOW_PERL_EXTENDED))
	326	{
	327	flag = UNICODE_GOT_PERL_EXTENDED;
	328	}
	329	}
	330
	331	if (msgs) {
	332	*msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
	333	category, flag);
	334	}
	335	else {
	336	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
	337	}
	338	}
	339	if ( (flags & UNICODE_DISALLOW_SUPER)
	340	\|\| ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
	341	&& UNICODE_IS_PERL_EXTENDED(uv)))
	342	{
	343	return NULL;
	344	}
	345	}
	346	else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
	347	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	348	}
	349
	350	/* Test for and handle 4-byte result. In the test immediately below, the
	351	* 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
	352	* characters). The 3 is for 3 continuation bytes; these each contribute
	353	* SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
	354	* ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
	355	* 0x1_0000-0x1F_FFFF on ASCII */
	356	if (uv < (8 * (1U << (3 * SHIFT)))) {
	357	d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) SHIFT)) \| UTF_START_MARK(4));
	358	d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) SHIFT)) & MASK) \| MARK);
	359	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	360	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	361
	362	#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
	363	characters. The end-plane non-characters for EBCDIC were
	364	handled just above */
	365	if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
	366	HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
	367	}
	368	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	369	HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
	370	}
	371	#endif
	372
	373	return d;
	374	}
	375
	376	/* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
	377	* platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
	378	* format. The unrolled version above turns out to not save all that much
	379	* time, and at these high code points (well above the legal Unicode range
	380	* on ASCII platforms, and well above anything in common use in EBCDIC),
	381	* khw believes that less code outweighs slight performance gains. */
	382
	383	{
	384	STRLEN len = OFFUNISKIP(uv);
	385	U8 *p = d+len-1;
	386	while (p > d) {
	387	*p-- = I8_TO_NATIVE_UTF8((uv & MASK) \| MARK);
	388	uv >>= SHIFT;
	389	}
	390	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	391	return d+len;
	392	}
	393	}
	394
	395	/*
	396	=for apidoc uvchr_to_utf8
	397
	398	Adds the UTF-8 representation of the native code point C<uv> to the end
	399	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	400	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	401	the byte after the end of the new character. In other words,
	402
	403	d = uvchr_to_utf8(d, uv);
	404
	405	is the recommended wide native character-aware way of saying
	406
	407	*(d++) = uv;
	408
	409	This function accepts any code point from 0..C<IV_MAX> as input.
	410	C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	411
	412	It is possible to forbid or warn on non-Unicode code points, or those that may
	413	be problematic by using L</uvchr_to_utf8_flags>.
	414
	415	=cut
	416	*/
	417
	418	/* This is also a macro */
	419	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	420
	421	U8 *
	422	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	423	{
	424	return uvchr_to_utf8(d, uv);
	425	}
	426
	427	/*
	428	=for apidoc uvchr_to_utf8_flags
	429
	430	Adds the UTF-8 representation of the native code point C<uv> to the end
	431	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	432	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	433	the byte after the end of the new character. In other words,
	434
	435	d = uvchr_to_utf8_flags(d, uv, flags);
	436
	437	or, in most cases,
	438
	439	d = uvchr_to_utf8_flags(d, uv, 0);
	440
	441	This is the Unicode-aware way of saying
	442
	443	*(d++) = uv;
	444
	445	If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
	446	input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
	447
	448	Specifying C<flags> can further restrict what is allowed and not warned on, as
	449	follows:
	450
	451	If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
	452	the function will raise a warning, provided UTF8 warnings are enabled. If
	453	instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
	454	NULL. If both flags are set, the function will both warn and return NULL.
	455
	456	Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
	457	affect how the function handles a Unicode non-character.
	458
	459	And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
	460	affect the handling of code points that are above the Unicode maximum of
	461	0x10FFFF. Languages other than Perl may not be able to accept files that
	462	contain these.
	463
	464	The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
	465	the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
	466	three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
	467	allowed inputs to the strict UTF-8 traditionally defined by Unicode.
	468	Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
	469	C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
	470	above-Unicode and surrogate flags, but not the non-character ones, as
	471	defined in
	472	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	473	See L<perlunicode/Noncharacter code points>.
	474
	475	Extremely high code points were never specified in any standard, and require an
	476	extension to UTF-8 to express, which Perl does. It is likely that programs
	477	written in something other than Perl would not be able to read files that
	478	contain these; nor would Perl understand files written by something that uses a
	479	different extension. For these reasons, there is a separate set of flags that
	480	can warn and/or disallow these extremely high code points, even if other
	481	above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
	482	and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
	483	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
	484	treat all above-Unicode code points, including these, as malformations. (Note
	485	that the Unicode standard considers anything above 0x10FFFF to be illegal, but
	486	there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
	487
	488	A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
	489	retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
	490	C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	491	C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
	492	platforms,these flags can apply to code points that actually do fit in 31 bits.
	493	The new names accurately describe the situation in all cases.
	494
	495	=cut
	496	*/
	497
	498	/* This is also a macro */
	499	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	500
	501	U8 *
	502	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	503	{
	504	return uvchr_to_utf8_flags(d, uv, flags);
	505	}
	506
	507	#ifndef UV_IS_QUAD
	508
	509	STATIC int
	510	S_is_utf8_cp_above_31_bits(const U8 * const s,
	511	const U8 * const e,
	512	const bool consider_overlongs)
	513	{
	514	/* Returns TRUE if the first code point represented by the Perl-extended-
	515	* UTF-8-encoded string starting at 's', and looking no further than 'e -
	516	* 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
	517	*
	518	* The function handles the case where the input bytes do not include all
	519	* the ones necessary to represent a full character. That is, they may be
	520	* the intial bytes of the representation of a code point, but possibly
	521	* the final ones necessary for the complete representation may be beyond
	522	* 'e - 1'.
	523	*
	524	* The function also can handle the case where the input is an overlong
	525	* sequence. If 'consider_overlongs' is 0, the function assumes the
	526	* input is not overlong, without checking, and will return based on that
	527	* assumption. If this parameter is 1, the function will go to the trouble
	528	* of figuring out if it actually evaluates to above or below 31 bits.
	529	*
	530	* The sequence is otherwise assumed to be well-formed, without checking.
	531	*/
	532
	533	const STRLEN len = e - s;
	534	int is_overlong;
	535
	536	PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
	537
	538	assert(! UTF8_IS_INVARIANT(*s) && e > s);
	539
	540	#ifdef EBCDIC
	541
	542	PERL_UNUSED_ARG(consider_overlongs);
	543
	544	/* On the EBCDIC code pages we handle, only the native start byte 0xFE can
	545	* mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
	546	* also be the start byte for a 31-bit code point; we need at least 2
	547	* bytes, and maybe up through 8 bytes, to determine that. (It can also be
	548	* the start byte for an overlong sequence, but for 30-bit or smaller code
	549	* points, so we don't have to worry about overlongs on EBCDIC.) */
	550	if (*s != 0xFE) {
	551	return 0;
	552	}
	553
	554	if (len == 1) {
	555	return -1;
	556	}
	557
	558	#else
	559
	560	/* On ASCII, FE and FF are the only start bytes that can evaluate to
	561	* needing more than 31 bits. */
	562	if (LIKELY(*s < 0xFE)) {
	563	return 0;
	564	}
	565
	566	/* What we have left are FE and FF. Both of these require more than 31
	567	* bits unless they are for overlongs. */
	568	if (! consider_overlongs) {
	569	return 1;
	570	}
	571
	572	/* Here, we have FE or FF. If the input isn't overlong, it evaluates to
	573	* above 31 bits. But we need more than one byte to discern this, so if
	574	* passed just the start byte, it could be an overlong evaluating to
	575	* smaller */
	576	if (len == 1) {
	577	return -1;
	578	}
	579
	580	/* Having excluded len==1, and knowing that FE and FF are both valid start
	581	* bytes, we can call the function below to see if the sequence is
	582	* overlong. (We don't need the full generality of the called function,
	583	* but for these huge code points, speed shouldn't be a consideration, and
	584	* the compiler does have enough information, since it's static to this
	585	* file, to optimize to just the needed parts.) */
	586	is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
	587
	588	/* If it isn't overlong, more than 31 bits are required. */
	589	if (is_overlong == 0) {
	590	return 1;
	591	}
	592
	593	/* If it is indeterminate if it is overlong, return that */
	594	if (is_overlong < 0) {
	595	return -1;
	596	}
	597
	598	/* Here is overlong. Such a sequence starting with FE is below 31 bits, as
	599	* the max it can be is 2*31 - 1 /
	600	if (*s == 0xFE) {
	601	return 0;
	602	}
	603
	604	#endif
	605
	606	/* Here, ASCII and EBCDIC rejoin:
	607	* On ASCII: We have an overlong sequence starting with FF
	608	* On EBCDIC: We have a sequence starting with FE. */
	609
	610	{ /* For C89, use a block so the declaration can be close to its use */
	611
	612	#ifdef EBCDIC
	613
	614	/* U+7FFFFFFF (2 ** 31 - 1)
	615	* [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
	616	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
	617	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
	618	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
	619	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
	620	* U+80000000 (2 ** 31):
	621	* IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	622	* IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	623	* POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	624	* I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
	625	*
	626	* and since we know that *s = \xfe, any continuation sequcence
	627	* following it that is gt the below is above 31 bits
	628	[0] [1] [2] [3] [4] [5] [6] */
	629	const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
	630
	631	#else
	632
	633	/* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
	634	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
	635	* FF overlong for U+80000000 (2 ** 31):
	636	* ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
	637	* and since we know that *s = \xff, any continuation sequcence
	638	* following it that is gt the below is above 30 bits
	639	[0] [1] [2] [3] [4] [5] [6] */
	640	const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
	641
	642
	643	#endif
	644	const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
	645	const STRLEN cmp_len = MIN(conts_len, len - 1);
	646
	647	/* Now compare the continuation bytes in s with the ones we have
	648	* compiled in that are for the largest 30 bit code point. If we have
	649	* enough bytes available to determine the answer, or the bytes we do
	650	* have differ from them, we can compare the two to get a definitive
	651	* answer (Note that in UTF-EBCDIC, the two lowest possible
	652	* continuation bytes are \x41 and \x42.) */
	653	if (cmp_len >= conts_len \|\| memNE(s + 1,
	654	conts_for_highest_30_bit,
	655	cmp_len))
	656	{
	657	return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
	658	}
	659
	660	/* Here, all the bytes we have are the same as the highest 30-bit code
	661	* point, but we are missing so many bytes that we can't make the
	662	* determination */
	663	return -1;
	664	}
	665	}
	666
	667	#endif
	668
	669	PERL_STATIC_INLINE int
	670	S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
	671	{
	672	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	673	* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
	674	* it isn't, and -1 if there isn't enough information to tell. This last
	675	* return value can happen if the sequence is incomplete, missing some
	676	* trailing bytes that would form a complete character. If there are
	677	* enough bytes to make a definitive decision, this function does so.
	678	* Usually 2 bytes sufficient.
	679	*
	680	* Overlongs can occur whenever the number of continuation bytes changes.
	681	* That means whenever the number of leading 1 bits in a start byte
	682	* increases from the next lower start byte. That happens for start bytes
	683	* C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
	684	* start bytes have already been excluded, so don't need to be tested here;
	685	* ASCII platforms: C0, C1
	686	* EBCDIC platforms C0, C1, C2, C3, C4, E0
	687	*/
	688
	689	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	690	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	691
	692	PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
	693	assert(len > 1 && UTF8_IS_START(*s));
	694
	695	/* Each platform has overlongs after the start bytes given above (expressed
	696	* in I8 for EBCDIC). What constitutes an overlong varies by platform, but
	697	* the logic is the same, except the E0 overlong has already been excluded
	698	* on EBCDIC platforms. The values below were found by manually
	699	* inspecting the UTF-8 patterns. See the tables in utf8.h and
	700	* utfebcdic.h. */
	701
	702	# ifdef EBCDIC
	703	# define F0_ABOVE_OVERLONG 0xB0
	704	# define F8_ABOVE_OVERLONG 0xA8
	705	# define FC_ABOVE_OVERLONG 0xA4
	706	# define FE_ABOVE_OVERLONG 0xA2
	707	# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
	708	/* I8(0xfe) is FF */
	709	# else
	710
	711	if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
	712	return 1;
	713	}
	714
	715	# define F0_ABOVE_OVERLONG 0x90
	716	# define F8_ABOVE_OVERLONG 0x88
	717	# define FC_ABOVE_OVERLONG 0x84
	718	# define FE_ABOVE_OVERLONG 0x82
	719	# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
	720	# endif
	721
	722
	723	if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
	724	\|\| (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
	725	\|\| (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
	726	\|\| (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
	727	{
	728	return 1;
	729	}
	730
	731	/* Check for the FF overlong */
	732	return isFF_OVERLONG(s, len);
	733	}
	734
	735	PERL_STATIC_INLINE int
	736	S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
	737	{
	738	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	739	* 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
	740	* it isn't, and -1 if there isn't enough information to tell. This last
	741	* return value can happen if the sequence is incomplete, missing some
	742	* trailing bytes that would form a complete character. If there are
	743	* enough bytes to make a definitive decision, this function does so. */
	744
	745	PERL_ARGS_ASSERT_ISFF_OVERLONG;
	746
	747	/* To be an FF overlong, all the available bytes must match */
	748	if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
	749	MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
	750	{
	751	return 0;
	752	}
	753
	754	/* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
	755	* be there; what comes after them doesn't matter. See tables in utf8.h,
	756	* utfebcdic.h. */
	757	if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
	758	return 1;
	759	}
	760
	761	/* The missing bytes could cause the result to go one way or the other, so
	762	* the result is indeterminate */
	763	return -1;
	764	}
	765
	766	#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2*63-1 /
	767	# ifdef EBCDIC /* Actually is I8 */
	768	# define HIGHEST_REPRESENTABLE_UTF8 \
	769	"\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	770	# else
	771	# define HIGHEST_REPRESENTABLE_UTF8 \
	772	"\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	773	# endif
	774	#endif
	775
	776	PERL_STATIC_INLINE int
	777	S_does_utf8_overflow(const U8 * const s,
	778	const U8 * e,
	779	const bool consider_overlongs)
	780	{
	781	/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
	782	* 'e' - 1 would overflow an IV on this platform; that is if it represents
	783	* a code point larger than the highest representable code point. It
	784	* returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
	785	* enough information to tell. This last return value can happen if the
	786	* sequence is incomplete, missing some trailing bytes that would form a
	787	* complete character. If there are enough bytes to make a definitive
	788	* decision, this function does so.
	789	*
	790	* If 'consider_overlongs' is TRUE, the function checks for the possibility
	791	* that the sequence is an overlong that doesn't overflow. Otherwise, it
	792	* assumes the sequence is not an overlong. This can give different
	793	* results only on ASCII 32-bit platforms.
	794	*
	795	* (For ASCII platforms, we could use memcmp() because we don't have to
	796	* convert each byte to I8, but it's very rare input indeed that would
	797	* approach overflow, so the loop below will likely only get executed once.)
	798	*
	799	* 'e' - 1 must not be beyond a full character. */
	800
	801
	802	PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
	803	assert(s <= e && s + UTF8SKIP(s) >= e);
	804
	805	#if ! defined(UV_IS_QUAD)
	806
	807	return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
	808
	809	#else
	810
	811	PERL_UNUSED_ARG(consider_overlongs);
	812
	813	{
	814	const STRLEN len = e - s;
	815	const U8 *x;
	816	const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
	817
	818	for (x = s; x < e; x++, y++) {
	819
	820	if (UNLIKELY(NATIVE_UTF8_TO_I8(x) == y)) {
	821	continue;
	822	}
	823
	824	/* If this byte is larger than the corresponding highest UTF-8
	825	* byte, the sequence overflow; otherwise the byte is less than,
	826	* and so the sequence doesn't overflow */
	827	return NATIVE_UTF8_TO_I8(x) > y;
	828
	829	}
	830
	831	/* Got to the end and all bytes are the same. If the input is a whole
	832	* character, it doesn't overflow. And if it is a partial character,
	833	* there's not enough information to tell */
	834	if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
	835	return -1;
	836	}
	837
	838	return 0;
	839	}
	840
	841	#endif
	842
	843	}
	844
	845	#if 0
	846
	847	/* This is the portions of the above function that deal with UV_MAX instead of
	848	* IV_MAX. They are left here in case we want to combine them so that internal
	849	* uses can have larger code points. The only logic difference is that the
	850	* 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
	851	* different logic.
	852	*/
	853
	854	/* Anything larger than this will overflow the word if it were converted into a UV */
	855	#if defined(UV_IS_QUAD)
	856	# ifdef EBCDIC /* Actually is I8 */
	857	# define HIGHEST_REPRESENTABLE_UTF8 \
	858	"\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	859	# else
	860	# define HIGHEST_REPRESENTABLE_UTF8 \
	861	"\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
	862	# endif
	863	#else /* 32-bit */
	864	# ifdef EBCDIC
	865	# define HIGHEST_REPRESENTABLE_UTF8 \
	866	"\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
	867	# else
	868	# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
	869	# endif
	870	#endif
	871
	872	#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
	873
	874	/* On 32 bit ASCII machines, many overlongs that start with FF don't
	875	* overflow */
	876	if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
	877
	878	/* To be such an overlong, the first bytes of 's' must match
	879	* FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
	880	* don't have any additional bytes available, the sequence, when
	881	* completed might or might not fit in 32 bits. But if we have that
	882	* next byte, we can tell for sure. If it is <= 0x83, then it does
	883	* fit. */
	884	if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
	885	return -1;
	886	}
	887
	888	return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
	889	}
	890
	891	/* Starting with the #else, the rest of the function is identical except
	892	* 1. we need to move the 'len' declaration to be global to the function
	893	* 2. the endif move to just after the UNUSED_ARG.
	894	* An empty endif is given just below to satisfy the preprocessor
	895	*/
	896	#endif
	897
	898	#endif
	899
	900	#undef F0_ABOVE_OVERLONG
	901	#undef F8_ABOVE_OVERLONG
	902	#undef FC_ABOVE_OVERLONG
	903	#undef FE_ABOVE_OVERLONG
	904	#undef FF_OVERLONG_PREFIX
	905
	906	STRLEN
	907	Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
	908	{
	909	STRLEN len;
	910	const U8 *x;
	911
	912	/* A helper function that should not be called directly.
	913	*
	914	* This function returns non-zero if the string beginning at 's' and
	915	* looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
	916	* code point; otherwise it returns 0. The examination stops after the
	917	* first code point in 's' is validated, not looking at the rest of the
	918	* input. If 'e' is such that there are not enough bytes to represent a
	919	* complete code point, this function will return non-zero anyway, if the
	920	* bytes it does have are well-formed UTF-8 as far as they go, and aren't
	921	* excluded by 'flags'.
	922	*
	923	* A non-zero return gives the number of bytes required to represent the
	924	* code point. Be aware that if the input is for a partial character, the
	925	* return will be larger than 'e - s'.
	926	*
	927	* This function assumes that the code point represented is UTF-8 variant.
	928	* The caller should have excluded the possibility of it being invariant
	929	* before calling this function.
	930	*
	931	* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
	932	* accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
	933	* 0 if the code point represented is well-formed Perl-extended-UTF-8, but
	934	* disallowed by the flags. If the input is only for a partial character,
	935	* the function will return non-zero if there is any sequence of
	936	* well-formed UTF-8 that, when appended to the input sequence, could
	937	* result in an allowed code point; otherwise it returns 0. Non characters
	938	* cannot be determined based on partial character input. But many of the
	939	* other excluded types can be determined with just the first one or two
	940	* bytes.
	941	*
	942	*/
	943
	944	PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
	945
	946	assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	947	\|UTF8_DISALLOW_PERL_EXTENDED)));
	948	assert(! UTF8_IS_INVARIANT(*s));
	949
	950	/* A variant char must begin with a start byte */
	951	if (UNLIKELY(! UTF8_IS_START(*s))) {
	952	return 0;
	953	}
	954
	955	/* Examine a maximum of a single whole code point */
	956	if (e - s > UTF8SKIP(s)) {
	957	e = s + UTF8SKIP(s);
	958	}
	959
	960	len = e - s;
	961
	962	if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
	963	const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
	964
	965	/* Here, we are disallowing some set of largish code points, and the
	966	* first byte indicates the sequence is for a code point that could be
	967	* in the excluded set. We generally don't have to look beyond this or
	968	* the second byte to see if the sequence is actually for one of the
	969	* excluded classes. The code below is derived from this table:
	970	*
	971	* UTF-8 UTF-EBCDIC I8
	972	* U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
	973	* U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
	974	* U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
	975	*
	976	* Keep in mind that legal continuation bytes range between \x80..\xBF
	977	* for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
	978	* continuation bytes. Hence, we don't have to test the upper edge
	979	* because if any of those is encountered, the sequence is malformed,
	980	* and would fail elsewhere in this function.
	981	*
	982	* The code here likewise assumes that there aren't other
	983	* malformations; again the function should fail elsewhere because of
	984	* these. For example, an overlong beginning with FC doesn't actually
	985	* have to be a super; it could actually represent a small code point,
	986	* even U+0000. But, since overlongs (and other malformations) are
	987	* illegal, the function should return FALSE in either case.
	988	*/
	989
	990	#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
	991	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
	992	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
	993
	994	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
	995	/* B6 and B7 */ \
	996	&& ((s1) & 0xFE ) == 0xB6)
	997	# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
	998	#else
	999	# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
	1000	# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
	1001	# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
	1002	# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
	1003	#endif
	1004
	1005	if ( (flags & UTF8_DISALLOW_SUPER)
	1006	&& UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1007	{
	1008	return 0; /* Above Unicode */
	1009	}
	1010
	1011	if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
	1012	&& UNLIKELY(isUTF8_PERL_EXTENDED(s)))
	1013	{
	1014	return 0;
	1015	}
	1016
	1017	if (len > 1) {
	1018	const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
	1019
	1020	if ( (flags & UTF8_DISALLOW_SUPER)
	1021	&& UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
	1022	{
	1023	return 0; /* Above Unicode */
	1024	}
	1025
	1026	if ( (flags & UTF8_DISALLOW_SURROGATE)
	1027	&& UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
	1028	{
	1029	return 0; /* Surrogate */
	1030	}
	1031
	1032	if ( (flags & UTF8_DISALLOW_NONCHAR)
	1033	&& UNLIKELY(UTF8_IS_NONCHAR(s, e)))
	1034	{
	1035	return 0; /* Noncharacter code point */
	1036	}
	1037	}
	1038	}
	1039
	1040	/* Make sure that all that follows are continuation bytes */
	1041	for (x = s + 1; x < e; x++) {
	1042	if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
	1043	return 0;
	1044	}
	1045	}
	1046
	1047	/* Here is syntactically valid. Next, make sure this isn't the start of an
	1048	* overlong. */
	1049	if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
	1050	return 0;
	1051	}
	1052
	1053	/* And finally, that the code point represented fits in a word on this
	1054	* platform */
	1055	if (0 < does_utf8_overflow(s, e,
	1056	0 /* Don't consider overlongs */
	1057	))
	1058	{
	1059	return 0;
	1060	}
	1061
	1062	return UTF8SKIP(s);
	1063	}
	1064
	1065	char *
	1066	Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
	1067	{
	1068	/* Returns a mortalized C string that is a displayable copy of the 'len'
	1069	* bytes starting at 'start'. 'format' gives how to display each byte.
	1070	* Currently, there are only two formats, so it is currently a bool:
	1071	* 0 \xab
	1072	* 1 ab (that is a space between two hex digit bytes)
	1073	*/
	1074
	1075	const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
	1076	trailing NUL */
	1077	const U8 * s = start;
	1078	const U8 * const e = start + len;
	1079	char * output;
	1080	char * d;
	1081
	1082	PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
	1083
	1084	Newx(output, output_len, char);
	1085	SAVEFREEPV(output);
	1086
	1087	d = output;
	1088	for (s = start; s < e; s++) {
	1089	const unsigned high_nibble = (*s & 0xF0) >> 4;
	1090	const unsigned low_nibble = (*s & 0x0F);
	1091
	1092	if (format) {
	1093	if (s > start) {
	1094	*d++ = ' ';
	1095	}
	1096	}
	1097	else {
	1098	*d++ = '\\';
	1099	*d++ = 'x';
	1100	}
	1101
	1102	if (high_nibble < 10) {
	1103	*d++ = high_nibble + '0';
	1104	}
	1105	else {
	1106	*d++ = high_nibble - 10 + 'a';
	1107	}
	1108
	1109	if (low_nibble < 10) {
	1110	*d++ = low_nibble + '0';
	1111	}
	1112	else {
	1113	*d++ = low_nibble - 10 + 'a';
	1114	}
	1115	}
	1116
	1117	*d = '\0';
	1118	return output;
	1119	}
	1120
	1121	PERL_STATIC_INLINE char *
	1122	S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
	1123
	1124	/* Max number of bytes to print */
	1125	STRLEN print_len,
	1126
	1127	/* Which one is the non-continuation */
	1128	const STRLEN non_cont_byte_pos,
	1129
	1130	/* How many bytes should there be? */
	1131	const STRLEN expect_len)
	1132	{
	1133	/* Return the malformation warning text for an unexpected continuation
	1134	* byte. */
	1135
	1136	const char * const where = (non_cont_byte_pos == 1)
	1137	? "immediately"
	1138	: Perl_form(aTHX_ "%d bytes",
	1139	(int) non_cont_byte_pos);
	1140	const U8 * x = s + non_cont_byte_pos;
	1141	const U8 * e = s + print_len;
	1142
	1143	PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
	1144
	1145	/* We don't need to pass this parameter, but since it has already been
	1146	* calculated, it's likely faster to pass it; verify under DEBUGGING */
	1147	assert(expect_len == UTF8SKIP(s));
	1148
	1149	/* As a defensive coding measure, don't output anything past a NUL. Such
	1150	* bytes shouldn't be in the middle of a malformation, and could mark the
	1151	* end of the allocated string, and what comes after is undefined */
	1152	for (; x < e; x++) {
	1153	if (*x == '\0') {
	1154	x++; /* Output this particular NUL */
	1155	break;
	1156	}
	1157	}
	1158
	1159	return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
	1160	" %s after start byte 0x%02x; need %d bytes, got %d)",
	1161	malformed_text,
	1162	_byte_dump_string(s, x - s, 0),
	1163	*(s + non_cont_byte_pos),
	1164	where,
	1165	*s,
	1166	(int) expect_len,
	1167	(int) non_cont_byte_pos);
	1168	}
	1169
	1170	/*
	1171
	1172	=for apidoc utf8n_to_uvchr
	1173
	1174	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1175	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1176
	1177	Bottom level UTF-8 decode routine.
	1178	Returns the native code point value of the first character in the string C<s>,
	1179	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	1180	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	1181	the length, in bytes, of that character.
	1182
	1183	The value of C<flags> determines the behavior when C<s> does not point to a
	1184	well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
	1185	causes zero to be returned and C<retlen> is set so that (S<C<s> + C<retlen>>)
	1186	is the next possible position in C<s> that could begin a non-malformed
	1187	character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
	1188	is raised. Some UTF-8 input sequences may contain multiple malformations.
	1189	This function tries to find every possible one in each call, so multiple
	1190	warnings can be raised for the same sequence.
	1191
	1192	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	1193	individual types of malformations, such as the sequence being overlong (that
	1194	is, when there is a shorter sequence that can express the same code point;
	1195	overlong sequences are expressly forbidden in the UTF-8 standard due to
	1196	potential security issues). Another malformation example is the first byte of
	1197	a character not being a legal first byte. See F<utf8.h> for the list of such
	1198	flags. Even if allowed, this function generally returns the Unicode
	1199	REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
	1200	F<utf8.h> to override this behavior for the overlong malformations, but don't
	1201	do that except for very specialized purposes.
	1202
	1203	The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
	1204	flags) malformation is found. If this flag is set, the routine assumes that
	1205	the caller will raise a warning, and this function will silently just set
	1206	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	1207
	1208	Note that this API requires disambiguation between successful decoding a C<NUL>
	1209	character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
	1210	in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
	1211	be set to 1. To disambiguate, upon a zero return, see if the first byte of
	1212	C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
	1213	error. Or you can use C<L</utf8n_to_uvchr_error>>.
	1214
	1215	Certain code points are considered problematic. These are Unicode surrogates,
	1216	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	1217	By default these are considered regular code points, but certain situations
	1218	warrant special handling for them, which can be specified using the C<flags>
	1219	parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
	1220	three classes are treated as malformations and handled as such. The flags
	1221	C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
	1222	C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
	1223	disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
	1224	restricts the allowed inputs to the strict UTF-8 traditionally defined by
	1225	Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
	1226	definition given by
	1227	L<Unicode Corrigendum #9\|http://www.unicode.org/versions/corrigendum9.html>.
	1228	The difference between traditional strictness and C9 strictness is that the
	1229	latter does not forbid non-character code points. (They are still discouraged,
	1230	however.) For more discussion see L<perlunicode/Noncharacter code points>.
	1231
	1232	The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
	1233	C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
	1234	C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
	1235	raised for their respective categories, but otherwise the code points are
	1236	considered valid (not malformations). To get a category to both be treated as
	1237	a malformation and raise a warning, specify both the WARN and DISALLOW flags.
	1238	(But note that warnings are not raised if lexically disabled nor if
	1239	C<UTF8_CHECK_ONLY> is also specified.)
	1240
	1241	Extremely high code points were never specified in any standard, and require an
	1242	extension to UTF-8 to express, which Perl does. It is likely that programs
	1243	written in something other than Perl would not be able to read files that
	1244	contain these; nor would Perl understand files written by something that uses a
	1245	different extension. For these reasons, there is a separate set of flags that
	1246	can warn and/or disallow these extremely high code points, even if other
	1247	above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
	1248	C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
	1249	L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
	1250	above-Unicode code points, including these, as malformations.
	1251	(Note that the Unicode standard considers anything above 0x10FFFF to be
	1252	illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
	1253	(2**31 -1))
	1254
	1255	A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
	1256	retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
	1257	C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
	1258	C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
	1259	can apply to code points that actually do fit in 31 bits. This happens on
	1260	EBCDIC platforms, and sometimes when the L<overlong
	1261	malformation\|/C<UTF8_GOT_LONG>> is also present. The new names accurately
	1262	describe the situation in all cases.
	1263
	1264
	1265	All other code points corresponding to Unicode characters, including private
	1266	use and those yet to be assigned, are never considered malformed and never
	1267	warn.
	1268
	1269	=cut
	1270
	1271	Also implemented as a macro in utf8.h
	1272	*/
	1273
	1274	UV
	1275	Perl_utf8n_to_uvchr(const U8 *s,
	1276	STRLEN curlen,
	1277	STRLEN *retlen,
	1278	const U32 flags)
	1279	{
	1280	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	1281
	1282	return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
	1283	}
	1284
	1285	/*
	1286
	1287	=for apidoc utf8n_to_uvchr_error
	1288
	1289	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1290	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1291
	1292	This function is for code that needs to know what the precise malformation(s)
	1293	are when an error is found. If you also need to know the generated warning
	1294	messages, use L</utf8n_to_uvchr_msgs>() instead.
	1295
	1296	It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
	1297	all the others, C<errors>. If this parameter is 0, this function behaves
	1298	identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
	1299	to a C<U32> variable, which this function sets to indicate any errors found.
	1300	Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
	1301	C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
	1302	of these bits will be set if a malformation is found, even if the input
	1303	C<flags> parameter indicates that the given malformation is allowed; those
	1304	exceptions are noted:
	1305
	1306	=over 4
	1307
	1308	=item C<UTF8_GOT_PERL_EXTENDED>
	1309
	1310	The input sequence is not standard UTF-8, but a Perl extension. This bit is
	1311	set only if the input C<flags> parameter contains either the
	1312	C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
	1313
	1314	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	1315	and so some extension must be used to express them. Perl uses a natural
	1316	extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
	1317	extension to represent even higher ones, so that any code point that fits in a
	1318	64-bit word can be represented. Text using these extensions is not likely to
	1319	be portable to non-Perl code. We lump both of these extensions together and
	1320	refer to them as Perl extended UTF-8. There exist other extensions that people
	1321	have invented, incompatible with Perl's.
	1322
	1323	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	1324	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	1325	than on ASCII. Prior to that, code points 2**31 and higher were simply
	1326	unrepresentable, and a different, incompatible method was used to represent
	1327	code points between 230 and 231 - 1.
	1328
	1329	On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
	1330	Perl extended UTF-8 is used.
	1331
	1332	In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
	1333	may use for backward compatibility. That name is misleading, as this flag may
	1334	be set when the code point actually does fit in 31 bits. This happens on
	1335	EBCDIC platforms, and sometimes when the L<overlong
	1336	malformation\|/C<UTF8_GOT_LONG>> is also present. The new name accurately
	1337	describes the situation in all cases.
	1338
	1339	=item C<UTF8_GOT_CONTINUATION>
	1340
	1341	The input sequence was malformed in that the first byte was a a UTF-8
	1342	continuation byte.
	1343
	1344	=item C<UTF8_GOT_EMPTY>
	1345
	1346	The input C<curlen> parameter was 0.
	1347
	1348	=item C<UTF8_GOT_LONG>
	1349
	1350	The input sequence was malformed in that there is some other sequence that
	1351	evaluates to the same code point, but that sequence is shorter than this one.
	1352
	1353	Until Unicode 3.1, it was legal for programs to accept this malformation, but
	1354	it was discovered that this created security issues.
	1355
	1356	=item C<UTF8_GOT_NONCHAR>
	1357
	1358	The code point represented by the input UTF-8 sequence is for a Unicode
	1359	non-character code point.
	1360	This bit is set only if the input C<flags> parameter contains either the
	1361	C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
	1362
	1363	=item C<UTF8_GOT_NON_CONTINUATION>
	1364
	1365	The input sequence was malformed in that a non-continuation type byte was found
	1366	in a position where only a continuation type one should be. See also
	1367	L</C<UTF8_GOT_SHORT>>.
	1368
	1369	=item C<UTF8_GOT_OVERFLOW>
	1370
	1371	The input sequence was malformed in that it is for a code point that is not
	1372	representable in the number of bits available in an IV on the current platform.
	1373
	1374	=item C<UTF8_GOT_SHORT>
	1375
	1376	The input sequence was malformed in that C<curlen> is smaller than required for
	1377	a complete sequence. In other words, the input is for a partial character
	1378	sequence.
	1379
	1380
	1381	C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
	1382	sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
	1383	that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
	1384	sequence was looked at. If no other flags are present, it means that the
	1385	sequence was valid as far as it went. Depending on the application, this could
	1386	mean one of three things:
	1387
	1388	=over
	1389
	1390	=item *
	1391
	1392	The C<curlen> length parameter passed in was too small, and the function was
	1393	prevented from examining all the necessary bytes.
	1394
	1395	=item *
	1396
	1397	The buffer being looked at is based on reading data, and the data received so
	1398	far stopped in the middle of a character, so that the next read will
	1399	read the remainder of this character. (It is up to the caller to deal with the
	1400	split bytes somehow.)
	1401
	1402	=item *
	1403
	1404	This is a real error, and the partial sequence is all we're going to get.
	1405
	1406	=back
	1407
	1408	=item C<UTF8_GOT_SUPER>
	1409
	1410	The input sequence was malformed in that it is for a non-Unicode code point;
	1411	that is, one above the legal Unicode maximum.
	1412	This bit is set only if the input C<flags> parameter contains either the
	1413	C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
	1414
	1415	=item C<UTF8_GOT_SURROGATE>
	1416
	1417	The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
	1418	code point.
	1419	This bit is set only if the input C<flags> parameter contains either the
	1420	C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
	1421
	1422	=back
	1423
	1424	To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
	1425	flag to suppress any warnings, and then examine the C<*errors> return.
	1426
	1427	=cut
	1428
	1429	Also implemented as a macro in utf8.h
	1430	*/
	1431
	1432	UV
	1433	Perl_utf8n_to_uvchr_error(const U8 *s,
	1434	STRLEN curlen,
	1435	STRLEN *retlen,
	1436	const U32 flags,
	1437	U32 * errors)
	1438	{
	1439	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
	1440
	1441	return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
	1442	}
	1443
	1444	/*
	1445
	1446	=for apidoc utf8n_to_uvchr_msgs
	1447
	1448	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	1449	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	1450
	1451	This function is for code that needs to know what the precise malformation(s)
	1452	are when an error is found, and wants the corresponding warning and/or error
	1453	messages to be returned to the caller rather than be displayed. All messages
	1454	that would have been displayed if all lexcial warnings are enabled will be
	1455	returned.
	1456
	1457	It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
	1458	placed after all the others, C<msgs>. If this parameter is 0, this function
	1459	behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
	1460	be a pointer to an C<AV *> variable, in which this function creates a new AV to
	1461	contain any appropriate messages. The elements of the array are ordered so
	1462	that the first message that would have been displayed is in the 0th element,
	1463	and so on. Each element is a hash with three key-value pairs, as follows:
	1464
	1465	=over 4
	1466
	1467	=item C<text>
	1468
	1469	The text of the message as a C<SVpv>.
	1470
	1471	=item C<warn_categories>
	1472
	1473	The warning category (or categories) packed into a C<SVuv>.
	1474
	1475	=item C<flag>
	1476
	1477	A single flag bit associated with this message, in a C<SVuv>.
	1478	The bit corresponds to some bit in the C<*errors> return value,
	1479	such as C<UTF8_GOT_LONG>.
	1480
	1481	=back
	1482
	1483	It's important to note that specifying this parameter as non-null will cause
	1484	any warnings this function would otherwise generate to be suppressed, and
	1485	instead be placed in C<*msgs>. The caller can check the lexical warnings state
	1486	(or not) when choosing what to do with the returned messages.
	1487
	1488	If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
	1489	no AV is created.
	1490
	1491	The caller, of course, is responsible for freeing any returned AV.
	1492
	1493	=cut
	1494	*/
	1495
	1496	UV
	1497	Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
	1498	STRLEN curlen,
	1499	STRLEN *retlen,
	1500	const U32 flags,
	1501	U32 * errors,
	1502	AV ** msgs)
	1503	{
	1504	const U8 * const s0 = s;
	1505	const U8 * send = s0 + curlen;
	1506	U32 possible_problems; /* A bit is set here for each potential problem
	1507	found as we go along */
	1508	UV uv;
	1509	STRLEN expectlen; /* How long should this sequence be? */
	1510	STRLEN avail_len; /* When input is too short, gives what that is */
	1511	U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
	1512	gets set and discarded */
	1513
	1514	/* The below are used only if there is both an overlong malformation and a
	1515	* too short one. Otherwise the first two are set to 's0' and 'send', and
	1516	* the third not used at all */
	1517	U8 * adjusted_s0;
	1518	U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
	1519	routine; see [perl #130921] */
	1520	UV uv_so_far;
	1521	dTHX;
	1522
	1523	PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
	1524
	1525	/* Here, is one of: a) malformed; b) a problematic code point (surrogate,
	1526	* non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
	1527	* syllables that the dfa doesn't properly handle. Quickly dispose of the
	1528	* final case. */
	1529
	1530	#ifndef EBCDIC
	1531
	1532	/* Each of the affected Hanguls starts with \xED */
	1533
	1534	if (is_HANGUL_ED_utf8_safe(s0, send)) {
	1535	if (retlen) {
	1536	*retlen = 3;
	1537	}
	1538	if (errors) {
	1539	*errors = 0;
	1540	}
	1541	if (msgs) {
	1542	*msgs = NULL;
	1543	}
	1544
	1545	return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
	1546	\| ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
	1547	\| (s0[2] & UTF_CONTINUATION_MASK);
	1548	}
	1549
	1550	#endif
	1551
	1552	/* In conjunction with the exhaustive tests that can be enabled in
	1553	* APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
	1554	* what it is intended to do, and that no flaws in it are masked by
	1555	* dropping down and executing the code below
	1556	assert(! isUTF8_CHAR(s0, send)
	1557	\|\| UTF8_IS_SURROGATE(s0, send)
	1558	\|\| UTF8_IS_SUPER(s0, send)
	1559	\|\| UTF8_IS_NONCHAR(s0,send));
	1560	*/
	1561
	1562	s = s0;
	1563	uv = *s0;
	1564	possible_problems = 0;
	1565	expectlen = 0;
	1566	avail_len = 0;
	1567	discard_errors = 0;
	1568	adjusted_s0 = (U8 *) s0;
	1569	uv_so_far = 0;
	1570
	1571	if (errors) {
	1572	*errors = 0;
	1573	}
	1574	else {
	1575	errors = &discard_errors;
	1576	}
	1577
	1578	/* The order of malformation tests here is important. We should consume as
	1579	* few bytes as possible in order to not skip any valid character. This is
	1580	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	1581	* http://unicode.org/reports/tr36 for more discussion as to why. For
	1582	* example, once we've done a UTF8SKIP, we can tell the expected number of
	1583	* bytes, and could fail right off the bat if the input parameters indicate
	1584	* that there are too few available. But it could be that just that first
	1585	* byte is garbled, and the intended character occupies fewer bytes. If we
	1586	* blindly assumed that the first byte is correct, and skipped based on
	1587	* that number, we could skip over a valid input character. So instead, we
	1588	* always examine the sequence byte-by-byte.
	1589	*
	1590	* We also should not consume too few bytes, otherwise someone could inject
	1591	* things. For example, an input could be deliberately designed to
	1592	* overflow, and if this code bailed out immediately upon discovering that,
	1593	* returning to the caller C<*retlen> pointing to the very next byte (one
	1594	* which is actually part of of the overflowing sequence), that could look
	1595	* legitimate to the caller, which could discard the initial partial
	1596	* sequence and process the rest, inappropriately.
	1597	*
	1598	* Some possible input sequences are malformed in more than one way. This
	1599	* function goes to lengths to try to find all of them. This is necessary
	1600	* for correctness, as the inputs may allow one malformation but not
	1601	* another, and if we abandon searching for others after finding the
	1602	* allowed one, we could allow in something that shouldn't have been.
	1603	*/
	1604
	1605	if (UNLIKELY(curlen == 0)) {
	1606	possible_problems \|= UTF8_GOT_EMPTY;
	1607	curlen = 0;
	1608	uv = UNICODE_REPLACEMENT;
	1609	goto ready_to_handle_errors;
	1610	}
	1611
	1612	expectlen = UTF8SKIP(s);
	1613
	1614	/* A well-formed UTF-8 character, as the vast majority of calls to this
	1615	* function will be for, has this expected length. For efficiency, set
	1616	* things up here to return it. It will be overriden only in those rare
	1617	* cases where a malformation is found */
	1618	if (retlen) {
	1619	*retlen = expectlen;
	1620	}
	1621
	1622	/* A continuation character can't start a valid sequence */
	1623	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	1624	possible_problems \|= UTF8_GOT_CONTINUATION;
	1625	curlen = 1;
	1626	uv = UNICODE_REPLACEMENT;
	1627	goto ready_to_handle_errors;
	1628	}
	1629
	1630	/* Here is not a continuation byte, nor an invariant. The only thing left
	1631	* is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
	1632	* because it excludes start bytes like \xC0 that always lead to
	1633	* overlongs.) */
	1634
	1635	/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
	1636	* that indicate the number of bytes in the character's whole UTF-8
	1637	* sequence, leaving just the bits that are part of the value. */
	1638	uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
	1639
	1640	/* Setup the loop end point, making sure to not look past the end of the
	1641	* input string, and flag it as too short if the size isn't big enough. */
	1642	if (UNLIKELY(curlen < expectlen)) {
	1643	possible_problems \|= UTF8_GOT_SHORT;
	1644	avail_len = curlen;
	1645	}
	1646	else {
	1647	send = (U8*) s0 + expectlen;
	1648	}
	1649
	1650	/* Now, loop through the remaining bytes in the character's sequence,
	1651	* accumulating each into the working value as we go. */
	1652	for (s = s0 + 1; s < send; s++) {
	1653	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	1654	uv = UTF8_ACCUMULATE(uv, *s);
	1655	continue;
	1656	}
	1657
	1658	/* Here, found a non-continuation before processing all expected bytes.
	1659	* This byte indicates the beginning of a new character, so quit, even
	1660	* if allowing this malformation. */
	1661	possible_problems \|= UTF8_GOT_NON_CONTINUATION;
	1662	break;
	1663	} /* End of loop through the character's bytes */
	1664
	1665	/* Save how many bytes were actually in the character */
	1666	curlen = s - s0;
	1667
	1668	/* Note that there are two types of too-short malformation. One is when
	1669	* there is actual wrong data before the normal termination of the
	1670	* sequence. The other is that the sequence wasn't complete before the end
	1671	* of the data we are allowed to look at, based on the input 'curlen'.
	1672	* This means that we were passed data for a partial character, but it is
	1673	* valid as far as we saw. The other is definitely invalid. This
	1674	* distinction could be important to a caller, so the two types are kept
	1675	* separate.
	1676	*
	1677	* A convenience macro that matches either of the too-short conditions. */
	1678	# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT\|UTF8_GOT_NON_CONTINUATION)
	1679
	1680	if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
	1681	uv_so_far = uv;
	1682	uv = UNICODE_REPLACEMENT;
	1683	}
	1684
	1685	/* Check for overflow. The algorithm requires us to not look past the end
	1686	* of the current character, even if partial, so the upper limit is 's' */
	1687	if (UNLIKELY(0 < does_utf8_overflow(s0, s,
	1688	1 /* Do consider overlongs */
	1689	)))
	1690	{
	1691	possible_problems \|= UTF8_GOT_OVERFLOW;
	1692	uv = UNICODE_REPLACEMENT;
	1693	}
	1694
	1695	/* Check for overlong. If no problems so far, 'uv' is the correct code
	1696	* point value. Simply see if it is expressible in fewer bytes. Otherwise
	1697	* we must look at the UTF-8 byte sequence itself to see if it is for an
	1698	* overlong */
	1699	if ( ( LIKELY(! possible_problems)
	1700	&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
	1701	\|\| ( UNLIKELY(possible_problems)
	1702	&& ( UNLIKELY(! UTF8_IS_START(*s0))
	1703	\|\| ( curlen > 1
	1704	&& UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
	1705	s - s0))))))
	1706	{
	1707	possible_problems \|= UTF8_GOT_LONG;
	1708
	1709	if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
	1710
	1711	/* The calculation in the 'true' branch of this 'if'
	1712	* below won't work if overflows, and isn't needed
	1713	* anyway. Further below we handle all overflow
	1714	* cases */
	1715	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
	1716	{
	1717	UV min_uv = uv_so_far;
	1718	STRLEN i;
	1719
	1720	/* Here, the input is both overlong and is missing some trailing
	1721	* bytes. There is no single code point it could be for, but there
	1722	* may be enough information present to determine if what we have
	1723	* so far is for an unallowed code point, such as for a surrogate.
	1724	* The code further below has the intelligence to determine this,
	1725	* but just for non-overlong UTF-8 sequences. What we do here is
	1726	* calculate the smallest code point the input could represent if
	1727	* there were no too short malformation. Then we compute and save
	1728	* the UTF-8 for that, which is what the code below looks at
	1729	* instead of the raw input. It turns out that the smallest such
	1730	* code point is all we need. */
	1731	for (i = curlen; i < expectlen; i++) {
	1732	min_uv = UTF8_ACCUMULATE(min_uv,
	1733	I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
	1734	}
	1735
	1736	adjusted_s0 = temp_char_buf;
	1737	(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
	1738	}
	1739	}
	1740
	1741	/* Here, we have found all the possible problems, except for when the input
	1742	* is for a problematic code point not allowed by the input parameters. */
	1743
	1744	/* uv is valid for overlongs */
	1745	if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
	1746
	1747	/* isn't problematic if < this */
	1748	&& uv >= UNICODE_SURROGATE_FIRST)
	1749	\|\| ( UNLIKELY(possible_problems)
	1750
	1751	/* if overflow, we know without looking further
	1752	* precisely which of the problematic types it is,
	1753	* and we deal with those in the overflow handling
	1754	* code */
	1755	&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
	1756	&& ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
	1757	\|\| UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
	1758	&& ((flags & ( UTF8_DISALLOW_NONCHAR
	1759	\|UTF8_DISALLOW_SURROGATE
	1760	\|UTF8_DISALLOW_SUPER
	1761	\|UTF8_DISALLOW_PERL_EXTENDED
	1762	\|UTF8_WARN_NONCHAR
	1763	\|UTF8_WARN_SURROGATE
	1764	\|UTF8_WARN_SUPER
	1765	\|UTF8_WARN_PERL_EXTENDED))))
	1766	{
	1767	/* If there were no malformations, or the only malformation is an
	1768	* overlong, 'uv' is valid */
	1769	if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
	1770	if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	1771	possible_problems \|= UTF8_GOT_SURROGATE;
	1772	}
	1773	else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
	1774	possible_problems \|= UTF8_GOT_SUPER;
	1775	}
	1776	else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
	1777	possible_problems \|= UTF8_GOT_NONCHAR;
	1778	}
	1779	}
	1780	else { /* Otherwise, need to look at the source UTF-8, possibly
	1781	adjusted to be non-overlong */
	1782
	1783	if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
	1784	>= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
	1785	{
	1786	possible_problems \|= UTF8_GOT_SUPER;
	1787	}
	1788	else if (curlen > 1) {
	1789	if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
	1790	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1791	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1792	{
	1793	possible_problems \|= UTF8_GOT_SUPER;
	1794	}
	1795	else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
	1796	NATIVE_UTF8_TO_I8(*adjusted_s0),
	1797	NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
	1798	{
	1799	possible_problems \|= UTF8_GOT_SURROGATE;
	1800	}
	1801	}
	1802
	1803	/* We need a complete well-formed UTF-8 character to discern
	1804	* non-characters, so can't look for them here */
	1805	}
	1806	}
	1807
	1808	ready_to_handle_errors:
	1809
	1810	/* At this point:
	1811	* curlen contains the number of bytes in the sequence that
	1812	* this call should advance the input by.
	1813	* avail_len gives the available number of bytes passed in, but
	1814	* only if this is less than the expected number of
	1815	* bytes, based on the code point's start byte.
	1816	* possible_problems' is 0 if there weren't any problems; otherwise a bit
	1817	* is set in it for each potential problem found.
	1818	* uv contains the code point the input sequence
	1819	* represents; or if there is a problem that prevents
	1820	* a well-defined value from being computed, it is
	1821	* some subsitute value, typically the REPLACEMENT
	1822	* CHARACTER.
	1823	* s0 points to the first byte of the character
	1824	* s points to just after were we left off processing
	1825	* the character
	1826	* send points to just after where that character should
	1827	* end, based on how many bytes the start byte tells
	1828	* us should be in it, but no further than s0 +
	1829	* avail_len
	1830	*/
	1831
	1832	if (UNLIKELY(possible_problems)) {
	1833	bool disallowed = FALSE;
	1834	const U32 orig_problems = possible_problems;
	1835
	1836	if (msgs) {
	1837	*msgs = NULL;
	1838	}
	1839
	1840	while (possible_problems) { /* Handle each possible problem */
	1841	UV pack_warn = 0;
	1842	char * message = NULL;
	1843	U32 this_flag_bit = 0;
	1844
	1845	/* Each 'if' clause handles one problem. They are ordered so that
	1846	* the first ones' messages will be displayed before the later
	1847	* ones; this is kinda in decreasing severity order. But the
	1848	* overlong must come last, as it changes 'uv' looked at by the
	1849	* others */
	1850	if (possible_problems & UTF8_GOT_OVERFLOW) {
	1851
	1852	/* Overflow means also got a super and are using Perl's
	1853	* extended UTF-8, but we handle all three cases here */
	1854	possible_problems
	1855	&= ~(UTF8_GOT_OVERFLOW\|UTF8_GOT_SUPER\|UTF8_GOT_PERL_EXTENDED);
	1856	*errors \|= UTF8_GOT_OVERFLOW;
	1857
	1858	/* But the API says we flag all errors found */
	1859	if (flags & (UTF8_WARN_SUPER\|UTF8_DISALLOW_SUPER)) {
	1860	*errors \|= UTF8_GOT_SUPER;
	1861	}
	1862	if (flags
	1863	& (UTF8_WARN_PERL_EXTENDED\|UTF8_DISALLOW_PERL_EXTENDED))
	1864	{
	1865	*errors \|= UTF8_GOT_PERL_EXTENDED;
	1866	}
	1867
	1868	/* Disallow if any of the three categories say to */
	1869	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1870	\|\| (flags & ( UTF8_DISALLOW_SUPER
	1871	\|UTF8_DISALLOW_PERL_EXTENDED)))
	1872	{
	1873	disallowed = TRUE;
	1874	}
	1875
	1876	/* Likewise, warn if any say to */
	1877	if ( ! (flags & UTF8_ALLOW_OVERFLOW)
	1878	\|\| (flags & (UTF8_WARN_SUPER\|UTF8_WARN_PERL_EXTENDED)))
	1879	{
	1880
	1881	/* The warnings code explicitly says it doesn't handle the
	1882	* case of packWARN2 and two categories which have
	1883	* parent-child relationship. Even if it works now to
	1884	* raise the warning if either is enabled, it wouldn't
	1885	* necessarily do so in the future. We output (only) the
	1886	* most dire warning */
	1887	if (! (flags & UTF8_CHECK_ONLY)) {
	1888	if (msgs \|\| ckWARN_d(WARN_UTF8)) {
	1889	pack_warn = packWARN(WARN_UTF8);
	1890	}
	1891	else if (msgs \|\| ckWARN_d(WARN_NON_UNICODE)) {
	1892	pack_warn = packWARN(WARN_NON_UNICODE);
	1893	}
	1894	if (pack_warn) {
	1895	message = Perl_form(aTHX_ "%s: %s (overflows)",
	1896	malformed_text,
	1897	_byte_dump_string(s0, curlen, 0));
	1898	this_flag_bit = UTF8_GOT_OVERFLOW;
	1899	}
	1900	}
	1901	}
	1902	}
	1903	else if (possible_problems & UTF8_GOT_EMPTY) {
	1904	possible_problems &= ~UTF8_GOT_EMPTY;
	1905	*errors \|= UTF8_GOT_EMPTY;
	1906
	1907	if (! (flags & UTF8_ALLOW_EMPTY)) {
	1908
	1909	/* This so-called malformation is now treated as a bug in
	1910	* the caller. If you have nothing to decode, skip calling
	1911	* this function */
	1912	assert(0);
	1913
	1914	disallowed = TRUE;
	1915	if ( (msgs
	1916	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1917	{
	1918	pack_warn = packWARN(WARN_UTF8);
	1919	message = Perl_form(aTHX_ "%s (empty string)",
	1920	malformed_text);
	1921	this_flag_bit = UTF8_GOT_EMPTY;
	1922	}
	1923	}
	1924	}
	1925	else if (possible_problems & UTF8_GOT_CONTINUATION) {
	1926	possible_problems &= ~UTF8_GOT_CONTINUATION;
	1927	*errors \|= UTF8_GOT_CONTINUATION;
	1928
	1929	if (! (flags & UTF8_ALLOW_CONTINUATION)) {
	1930	disallowed = TRUE;
	1931	if (( msgs
	1932	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1933	{
	1934	pack_warn = packWARN(WARN_UTF8);
	1935	message = Perl_form(aTHX_
	1936	"%s: %s (unexpected continuation byte 0x%02x,"
	1937	" with no preceding start byte)",
	1938	malformed_text,
	1939	_byte_dump_string(s0, 1, 0), *s0);
	1940	this_flag_bit = UTF8_GOT_CONTINUATION;
	1941	}
	1942	}
	1943	}
	1944	else if (possible_problems & UTF8_GOT_SHORT) {
	1945	possible_problems &= ~UTF8_GOT_SHORT;
	1946	*errors \|= UTF8_GOT_SHORT;
	1947
	1948	if (! (flags & UTF8_ALLOW_SHORT)) {
	1949	disallowed = TRUE;
	1950	if (( msgs
	1951	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1952	{
	1953	pack_warn = packWARN(WARN_UTF8);
	1954	message = Perl_form(aTHX_
	1955	"%s: %s (too short; %d byte%s available, need %d)",
	1956	malformed_text,
	1957	_byte_dump_string(s0, send - s0, 0),
	1958	(int)avail_len,
	1959	avail_len == 1 ? "" : "s",
	1960	(int)expectlen);
	1961	this_flag_bit = UTF8_GOT_SHORT;
	1962	}
	1963	}
	1964
	1965	}
	1966	else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
	1967	possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
	1968	*errors \|= UTF8_GOT_NON_CONTINUATION;
	1969
	1970	if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
	1971	disallowed = TRUE;
	1972	if (( msgs
	1973	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	1974	{
	1975
	1976	/* If we don't know for sure that the input length is
	1977	* valid, avoid as much as possible reading past the
	1978	* end of the buffer */
	1979	int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
	1980	? s - s0
	1981	: send - s0;
	1982	pack_warn = packWARN(WARN_UTF8);
	1983	message = Perl_form(aTHX_ "%s",
	1984	unexpected_non_continuation_text(s0,
	1985	printlen,
	1986	s - s0,
	1987	(int) expectlen));
	1988	this_flag_bit = UTF8_GOT_NON_CONTINUATION;
	1989	}
	1990	}
	1991	}
	1992	else if (possible_problems & UTF8_GOT_SURROGATE) {
	1993	possible_problems &= ~UTF8_GOT_SURROGATE;
	1994
	1995	if (flags & UTF8_WARN_SURROGATE) {
	1996	*errors \|= UTF8_GOT_SURROGATE;
	1997
	1998	if ( ! (flags & UTF8_CHECK_ONLY)
	1999	&& (msgs \|\| ckWARN_d(WARN_SURROGATE)))
	2000	{
	2001	pack_warn = packWARN(WARN_SURROGATE);
	2002
	2003	/* These are the only errors that can occur with a
	2004	* surrogate when the 'uv' isn't valid */
	2005	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	2006	message = Perl_form(aTHX_
	2007	"UTF-16 surrogate (any UTF-8 sequence that"
	2008	" starts with \"%s\" is for a surrogate)",
	2009	_byte_dump_string(s0, curlen, 0));
	2010	}
	2011	else {
	2012	message = Perl_form(aTHX_ surrogate_cp_format, uv);
	2013	}
	2014	this_flag_bit = UTF8_GOT_SURROGATE;
	2015	}
	2016	}
	2017
	2018	if (flags & UTF8_DISALLOW_SURROGATE) {
	2019	disallowed = TRUE;
	2020	*errors \|= UTF8_GOT_SURROGATE;
	2021	}
	2022	}
	2023	else if (possible_problems & UTF8_GOT_SUPER) {
	2024	possible_problems &= ~UTF8_GOT_SUPER;
	2025
	2026	if (flags & UTF8_WARN_SUPER) {
	2027	*errors \|= UTF8_GOT_SUPER;
	2028
	2029	if ( ! (flags & UTF8_CHECK_ONLY)
	2030	&& (msgs \|\| ckWARN_d(WARN_NON_UNICODE)))
	2031	{
	2032	pack_warn = packWARN(WARN_NON_UNICODE);
	2033
	2034	if (orig_problems & UTF8_GOT_TOO_SHORT) {
	2035	message = Perl_form(aTHX_
	2036	"Any UTF-8 sequence that starts with"
	2037	" \"%s\" is for a non-Unicode code point,"
	2038	" may not be portable",
	2039	_byte_dump_string(s0, curlen, 0));
	2040	}
	2041	else {
	2042	message = Perl_form(aTHX_ super_cp_format, uv);
	2043	}
	2044	this_flag_bit = UTF8_GOT_SUPER;
	2045	}
	2046	}
	2047
	2048	/* Test for Perl's extended UTF-8 after the regular SUPER ones,
	2049	* and before possibly bailing out, so that the more dire
	2050	* warning will override the regular one. */
	2051	if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
	2052	if ( ! (flags & UTF8_CHECK_ONLY)
	2053	&& (flags & (UTF8_WARN_PERL_EXTENDED\|UTF8_WARN_SUPER))
	2054	&& (msgs \|\| ckWARN_d(WARN_NON_UNICODE)))
	2055	{
	2056	pack_warn = packWARN(WARN_NON_UNICODE);
	2057
	2058	/* If it is an overlong that evaluates to a code point
	2059	* that doesn't have to use the Perl extended UTF-8, it
	2060	* still used it, and so we output a message that
	2061	* doesn't refer to the code point. The same is true
	2062	* if there was a SHORT malformation where the code
	2063	* point is not valid. In that case, 'uv' will have
	2064	* been set to the REPLACEMENT CHAR, and the message
	2065	* below without the code point in it will be selected
	2066	* */
	2067	if (UNICODE_IS_PERL_EXTENDED(uv)) {
	2068	message = Perl_form(aTHX_
	2069	perl_extended_cp_format, uv);
	2070	}
	2071	else {
	2072	message = Perl_form(aTHX_
	2073	"Any UTF-8 sequence that starts with"
	2074	" \"%s\" is a Perl extension, and"
	2075	" so is not portable",
	2076	_byte_dump_string(s0, curlen, 0));
	2077	}
	2078	this_flag_bit = UTF8_GOT_PERL_EXTENDED;
	2079	}
	2080
	2081	if (flags & ( UTF8_WARN_PERL_EXTENDED
	2082	\|UTF8_DISALLOW_PERL_EXTENDED))
	2083	{
	2084	*errors \|= UTF8_GOT_PERL_EXTENDED;
	2085
	2086	if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
	2087	disallowed = TRUE;
	2088	}
	2089	}
	2090	}
	2091
	2092	if (flags & UTF8_DISALLOW_SUPER) {
	2093	*errors \|= UTF8_GOT_SUPER;
	2094	disallowed = TRUE;
	2095	}
	2096	}
	2097	else if (possible_problems & UTF8_GOT_NONCHAR) {
	2098	possible_problems &= ~UTF8_GOT_NONCHAR;
	2099
	2100	if (flags & UTF8_WARN_NONCHAR) {
	2101	*errors \|= UTF8_GOT_NONCHAR;
	2102
	2103	if ( ! (flags & UTF8_CHECK_ONLY)
	2104	&& (msgs \|\| ckWARN_d(WARN_NONCHAR)))
	2105	{
	2106	/* The code above should have guaranteed that we don't
	2107	* get here with errors other than overlong */
	2108	assert (! (orig_problems
	2109	& ~(UTF8_GOT_LONG\|UTF8_GOT_NONCHAR)));
	2110
	2111	pack_warn = packWARN(WARN_NONCHAR);
	2112	message = Perl_form(aTHX_ nonchar_cp_format, uv);
	2113	this_flag_bit = UTF8_GOT_NONCHAR;
	2114	}
	2115	}
	2116
	2117	if (flags & UTF8_DISALLOW_NONCHAR) {
	2118	disallowed = TRUE;
	2119	*errors \|= UTF8_GOT_NONCHAR;
	2120	}
	2121	}
	2122	else if (possible_problems & UTF8_GOT_LONG) {
	2123	possible_problems &= ~UTF8_GOT_LONG;
	2124	*errors \|= UTF8_GOT_LONG;
	2125
	2126	if (flags & UTF8_ALLOW_LONG) {
	2127
	2128	/* We don't allow the actual overlong value, unless the
	2129	* special extra bit is also set */
	2130	if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
	2131	& ~UTF8_ALLOW_LONG)))
	2132	{
	2133	uv = UNICODE_REPLACEMENT;
	2134	}
	2135	}
	2136	else {
	2137	disallowed = TRUE;
	2138
	2139	if (( msgs
	2140	\|\| ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
	2141	{
	2142	pack_warn = packWARN(WARN_UTF8);
	2143
	2144	/* These error types cause 'uv' to be something that
	2145	* isn't what was intended, so can't use it in the
	2146	* message. The other error types either can't
	2147	* generate an overlong, or else the 'uv' is valid */
	2148	if (orig_problems &
	2149	(UTF8_GOT_TOO_SHORT\|UTF8_GOT_OVERFLOW))
	2150	{
	2151	message = Perl_form(aTHX_
	2152	"%s: %s (any UTF-8 sequence that starts"
	2153	" with \"%s\" is overlong which can and"
	2154	" should be represented with a"
	2155	" different, shorter sequence)",
	2156	malformed_text,
	2157	_byte_dump_string(s0, send - s0, 0),
	2158	_byte_dump_string(s0, curlen, 0));
	2159	}
	2160	else {
	2161	U8 tmpbuf[UTF8_MAXBYTES+1];
	2162	const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
	2163	uv, 0);
	2164	/* Don't use U+ for non-Unicode code points, which
	2165	* includes those in the Latin1 range */
	2166	const char * preface = ( uv > PERL_UNICODE_MAX
	2167	#ifdef EBCDIC
	2168	\|\| uv <= 0xFF
	2169	#endif
	2170	)
	2171	? "0x"
	2172	: "U+";
	2173	message = Perl_form(aTHX_
	2174	"%s: %s (overlong; instead use %s to represent"
	2175	" %s%0*" UVXf ")",
	2176	malformed_text,
	2177	_byte_dump_string(s0, send - s0, 0),
	2178	_byte_dump_string(tmpbuf, e - tmpbuf, 0),
	2179	preface,
	2180	((uv < 256) ? 2 : 4), /* Field width of 2 for
	2181	small code points */
	2182	UNI_TO_NATIVE(uv));
	2183	}
	2184	this_flag_bit = UTF8_GOT_LONG;
	2185	}
	2186	}
	2187	} /* End of looking through the possible flags */
	2188
	2189	/* Display the message (if any) for the problem being handled in
	2190	* this iteration of the loop */
	2191	if (message) {
	2192	if (msgs) {
	2193	assert(this_flag_bit);
	2194
	2195	if (*msgs == NULL) {
	2196	*msgs = newAV();
	2197	}
	2198
	2199	av_push(msgs, newRV_noinc((SV) new_msg_hv(message,
	2200	pack_warn,
	2201	this_flag_bit)));
	2202	}
	2203	else if (PL_op)
	2204	Perl_warner(aTHX_ pack_warn, "%s in %s", message,
	2205	OP_DESC(PL_op));
	2206	else
	2207	Perl_warner(aTHX_ pack_warn, "%s", message);
	2208	}
	2209	} /* End of 'while (possible_problems)' */
	2210
	2211	/* Since there was a possible problem, the returned length may need to
	2212	* be changed from the one stored at the beginning of this function.
	2213	* Instead of trying to figure out if that's needed, just do it. */
	2214	if (retlen) {
	2215	*retlen = curlen;
	2216	}
	2217
	2218	if (disallowed) {
	2219	if (flags & UTF8_CHECK_ONLY && retlen) {
	2220	*retlen = ((STRLEN) -1);
	2221	}
	2222	return 0;
	2223	}
	2224	}
	2225
	2226	return UNI_TO_NATIVE(uv);
	2227	}
	2228
	2229	/*
	2230	=for apidoc utf8_to_uvchr_buf
	2231
	2232	Returns the native code point of the first character in the string C<s> which
	2233	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	2234	C<*retlen> will be set to the length, in bytes, of that character.
	2235
	2236	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	2237	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	2238	C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
	2239	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	2240	C<retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<retlen>>) is
	2241	the next possible position in C<s> that could begin a non-malformed character.
	2242	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	2243	returned.
	2244
	2245	=cut
	2246
	2247	Also implemented as a macro in utf8.h
	2248
	2249	*/
	2250
	2251
	2252	UV
	2253	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	2254	{
	2255	PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
	2256
	2257	assert(s < send);
	2258
	2259	return utf8n_to_uvchr(s, send - s, retlen,
	2260	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	2261	}
	2262
	2263	/* This is marked as deprecated
	2264	*
	2265	=for apidoc utf8_to_uvuni_buf
	2266
	2267	Only in very rare circumstances should code need to be dealing in Unicode
	2268	(as opposed to native) code points. In those few cases, use
	2269	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>> instead. If you
	2270	are not absolutely sure this is one of those cases, then assume it isn't and
	2271	use plain C<utf8_to_uvchr_buf> instead.
	2272
	2273	Returns the Unicode (not-native) code point of the first character in the
	2274	string C<s> which
	2275	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	2276	C<retlen> will be set to the length, in bytes, of that character.
	2277
	2278	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	2279	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	2280	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	2281	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	2282	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	2283	next possible position in C<s> that could begin a non-malformed character.
	2284	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	2285
	2286	=cut
	2287	*/
	2288
	2289	UV
	2290	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	2291	{
	2292	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	2293
	2294	assert(send > s);
	2295
	2296	return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
	2297	}
	2298
	2299	/*
	2300	=for apidoc utf8_length
	2301
	2302	Returns the number of characters in the sequence of UTF-8-encoded bytes starting
	2303	at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
	2304	same place, it returns 0 with no warning raised.
	2305
	2306	If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
	2307	and returns the number of valid characters.
	2308
	2309	=cut
	2310	*/
	2311
	2312	STRLEN
	2313	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	2314	{
	2315	STRLEN len = 0;
	2316
	2317	PERL_ARGS_ASSERT_UTF8_LENGTH;
	2318
	2319	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	2320	* the bitops (especially ~) can create illegal UTF-8.
	2321	* In other words: in Perl UTF-8 is not just for Unicode. */
	2322
	2323	if (e < s)
	2324	goto warn_and_return;
	2325	while (s < e) {
	2326	s += UTF8SKIP(s);
	2327	len++;
	2328	}
	2329
	2330	if (e != s) {
	2331	len--;
	2332	warn_and_return:
	2333	if (PL_op)
	2334	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2335	"%s in %s", unees, OP_DESC(PL_op));
	2336	else
	2337	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2338	}
	2339
	2340	return len;
	2341	}
	2342
	2343	/*
	2344	=for apidoc bytes_cmp_utf8
	2345
	2346	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	2347	sequence of characters (stored as UTF-8)
	2348	in C<u>, C<ulen>. Returns 0 if they are
	2349	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	2350	if the first string is greater than the second string.
	2351
	2352	-1 or +1 is returned if the shorter string was identical to the start of the
	2353	longer string. -2 or +2 is returned if
	2354	there was a difference between characters
	2355	within the strings.
	2356
	2357	=cut
	2358	*/
	2359
	2360	int
	2361	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	2362	{
	2363	const U8 *const bend = b + blen;
	2364	const U8 *const uend = u + ulen;
	2365
	2366	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	2367
	2368	while (b < bend && u < uend) {
	2369	U8 c = *u++;
	2370	if (!UTF8_IS_INVARIANT(c)) {
	2371	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2372	if (u < uend) {
	2373	U8 c1 = *u++;
	2374	if (UTF8_IS_CONTINUATION(c1)) {
	2375	c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
	2376	} else {
	2377	/* diag_listed_as: Malformed UTF-8 character%s */
	2378	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2379	"%s %s%s",
	2380	unexpected_non_continuation_text(u - 2, 2, 1, 2),
	2381	PL_op ? " in " : "",
	2382	PL_op ? OP_DESC(PL_op) : "");
	2383	return -2;
	2384	}
	2385	} else {
	2386	if (PL_op)
	2387	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2388	"%s in %s", unees, OP_DESC(PL_op));
	2389	else
	2390	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	2391	return -2; /* Really want to return undef :-) */
	2392	}
	2393	} else {
	2394	return -2;
	2395	}
	2396	}
	2397	if (*b != c) {
	2398	return *b < c ? -2 : +2;
	2399	}
	2400	++b;
	2401	}
	2402
	2403	if (b == bend && u == uend)
	2404	return 0;
	2405
	2406	return b < bend ? +1 : -1;
	2407	}
	2408
	2409	/*
	2410	=for apidoc utf8_to_bytes
	2411
	2412	Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
	2413	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	2414	updates C<*lenp> to contain the new length.
	2415	Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
	2416
	2417	Upon successful return, the number of variants in the string can be computed by
	2418	having saved the value of C<*lenp> before the call, and subtracting the
	2419	after-call value of C<*lenp> from it.
	2420
	2421	If you need a copy of the string, see L</bytes_from_utf8>.
	2422
	2423	=cut
	2424	*/
	2425
	2426	U8 *
	2427	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN lenp)
	2428	{
	2429	U8 * first_variant;
	2430
	2431	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	2432	PERL_UNUSED_CONTEXT;
	2433
	2434	/* This is a no-op if no variants at all in the input */
	2435	if (is_utf8_invariant_string_loc(s, lenp, (const U8 *) &first_variant)) {
	2436	return s;
	2437	}
	2438
	2439	{
	2440	U8 * const save = s;
	2441	U8 * const send = s + *lenp;
	2442	U8 * d;
	2443
	2444	/* Nothing before the first variant needs to be changed, so start the real
	2445	* work there */
	2446	s = first_variant;
	2447	while (s < send) {
	2448	if (! UTF8_IS_INVARIANT(*s)) {
	2449	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	2450	*lenp = ((STRLEN) -1);
	2451	return 0;
	2452	}
	2453	s++;
	2454	}
	2455	s++;
	2456	}
	2457
	2458	/* Is downgradable, so do it */
	2459	d = s = first_variant;
	2460	while (s < send) {
	2461	U8 c = *s++;
	2462	if (! UVCHR_IS_INVARIANT(c)) {
	2463	/* Then it is two-byte encoded */
	2464	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2465	s++;
	2466	}
	2467	*d++ = c;
	2468	}
	2469	*d = '\0';
	2470	*lenp = d - save;
	2471
	2472	return save;
	2473	}
	2474	}
	2475
	2476	/*
	2477	=for apidoc bytes_from_utf8
	2478
	2479	Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
	2480	byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
	2481	actually encoded in UTF-8.
	2482
	2483	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
	2484	the input string.
	2485
	2486	Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
	2487	not expressible in native byte encoding. In these cases, C<*is_utf8p> and
	2488	C<*lenp> are unchanged, and the return value is the original C<s>.
	2489
	2490	Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
	2491	newly created string containing a downgraded copy of C<s>, and whose length is
	2492	returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
	2493	caller is responsible for arranging for the memory used by this string to get
	2494	freed.
	2495
	2496	Upon successful return, the number of variants in the string can be computed by
	2497	having saved the value of C<*lenp> before the call, and subtracting the
	2498	after-call value of C<*lenp> from it.
	2499
	2500	=cut
	2501
	2502	There is a macro that avoids this function call, but this is retained for
	2503	anyone who calls it with the Perl_ prefix */
	2504
	2505	U8 *
	2506	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN lenp, bool *is_utf8p)
	2507	{
	2508	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	2509	PERL_UNUSED_CONTEXT;
	2510
	2511	return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
	2512	}
	2513
	2514	/*
	2515	No = here because currently externally undocumented
	2516	for apidoc bytes_from_utf8_loc
	2517
	2518	Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
	2519	to store the location of the first character in C<"s"> that cannot be
	2520	converted to non-UTF8.
	2521
	2522	If that parameter is C<NULL>, this function behaves identically to
	2523	C<bytes_from_utf8>.
	2524
	2525	Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
	2526	C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
	2527
	2528	Otherwise, the function returns a newly created C<NUL>-terminated string
	2529	containing the non-UTF8 equivalent of the convertible first portion of
	2530	C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
	2531	If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
	2532	and C<*first_non_downgradable> is set to C<NULL>.
	2533
	2534	Otherwise, C<*first_non_downgradable> set to point to the first byte of the
	2535	first character in the original string that wasn't converted. C<*is_utf8p> is
	2536	unchanged. Note that the new string may have length 0.
	2537
	2538	Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
	2539	C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
	2540	converts as many characters in it as possible stopping at the first one it
	2541	finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
	2542	set to point to that. The function returns the portion that could be converted
	2543	in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
	2544	not including the terminating C<NUL>. If the very first character in the
	2545	original could not be converted, C<*lenp> will be 0, and the new string will
	2546	contain just a single C<NUL>. If the entire input string was converted,
	2547	C<is_utf8p> is set to FALSE and C<first_non_downgradable> is set to C<NULL>.
	2548
	2549	Upon successful return, the number of variants in the converted portion of the
	2550	string can be computed by having saved the value of C<*lenp> before the call,
	2551	and subtracting the after-call value of C<*lenp> from it.
	2552
	2553	=cut
	2554
	2555
	2556	*/
	2557
	2558	U8 *
	2559	Perl_bytes_from_utf8_loc(const U8 s, STRLEN lenp, bool is_utf8p, const U8* first_unconverted)
	2560	{
	2561	U8 *d;
	2562	const U8 *original = s;
	2563	U8 *converted_start;
	2564	const U8 send = s + lenp;
	2565
	2566	PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
	2567
	2568	if (! *is_utf8p) {
	2569	if (first_unconverted) {
	2570	*first_unconverted = NULL;
	2571	}
	2572
	2573	return (U8 *) original;
	2574	}
	2575
	2576	Newx(d, (*lenp) + 1, U8);
	2577
	2578	converted_start = d;
	2579	while (s < send) {
	2580	U8 c = *s++;
	2581	if (! UTF8_IS_INVARIANT(c)) {
	2582
	2583	/* Then it is multi-byte encoded. If the code point is above 0xFF,
	2584	* have to stop now */
	2585	if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
	2586	if (first_unconverted) {
	2587	*first_unconverted = s - 1;
	2588	goto finish_and_return;
	2589	}
	2590	else {
	2591	Safefree(converted_start);
	2592	return (U8 *) original;
	2593	}
	2594	}
	2595
	2596	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	2597	s++;
	2598	}
	2599	*d++ = c;
	2600	}
	2601
	2602	/* Here, converted the whole of the input */
	2603	*is_utf8p = FALSE;
	2604	if (first_unconverted) {
	2605	*first_unconverted = NULL;
	2606	}
	2607
	2608	finish_and_return:
	2609	*d = '\0';
	2610	*lenp = d - converted_start;
	2611
	2612	/* Trim unused space */
	2613	Renew(converted_start, *lenp + 1, U8);
	2614
	2615	return converted_start;
	2616	}
	2617
	2618	/*
	2619	=for apidoc bytes_to_utf8
	2620
	2621	Converts a string C<s> of length C<*lenp> bytes from the native encoding into
	2622	UTF-8.
	2623	Returns a pointer to the newly-created string, and sets C<*lenp> to
	2624	reflect the new length in bytes. The caller is responsible for arranging for
	2625	the memory used by this string to get freed.
	2626
	2627	Upon successful return, the number of variants in the string can be computed by
	2628	having saved the value of C<*lenp> before the call, and subtracting it from the
	2629	after-call value of C<*lenp>.
	2630
	2631	A C<NUL> character will be written after the end of the string.
	2632
	2633	If you want to convert to UTF-8 from encodings other than
	2634	the native (Latin1 or EBCDIC),
	2635	see L</sv_recode_to_utf8>().
	2636
	2637	=cut
	2638	*/
	2639
	2640	U8*
	2641	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN lenp)
	2642	{
	2643	const U8 * const send = s + (*lenp);
	2644	U8 *d;
	2645	U8 *dst;
	2646
	2647	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	2648	PERL_UNUSED_CONTEXT;
	2649
	2650	/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
	2651	Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
	2652	dst = d;
	2653
	2654	while (s < send) {
	2655	append_utf8_from_native_byte(*s, &d);
	2656	s++;
	2657	}
	2658
	2659	*d = '\0';
	2660	*lenp = d-dst;
	2661
	2662	return dst;
	2663	}
	2664
	2665	/*
	2666	* Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
	2667	* use utf16_to_utf8_reversed().
	2668	*
	2669	* UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
	2670	* UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
	2671	* UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
	2672	*
	2673	* These functions don't check for overflow. The worst case is every code
	2674	* point in the input is 2 bytes, and requires 4 bytes on output. (If the code
	2675	* is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
	2676	* destination must be pre-extended to 2 times the source length.
	2677	*
	2678	* Do not use in-place. We optimize for native, for obvious reasons. */
	2679
	2680	U8*
	2681	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2682	{
	2683	U8* pend;
	2684	U8* dstart = d;
	2685
	2686	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	2687
	2688	if (bytelen & 1)
	2689	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
	2690	(UV)bytelen);
	2691
	2692	pend = p + bytelen;
	2693
	2694	while (p < pend) {
	2695	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	2696	p += 2;
	2697	if (OFFUNI_IS_INVARIANT(uv)) {
	2698	*d++ = LATIN1_TO_NATIVE((U8) uv);
	2699	continue;
	2700	}
	2701	if (uv <= MAX_UTF8_TWO_BYTE) {
	2702	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	2703	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	2704	continue;
	2705	}
	2706
	2707	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	2708	#define LAST_HIGH_SURROGATE 0xDBFF
	2709	#define FIRST_LOW_SURROGATE 0xDC00
	2710	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	2711	#define FIRST_IN_PLANE1 0x10000
	2712
	2713	/* This assumes that most uses will be in the first Unicode plane, not
	2714	* needing surrogates */
	2715	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
	2716	&& uv <= UNICODE_SURROGATE_LAST))
	2717	{
	2718	if (UNLIKELY(p >= pend) \|\| UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
	2719	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2720	}
	2721	else {
	2722	UV low = (p[0] << 8) + p[1];
	2723	if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
	2724	\|\| UNLIKELY(low > LAST_LOW_SURROGATE))
	2725	{
	2726	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	2727	}
	2728	p += 2;
	2729	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	2730	+ (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
	2731	}
	2732	}
	2733	#ifdef EBCDIC
	2734	d = uvoffuni_to_utf8_flags(d, uv, 0);
	2735	#else
	2736	if (uv < FIRST_IN_PLANE1) {
	2737	*d++ = (U8)(( uv >> 12) \| 0xe0);
	2738	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2739	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2740	continue;
	2741	}
	2742	else {
	2743	*d++ = (U8)(( uv >> 18) \| 0xf0);
	2744	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	2745	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	2746	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	2747	continue;
	2748	}
	2749	#endif
	2750	}
	2751	*newlen = d - dstart;
	2752	return d;
	2753	}
	2754
	2755	/* Note: this one is slightly destructive of the source. */
	2756
	2757	U8*
	2758	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	2759	{
	2760	U8* s = (U8*)p;
	2761	U8* const send = s + bytelen;
	2762
	2763	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	2764
	2765	if (bytelen & 1)
	2766	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
	2767	(UV)bytelen);
	2768
	2769	while (s < send) {
	2770	const U8 tmp = s[0];
	2771	s[0] = s[1];
	2772	s[1] = tmp;
	2773	s += 2;
	2774	}
	2775	return utf16_to_utf8(p, d, bytelen, newlen);
	2776	}
	2777
	2778	bool
	2779	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	2780	{
	2781	return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
	2782	}
	2783
	2784	/* Internal function so we can deprecate the external one, and call
	2785	this one from other deprecated functions in this file */
	2786
	2787	bool
	2788	Perl__is_utf8_idstart(pTHX_ const U8 *p)
	2789	{
	2790	PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
	2791
	2792	if (*p == '_')
	2793	return TRUE;
	2794	return is_utf8_common(p, PL_utf8_idstart);
	2795	}
	2796
	2797	bool
	2798	Perl__is_uni_perl_idcont(pTHX_ UV c)
	2799	{
	2800	return _invlist_contains_cp(PL_utf8_perl_idcont, c);
	2801	}
	2802
	2803	bool
	2804	Perl__is_uni_perl_idstart(pTHX_ UV c)
	2805	{
	2806	return _invlist_contains_cp(PL_utf8_perl_idstart, c);
	2807	}
	2808
	2809	UV
	2810	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
	2811	const char S_or_s)
	2812	{
	2813	/* We have the latin1-range values compiled into the core, so just use
	2814	* those, converting the result to UTF-8. The only difference between upper
	2815	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	2816	* either "SS" or "Ss". Which one to use is passed into the routine in
	2817	* 'S_or_s' to avoid a test */
	2818
	2819	UV converted = toUPPER_LATIN1_MOD(c);
	2820
	2821	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	2822
	2823	assert(S_or_s == 'S' \|\| S_or_s == 's');
	2824
	2825	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	2826	characters in this range */
	2827	*p = (U8) converted;
	2828	*lenp = 1;
	2829	return converted;
	2830	}
	2831
	2832	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	2833	* which it maps to one of them, so as to only have to have one check for
	2834	* it in the main case */
	2835	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	2836	switch (c) {
	2837	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	2838	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	2839	break;
	2840	case MICRO_SIGN:
	2841	converted = GREEK_CAPITAL_LETTER_MU;
	2842	break;
	2843	#if UNICODE_MAJOR_VERSION > 2 \
	2844	\|\| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
	2845	&& UNICODE_DOT_DOT_VERSION >= 8)
	2846	case LATIN_SMALL_LETTER_SHARP_S:
	2847	*(p)++ = 'S';
	2848	*p = S_or_s;
	2849	*lenp = 2;
	2850	return 'S';
	2851	#endif
	2852	default:
	2853	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
	2854	" '%c' to map to '%c'",
	2855	c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	2856	NOT_REACHED; /* NOTREACHED */
	2857	}
	2858	}
	2859
	2860	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	2861	*p = UTF8_TWO_BYTE_LO(converted);
	2862	*lenp = 2;
	2863
	2864	return converted;
	2865	}
	2866
	2867	/* If compiled on an early Unicode version, there may not be auxiliary tables
	2868	* */
	2869	#ifndef HAS_UC_AUX_TABLES
	2870	# define UC_AUX_TABLE_ptrs NULL
	2871	# define UC_AUX_TABLE_lengths NULL
	2872	#endif
	2873	#ifndef HAS_TC_AUX_TABLES
	2874	# define TC_AUX_TABLE_ptrs NULL
	2875	# define TC_AUX_TABLE_lengths NULL
	2876	#endif
	2877	#ifndef HAS_LC_AUX_TABLES
	2878	# define LC_AUX_TABLE_ptrs NULL
	2879	# define LC_AUX_TABLE_lengths NULL
	2880	#endif
	2881	#ifndef HAS_CF_AUX_TABLES
	2882	# define CF_AUX_TABLE_ptrs NULL
	2883	# define CF_AUX_TABLE_lengths NULL
	2884	#endif
	2885	#ifndef HAS_UC_AUX_TABLES
	2886	# define UC_AUX_TABLE_ptrs NULL
	2887	# define UC_AUX_TABLE_lengths NULL
	2888	#endif
	2889
	2890	/* Call the function to convert a UTF-8 encoded character to the specified case.
	2891	* Note that there may be more than one character in the result.
	2892	* 's' is a pointer to the first byte of the input character
	2893	* 'd' will be set to the first byte of the string of changed characters. It
	2894	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	2895	* 'lenp' will be set to the length in bytes of the string of changed characters
	2896	*
	2897	* The functions return the ordinal of the first character in the string of
	2898	* 'd' */
	2899	#define CALL_UPPER_CASE(uv, s, d, lenp) \
	2900	_to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
	2901	Uppercase_Mapping_invmap, \
	2902	UC_AUX_TABLE_ptrs, \
	2903	UC_AUX_TABLE_lengths, \
	2904	"uppercase")
	2905	#define CALL_TITLE_CASE(uv, s, d, lenp) \
	2906	_to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
	2907	Titlecase_Mapping_invmap, \
	2908	TC_AUX_TABLE_ptrs, \
	2909	TC_AUX_TABLE_lengths, \
	2910	"titlecase")
	2911	#define CALL_LOWER_CASE(uv, s, d, lenp) \
	2912	_to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
	2913	Lowercase_Mapping_invmap, \
	2914	LC_AUX_TABLE_ptrs, \
	2915	LC_AUX_TABLE_lengths, \
	2916	"lowercase")
	2917
	2918
	2919	/* This additionally has the input parameter 'specials', which if non-zero will
	2920	* cause this to use the specials hash for folding (meaning get full case
	2921	* folding); otherwise, when zero, this implies a simple case fold */
	2922	#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
	2923	(specials) \
	2924	? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
	2925	Case_Folding_invmap, \
	2926	CF_AUX_TABLE_ptrs, \
	2927	CF_AUX_TABLE_lengths, \
	2928	"foldcase") \
	2929	: _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
	2930	Simple_Case_Folding_invmap, \
	2931	NULL, NULL, \
	2932	"foldcase")
	2933
	2934	UV
	2935	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	2936	{
	2937	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	2938	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	2939	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	2940	* the changed version may be longer than the original character.
	2941	*
	2942	* The ordinal of the first character of the changed version is returned
	2943	* (but note, as explained above, that there may be more.) */
	2944
	2945	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	2946
	2947	if (c < 256) {
	2948	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	2949	}
	2950
	2951	return CALL_UPPER_CASE(c, NULL, p, lenp);
	2952	}
	2953
	2954	UV
	2955	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	2956	{
	2957	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	2958
	2959	if (c < 256) {
	2960	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	2961	}
	2962
	2963	return CALL_TITLE_CASE(c, NULL, p, lenp);
	2964	}
	2965
	2966	STATIC U8
	2967	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
	2968	{
	2969	/* We have the latin1-range values compiled into the core, so just use
	2970	* those, converting the result to UTF-8. Since the result is always just
	2971	* one character, we allow <p> to be NULL */
	2972
	2973	U8 converted = toLOWER_LATIN1(c);
	2974
	2975	PERL_UNUSED_ARG(dummy);
	2976
	2977	if (p != NULL) {
	2978	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	2979	*p = converted;
	2980	*lenp = 1;
	2981	}
	2982	else {
	2983	/* Result is known to always be < 256, so can use the EIGHT_BIT
	2984	* macros */
	2985	*p = UTF8_EIGHT_BIT_HI(converted);
	2986	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	2987	*lenp = 2;
	2988	}
	2989	}
	2990	return converted;
	2991	}
	2992
	2993	UV
	2994	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	2995	{
	2996	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	2997
	2998	if (c < 256) {
	2999	return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
	3000	}
	3001
	3002	return CALL_LOWER_CASE(c, NULL, p, lenp);
	3003	}
	3004
	3005	UV
	3006	Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
	3007	{
	3008	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	3009	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	3010	* FOLD_FLAGS_FULL iff full folding is to be used;
	3011	*
	3012	* Not to be used for locale folds
	3013	*/
	3014
	3015	UV converted;
	3016
	3017	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	3018
	3019	assert (! (flags & FOLD_FLAGS_LOCALE));
	3020
	3021	if (UNLIKELY(c == MICRO_SIGN)) {
	3022	converted = GREEK_SMALL_LETTER_MU;
	3023	}
	3024	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	3025	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	3026	\|\| UNICODE_DOT_DOT_VERSION > 0)
	3027	else if ( (flags & FOLD_FLAGS_FULL)
	3028	&& UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
	3029	{
	3030	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	3031	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	3032	* under those circumstances. */
	3033	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	3034	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	3035	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	3036	p, *lenp, U8);
	3037	return LATIN_SMALL_LETTER_LONG_S;
	3038	}
	3039	else {
	3040	*(p)++ = 's';
	3041	*p = 's';
	3042	*lenp = 2;
	3043	return 's';
	3044	}
	3045	}
	3046	#endif
	3047	else { /* In this range the fold of all other characters is their lower
	3048	case */
	3049	converted = toLOWER_LATIN1(c);
	3050	}
	3051
	3052	if (UVCHR_IS_INVARIANT(converted)) {
	3053	*p = (U8) converted;
	3054	*lenp = 1;
	3055	}
	3056	else {
	3057	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	3058	*p = UTF8_TWO_BYTE_LO(converted);
	3059	*lenp = 2;
	3060	}
	3061
	3062	return converted;
	3063	}
	3064
	3065	UV
	3066	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	3067	{
	3068
	3069	/* Not currently externally documented, and subject to change
	3070	* <flags> bits meanings:
	3071	* FOLD_FLAGS_FULL iff full folding is to be used;
	3072	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3073	* locale are to be used.
	3074	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	3075	*/
	3076
	3077	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	3078
	3079	if (flags & FOLD_FLAGS_LOCALE) {
	3080	/* Treat a UTF-8 locale as not being in locale at all, except for
	3081	* potentially warning */
	3082	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	3083	if (IN_UTF8_CTYPE_LOCALE) {
	3084	flags &= ~FOLD_FLAGS_LOCALE;
	3085	}
	3086	else {
	3087	goto needs_full_generality;
	3088	}
	3089	}
	3090
	3091	if (c < 256) {
	3092	return _to_fold_latin1((U8) c, p, lenp,
	3093	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	3094	}
	3095
	3096	/* Here, above 255. If no special needs, just use the macro */
	3097	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	3098	return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
	3099	}
	3100	else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
	3101	the special flags. */
	3102	U8 utf8_c[UTF8_MAXBYTES + 1];
	3103
	3104	needs_full_generality:
	3105	uvchr_to_utf8(utf8_c, c);
	3106	return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
	3107	p, lenp, flags);
	3108	}
	3109	}
	3110
	3111	PERL_STATIC_INLINE bool
	3112	S_is_utf8_common(pTHX_ const U8 const p, SV const invlist)
	3113	{
	3114	/* returns a boolean giving whether or not the UTF8-encoded character that
	3115	* starts at <p> is in the inversion list indicated by <invlist>.
	3116	*
	3117	* Note that it is assumed that the buffer length of <p> is enough to
	3118	* contain all the bytes that comprise the character. Thus, <*p> should
	3119	* have been checked before this call for mal-formedness enough to assure
	3120	* that. This function, does make sure to not look past any NUL, so it is
	3121	* safe to use on C, NUL-terminated, strings */
	3122	STRLEN len = my_strnlen((char *) p, UTF8SKIP(p));
	3123
	3124	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	3125
	3126	/* The API should have included a length for the UTF-8 character in <p>,
	3127	* but it doesn't. We therefore assume that p has been validated at least
	3128	* as far as there being enough bytes available in it to accommodate the
	3129	* character without reading beyond the end, and pass that number on to the
	3130	* validating routine */
	3131	if (! isUTF8_CHAR(p, p + len)) {
	3132	_force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN,
	3133	1 /* Die */ );
	3134	NOT_REACHED; /* NOTREACHED */
	3135	}
	3136
	3137	return is_utf8_common_with_len(p, p + len, invlist);
	3138	}
	3139
	3140	PERL_STATIC_INLINE bool
	3141	S_is_utf8_common_with_len(pTHX_ const U8 const p, const U8 const e,
	3142	SV* const invlist)
	3143	{
	3144	/* returns a boolean giving whether or not the UTF8-encoded character that
	3145	* starts at <p>, and extending no further than <e - 1> is in the inversion
	3146	* list <invlist>. */
	3147
	3148	UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
	3149
	3150	PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
	3151
	3152	if (cp == 0 && (p >= e \|\| *p != '\0')) {
	3153	_force_out_malformed_utf8_message(p, e, 0, 1);
	3154	NOT_REACHED; /* NOTREACHED */
	3155	}
	3156
	3157	assert(invlist);
	3158	return _invlist_contains_cp(invlist, cp);
	3159	}
	3160
	3161	STATIC void
	3162	S_warn_on_first_deprecated_use(pTHX_ const char * const name,
	3163	const char * const alternative,
	3164	const bool use_locale,
	3165	const char * const file,
	3166	const unsigned line)
	3167	{
	3168	const char * key;
	3169
	3170	PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
	3171
	3172	if (ckWARN_d(WARN_DEPRECATED)) {
	3173
	3174	key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
	3175	if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
	3176	if (! PL_seen_deprecated_macro) {
	3177	PL_seen_deprecated_macro = newHV();
	3178	}
	3179	if (! hv_store(PL_seen_deprecated_macro, key,
	3180	strlen(key), &PL_sv_undef, 0))
	3181	{
	3182	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3183	}
	3184
	3185	if (instr(file, "mathoms.c")) {
	3186	Perl_warner(aTHX_ WARN_DEPRECATED,
	3187	"In %s, line %d, starting in Perl v5.32, %s()"
	3188	" will be removed. Avoid this message by"
	3189	" converting to use %s().\n",
	3190	file, line, name, alternative);
	3191	}
	3192	else {
	3193	Perl_warner(aTHX_ WARN_DEPRECATED,
	3194	"In %s, line %d, starting in Perl v5.32, %s() will"
	3195	" require an additional parameter. Avoid this"
	3196	" message by converting to use %s().\n",
	3197	file, line, name, alternative);
	3198	}
	3199	}
	3200	}
	3201	}
	3202
	3203	bool
	3204	Perl__is_utf8_FOO(pTHX_ U8 classnum,
	3205	const U8 * const p,
	3206	const char * const name,
	3207	const char * const alternative,
	3208	const bool use_utf8,
	3209	const bool use_locale,
	3210	const char * const file,
	3211	const unsigned line)
	3212	{
	3213	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	3214
	3215	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	3216
	3217	if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
	3218
	3219	switch (classnum) {
	3220	case _CC_WORDCHAR:
	3221	case _CC_DIGIT:
	3222	case _CC_ALPHA:
	3223	case _CC_LOWER:
	3224	case _CC_UPPER:
	3225	case _CC_PUNCT:
	3226	case _CC_PRINT:
	3227	case _CC_ALPHANUMERIC:
	3228	case _CC_GRAPH:
	3229	case _CC_CASED:
	3230
	3231	return is_utf8_common(p, PL_XPosix_ptrs[classnum]);
	3232
	3233	case _CC_SPACE:
	3234	return is_XPERLSPACE_high(p);
	3235	case _CC_BLANK:
	3236	return is_HORIZWS_high(p);
	3237	case _CC_XDIGIT:
	3238	return is_XDIGIT_high(p);
	3239	case _CC_CNTRL:
	3240	return 0;
	3241	case _CC_ASCII:
	3242	return 0;
	3243	case _CC_VERTSPACE:
	3244	return is_VERTWS_high(p);
	3245	case _CC_IDFIRST:
	3246	return is_utf8_common(p, PL_utf8_perl_idstart);
	3247	case _CC_IDCONT:
	3248	return is_utf8_common(p, PL_utf8_perl_idcont);
	3249	}
	3250	}
	3251
	3252	/* idcont is the same as wordchar below 256 */
	3253	if (classnum == _CC_IDCONT) {
	3254	classnum = _CC_WORDCHAR;
	3255	}
	3256	else if (classnum == _CC_IDFIRST) {
	3257	if (*p == '_') {
	3258	return TRUE;
	3259	}
	3260	classnum = _CC_ALPHA;
	3261	}
	3262
	3263	if (! use_locale) {
	3264	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	3265	return _generic_isCC(*p, classnum);
	3266	}
	3267
	3268	return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )), classnum);
	3269	}
	3270	else {
	3271	if (! use_utf8 \|\| UTF8_IS_INVARIANT(*p)) {
	3272	return isFOO_lc(classnum, *p);
	3273	}
	3274
	3275	return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(p, (p + 1 )));
	3276	}
	3277
	3278	NOT_REACHED; /* NOTREACHED */
	3279	}
	3280
	3281	bool
	3282	Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
	3283	const U8 * const e)
	3284	{
	3285	PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
	3286
	3287	return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]);
	3288	}
	3289
	3290	bool
	3291	Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 p, const U8 const e)
	3292	{
	3293	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
	3294
	3295	return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart);
	3296	}
	3297
	3298	bool
	3299	Perl__is_utf8_xidstart(pTHX_ const U8 *p)
	3300	{
	3301	PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
	3302
	3303	if (*p == '_')
	3304	return TRUE;
	3305	return is_utf8_common(p, PL_utf8_xidstart);
	3306	}
	3307
	3308	bool
	3309	Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 p, const U8 const e)
	3310	{
	3311	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
	3312
	3313	return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont);
	3314	}
	3315
	3316	bool
	3317	Perl__is_utf8_idcont(pTHX_ const U8 *p)
	3318	{
	3319	PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
	3320
	3321	return is_utf8_common(p, PL_utf8_idcont);
	3322	}
	3323
	3324	bool
	3325	Perl__is_utf8_xidcont(pTHX_ const U8 *p)
	3326	{
	3327	PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
	3328
	3329	return is_utf8_common(p, PL_utf8_xidcont);
	3330	}
	3331
	3332	bool
	3333	Perl__is_utf8_mark(pTHX_ const U8 *p)
	3334	{
	3335	PERL_ARGS_ASSERT__IS_UTF8_MARK;
	3336
	3337	return is_utf8_common(p, PL_utf8_mark);
	3338	}
	3339
	3340	STATIC UV
	3341	S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
	3342	U8* ustrp, STRLEN *lenp,
	3343	SV invlist, const int const invmap,
	3344	const unsigned int * const * const aux_tables,
	3345	const U8 * const aux_table_lengths,
	3346	const char * const normal)
	3347	{
	3348	STRLEN len = 0;
	3349
	3350	/* Change the case of code point 'uv1' whose UTF-8 representation (assumed
	3351	* by this routine to be valid) begins at 'p'. 'normal' is a string to use
	3352	* to name the new case in any generated messages, as a fallback if the
	3353	* operation being used is not available. The new case is given by the
	3354	* data structures in the remaining arguments.
	3355	*
	3356	* On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
	3357	* entire changed case string, and the return value is the first code point
	3358	* in that string */
	3359
	3360	PERL_ARGS_ASSERT__TO_UTF8_CASE;
	3361
	3362	/* For code points that don't change case, we already know that the output
	3363	* of this function is the unchanged input, so we can skip doing look-ups
	3364	* for them. Unfortunately the case-changing code points are scattered
	3365	* around. But there are some long consecutive ranges where there are no
	3366	* case changing code points. By adding tests, we can eliminate the lookup
	3367	* for all the ones in such ranges. This is currently done here only for
	3368	* just a few cases where the scripts are in common use in modern commerce
	3369	* (and scripts adjacent to those which can be included without additional
	3370	* tests). */
	3371
	3372	if (uv1 >= 0x0590) {
	3373	/* This keeps from needing further processing the code points most
	3374	* likely to be used in the following non-cased scripts: Hebrew,
	3375	* Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
	3376	* Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
	3377	* Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
	3378	if (uv1 < 0x10A0) {
	3379	goto cases_to_self;
	3380	}
	3381
	3382	/* The following largish code point ranges also don't have case
	3383	* changes, but khw didn't think they warranted extra tests to speed
	3384	* them up (which would slightly slow down everything else above them):
	3385	* 1100..139F Hangul Jamo, Ethiopic
	3386	* 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
	3387	* Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
	3388	* Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
	3389	* Combining Diacritical Marks Extended, Balinese,
	3390	* Sundanese, Batak, Lepcha, Ol Chiki
	3391	* 2000..206F General Punctuation
	3392	*/
	3393
	3394	if (uv1 >= 0x2D30) {
	3395
	3396	/* This keeps the from needing further processing the code points
	3397	* most likely to be used in the following non-cased major scripts:
	3398	* CJK, Katakana, Hiragana, plus some less-likely scripts.
	3399	*
	3400	* (0x2D30 above might have to be changed to 2F00 in the unlikely
	3401	* event that Unicode eventually allocates the unused block as of
	3402	* v8.0 2FE0..2FEF to code points that are cased. khw has verified
	3403	* that the test suite will start having failures to alert you
	3404	* should that happen) */
	3405	if (uv1 < 0xA640) {
	3406	goto cases_to_self;
	3407	}
	3408
	3409	if (uv1 >= 0xAC00) {
	3410	if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
	3411	if (ckWARN_d(WARN_SURROGATE)) {
	3412	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3413	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3414	"Operation \"%s\" returns its argument for"
	3415	" UTF-16 surrogate U+%04" UVXf, desc, uv1);
	3416	}
	3417	goto cases_to_self;
	3418	}
	3419
	3420	/* AC00..FAFF Catches Hangul syllables and private use, plus
	3421	* some others */
	3422	if (uv1 < 0xFB00) {
	3423	goto cases_to_self;
	3424	}
	3425
	3426	if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
	3427	if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
	3428	Perl_croak(aTHX_ cp_above_legal_max, uv1,
	3429	MAX_LEGAL_CP);
	3430	}
	3431	if (ckWARN_d(WARN_NON_UNICODE)) {
	3432	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	3433	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3434	"Operation \"%s\" returns its argument for"
	3435	" non-Unicode code point 0x%04" UVXf, desc, uv1);
	3436	}
	3437	goto cases_to_self;
	3438	}
	3439	#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
	3440	if (UNLIKELY(uv1
	3441	> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
	3442	{
	3443
	3444	/* As of Unicode 10.0, this means we avoid swash creation
	3445	* for anything beyond high Plane 1 (below emojis) */
	3446	goto cases_to_self;
	3447	}
	3448	#endif
	3449	}
	3450	}
	3451
	3452	/* Note that non-characters are perfectly legal, so no warning should
	3453	* be given. */
	3454	}
	3455
	3456	{
	3457	unsigned int i;
	3458	const unsigned int * cp_list;
	3459	U8 * d;
	3460
	3461	/* 'index' is guaranteed to be non-negative, as this is an inversion
	3462	* map that covers all possible inputs. See [perl #133365] */
	3463	SSize_t index = _invlist_search(invlist, uv1);
	3464	IV base = invmap[index];
	3465
	3466	/* The data structures are set up so that if 'base' is non-negative,
	3467	* the case change is 1-to-1; and if 0, the change is to itself */
	3468	if (base >= 0) {
	3469	IV lc;
	3470
	3471	if (base == 0) {
	3472	goto cases_to_self;
	3473	}
	3474
	3475	/* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
	3476	lc = base + uv1 - invlist_array(invlist)[index];
	3477	*lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
	3478	return lc;
	3479	}
	3480
	3481	/* Here 'base' is negative. That means the mapping is 1-to-many, and
	3482	* requires an auxiliary table look up. abs(base) gives the index into
	3483	* a list of such tables which points to the proper aux table. And a
	3484	* parallel list gives the length of each corresponding aux table. */
	3485	cp_list = aux_tables[-base];
	3486
	3487	/* Create the string of UTF-8 from the mapped-to code points */
	3488	d = ustrp;
	3489	for (i = 0; i < aux_table_lengths[-base]; i++) {
	3490	d = uvchr_to_utf8(d, cp_list[i]);
	3491	}
	3492	*d = '\0';
	3493	*lenp = d - ustrp;
	3494
	3495	return cp_list[0];
	3496	}
	3497
	3498	/* Here, there was no mapping defined, which means that the code point maps
	3499	* to itself. Return the inputs */
	3500	cases_to_self:
	3501	if (p) {
	3502	len = UTF8SKIP(p);
	3503	if (p != ustrp) { /* Don't copy onto itself */
	3504	Copy(p, ustrp, len, U8);
	3505	}
	3506	*lenp = len;
	3507	}
	3508	else {
	3509	*lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
	3510	}
	3511
	3512	return uv1;
	3513
	3514	}
	3515
	3516	Size_t
	3517	Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
	3518	const unsigned int ** remaining_folds_to)
	3519	{
	3520	/* Returns the count of the number of code points that fold to the input
	3521	* 'cp' (besides itself).
	3522	*
	3523	* If the return is 0, there is nothing else that folds to it, and
	3524	* 'first_folds_to' is set to 0, and 'remaining_folds_to' is set to NULL.
	3525	*
	3526	* If the return is 1, '*first_folds_to' is set to the single code point,
	3527	* and '*remaining_folds_to' is set to NULL.
	3528	*
	3529	* Otherwise, '*first_folds_to' is set to a code point, and
	3530	* '*remaining_fold_to' is set to an array that contains the others. The
	3531	* length of this array is the returned count minus 1.
	3532	*
	3533	* The reason for this convolution is to avoid having to deal with
	3534	* allocating and freeing memory. The lists are already constructed, so
	3535	* the return can point to them, but single code points aren't, so would
	3536	* need to be constructed if we didn't employ something like this API */
	3537
	3538	/* 'index' is guaranteed to be non-negative, as this is an inversion map
	3539	* that covers all possible inputs. See [perl #133365] */
	3540	SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
	3541	int base = _Perl_IVCF_invmap[index];
	3542
	3543	PERL_ARGS_ASSERT__INVERSE_FOLDS;
	3544
	3545	if (base == 0) { /* No fold */
	3546	*first_folds_to = 0;
	3547	*remaining_folds_to = NULL;
	3548	return 0;
	3549	}
	3550
	3551	#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
	3552
	3553	assert(base > 0);
	3554
	3555	#else
	3556
	3557	if (UNLIKELY(base < 0)) { /* Folds to more than one character */
	3558
	3559	/* The data structure is set up so that the absolute value of 'base' is
	3560	* an index into a table of pointers to arrays, with the array
	3561	* corresponding to the index being the list of code points that fold
	3562	* to 'cp', and the parallel array containing the length of the list
	3563	* array */
	3564	*first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
	3565	remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; / +1 excludes
	3566	*first_folds_to
	3567	*/
	3568	return IVCF_AUX_TABLE_lengths[-base];
	3569	}
	3570
	3571	#endif
	3572
	3573	/* Only the single code point. This works like 'fc(G) = G - A + a' */
	3574	*first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
	3575	*remaining_folds_to = NULL;
	3576	return 1;
	3577	}
	3578
	3579	STATIC UV
	3580	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
	3581	U8* const ustrp, STRLEN *lenp)
	3582	{
	3583	/* This is called when changing the case of a UTF-8-encoded character above
	3584	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	3585	* result contains a character that crosses the 255/256 boundary, disallow
	3586	* the change, and return the original code point. See L<perlfunc/lc> for
	3587	* why;
	3588	*
	3589	* p points to the original string whose case was changed; assumed
	3590	* by this routine to be well-formed
	3591	* result the code point of the first character in the changed-case string
	3592	* ustrp points to the changed-case string (<result> represents its
	3593	* first char)
	3594	* lenp points to the length of <ustrp> */
	3595
	3596	UV original; /* To store the first code point of <p> */
	3597
	3598	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	3599
	3600	assert(UTF8_IS_ABOVE_LATIN1(*p));
	3601
	3602	/* We know immediately if the first character in the string crosses the
	3603	* boundary, so can skip testing */
	3604	if (result > 255) {
	3605
	3606	/* Look at every character in the result; if any cross the
	3607	* boundary, the whole thing is disallowed */
	3608	U8* s = ustrp + UTF8SKIP(ustrp);
	3609	U8* e = ustrp + *lenp;
	3610	while (s < e) {
	3611	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	3612	goto bad_crossing;
	3613	}
	3614	s += UTF8SKIP(s);
	3615	}
	3616
	3617	/* Here, no characters crossed, result is ok as-is, but we warn. */
	3618	_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
	3619	return result;
	3620	}
	3621
	3622	bad_crossing:
	3623
	3624	/* Failed, have to return the original */
	3625	original = valid_utf8_to_uvchr(p, lenp);
	3626
	3627	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3628	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3629	"Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
	3630	" locale; resolved to \"\\x{%" UVXf "}\".",
	3631	OP_DESC(PL_op),
	3632	original,
	3633	original);
	3634	Copy(p, ustrp, *lenp, char);
	3635	return original;
	3636	}
	3637
	3638	STATIC U32
	3639	S_check_and_deprecate(pTHX_ const U8 *p,
	3640	const U8 **e,
	3641	const unsigned int type, /* See below */
	3642	const bool use_locale, /* Is this a 'LC_'
	3643	macro call? */
	3644	const char * const file,
	3645	const unsigned line)
	3646	{
	3647	/* This is a temporary function to deprecate the unsafe calls to the case
	3648	* changing macros and functions. It keeps all the special stuff in just
	3649	* one place.
	3650	*
	3651	* It updates *e with the pointer to the end of the input string. If using
	3652	* the old-style macros, *e is NULL on input, and so this function assumes
	3653	* the input string is long enough to hold the entire UTF-8 sequence, and
	3654	* sets *e accordingly, but it then returns a flag to pass the
	3655	* utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
	3656	* using the full length if possible.
	3657	*
	3658	* It also does the assert that e > p when e is not NULL. This should be
	3659	* migrated to the callers when this function gets deleted.
	3660	*
	3661	* The 'type' parameter is used for the caller to specify which case
	3662	* changing function this is called from: */
	3663
	3664	# define DEPRECATE_TO_UPPER 0
	3665	# define DEPRECATE_TO_TITLE 1
	3666	# define DEPRECATE_TO_LOWER 2
	3667	# define DEPRECATE_TO_FOLD 3
	3668
	3669	U32 utf8n_flags = 0;
	3670	const char * name;
	3671	const char * alternative;
	3672
	3673	PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
	3674
	3675	if (*e == NULL) {
	3676	utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
	3677
	3678	/* strnlen() makes this function safe for the common case of
	3679	* NUL-terminated strings */
	3680	e = p + my_strnlen((char ) p, UTF8SKIP(p));
	3681
	3682	/* For mathoms.c calls, we use the function name we know is stored
	3683	* there. It could be part of a larger path */
	3684	if (type == DEPRECATE_TO_UPPER) {
	3685	name = instr(file, "mathoms.c")
	3686	? "to_utf8_upper"
	3687	: "toUPPER_utf8";
	3688	alternative = "toUPPER_utf8_safe";
	3689	}
	3690	else if (type == DEPRECATE_TO_TITLE) {
	3691	name = instr(file, "mathoms.c")
	3692	? "to_utf8_title"
	3693	: "toTITLE_utf8";
	3694	alternative = "toTITLE_utf8_safe";
	3695	}
	3696	else if (type == DEPRECATE_TO_LOWER) {
	3697	name = instr(file, "mathoms.c")
	3698	? "to_utf8_lower"
	3699	: "toLOWER_utf8";
	3700	alternative = "toLOWER_utf8_safe";
	3701	}
	3702	else if (type == DEPRECATE_TO_FOLD) {
	3703	name = instr(file, "mathoms.c")
	3704	? "to_utf8_fold"
	3705	: "toFOLD_utf8";
	3706	alternative = "toFOLD_utf8_safe";
	3707	}
	3708	else Perl_croak(aTHX_ "panic: Unexpected case change type");
	3709
	3710	warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
	3711	}
	3712	else {
	3713	assert (p < *e);
	3714	}
	3715
	3716	return utf8n_flags;
	3717	}
	3718
	3719	/* The process for changing the case is essentially the same for the four case
	3720	* change types, except there are complications for folding. Otherwise the
	3721	* difference is only which case to change to. To make sure that they all do
	3722	* the same thing, the bodies of the functions are extracted out into the
	3723	* following two macros. The functions are written with the same variable
	3724	* names, and these are known and used inside these macros. It would be
	3725	* better, of course, to have inline functions to do it, but since different
	3726	* macros are called, depending on which case is being changed to, this is not
	3727	* feasible in C (to khw's knowledge). Two macros are created so that the fold
	3728	* function can start with the common start macro, then finish with its special
	3729	* handling; while the other three cases can just use the common end macro.
	3730	*
	3731	* The algorithm is to use the proper (passed in) macro or function to change
	3732	* the case for code points that are below 256. The macro is used if using
	3733	* locale rules for the case change; the function if not. If the code point is
	3734	* above 255, it is computed from the input UTF-8, and another macro is called
	3735	* to do the conversion. If necessary, the output is converted to UTF-8. If
	3736	* using a locale, we have to check that the change did not cross the 255/256
	3737	* boundary, see check_locale_boundary_crossing() for further details.
	3738	*
	3739	* The macros are split with the correct case change for the below-256 case
	3740	* stored into 'result', and in the middle of an else clause for the above-255
	3741	* case. At that point in the 'else', 'result' is not the final result, but is
	3742	* the input code point calculated from the UTF-8. The fold code needs to
	3743	* realize all this and take it from there.
	3744	*
	3745	* If you read the two macros as sequential, it's easier to understand what's
	3746	* going on. */
	3747	#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
	3748	L1_func_extra_param) \
	3749	\
	3750	if (flags & (locale_flags)) { \
	3751	_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
	3752	/* Treat a UTF-8 locale as not being in locale at all */ \
	3753	if (IN_UTF8_CTYPE_LOCALE) { \
	3754	flags &= ~(locale_flags); \
	3755	} \
	3756	} \
	3757	\
	3758	if (UTF8_IS_INVARIANT(*p)) { \
	3759	if (flags & (locale_flags)) { \
	3760	result = LC_L1_change_macro(*p); \
	3761	} \
	3762	else { \
	3763	return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
	3764	} \
	3765	} \
	3766	else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
	3767	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)); \
	3768	if (flags & (locale_flags)) { \
	3769	result = LC_L1_change_macro(c); \
	3770	} \
	3771	else { \
	3772	return L1_func(c, ustrp, lenp, L1_func_extra_param); \
	3773	} \
	3774	} \
	3775	else { /* malformed UTF-8 or ord above 255 */ \
	3776	STRLEN len_result; \
	3777	result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
	3778	if (len_result == (STRLEN) -1) { \
	3779	_force_out_malformed_utf8_message(p, e, utf8n_flags, \
	3780	1 /* Die */ ); \
	3781	}
	3782
	3783	#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
	3784	result = change_macro(result, p, ustrp, lenp); \
	3785	\
	3786	if (flags & (locale_flags)) { \
	3787	result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
	3788	} \
	3789	return result; \
	3790	} \
	3791	\
	3792	/* Here, used locale rules. Convert back to UTF-8 */ \
	3793	if (UTF8_IS_INVARIANT(result)) { \
	3794	*ustrp = (U8) result; \
	3795	*lenp = 1; \
	3796	} \
	3797	else { \
	3798	*ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
	3799	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
	3800	*lenp = 2; \
	3801	} \
	3802	\
	3803	return result;
	3804
	3805	/*
	3806	=for apidoc to_utf8_upper
	3807
	3808	Instead use L</toUPPER_utf8_safe>.
	3809
	3810	=cut */
	3811
	3812	/* Not currently externally documented, and subject to change:
	3813	* <flags> is set iff iff the rules from the current underlying locale are to
	3814	* be used. */
	3815
	3816	UV
	3817	Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
	3818	const U8 *e,
	3819	U8* ustrp,
	3820	STRLEN *lenp,
	3821	bool flags,
	3822	const char * const file,
	3823	const int line)
	3824	{
	3825	UV result;
	3826	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
	3827	cBOOL(flags), file, line);
	3828
	3829	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	3830
	3831	/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
	3832	/* 2nd char of uc(U+DF) is 'S' */
	3833	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
	3834	CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
	3835	}
	3836
	3837	/*
	3838	=for apidoc to_utf8_title
	3839
	3840	Instead use L</toTITLE_utf8_safe>.
	3841
	3842	=cut */
	3843
	3844	/* Not currently externally documented, and subject to change:
	3845	* <flags> is set iff the rules from the current underlying locale are to be
	3846	* used. Since titlecase is not defined in POSIX, for other than a
	3847	* UTF-8 locale, uppercase is used instead for code points < 256.
	3848	*/
	3849
	3850	UV
	3851	Perl__to_utf8_title_flags(pTHX_ const U8 *p,
	3852	const U8 *e,
	3853	U8* ustrp,
	3854	STRLEN *lenp,
	3855	bool flags,
	3856	const char * const file,
	3857	const int line)
	3858	{
	3859	UV result;
	3860	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
	3861	cBOOL(flags), file, line);
	3862
	3863	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	3864
	3865	/* 2nd char of ucfirst(U+DF) is 's' */
	3866	CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
	3867	CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
	3868	}
	3869
	3870	/*
	3871	=for apidoc to_utf8_lower
	3872
	3873	Instead use L</toLOWER_utf8_safe>.
	3874
	3875	=cut */
	3876
	3877	/* Not currently externally documented, and subject to change:
	3878	* <flags> is set iff iff the rules from the current underlying locale are to
	3879	* be used.
	3880	*/
	3881
	3882	UV
	3883	Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
	3884	const U8 *e,
	3885	U8* ustrp,
	3886	STRLEN *lenp,
	3887	bool flags,
	3888	const char * const file,
	3889	const int line)
	3890	{
	3891	UV result;
	3892	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
	3893	cBOOL(flags), file, line);
	3894
	3895	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	3896
	3897	CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
	3898	CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
	3899	}
	3900
	3901	/*
	3902	=for apidoc to_utf8_fold
	3903
	3904	Instead use L</toFOLD_utf8_safe>.
	3905
	3906	=cut */
	3907
	3908	/* Not currently externally documented, and subject to change,
	3909	* in <flags>
	3910	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	3911	* locale are to be used.
	3912	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	3913	* otherwise simple folds
	3914	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	3915	* prohibited
	3916	*/
	3917
	3918	UV
	3919	Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
	3920	const U8 *e,
	3921	U8* ustrp,
	3922	STRLEN *lenp,
	3923	U8 flags,
	3924	const char * const file,
	3925	const int line)
	3926	{
	3927	UV result;
	3928	const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
	3929	cBOOL(flags), file, line);
	3930
	3931	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	3932
	3933	/* These are mutually exclusive */
	3934	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	3935
	3936	assert(p != ustrp); /* Otherwise overwrites */
	3937
	3938	CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
	3939	((flags) & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII)));
	3940
	3941	result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	3942
	3943	if (flags & FOLD_FLAGS_LOCALE) {
	3944
	3945	# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
	3946	# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3947	# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	3948
	3949	/* Special case these two characters, as what normally gets
	3950	* returned under locale doesn't work */
	3951	if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
	3952	{
	3953	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3954	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3955	"Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
	3956	"resolved to \"\\x{17F}\\x{17F}\".");
	3957	goto return_long_s;
	3958	}
	3959	else
	3960	#endif
	3961	if (memBEGINs((char *) p, e - p, LONG_S_T))
	3962	{
	3963	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3964	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3965	"Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
	3966	"resolved to \"\\x{FB06}\".");
	3967	goto return_ligature_st;
	3968	}
	3969
	3970	#if UNICODE_MAJOR_VERSION == 3 \
	3971	&& UNICODE_DOT_VERSION == 0 \
	3972	&& UNICODE_DOT_DOT_VERSION == 1
	3973	# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
	3974
	3975	/* And special case this on this Unicode version only, for the same
	3976	* reaons the other two are special cased. They would cross the
	3977	* 255/256 boundary which is forbidden under /l, and so the code
	3978	* wouldn't catch that they are equivalent (which they are only in
	3979	* this release) */
	3980	else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
	3981	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	3982	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	3983	"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
	3984	"resolved to \"\\x{0131}\".");
	3985	goto return_dotless_i;
	3986	}
	3987	#endif
	3988
	3989	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	3990	}
	3991	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	3992	return result;
	3993	}
	3994	else {
	3995	/* This is called when changing the case of a UTF-8-encoded
	3996	* character above the ASCII range, and the result should not
	3997	* contain an ASCII character. */
	3998
	3999	UV original; /* To store the first code point of <p> */
	4000
	4001	/* Look at every character in the result; if any cross the
	4002	* boundary, the whole thing is disallowed */
	4003	U8* s = ustrp;
	4004	U8* e = ustrp + *lenp;
	4005	while (s < e) {
	4006	if (isASCII(*s)) {
	4007	/* Crossed, have to return the original */
	4008	original = valid_utf8_to_uvchr(p, lenp);
	4009
	4010	/* But in these instances, there is an alternative we can
	4011	* return that is valid */
	4012	if (original == LATIN_SMALL_LETTER_SHARP_S
	4013	#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
	4014	\|\| original == LATIN_CAPITAL_LETTER_SHARP_S
	4015	#endif
	4016	) {
	4017	goto return_long_s;
	4018	}
	4019	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	4020	goto return_ligature_st;
	4021	}
	4022	#if UNICODE_MAJOR_VERSION == 3 \
	4023	&& UNICODE_DOT_VERSION == 0 \
	4024	&& UNICODE_DOT_DOT_VERSION == 1
	4025
	4026	else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	4027	goto return_dotless_i;
	4028	}
	4029	#endif
	4030	Copy(p, ustrp, *lenp, char);
	4031	return original;
	4032	}
	4033	s += UTF8SKIP(s);
	4034	}
	4035
	4036	/* Here, no characters crossed, result is ok as-is */
	4037	return result;
	4038	}
	4039	}
	4040
	4041	/* Here, used locale rules. Convert back to UTF-8 */
	4042	if (UTF8_IS_INVARIANT(result)) {
	4043	*ustrp = (U8) result;
	4044	*lenp = 1;
	4045	}
	4046	else {
	4047	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	4048	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	4049	*lenp = 2;
	4050	}
	4051
	4052	return result;
	4053
	4054	return_long_s:
	4055	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	4056	* folds to a string of two of these characters. By returning this
	4057	* instead, then, e.g.,
	4058	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	4059	* works. */
	4060
	4061	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	4062	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	4063	ustrp, *lenp, U8);
	4064	return LATIN_SMALL_LETTER_LONG_S;
	4065
	4066	return_ligature_st:
	4067	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	4068	* have the other one fold to it */
	4069
	4070	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	4071	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	4072	return LATIN_SMALL_LIGATURE_ST;
	4073
	4074	#if UNICODE_MAJOR_VERSION == 3 \
	4075	&& UNICODE_DOT_VERSION == 0 \
	4076	&& UNICODE_DOT_DOT_VERSION == 1
	4077
	4078	return_dotless_i:
	4079	*lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
	4080	Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
	4081	return LATIN_SMALL_LETTER_DOTLESS_I;
	4082
	4083	#endif
	4084
	4085	}
	4086
	4087	/* Note:
	4088	* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
	4089	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	4090	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	4091	*/
	4092
	4093	SV*
	4094	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	4095	I32 minbits, I32 none)
	4096	{
	4097	PERL_ARGS_ASSERT_SWASH_INIT;
	4098
	4099	/* Returns a copy of a swash initiated by the called function. This is the
	4100	* public interface, and returning a copy prevents others from doing
	4101	* mischief on the original */
	4102
	4103	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
	4104	NULL, NULL));
	4105	}
	4106
	4107	SV*
	4108	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
	4109	I32 minbits, I32 none, SV* invlist,
	4110	U8* const flags_p)
	4111	{
	4112
	4113	/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
	4114	* use the following define */
	4115
	4116	#define CORE_SWASH_INIT_RETURN(x) \
	4117	PL_curpm= old_PL_curpm; \
	4118	return x
	4119
	4120	/* Initialize and return a swash, creating it if necessary. It does this
	4121	* by calling utf8_heavy.pl in the general case. The returned value may be
	4122	* the swash's inversion list instead if the input parameters allow it.
	4123	* Which is returned should be immaterial to callers, as the only
	4124	* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
	4125	* and swash_to_invlist() handle both these transparently.
	4126	*
	4127	* This interface should only be used by functions that won't destroy or
	4128	* adversely change the swash, as doing so affects all other uses of the
	4129	* swash in the program; the general public should use 'Perl_swash_init'
	4130	* instead.
	4131	*
	4132	* pkg is the name of the package that <name> should be in.
	4133	* name is the name of the swash to find. Typically it is a Unicode
	4134	* property name, including user-defined ones
	4135	* listsv is a string to initialize the swash with. It must be of the form
	4136	* documented as the subroutine return value in
	4137	* L<perlunicode/User-Defined Character Properties>
	4138	* minbits is the number of bits required to represent each data element.
	4139	* It is '1' for binary properties.
	4140	* none I (khw) do not understand this one, but it is used only in tr///.
	4141	* invlist is an inversion list to initialize the swash with (or NULL)
	4142	* flags_p if non-NULL is the address of various input and output flag bits
	4143	* to the routine, as follows: ('I' means is input to the routine;
	4144	* 'O' means output from the routine. Only flags marked O are
	4145	* meaningful on return.)
	4146	* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
	4147	* came from a user-defined property. (I O)
	4148	* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
	4149	* when the swash cannot be located, to simply return NULL. (I)
	4150	* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
	4151	* return of an inversion list instead of a swash hash if this routine
	4152	* thinks that would result in faster execution of swash_fetch() later
	4153	* on. (I)
	4154	*
	4155	* Thus there are three possible inputs to find the swash: <name>,
	4156	* <listsv>, and <invlist>. At least one must be specified. The result
	4157	* will be the union of the specified ones, although <listsv>'s various
	4158	* actions can intersect, etc. what <name> gives. To avoid going out to
	4159	* disk at all, <invlist> should specify completely what the swash should
	4160	* have, and <listsv> should be &PL_sv_undef and <name> should be "".
	4161	*
	4162	* <invlist> is only valid for binary properties */
	4163
	4164	PMOP old_PL_curpm= PL_curpm; / save away the old PL_curpm */
	4165
	4166	SV* retval = &PL_sv_undef;
	4167	HV* swash_hv = NULL;
	4168	const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST);
	4169
	4170	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	4171	assert(! invlist \|\| minbits == 1);
	4172
	4173	PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
	4174	regex that triggered the swash init and the swash init
	4175	perl logic itself. See perl #122747 */
	4176
	4177	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	4178	* so */
	4179	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	4180	dSP;
	4181	const size_t pkg_len = strlen(pkg);
	4182	const size_t name_len = strlen(name);
	4183	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	4184	SV* errsv_save;
	4185	GV *method;
	4186
	4187	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	4188
	4189	PUSHSTACKi(PERLSI_MAGIC);
	4190	ENTER;
	4191	SAVEHINTS();
	4192	save_re_context();
	4193	/* We might get here via a subroutine signature which uses a utf8
	4194	* parameter name, at which point PL_subname will have been set
	4195	* but not yet used. */
	4196	save_item(PL_subname);
	4197	if (PL_parser && PL_parser->error_count)
	4198	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	4199	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	4200	if (!method) { /* demand load UTF-8 */
	4201	ENTER;
	4202	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	4203	GvSV(PL_errgv) = NULL;
	4204	#ifndef NO_TAINT_SUPPORT
	4205	/* It is assumed that callers of this routine are not passing in
	4206	* any user derived data. */
	4207	/* Need to do this after save_re_context() as it will set
	4208	* PL_tainted to 1 while saving $1 etc (see the code after getrx:
	4209	* in Perl_magic_get). Even line to create errsv_save can turn on
	4210	* PL_tainted. */
	4211	SAVEBOOL(TAINT_get);
	4212	TAINT_NOT;
	4213	#endif
	4214	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	4215	NULL);
	4216	{
	4217	/* Not ERRSV, as there is no need to vivify a scalar we are
	4218	about to discard. */
	4219	SV * const errsv = GvSV(PL_errgv);
	4220	if (!SvTRUE(errsv)) {
	4221	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	4222	SvREFCNT_dec(errsv);
	4223	}
	4224	}
	4225	LEAVE;
	4226	}
	4227	SPAGAIN;
	4228	PUSHMARK(SP);
	4229	EXTEND(SP,5);
	4230	mPUSHp(pkg, pkg_len);
	4231	mPUSHp(name, name_len);
	4232	PUSHs(listsv);
	4233	mPUSHi(minbits);
	4234	mPUSHi(none);
	4235	PUTBACK;
	4236	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	4237	GvSV(PL_errgv) = NULL;
	4238	/* If we already have a pointer to the method, no need to use
	4239	* call_method() to repeat the lookup. */
	4240	if (method
	4241	? call_sv(MUTABLE_SV(method), G_SCALAR)
	4242	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	4243	{
	4244	retval = *PL_stack_sp--;
	4245	SvREFCNT_inc(retval);
	4246	}
	4247	{
	4248	/* Not ERRSV. See above. */
	4249	SV * const errsv = GvSV(PL_errgv);
	4250	if (!SvTRUE(errsv)) {
	4251	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	4252	SvREFCNT_dec(errsv);
	4253	}
	4254	}
	4255	LEAVE;
	4256	POPSTACK;
	4257	if (IN_PERL_COMPILETIME) {
	4258	CopHINTS_set(PL_curcop, PL_hints);
	4259	}
	4260	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	4261	if (SvPOK(retval)) {
	4262
	4263	/* If caller wants to handle missing properties, let them */
	4264	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
	4265	CORE_SWASH_INIT_RETURN(NULL);
	4266	}
	4267	Perl_croak(aTHX_
	4268	"Can't find Unicode property definition \"%" SVf "\"",
	4269	SVfARG(retval));
	4270	NOT_REACHED; /* NOTREACHED */
	4271	}
	4272	}
	4273	} /* End of calling the module to find the swash */
	4274
	4275	/* If this operation fetched a swash, and we will need it later, get it */
	4276	if (retval != &PL_sv_undef
	4277	&& (minbits == 1 \|\| (flags_p
	4278	&& ! (*flags_p
	4279	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
	4280	{
	4281	swash_hv = MUTABLE_HV(SvRV(retval));
	4282
	4283	/* If we don't already know that there is a user-defined component to
	4284	* this swash, and the user has indicated they wish to know if there is
	4285	* one (by passing <flags_p>), find out */
	4286	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
	4287	SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
	4288	if (user_defined && SvUV(*user_defined)) {
	4289	*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	4290	}
	4291	}
	4292	}
	4293
	4294	/* Make sure there is an inversion list for binary properties */
	4295	if (minbits == 1) {
	4296	SV** swash_invlistsvp = NULL;
	4297	SV* swash_invlist = NULL;
	4298	bool invlist_in_swash_is_valid = FALSE;
	4299	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
	4300	an unclaimed reference count */
	4301
	4302	/* If this operation fetched a swash, get its already existing
	4303	* inversion list, or create one for it */
	4304
	4305	if (swash_hv) {
	4306	swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
	4307	if (swash_invlistsvp) {
	4308	swash_invlist = *swash_invlistsvp;
	4309	invlist_in_swash_is_valid = TRUE;
	4310	}
	4311	else {
	4312	swash_invlist = _swash_to_invlist(retval);
	4313	swash_invlist_unclaimed = TRUE;
	4314	}
	4315	}
	4316
	4317	/* If an inversion list was passed in, have to include it */
	4318	if (invlist) {
	4319
	4320	/* Any fetched swash will by now have an inversion list in it;
	4321	* otherwise <swash_invlist> will be NULL, indicating that we
	4322	* didn't fetch a swash */
	4323	if (swash_invlist) {
	4324
	4325	/* Add the passed-in inversion list, which invalidates the one
	4326	* already stored in the swash */
	4327	invlist_in_swash_is_valid = FALSE;
	4328	SvREADONLY_off(swash_invlist); /* Turned on again below */
	4329	_invlist_union(invlist, swash_invlist, &swash_invlist);
	4330	}
	4331	else {
	4332
	4333	/* Here, there is no swash already. Set up a minimal one, if
	4334	* we are going to return a swash */
	4335	if (! use_invlist) {
	4336	swash_hv = newHV();
	4337	retval = newRV_noinc(MUTABLE_SV(swash_hv));
	4338	}
	4339	swash_invlist = invlist;
	4340	}
	4341	}
	4342
	4343	/* Here, we have computed the union of all the passed-in data. It may
	4344	* be that there was an inversion list in the swash which didn't get
	4345	* touched; otherwise save the computed one */
	4346	if (! invlist_in_swash_is_valid && ! use_invlist) {
	4347	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
	4348	{
	4349	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	4350	}
	4351	/* We just stole a reference count. */
	4352	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
	4353	else SvREFCNT_inc_simple_void_NN(swash_invlist);
	4354	}
	4355
	4356	/* The result is immutable. Forbid attempts to change it. */
	4357	SvREADONLY_on(swash_invlist);
	4358
	4359	if (use_invlist) {
	4360	SvREFCNT_dec(retval);
	4361	if (!swash_invlist_unclaimed)
	4362	SvREFCNT_inc_simple_void_NN(swash_invlist);
	4363	retval = newRV_noinc(swash_invlist);
	4364	}
	4365	}
	4366
	4367	CORE_SWASH_INIT_RETURN(retval);
	4368	#undef CORE_SWASH_INIT_RETURN
	4369	}
	4370
	4371
	4372	/* This API is wrong for special case conversions since we may need to
	4373	* return several Unicode characters for a single Unicode character
	4374	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	4375	* the lower-level routine, and it is similarly broken for returning
	4376	* multiple values. --jhi
	4377	* For those, you should use S__to_utf8_case() instead */
	4378	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	4379
	4380	/* Note:
	4381	* Returns the value of property/mapping C<swash> for the first character
	4382	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	4383	* assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
	4384	* is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	4385	*
	4386	* A "swash" is a hash which contains initially the keys/values set up by
	4387	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	4388	* property for all possible code points. Things are stored in a compact form
	4389	* (see utf8_heavy.pl) so that calculation is required to find the actual
	4390	* property value for a given code point. As code points are looked up, new
	4391	* key/value pairs are added to the hash, so that the calculation doesn't have
	4392	* to ever be re-done. Further, each calculation is done, not just for the
	4393	* desired one, but for a whole block of code points adjacent to that one.
	4394	* For binary properties on ASCII machines, the block is usually for 64 code
	4395	* points, starting with a code point evenly divisible by 64. Thus if the
	4396	* property value for code point 257 is requested, the code goes out and
	4397	* calculates the property values for all 64 code points between 256 and 319,
	4398	* and stores these as a single 64-bit long bit vector, called a "swatch",
	4399	* under the key for code point 256. The key is the UTF-8 encoding for code
	4400	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	4401	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	4402	* for code point 258 is then requested, this code realizes that it would be
	4403	* stored under the key for 256, and would find that value and extract the
	4404	* relevant bit, offset from 256.
	4405	*
	4406	* Non-binary properties are stored in as many bits as necessary to represent
	4407	* their values (32 currently, though the code is more general than that), not
	4408	* as single bits, but the principle is the same: the value for each key is a
	4409	* vector that encompasses the property values for all code points whose UTF-8
	4410	* representations are represented by the key. That is, for all code points
	4411	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	4412	* bytes of that.
	4413	*/
	4414	UV
	4415	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	4416	{
	4417	HV *const hv = MUTABLE_HV(SvRV(swash));
	4418	U32 klen;
	4419	U32 off;
	4420	STRLEN slen = 0;
	4421	STRLEN needents;
	4422	const U8 *tmps = NULL;
	4423	SV *swatch;
	4424	const U8 c = *ptr;
	4425
	4426	PERL_ARGS_ASSERT_SWASH_FETCH;
	4427
	4428	/* If it really isn't a hash, it isn't really swash; must be an inversion
	4429	* list */
	4430	if (SvTYPE(hv) != SVt_PVHV) {
	4431	return _invlist_contains_cp((SV*)hv,
	4432	(do_utf8)
	4433	? valid_utf8_to_uvchr(ptr, NULL)
	4434	: c);
	4435	}
	4436
	4437	/* We store the values in a "swatch" which is a vec() value in a swash
	4438	* hash. Code points 0-255 are a single vec() stored with key length
	4439	* (klen) 0. All other code points have a UTF-8 representation
	4440	* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
	4441	* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
	4442	* length for them is the length of the encoded char - 1. ptr[klen] is the
	4443	* final byte in the sequence representing the character */
	4444	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
	4445	klen = 0;
	4446	needents = 256;
	4447	off = c;
	4448	}
	4449	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	4450	klen = 0;
	4451	needents = 256;
	4452	off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
	4453	}
	4454	else {
	4455	klen = UTF8SKIP(ptr) - 1;
	4456
	4457	/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
	4458	* the vec is the final byte in the sequence. (In EBCDIC this is
	4459	* converted to I8 to get consecutive values.) To help you visualize
	4460	* all this:
	4461	* Straight 1047 After final byte
	4462	* UTF-8 UTF-EBCDIC I8 transform
	4463	* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
	4464	* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
	4465	* ...
	4466	* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
	4467	* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
	4468	* ...
	4469	* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
	4470	* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
	4471	* ...
	4472	* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
	4473	* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
	4474	* ...
	4475	* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
	4476	* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
	4477	*
	4478	* (There are no discontinuities in the elided (...) entries.)
	4479	* The UTF-8 key for these 33 code points is '\xD0' (which also is the
	4480	* key for the next 31, up through U+043F, whose UTF-8 final byte is
	4481	* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
	4482	* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
	4483	* index into the vec() swatch (after subtracting 0x80, which we
	4484	* actually do with an '&').
	4485	* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
	4486	* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
	4487	* dicontinuities which go away by transforming it into I8, and we
	4488	* effectively subtract 0xA0 to get the index. */
	4489	needents = (1 << UTF_ACCUMULATION_SHIFT);
	4490	off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
	4491	}
	4492
	4493	/*
	4494	* This single-entry cache saves about 1/3 of the UTF-8 overhead in test
	4495	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	4496	* it's nothing to sniff at.) Pity we usually come through at least
	4497	* two function calls to get here...
	4498	*
	4499	* NB: this code assumes that swatches are never modified, once generated!
	4500	*/
	4501
	4502	if (hv == PL_last_swash_hv &&
	4503	klen == PL_last_swash_klen &&
	4504	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	4505	{
	4506	tmps = PL_last_swash_tmps;
	4507	slen = PL_last_swash_slen;
	4508	}
	4509	else {
	4510	/* Try our second-level swatch cache, kept in a hash. */
	4511	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	4512
	4513	/* If not cached, generate it via swatch_get */
	4514	if (!svp \|\| !SvPOK(*svp)
	4515	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
	4516	{
	4517	if (klen) {
	4518	const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
	4519	swatch = swatch_get(swash,
	4520	code_point & ~((UV)needents - 1),
	4521	needents);
	4522	}
	4523	else { /* For the first 256 code points, the swatch has a key of
	4524	length 0 */
	4525	swatch = swatch_get(swash, 0, needents);
	4526	}
	4527
	4528	if (IN_PERL_COMPILETIME)
	4529	CopHINTS_set(PL_curcop, PL_hints);
	4530
	4531	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	4532
	4533	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	4534	\|\| (slen << 3) < needents)
	4535	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	4536	"svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
	4537	svp, tmps, (UV)slen, (UV)needents);
	4538	}
	4539
	4540	PL_last_swash_hv = hv;
	4541	assert(klen <= sizeof(PL_last_swash_key));
	4542	PL_last_swash_klen = (U8)klen;
	4543	/* FIXME change interpvar.h? */
	4544	PL_last_swash_tmps = (U8 *) tmps;
	4545	PL_last_swash_slen = slen;
	4546	if (klen)
	4547	Copy(ptr, PL_last_swash_key, klen, U8);
	4548	}
	4549
	4550	switch ((int)((slen << 3) / needents)) {
	4551	case 1:
	4552	return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
	4553	case 8:
	4554	return ((UV) tmps[off]);
	4555	case 16:
	4556	off <<= 1;
	4557	return
	4558	((UV) tmps[off ] << 8) +
	4559	((UV) tmps[off + 1]);
	4560	case 32:
	4561	off <<= 2;
	4562	return
	4563	((UV) tmps[off ] << 24) +
	4564	((UV) tmps[off + 1] << 16) +
	4565	((UV) tmps[off + 2] << 8) +
	4566	((UV) tmps[off + 3]);
	4567	}
	4568	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	4569	"slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
	4570	NORETURN_FUNCTION_END;
	4571	}
	4572
	4573	/* Read a single line of the main body of the swash input text. These are of
	4574	* the form:
	4575	* 0053 0056 0073
	4576	* where each number is hex. The first two numbers form the minimum and
	4577	* maximum of a range, and the third is the value associated with the range.
	4578	* Not all swashes should have a third number
	4579	*
	4580	* On input: l points to the beginning of the line to be examined; it points
	4581	* to somewhere in the string of the whole input text, and is
	4582	* terminated by a \n or the null string terminator.
	4583	* lend points to the null terminator of that string
	4584	* wants_value is non-zero if the swash expects a third number
	4585	* typestr is the name of the swash's mapping, like 'ToLower'
	4586	* On output: min, max, and *val are set to the values read from the line.
	4587	* returns a pointer just beyond the line examined. If there was no
	4588	* valid min number on the line, returns lend+1
	4589	*/
	4590
	4591	STATIC U8*
	4592	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	4593	const bool wants_value, const U8* const typestr)
	4594	{
	4595	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	4596	STRLEN numlen; /* Length of the number */
	4597	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	4598	\| PERL_SCAN_DISALLOW_PREFIX
	4599	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4600
	4601	/* nl points to the next \n in the scan */
	4602	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	4603
	4604	PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
	4605
	4606	/* Get the first number on the line: the range minimum */
	4607	numlen = lend - l;
	4608	min = grok_hex((char )l, &numlen, &flags, NULL);
	4609	max = min; /* So can never return without setting max */
	4610	if (numlen) /* If found a hex number, position past it */
	4611	l += numlen;
	4612	else if (nl) { /* Else, go handle next line, if any */
	4613	return nl + 1; /* 1 is length of "\n" */
	4614	}
	4615	else { /* Else, no next line */
	4616	return lend + 1; /* to LIST's end at which \n is not found */
	4617	}
	4618
	4619	/* The max range value follows, separated by a BLANK */
	4620	if (isBLANK(*l)) {
	4621	++l;
	4622	flags = PERL_SCAN_SILENT_ILLDIGIT
	4623	\| PERL_SCAN_DISALLOW_PREFIX
	4624	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4625	numlen = lend - l;
	4626	max = grok_hex((char )l, &numlen, &flags, NULL);
	4627	if (numlen)
	4628	l += numlen;
	4629	else /* If no value here, it is a single element range */
	4630	max = min;
	4631
	4632	/* Non-binary tables have a third entry: what the first element of the
	4633	* range maps to. The map for those currently read here is in hex */
	4634	if (wants_value) {
	4635	if (isBLANK(*l)) {
	4636	++l;
	4637	flags = PERL_SCAN_SILENT_ILLDIGIT
	4638	\| PERL_SCAN_DISALLOW_PREFIX
	4639	\| PERL_SCAN_SILENT_NON_PORTABLE;
	4640	numlen = lend - l;
	4641	val = grok_hex((char )l, &numlen, &flags, NULL);
	4642	if (numlen)
	4643	l += numlen;
	4644	else
	4645	*val = 0;
	4646	}
	4647	else {
	4648	*val = 0;
	4649	if (typeto) {
	4650	/* diag_listed_as: To%s: illegal mapping '%s' */
	4651	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	4652	typestr, l);
	4653	}
	4654	}
	4655	}
	4656	else
	4657	val = 0; / bits == 1, then any val should be ignored */
	4658	}
	4659	else { /* Nothing following range min, should be single element with no
	4660	mapping expected */
	4661	if (wants_value) {
	4662	*val = 0;
	4663	if (typeto) {
	4664	/* diag_listed_as: To%s: illegal mapping '%s' */
	4665	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	4666	}
	4667	}
	4668	else
	4669	val = 0; / bits == 1, then val should be ignored */
	4670	}
	4671
	4672	/* Position to next line if any, or EOF */
	4673	if (nl)
	4674	l = nl + 1;
	4675	else
	4676	l = lend;
	4677
	4678	return l;
	4679	}
	4680
	4681	/* Note:
	4682	* Returns a swatch (a bit vector string) for a code point sequence
	4683	* that starts from the value C<start> and comprises the number C<span>.
	4684	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	4685	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	4686	*/
	4687	STATIC SV*
	4688	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	4689	{
	4690	SV *swatch;
	4691	U8 l, lend, x, xend, s, send;
	4692	STRLEN lcur, xcur, scur;
	4693	HV *const hv = MUTABLE_HV(SvRV(swash));
	4694	SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
	4695
	4696	SV** listsvp = NULL; /* The string containing the main body of the table */
	4697	SV** extssvp = NULL;
	4698	SV** invert_it_svp = NULL;
	4699	U8* typestr = NULL;
	4700	STRLEN bits;
	4701	STRLEN octets; /* if bits == 1, then octets == 0 */
	4702	UV none;
	4703	UV end = start + span;
	4704
	4705	if (invlistsvp == NULL) {
	4706	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	4707	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	4708	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	4709	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	4710	listsvp = hv_fetchs(hv, "LIST", FALSE);
	4711	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	4712
	4713	bits = SvUV(*bitssvp);
	4714	none = SvUV(*nonesvp);
	4715	typestr = (U8)SvPV_nolen(typesvp);
	4716	}
	4717	else {
	4718	bits = 1;
	4719	none = 0;
	4720	}
	4721	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	4722
	4723	PERL_ARGS_ASSERT_SWATCH_GET;
	4724
	4725	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	4726	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
	4727	(UV)bits);
	4728	}
	4729
	4730	/* If overflowed, use the max possible */
	4731	if (end < start) {
	4732	end = UV_MAX;
	4733	span = end - start;
	4734	}
	4735
	4736	/* create and initialize $swatch */
	4737	scur = octets ? (span * octets) : (span + 7) / 8;
	4738	swatch = newSV(scur);
	4739	SvPOK_on(swatch);
	4740	s = (U8*)SvPVX(swatch);
	4741	if (octets && none) {
	4742	const U8* const e = s + scur;
	4743	while (s < e) {
	4744	if (bits == 8)
	4745	*s++ = (U8)(none & 0xff);
	4746	else if (bits == 16) {
	4747	*s++ = (U8)((none >> 8) & 0xff);
	4748	*s++ = (U8)( none & 0xff);
	4749	}
	4750	else if (bits == 32) {
	4751	*s++ = (U8)((none >> 24) & 0xff);
	4752	*s++ = (U8)((none >> 16) & 0xff);
	4753	*s++ = (U8)((none >> 8) & 0xff);
	4754	*s++ = (U8)( none & 0xff);
	4755	}
	4756	}
	4757	*s = '\0';
	4758	}
	4759	else {
	4760	(void)memzero((U8*)s, scur + 1);
	4761	}
	4762	SvCUR_set(swatch, scur);
	4763	s = (U8*)SvPVX(swatch);
	4764
	4765	if (invlistsvp) { /* If has an inversion list set up use that */
	4766	_invlist_populate_swatch(*invlistsvp, start, end, s);
	4767	return swatch;
	4768	}
	4769
	4770	/* read $swash->{LIST} */
	4771	l = (U8)SvPV(listsvp, lcur);
	4772	lend = l + lcur;
	4773	while (l < lend) {
	4774	UV min, max, val, upper;
	4775	l = swash_scan_list_line(l, lend, &min, &max, &val,
	4776	cBOOL(octets), typestr);
	4777	if (l > lend) {
	4778	break;
	4779	}
	4780
	4781	/* If looking for something beyond this range, go try the next one */
	4782	if (max < start)
	4783	continue;
	4784
	4785	/* <end> is generally 1 beyond where we want to set things, but at the
	4786	* platform's infinity, where we can't go any higher, we want to
	4787	* include the code point at <end> */
	4788	upper = (max < end)
	4789	? max
	4790	: (max != UV_MAX \|\| end != UV_MAX)
	4791	? end - 1
	4792	: end;
	4793
	4794	if (octets) {
	4795	UV key;
	4796	if (min < start) {
	4797	if (!none \|\| val < none) {
	4798	val += start - min;
	4799	}
	4800	min = start;
	4801	}
	4802	for (key = min; key <= upper; key++) {
	4803	STRLEN offset;
	4804	/* offset must be non-negative (start <= min <= key < end) */
	4805	offset = octets * (key - start);
	4806	if (bits == 8)
	4807	s[offset] = (U8)(val & 0xff);
	4808	else if (bits == 16) {
	4809	s[offset ] = (U8)((val >> 8) & 0xff);
	4810	s[offset + 1] = (U8)( val & 0xff);
	4811	}
	4812	else if (bits == 32) {
	4813	s[offset ] = (U8)((val >> 24) & 0xff);
	4814	s[offset + 1] = (U8)((val >> 16) & 0xff);
	4815	s[offset + 2] = (U8)((val >> 8) & 0xff);
	4816	s[offset + 3] = (U8)( val & 0xff);
	4817	}
	4818
	4819	if (!none \|\| val < none)
	4820	++val;
	4821	}
	4822	}
	4823	else { /* bits == 1, then val should be ignored */
	4824	UV key;
	4825	if (min < start)
	4826	min = start;
	4827
	4828	for (key = min; key <= upper; key++) {
	4829	const STRLEN offset = (STRLEN)(key - start);
	4830	s[offset >> 3] \|= 1 << (offset & 7);
	4831	}
	4832	}
	4833	} /* while */
	4834
	4835	/* Invert if the data says it should be. Assumes that bits == 1 */
	4836	if (invert_it_svp && SvUV(*invert_it_svp)) {
	4837
	4838	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	4839	* be 0, and their inversion should also be 0, as we don't succeed any
	4840	* Unicode property matches for non-Unicode code points */
	4841	if (start <= PERL_UNICODE_MAX) {
	4842
	4843	/* The code below assumes that we never cross the
	4844	* Unicode/above-Unicode boundary in a range, as otherwise we would
	4845	* have to figure out where to stop flipping the bits. Since this
	4846	* boundary is divisible by a large power of 2, and swatches comes
	4847	* in small powers of 2, this should be a valid assumption */
	4848	assert(start + span - 1 <= PERL_UNICODE_MAX);
	4849
	4850	send = s + scur;
	4851	while (s < send) {
	4852	s = ~(s);
	4853	s++;
	4854	}
	4855	}
	4856	}
	4857
	4858	/* read $swash->{EXTRAS}
	4859	* This code also copied to swash_to_invlist() below */
	4860	x = (U8)SvPV(extssvp, xcur);
	4861	xend = x + xcur;
	4862	while (x < xend) {
	4863	STRLEN namelen;
	4864	U8 *namestr;
	4865	SV** othersvp;
	4866	HV* otherhv;
	4867	STRLEN otherbits;
	4868	SV *otherbitssvp, other;
	4869	U8 s, o, *nl;
	4870	STRLEN slen, olen;
	4871
	4872	const U8 opc = *x++;
	4873	if (opc == '\n')
	4874	continue;
	4875
	4876	nl = (U8*)memchr(x, '\n', xend - x);
	4877
	4878	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	4879	if (nl) {
	4880	x = nl + 1; /* 1 is length of "\n" */
	4881	continue;
	4882	}
	4883	else {
	4884	x = xend; /* to EXTRAS' end at which \n is not found */
	4885	break;
	4886	}
	4887	}
	4888
	4889	namestr = x;
	4890	if (nl) {
	4891	namelen = nl - namestr;
	4892	x = nl + 1;
	4893	}
	4894	else {
	4895	namelen = xend - namestr;
	4896	x = xend;
	4897	}
	4898
	4899	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	4900	otherhv = MUTABLE_HV(SvRV(*othersvp));
	4901	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	4902	otherbits = (STRLEN)SvUV(*otherbitssvp);
	4903	if (bits < otherbits)
	4904	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	4905	"bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
	4906
	4907	/* The "other" swatch must be destroyed after. */
	4908	other = swatch_get(*othersvp, start, span);
	4909	o = (U8*)SvPV(other, olen);
	4910
	4911	if (!olen)
	4912	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	4913
	4914	s = (U8*)SvPV(swatch, slen);
	4915	if (bits == 1 && otherbits == 1) {
	4916	if (slen != olen)
	4917	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	4918	"mismatch, slen=%" UVuf ", olen=%" UVuf,
	4919	(UV)slen, (UV)olen);
	4920
	4921	switch (opc) {
	4922	case '+':
	4923	while (slen--)
	4924	s++ \|= o++;
	4925	break;
	4926	case '!':
	4927	while (slen--)
	4928	s++ \|= ~o++;
	4929	break;
	4930	case '-':
	4931	while (slen--)
	4932	s++ &= ~o++;
	4933	break;
	4934	case '&':
	4935	while (slen--)
	4936	s++ &= o++;
	4937	break;
	4938	default:
	4939	break;
	4940	}
	4941	}
	4942	else {
	4943	STRLEN otheroctets = otherbits >> 3;
	4944	STRLEN offset = 0;
	4945	U8* const send = s + slen;
	4946
	4947	while (s < send) {
	4948	UV otherval = 0;
	4949
	4950	if (otherbits == 1) {
	4951	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	4952	++offset;
	4953	}
	4954	else {
	4955	STRLEN vlen = otheroctets;
	4956	otherval = *o++;
	4957	while (--vlen) {
	4958	otherval <<= 8;
	4959	otherval \|= *o++;
	4960	}
	4961	}
	4962
	4963	if (opc == '+' && otherval)
	4964	NOOP; /* replace with otherval */
	4965	else if (opc == '!' && !otherval)
	4966	otherval = 1;
	4967	else if (opc == '-' && otherval)
	4968	otherval = 0;
	4969	else if (opc == '&' && !otherval)
	4970	otherval = 0;
	4971	else {
	4972	s += octets; /* no replacement */
	4973	continue;
	4974	}
	4975
	4976	if (bits == 8)
	4977	*s++ = (U8)( otherval & 0xff);
	4978	else if (bits == 16) {
	4979	*s++ = (U8)((otherval >> 8) & 0xff);
	4980	*s++ = (U8)( otherval & 0xff);
	4981	}
	4982	else if (bits == 32) {
	4983	*s++ = (U8)((otherval >> 24) & 0xff);
	4984	*s++ = (U8)((otherval >> 16) & 0xff);
	4985	*s++ = (U8)((otherval >> 8) & 0xff);
	4986	*s++ = (U8)( otherval & 0xff);
	4987	}
	4988	}
	4989	}
	4990	sv_free(other); /* through with it! */
	4991	} /* while */
	4992	return swatch;
	4993	}
	4994
	4995	SV*
	4996	Perl__swash_to_invlist(pTHX_ SV* const swash)
	4997	{
	4998
	4999	/* Subject to change or removal. For use only in one place in regcomp.c.
	5000	* Ownership is given to one reference count in the returned SV* */
	5001
	5002	U8 l, lend;
	5003	char *loc;
	5004	STRLEN lcur;
	5005	HV *const hv = MUTABLE_HV(SvRV(swash));
	5006	UV elements = 0; /* Number of elements in the inversion list */
	5007	U8 empty[] = "";
	5008	SV** listsvp;
	5009	SV** typesvp;
	5010	SV** bitssvp;
	5011	SV** extssvp;
	5012	SV** invert_it_svp;
	5013
	5014	U8* typestr;
	5015	STRLEN bits;
	5016	STRLEN octets; /* if bits == 1, then octets == 0 */
	5017	U8 x, xend;
	5018	STRLEN xcur;
	5019
	5020	SV* invlist;
	5021
	5022	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	5023
	5024	/* If not a hash, it must be the swash's inversion list instead */
	5025	if (SvTYPE(hv) != SVt_PVHV) {
	5026	return SvREFCNT_inc_simple_NN((SV*) hv);
	5027	}
	5028
	5029	/* The string containing the main body of the table */
	5030	listsvp = hv_fetchs(hv, "LIST", FALSE);
	5031	typesvp = hv_fetchs(hv, "TYPE", FALSE);
	5032	bitssvp = hv_fetchs(hv, "BITS", FALSE);
	5033	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	5034	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	5035
	5036	typestr = (U8)SvPV_nolen(typesvp);
	5037	bits = SvUV(*bitssvp);
	5038	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	5039
	5040	/* read $swash->{LIST} */
	5041	if (SvPOK(*listsvp)) {
	5042	l = (U8)SvPV(listsvp, lcur);
	5043	}
	5044	else {
	5045	/* LIST legitimately doesn't contain a string during compilation phases
	5046	* of Perl itself, before the Unicode tables are generated. In this
	5047	* case, just fake things up by creating an empty list */
	5048	l = empty;
	5049	lcur = 0;
	5050	}
	5051	loc = (char *) l;
	5052	lend = l + lcur;
	5053
	5054	if (l == 'V') { / Inversion list format */
	5055	const char after_atou = (char ) lend;
	5056	UV element0;
	5057	UV* other_elements_ptr;
	5058
	5059	/* The first number is a count of the rest */
	5060	l++;
	5061	if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
	5062	Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
	5063	" at start of inversion list");
	5064	}
	5065	if (elements == 0) {
	5066	invlist = _new_invlist(0);
	5067	}
	5068	else {
	5069	l = (U8 *) after_atou;
	5070
	5071	/* Get the 0th element, which is needed to setup the inversion list
	5072	* */
	5073	while (isSPACE(*l)) l++;
	5074	after_atou = (char *) lend;
	5075	if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
	5076	Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
	5077	" inversion list");
	5078	}
	5079	l = (U8 *) after_atou;
	5080	invlist = _setup_canned_invlist(elements, element0,
	5081	&other_elements_ptr);
	5082	elements--;
	5083
	5084	/* Then just populate the rest of the input */
	5085	while (elements-- > 0) {
	5086	if (l > lend) {
	5087	Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
	5088	" elements than available", elements);
	5089	}
	5090	while (isSPACE(*l)) l++;
	5091	after_atou = (char *) lend;
	5092	if (!grok_atoUV((const char *)l, other_elements_ptr++,
	5093	&after_atou))
	5094	{
	5095	Perl_croak(aTHX_ "panic: Expecting a valid element"
	5096	" in inversion list");
	5097	}
	5098	l = (U8 *) after_atou;
	5099	}
	5100	}
	5101	}
	5102	else {
	5103
	5104	/* Scan the input to count the number of lines to preallocate array
	5105	* size based on worst possible case, which is each line in the input
	5106	* creates 2 elements in the inversion list: 1) the beginning of a
	5107	* range in the list; 2) the beginning of a range not in the list. */
	5108	while ((loc = (char ) memchr(loc, '\n', lend - (U8 ) loc)) != NULL) {
	5109	elements += 2;
	5110	loc++;
	5111	}
	5112
	5113	/* If the ending is somehow corrupt and isn't a new line, add another
	5114	* element for the final range that isn't in the inversion list */
	5115	if (! (*lend == '\n'
	5116	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	5117	{
	5118	elements++;
	5119	}
	5120
	5121	invlist = _new_invlist(elements);
	5122
	5123	/* Now go through the input again, adding each range to the list */
	5124	while (l < lend) {
	5125	UV start, end;
	5126	UV val; /* Not used by this function */
	5127
	5128	l = swash_scan_list_line(l, lend, &start, &end, &val,
	5129	cBOOL(octets), typestr);
	5130
	5131	if (l > lend) {
	5132	break;
	5133	}
	5134
	5135	invlist = _add_range_to_invlist(invlist, start, end);
	5136	}
	5137	}
	5138
	5139	/* Invert if the data says it should be */
	5140	if (invert_it_svp && SvUV(*invert_it_svp)) {
	5141	_invlist_invert(invlist);
	5142	}
	5143
	5144	/* This code is copied from swatch_get()
	5145	* read $swash->{EXTRAS} */
	5146	x = (U8)SvPV(extssvp, xcur);
	5147	xend = x + xcur;
	5148	while (x < xend) {
	5149	STRLEN namelen;
	5150	U8 *namestr;
	5151	SV** othersvp;
	5152	HV* otherhv;
	5153	STRLEN otherbits;
	5154	SV *otherbitssvp, other;
	5155	U8 *nl;
	5156
	5157	const U8 opc = *x++;
	5158	if (opc == '\n')
	5159	continue;
	5160
	5161	nl = (U8*)memchr(x, '\n', xend - x);
	5162
	5163	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	5164	if (nl) {
	5165	x = nl + 1; /* 1 is length of "\n" */
	5166	continue;
	5167	}
	5168	else {
	5169	x = xend; /* to EXTRAS' end at which \n is not found */
	5170	break;
	5171	}
	5172	}
	5173
	5174	namestr = x;
	5175	if (nl) {
	5176	namelen = nl - namestr;
	5177	x = nl + 1;
	5178	}
	5179	else {
	5180	namelen = xend - namestr;
	5181	x = xend;
	5182	}
	5183
	5184	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	5185	otherhv = MUTABLE_HV(SvRV(*othersvp));
	5186	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	5187	otherbits = (STRLEN)SvUV(*otherbitssvp);
	5188
	5189	if (bits != otherbits \|\| bits != 1) {
	5190	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	5191	"properties, bits=%" UVuf ", otherbits=%" UVuf,
	5192	(UV)bits, (UV)otherbits);
	5193	}
	5194
	5195	/* The "other" swatch must be destroyed after. */
	5196	other = _swash_to_invlist((SV )othersvp);
	5197
	5198	/* End of code copied from swatch_get() */
	5199	switch (opc) {
	5200	case '+':
	5201	_invlist_union(invlist, other, &invlist);
	5202	break;
	5203	case '!':
	5204	_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
	5205	break;
	5206	case '-':
	5207	_invlist_subtract(invlist, other, &invlist);
	5208	break;
	5209	case '&':
	5210	_invlist_intersection(invlist, other, &invlist);
	5211	break;
	5212	default:
	5213	break;
	5214	}
	5215	sv_free(other); /* through with it! */
	5216	}
	5217
	5218	SvREADONLY_on(invlist);
	5219	return invlist;
	5220	}
	5221
	5222	SV*
	5223	Perl__get_swash_invlist(pTHX_ SV* const swash)
	5224	{
	5225	SV** ptr;
	5226
	5227	PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
	5228
	5229	if (! SvROK(swash)) {
	5230	return NULL;
	5231	}
	5232
	5233	/* If it really isn't a hash, it isn't really swash; must be an inversion
	5234	* list */
	5235	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
	5236	return SvRV(swash);
	5237	}
	5238
	5239	ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
	5240	if (! ptr) {
	5241	return NULL;
	5242	}
	5243
	5244	return *ptr;
	5245	}
	5246
	5247	bool
	5248	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	5249	{
	5250	/* May change: warns if surrogates, non-character code points, or
	5251	* non-Unicode code points are in 's' which has length 'len' bytes.
	5252	* Returns TRUE if none found; FALSE otherwise. The only other validity
	5253	* check is to make sure that this won't exceed the string's length nor
	5254	* overflow */
	5255
	5256	const U8* const e = s + len;
	5257	bool ok = TRUE;
	5258
	5259	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	5260
	5261	while (s < e) {
	5262	if (UTF8SKIP(s) > len) {
	5263	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	5264	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	5265	return FALSE;
	5266	}
	5267	if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
	5268	if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
	5269	if ( ckWARN_d(WARN_NON_UNICODE)
	5270	\|\| UNLIKELY(0 < does_utf8_overflow(s, s + len,
	5271	0 /* Don't consider overlongs */
	5272	)))
	5273	{
	5274	/* A side effect of this function will be to warn */
	5275	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
	5276	ok = FALSE;
	5277	}
	5278	}
	5279	else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
	5280	if (ckWARN_d(WARN_SURROGATE)) {
	5281	/* This has a different warning than the one the called
	5282	* function would output, so can't just call it, unlike we
	5283	* do for the non-chars and above-unicodes */
	5284	UV uv = utf8_to_uvchr_buf(s, e, NULL);
	5285	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	5286	"Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
	5287	uv);
	5288	ok = FALSE;
	5289	}
	5290	}
	5291	else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
	5292	&& (ckWARN_d(WARN_NONCHAR)))
	5293	{
	5294	/* A side effect of this function will be to warn */
	5295	(void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
	5296	ok = FALSE;
	5297	}
	5298	}
	5299	s += UTF8SKIP(s);
	5300	}
	5301
	5302	return ok;
	5303	}
	5304
	5305	/*
	5306	=for apidoc pv_uni_display
	5307
	5308	Build to the scalar C<dsv> a displayable version of the string C<spv>,
	5309	length C<len>, the displayable version being at most C<pvlim> bytes long
	5310	(if longer, the rest is truncated and C<"..."> will be appended).
	5311
	5312	The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
	5313	C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
	5314	to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
	5315	(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
	5316	C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
	5317	C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
	5318
	5319	The pointer to the PV of the C<dsv> is returned.
	5320
	5321	See also L</sv_uni_display>.
	5322
	5323	=cut */
	5324	char *
	5325	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim,
	5326	UV flags)
	5327	{
	5328	int truncated = 0;
	5329	const char s, e;
	5330
	5331	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	5332
	5333	SvPVCLEAR(dsv);
	5334	SvUTF8_off(dsv);
	5335	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	5336	UV u;
	5337	/* This serves double duty as a flag and a character to print after
	5338	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	5339	*/
	5340	char ok = 0;
	5341
	5342	if (pvlim && SvCUR(dsv) >= pvlim) {
	5343	truncated++;
	5344	break;
	5345	}
	5346	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	5347	if (u < 256) {
	5348	const unsigned char c = (unsigned char)u & 0xFF;
	5349	if (flags & UNI_DISPLAY_BACKSLASH) {
	5350	switch (c) {
	5351	case '\n':
	5352	ok = 'n'; break;
	5353	case '\r':
	5354	ok = 'r'; break;
	5355	case '\t':
	5356	ok = 't'; break;
	5357	case '\f':
	5358	ok = 'f'; break;
	5359	case '\a':
	5360	ok = 'a'; break;
	5361	case '\\':
	5362	ok = '\\'; break;
	5363	default: break;
	5364	}
	5365	if (ok) {
	5366	const char string = ok;
	5367	sv_catpvs(dsv, "\\");
	5368	sv_catpvn(dsv, &string, 1);
	5369	}
	5370	}
	5371	/* isPRINT() is the locale-blind version. */
	5372	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	5373	const char string = c;
	5374	sv_catpvn(dsv, &string, 1);
	5375	ok = 1;
	5376	}
	5377	}
	5378	if (!ok)
	5379	Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
	5380	}
	5381	if (truncated)
	5382	sv_catpvs(dsv, "...");
	5383
	5384	return SvPVX(dsv);
	5385	}
	5386
	5387	/*
	5388	=for apidoc sv_uni_display
	5389
	5390	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	5391	the displayable version being at most C<pvlim> bytes long
	5392	(if longer, the rest is truncated and "..." will be appended).
	5393
	5394	The C<flags> argument is as in L</pv_uni_display>().
	5395
	5396	The pointer to the PV of the C<dsv> is returned.
	5397
	5398	=cut
	5399	*/
	5400	char *
	5401	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	5402	{
	5403	const char * const ptr =
	5404	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	5405
	5406	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	5407
	5408	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	5409	SvCUR(ssv), pvlim, flags);
	5410	}
	5411
	5412	/*
	5413	=for apidoc foldEQ_utf8
	5414
	5415	Returns true if the leading portions of the strings C<s1> and C<s2> (either or
	5416	both of which may be in UTF-8) are the same case-insensitively; false
	5417	otherwise. How far into the strings to compare is determined by other input
	5418	parameters.
	5419
	5420	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	5421	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
	5422	C<u2> with respect to C<s2>.
	5423
	5424	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
	5425	fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
	5426	The scan will not be considered to be a match unless the goal is reached, and
	5427	scanning won't continue past that goal. Correspondingly for C<l2> with respect
	5428	to C<s2>.
	5429
	5430	If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
	5431	pointer is considered an end pointer to the position 1 byte past the maximum
	5432	point in C<s1> beyond which scanning will not continue under any circumstances.
	5433	(This routine assumes that UTF-8 encoded input strings are not malformed;
	5434	malformed input can cause it to read past C<pe1>). This means that if both
	5435	C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
	5436	will never be successful because it can never
	5437	get as far as its goal (and in fact is asserted against). Correspondingly for
	5438	C<pe2> with respect to C<s2>.
	5439
	5440	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	5441	C<l2> must be non-zero), and if both do, both have to be
	5442	reached for a successful match. Also, if the fold of a character is multiple
	5443	characters, all of them must be matched (see tr21 reference below for
	5444	'folding').
	5445
	5446	Upon a successful match, if C<pe1> is non-C<NULL>,
	5447	it will be set to point to the beginning of the I<next> character of C<s1>
	5448	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	5449
	5450	For case-insensitiveness, the "casefolding" of Unicode is used
	5451	instead of upper/lowercasing both the characters, see
	5452	L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	5453
	5454	=cut */
	5455
	5456	/* A flags parameter has been added which may change, and hence isn't
	5457	* externally documented. Currently it is:
	5458	* 0 for as-documented above
	5459	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	5460	ASCII one, to not match
	5461	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	5462	* locale are to be used.
	5463	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	5464	* routine. This allows that step to be skipped.
	5465	* Currently, this requires s1 to be encoded as UTF-8
	5466	* (u1 must be true), which is asserted for.
	5467	* FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
	5468	* cross certain boundaries. Hence, the caller should
	5469	* let this function do the folding instead of
	5470	* pre-folding. This code contains an assertion to
	5471	* that effect. However, if the caller knows what
	5472	* it's doing, it can pass this flag to indicate that,
	5473	* and the assertion is skipped.
	5474	* FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
	5475	* to s2, and s2 doesn't have to be UTF-8 encoded.
	5476	* This introduces an asymmetry to save a few branches
	5477	* in a loop. Currently, this is not a problem, as
	5478	* never are both inputs pre-folded. Simply call this
	5479	* function with the pre-folded one as the second
	5480	* string.
	5481	* FOLDEQ_S2_FOLDS_SANE
	5482	*/
	5483	I32
	5484	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char *pe1, UV l1, bool u1,
	5485	const char s2, char *pe2, UV l2, bool u2,
	5486	U32 flags)
	5487	{
	5488	const U8 p1 = (const U8)s1; /* Point to current char */
	5489	const U8 p2 = (const U8)s2;
	5490	const U8 g1 = NULL; / goal for s1 */
	5491	const U8 *g2 = NULL;
	5492	const U8 e1 = NULL; / Don't scan s1 past this */
	5493	U8 f1 = NULL; / Point to current folded */
	5494	const U8 *e2 = NULL;
	5495	U8 *f2 = NULL;
	5496	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	5497	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	5498	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	5499	U8 flags_for_folder = FOLD_FLAGS_FULL;
	5500
	5501	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	5502
	5503	assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	5504	&& (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
	5505	&& !(flags & FOLDEQ_S1_FOLDS_SANE))
	5506	\|\| ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
	5507	&& !(flags & FOLDEQ_S2_FOLDS_SANE)))));
	5508	/* The algorithm is to trial the folds without regard to the flags on
	5509	* the first line of the above assert(), and then see if the result
	5510	* violates them. This means that the inputs can't be pre-folded to a
	5511	* violating result, hence the assert. This could be changed, with the
	5512	* addition of extra tests here for the already-folded case, which would
	5513	* slow it down. That cost is more than any possible gain for when these
	5514	* flags are specified, as the flags indicate /il or /iaa matching which
	5515	* is less common than /iu, and I (khw) also believe that real-world /il
	5516	* and /iaa matches are most likely to involve code points 0-255, and this
	5517	* function only under rare conditions gets called for 0-255. */
	5518
	5519	if (flags & FOLDEQ_LOCALE) {
	5520	if (IN_UTF8_CTYPE_LOCALE) {
	5521	flags &= ~FOLDEQ_LOCALE;
	5522	}
	5523	else {
	5524	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	5525	}
	5526	}
	5527	if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
	5528	flags_for_folder \|= FOLD_FLAGS_NOMIX_ASCII;
	5529	}
	5530
	5531	if (pe1) {
	5532	e1 = (U8*)pe1;
	5533	}
	5534
	5535	if (l1) {
	5536	g1 = (const U8*)s1 + l1;
	5537	}
	5538
	5539	if (pe2) {
	5540	e2 = (U8*)pe2;
	5541	}
	5542
	5543	if (l2) {
	5544	g2 = (const U8*)s2 + l2;
	5545	}
	5546
	5547	/* Must have at least one goal */
	5548	assert(g1 \|\| g2);
	5549
	5550	if (g1) {
	5551
	5552	/* Will never match if goal is out-of-bounds */
	5553	assert(! e1 \|\| e1 >= g1);
	5554
	5555	/* Here, there isn't an end pointer, or it is beyond the goal. We
	5556	* only go as far as the goal */
	5557	e1 = g1;
	5558	}
	5559	else {
	5560	assert(e1); /* Must have an end for looking at s1 */
	5561	}
	5562
	5563	/* Same for goal for s2 */
	5564	if (g2) {
	5565	assert(! e2 \|\| e2 >= g2);
	5566	e2 = g2;
	5567	}
	5568	else {
	5569	assert(e2);
	5570	}
	5571
	5572	/* If both operands are already folded, we could just do a memEQ on the
	5573	* whole strings at once, but it would be better if the caller realized
	5574	* this and didn't even call us */
	5575
	5576	/* Look through both strings, a character at a time */
	5577	while (p1 < e1 && p2 < e2) {
	5578
	5579	/* If at the beginning of a new character in s1, get its fold to use
	5580	* and the length of the fold. */
	5581	if (n1 == 0) {
	5582	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	5583	f1 = (U8 *) p1;
	5584	assert(u1);
	5585	n1 = UTF8SKIP(f1);
	5586	}
	5587	else {
	5588	if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
	5589
	5590	/* We have to forbid mixing ASCII with non-ASCII if the
	5591	* flags so indicate. And, we can short circuit having to
	5592	* call the general functions for this common ASCII case,
	5593	* all of whose non-locale folds are also ASCII, and hence
	5594	* UTF-8 invariants, so the UTF8ness of the strings is not
	5595	* relevant. */
	5596	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	5597	return 0;
	5598	}
	5599	n1 = 1;
	5600	foldbuf1 = toFOLD(p1);
	5601	}
	5602	else if (u1) {
	5603	_toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
	5604	}
	5605	else { /* Not UTF-8, get UTF-8 fold */
	5606	_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
	5607	}
	5608	f1 = foldbuf1;
	5609	}
	5610	}
	5611
	5612	if (n2 == 0) { /* Same for s2 */
	5613	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	5614
	5615	/* Point to the already-folded character. But for non-UTF-8
	5616	* variants, convert to UTF-8 for the algorithm below */
	5617	if (UTF8_IS_INVARIANT(*p2)) {
	5618	f2 = (U8 *) p2;
	5619	n2 = 1;
	5620	}
	5621	else if (u2) {
	5622	f2 = (U8 *) p2;
	5623	n2 = UTF8SKIP(f2);
	5624	}
	5625	else {
	5626	foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
	5627	foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
	5628	f2 = foldbuf2;
	5629	n2 = 2;
	5630	}
	5631	}
	5632	else {
	5633	if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
	5634	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	5635	return 0;
	5636	}
	5637	n2 = 1;
	5638	foldbuf2 = toFOLD(p2);
	5639	}
	5640	else if (u2) {
	5641	_toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
	5642	}
	5643	else {
	5644	_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
	5645	}
	5646	f2 = foldbuf2;
	5647	}
	5648	}
	5649
	5650	/* Here f1 and f2 point to the beginning of the strings to compare.
	5651	* These strings are the folds of the next character from each input
	5652	* string, stored in UTF-8. */
	5653
	5654	/* While there is more to look for in both folds, see if they
	5655	* continue to match */
	5656	while (n1 && n2) {
	5657	U8 fold_length = UTF8SKIP(f1);
	5658	if (fold_length != UTF8SKIP(f2)
	5659	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	5660	function call for single
	5661	byte */
	5662	\|\| memNE((char)f1, (char)f2, fold_length))
	5663	{
	5664	return 0; /* mismatch */
	5665	}
	5666
	5667	/* Here, they matched, advance past them */
	5668	n1 -= fold_length;
	5669	f1 += fold_length;
	5670	n2 -= fold_length;
	5671	f2 += fold_length;
	5672	}
	5673
	5674	/* When reach the end of any fold, advance the input past it */
	5675	if (n1 == 0) {
	5676	p1 += u1 ? UTF8SKIP(p1) : 1;
	5677	}
	5678	if (n2 == 0) {
	5679	p2 += u2 ? UTF8SKIP(p2) : 1;
	5680	}
	5681	} /* End of loop through both strings */
	5682
	5683	/* A match is defined by each scan that specified an explicit length
	5684	* reaching its final goal, and the other not having matched a partial
	5685	* character (which can happen when the fold of a character is more than one
	5686	* character). */
	5687	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	5688	return 0;
	5689	}
	5690
	5691	/* Successful match. Set output pointers */
	5692	if (pe1) {
	5693	pe1 = (char)p1;
	5694	}
	5695	if (pe2) {
	5696	pe2 = (char)p2;
	5697	}
	5698	return 1;
	5699	}
	5700
	5701	/* XXX The next two functions should likely be moved to mathoms.c once all
	5702	* occurrences of them are removed from the core; some cpan-upstream modules
	5703	* still use them */
	5704
	5705	U8 *
	5706	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	5707	{
	5708	PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
	5709
	5710	return uvoffuni_to_utf8_flags(d, uv, 0);
	5711	}
	5712
	5713	/*
	5714	=for apidoc utf8n_to_uvuni
	5715
	5716	Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
	5717
	5718	This function was useful for code that wanted to handle both EBCDIC and
	5719	ASCII platforms with Unicode properties, but starting in Perl v5.20, the
	5720	distinctions between the platforms have mostly been made invisible to most
	5721	code, so this function is quite unlikely to be what you want. If you do need
	5722	this precise functionality, use instead
	5723	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>>
	5724	or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))\|/utf8n_to_uvchr>>.
	5725
	5726	=cut
	5727	*/
	5728
	5729	UV
	5730	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	5731	{
	5732	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	5733
	5734	return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
	5735	}
	5736
	5737	/*
	5738	=for apidoc uvuni_to_utf8_flags
	5739
	5740	Instead you almost certainly want to use L</uvchr_to_utf8> or
	5741	L</uvchr_to_utf8_flags>.
	5742
	5743	This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
	5744	which itself, while not deprecated, should be used only in isolated
	5745	circumstances. These functions were useful for code that wanted to handle
	5746	both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
	5747	v5.20, the distinctions between the platforms have mostly been made invisible
	5748	to most code, so this function is quite unlikely to be what you want.
	5749
	5750	=cut
	5751	*/
	5752
	5753	U8 *
	5754	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	5755	{
	5756	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	5757
	5758	return uvoffuni_to_utf8_flags(d, uv, flags);
	5759	}
	5760
	5761	/*
	5762	=for apidoc utf8_to_uvchr
	5763
	5764	Returns the native code point of the first character in the string C<s>
	5765	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	5766	length, in bytes, of that character.
	5767
	5768	Some, but not all, UTF-8 malformations are detected, and in fact, some
	5769	malformed input could cause reading beyond the end of the input buffer, which
	5770	is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
	5771
	5772	If C<s> points to one of the detected malformations, and UTF8 warnings are
	5773	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	5774	C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
	5775	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	5776	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	5777	next possible position in C<s> that could begin a non-malformed character.
	5778	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	5779
	5780	=cut
	5781	*/
	5782
	5783	UV
	5784	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	5785	{
	5786	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	5787
	5788	/* This function is unsafe if malformed UTF-8 input is given it, which is
	5789	* why the function is deprecated. If the first byte of the input
	5790	* indicates that there are more bytes remaining in the sequence that forms
	5791	* the character than there are in the input buffer, it can read past the
	5792	* end. But we can make it safe if the input string happens to be
	5793	* NUL-terminated, as many strings in Perl are, by refusing to read past a
	5794	* NUL. A NUL indicates the start of the next character anyway. If the
	5795	* input isn't NUL-terminated, the function remains unsafe, as it always
	5796	* has been.
	5797	*
	5798	* An initial NUL has to be handled separately, but all ASCIIs can be
	5799	* handled the same way, speeding up this common case */
	5800
	5801	if (UTF8_IS_INVARIANT(s)) { / Assumes 's' contains at least 1 byte */
	5802	return (UV) *s;
	5803	}
	5804
	5805	return utf8_to_uvchr_buf(s,
	5806	s + my_strnlen((char *) s, UTF8SKIP(s)),
	5807	retlen);
	5808	}
	5809
	5810	/*
	5811	* ex: set ts=8 sts=4 sw=4 et:
	5812	*/