perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "inline_invlist.c"
	35	#include "charclass_invlists.h"
	36
	37	static const char unees[] =
	38	"Malformed UTF-8 character (unexpected end of string)";
	39
	40	/*
	41	=head1 Unicode Support
	42	These are various utility functions for manipulating UTF8-encoded
	43	strings. For the uninitiated, this is a method of representing arbitrary
	44	Unicode characters as a variable number of bytes, in such a way that
	45	characters in the ASCII range are unmodified, and a zero byte never appears
	46	within non-zero characters.
	47
	48	=cut
	49	*/
	50
	51	/*
	52	=for apidoc is_ascii_string
	53
	54	Returns true if the first C<len> bytes of the string C<s> are the same whether
	55	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	56	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	57	fit this definition, hence the function's name.
	58
	59	If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
	60	use this option, that C<s> can't have embedded C<NUL> characters and has to
	61	have a terminating C<NUL> byte).
	62
	63	See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
	64
	65	=cut
	66	*/
	67
	68	bool
	69	Perl_is_ascii_string(const U8 *s, STRLEN len)
	70	{
	71	const U8* const send = s + (len ? len : strlen((const char *)s));
	72	const U8* x = s;
	73
	74	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	75
	76	for (; x < send; ++x) {
	77	if (!UTF8_IS_INVARIANT(*x))
	78	break;
	79	}
	80
	81	return x == send;
	82	}
	83
	84	/*
	85	=for apidoc uvoffuni_to_utf8_flags
	86
	87	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	88	Instead, B<Almost all code should use L</uvchr_to_utf8> or
	89	L</uvchr_to_utf8_flags>>.
	90
	91	This function is like them, but the input is a strict Unicode
	92	(as opposed to native) code point. Only in very rare circumstances should code
	93	not be using the native code point.
	94
	95	For details, see the description for L</uvchr_to_utf8_flags>>.
	96
	97	=cut
	98	*/
	99
	100	U8 *
	101	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	102	{
	103	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	104
	105	if (UNI_IS_INVARIANT(uv)) {
	106	*d++ = (U8) LATIN1_TO_NATIVE(uv);
	107	return d;
	108	}
	109
	110	#ifdef EBCDIC
	111	/* Not representable in UTF-EBCDIC */
	112	flags \|= UNICODE_DISALLOW_FE_FF;
	113	#endif
	114
	115	/* The first problematic code point is the first surrogate */
	116	if (uv >= UNICODE_SURROGATE_FIRST
	117	&& ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
	118	{
	119	if (UNICODE_IS_SURROGATE(uv)) {
	120	if (flags & UNICODE_WARN_SURROGATE) {
	121	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
	122	"UTF-16 surrogate U+%04"UVXf, uv);
	123	}
	124	if (flags & UNICODE_DISALLOW_SURROGATE) {
	125	return NULL;
	126	}
	127	}
	128	else if (UNICODE_IS_SUPER(uv)) {
	129	if (flags & UNICODE_WARN_SUPER
	130	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	131	{
	132	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	133	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	134	}
	135	if (flags & UNICODE_DISALLOW_SUPER
	136	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	137	{
	138	#ifdef EBCDIC
	139	Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
	140	assert(0);
	141	#endif
	142	return NULL;
	143	}
	144	}
	145	else if (UNICODE_IS_NONCHAR(uv)) {
	146	if (flags & UNICODE_WARN_NONCHAR) {
	147	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
	148	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	149	uv);
	150	}
	151	if (flags & UNICODE_DISALLOW_NONCHAR) {
	152	return NULL;
	153	}
	154	}
	155	}
	156
	157	#if defined(EBCDIC)
	158	{
	159	STRLEN len = OFFUNISKIP(uv);
	160	U8 *p = d+len-1;
	161	while (p > d) {
	162	*p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	163	uv >>= UTF_ACCUMULATION_SHIFT;
	164	}
	165	*p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	166	return d+len;
	167	}
	168	#else /* Non loop style */
	169	if (uv < 0x800) {
	170	*d++ = (U8)(( uv >> 6) \| 0xc0);
	171	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	172	return d;
	173	}
	174	if (uv < 0x10000) {
	175	*d++ = (U8)(( uv >> 12) \| 0xe0);
	176	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	177	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	178	return d;
	179	}
	180	if (uv < 0x200000) {
	181	*d++ = (U8)(( uv >> 18) \| 0xf0);
	182	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	183	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	184	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	185	return d;
	186	}
	187	if (uv < 0x4000000) {
	188	*d++ = (U8)(( uv >> 24) \| 0xf8);
	189	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	190	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	191	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	192	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	193	return d;
	194	}
	195	if (uv < 0x80000000) {
	196	*d++ = (U8)(( uv >> 30) \| 0xfc);
	197	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	198	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	199	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	200	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	201	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	202	return d;
	203	}
	204	#ifdef UTF8_QUAD_MAX
	205	if (uv < UTF8_QUAD_MAX)
	206	#endif
	207	{
	208	d++ = 0xfe; / Can't match U+FEFF! */
	209	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	212	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	213	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	214	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	215	return d;
	216	}
	217	#ifdef UTF8_QUAD_MAX
	218	{
	219	d++ = 0xff; / Can't match U+FFFE! */
	220	d++ = 0x80; / 6 Reserved bits */
	221	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	222	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	223	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	224	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	225	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	226	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	227	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	228	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	229	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	231	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	232	return d;
	233	}
	234	#endif
	235	#endif /* Non loop style */
	236	}
	237	/*
	238	=for apidoc uvchr_to_utf8
	239
	240	Adds the UTF-8 representation of the native code point C<uv> to the end
	241	of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
	242	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	243	the byte after the end of the new character. In other words,
	244
	245	d = uvchr_to_utf8(d, uv);
	246
	247	is the recommended wide native character-aware way of saying
	248
	249	*(d++) = uv;
	250
	251	This function accepts any UV as input. To forbid or warn on non-Unicode code
	252	points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
	253
	254	=cut
	255	*/
	256
	257	/* This is also a macro */
	258	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	259
	260	U8 *
	261	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	262	{
	263	return uvchr_to_utf8(d, uv);
	264	}
	265
	266	/*
	267	=for apidoc uvchr_to_utf8_flags
	268
	269	Adds the UTF-8 representation of the native code point C<uv> to the end
	270	of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
	271	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	272	the byte after the end of the new character. In other words,
	273
	274	d = uvchr_to_utf8_flags(d, uv, flags);
	275
	276	or, in most cases,
	277
	278	d = uvchr_to_utf8_flags(d, uv, 0);
	279
	280	This is the Unicode-aware way of saying
	281
	282	*(d++) = uv;
	283
	284	This function will convert to UTF-8 (and not warn) even code points that aren't
	285	legal Unicode or are problematic, unless C<flags> contains one or more of the
	286	following flags:
	287
	288	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	289	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	290	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	291	If both flags are set, the function will both warn and return NULL.
	292
	293	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
	294	affect how the function handles a Unicode non-character. And likewise, the
	295	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
	296	code points that are
	297	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	298	even less portable) can be warned and/or disallowed even if other above-Unicode
	299	code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	300	flags.
	301
	302	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	303	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	304	DISALLOW flags.
	305
	306	=cut
	307	*/
	308
	309	/* This is also a macro */
	310	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	311
	312	U8 *
	313	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	314	{
	315	return uvchr_to_utf8_flags(d, uv, flags);
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char_buf
	320
	321	This is identical to the macro L</isUTF8_CHAR>.
	322
	323	=cut */
	324
	325	STRLEN
	326	Perl_is_utf8_char_buf(const U8 buf, const U8 buf_end)
	327	{
	328
	329	PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
	330
	331	return isUTF8_CHAR(buf, buf_end);
	332	}
	333
	334	/*
	335	=for apidoc is_utf8_string
	336
	337	Returns true if the first C<len> bytes of string C<s> form a valid
	338	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	339	using C<strlen(s)> (which means if you use this option, that C<s> can't have
	340	embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
	341	that all characters being ASCII constitute 'a valid UTF-8 string'.
	342
	343	See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
	344
	345	=cut
	346	*/
	347
	348	bool
	349	Perl_is_utf8_string(const U8 *s, STRLEN len)
	350	{
	351	const U8* const send = s + (len ? len : strlen((const char *)s));
	352	const U8* x = s;
	353
	354	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	355
	356	while (x < send) {
	357	STRLEN len = isUTF8_CHAR(x, send);
	358	if (UNLIKELY(! len)) {
	359	return FALSE;
	360	}
	361	x += len;
	362	}
	363
	364	return TRUE;
	365	}
	366
	367	/*
	368	Implemented as a macro in utf8.h
	369
	370	=for apidoc is_utf8_string_loc
	371
	372	Like L</is_utf8_string> but stores the location of the failure (in the
	373	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	374	"utf8ness success") in the C<ep>.
	375
	376	See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
	377
	378	=for apidoc is_utf8_string_loclen
	379
	380	Like L</is_utf8_string>() but stores the location of the failure (in the
	381	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	382	"utf8ness success") in the C<ep>, and the number of UTF-8
	383	encoded characters in the C<el>.
	384
	385	See also L</is_utf8_string_loc>() and L</is_utf8_string>().
	386
	387	=cut
	388	*/
	389
	390	bool
	391	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	392	{
	393	const U8* const send = s + (len ? len : strlen((const char *)s));
	394	const U8* x = s;
	395	STRLEN outlen = 0;
	396
	397	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	398
	399	while (x < send) {
	400	STRLEN len = isUTF8_CHAR(x, send);
	401	if (UNLIKELY(! len)) {
	402	goto out;
	403	}
	404	x += len;
	405	outlen++;
	406	}
	407
	408	out:
	409	if (el)
	410	*el = outlen;
	411
	412	if (ep)
	413	*ep = x;
	414	return (x == send);
	415	}
	416
	417	/*
	418
	419	=for apidoc utf8n_to_uvchr
	420
	421	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	422	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	423
	424	Bottom level UTF-8 decode routine.
	425	Returns the native code point value of the first character in the string C<s>,
	426	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	427	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	428	the length, in bytes, of that character.
	429
	430	The value of C<flags> determines the behavior when C<s> does not point to a
	431	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	432	zero is returned and C<retlen> is set so that (S<C<s> + C<retlen>>) is the
	433	next possible position in C<s> that could begin a non-malformed character.
	434	Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
	435
	436	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	437	individual types of malformations, such as the sequence being overlong (that
	438	is, when there is a shorter sequence that can express the same code point;
	439	overlong sequences are expressly forbidden in the UTF-8 standard due to
	440	potential security issues). Another malformation example is the first byte of
	441	a character not being a legal first byte. See F<utf8.h> for the list of such
	442	flags. For allowed 0 length strings, this function returns 0; for allowed
	443	overlong sequences, the computed code point is returned; for all other allowed
	444	malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
	445	determinable reasonable value.
	446
	447	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	448	flags) malformation is found. If this flag is set, the routine assumes that
	449	the caller will raise a warning, and this function will silently just set
	450	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	451
	452	Note that this API requires disambiguation between successful decoding a C<NUL>
	453	character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
	454	in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
	455	first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the
	456	input had an error.
	457
	458	Certain code points are considered problematic. These are Unicode surrogates,
	459	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	460	By default these are considered regular code points, but certain situations
	461	warrant special handling for them. If C<flags> contains
	462	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	463	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	464	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	465	maximum) can be set to disallow these categories individually.
	466
	467	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	468	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	469	for their respective categories, but otherwise the code points are considered
	470	valid (not malformations). To get a category to both be treated as a
	471	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	472	(But note that warnings are not raised if lexically disabled nor if
	473	UTF8_CHECK_ONLY is also specified.)
	474
	475	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	476	the others that are above the Unicode legal maximum. There are several
	477	reasons: they requre at least 32 bits to represent them on ASCII platforms, are
	478	not representable at all on EBCDIC platforms, and the original UTF-8
	479	specification never went above this number (the current 0x10FFFF limit was
	480	imposed later). (The smaller ones, those that fit into 32 bits, are
	481	representable by a UV on ASCII platforms, but not by an IV, which means that
	482	the number of operations that can be performed on them is quite restricted.)
	483	The UTF-8 encoding on ASCII platforms for these large code points begins with a
	484	byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
	485	be treated as malformations, while allowing smaller above-Unicode code points.
	486	(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
	487	including these, as malformations.)
	488	Similarly, UTF8_WARN_FE_FF acts just like
	489	the other WARN flags, but applies just to these code points.
	490
	491	All other code points corresponding to Unicode characters, including private
	492	use and those yet to be assigned, are never considered malformed and never
	493	warn.
	494
	495	=cut
	496	*/
	497
	498	UV
	499	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	500	{
	501	const U8 * const s0 = s;
	502	U8 overflow_byte = '\0'; /* Save byte in case of overflow */
	503	U8 * send;
	504	UV uv = *s;
	505	STRLEN expectlen;
	506	SV* sv = NULL;
	507	UV outlier_ret = 0; /* return value when input is in error or problematic
	508	*/
	509	UV pack_warn = 0; /* Save result of packWARN() for later */
	510	bool unexpected_non_continuation = FALSE;
	511	bool overflowed = FALSE;
	512	bool do_overlong_test = TRUE; /* May have to skip this test */
	513
	514	const char* const malformed_text = "Malformed UTF-8 character";
	515
	516	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	517
	518	/* The order of malformation tests here is important. We should consume as
	519	* few bytes as possible in order to not skip any valid character. This is
	520	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	521	* http://unicode.org/reports/tr36 for more discussion as to why. For
	522	* example, once we've done a UTF8SKIP, we can tell the expected number of
	523	* bytes, and could fail right off the bat if the input parameters indicate
	524	* that there are too few available. But it could be that just that first
	525	* byte is garbled, and the intended character occupies fewer bytes. If we
	526	* blindly assumed that the first byte is correct, and skipped based on
	527	* that number, we could skip over a valid input character. So instead, we
	528	* always examine the sequence byte-by-byte.
	529	*
	530	* We also should not consume too few bytes, otherwise someone could inject
	531	* things. For example, an input could be deliberately designed to
	532	* overflow, and if this code bailed out immediately upon discovering that,
	533	* returning to the caller C<*retlen> pointing to the very next byte (one
	534	* which is actually part of of the overflowing sequence), that could look
	535	* legitimate to the caller, which could discard the initial partial
	536	* sequence and process the rest, inappropriately */
	537
	538	/* Zero length strings, if allowed, of necessity are zero */
	539	if (UNLIKELY(curlen == 0)) {
	540	if (retlen) {
	541	*retlen = 0;
	542	}
	543
	544	if (flags & UTF8_ALLOW_EMPTY) {
	545	return 0;
	546	}
	547	if (! (flags & UTF8_CHECK_ONLY)) {
	548	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
	549	}
	550	goto malformed;
	551	}
	552
	553	expectlen = UTF8SKIP(s);
	554
	555	/* A well-formed UTF-8 character, as the vast majority of calls to this
	556	* function will be for, has this expected length. For efficiency, set
	557	* things up here to return it. It will be overriden only in those rare
	558	* cases where a malformation is found */
	559	if (retlen) {
	560	*retlen = expectlen;
	561	}
	562
	563	/* An invariant is trivially well-formed */
	564	if (UTF8_IS_INVARIANT(uv)) {
	565	return uv;
	566	}
	567
	568	/* A continuation character can't start a valid sequence */
	569	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	570	if (flags & UTF8_ALLOW_CONTINUATION) {
	571	if (retlen) {
	572	*retlen = 1;
	573	}
	574	return UNICODE_REPLACEMENT;
	575	}
	576
	577	if (! (flags & UTF8_CHECK_ONLY)) {
	578	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
	579	}
	580	curlen = 1;
	581	goto malformed;
	582	}
	583
	584	/* Here is not a continuation byte, nor an invariant. The only thing left
	585	* is a start byte (possibly for an overlong) */
	586
	587	#ifdef EBCDIC
	588	uv = NATIVE_UTF8_TO_I8(uv);
	589	#endif
	590
	591	/* Remove the leading bits that indicate the number of bytes in the
	592	* character's whole UTF-8 sequence, leaving just the bits that are part of
	593	* the value */
	594	uv &= UTF_START_MASK(expectlen);
	595
	596	/* Now, loop through the remaining bytes in the character's sequence,
	597	* accumulating each into the working value as we go. Be sure to not look
	598	* past the end of the input string */
	599	send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
	600
	601	for (s = s0 + 1; s < send; s++) {
	602	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	603	#ifndef EBCDIC /* Can't overflow in EBCDIC */
	604	if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
	605
	606	/* The original implementors viewed this malformation as more
	607	* serious than the others (though I, khw, don't understand
	608	* why, since other malformations also give very very wrong
	609	* results), so there is no way to turn off checking for it.
	610	* Set a flag, but keep going in the loop, so that we absorb
	611	* the rest of the bytes that comprise the character. */
	612	overflowed = TRUE;
	613	overflow_byte = s; / Save for warning message's use */
	614	}
	615	#endif
	616	uv = UTF8_ACCUMULATE(uv, *s);
	617	}
	618	else {
	619	/* Here, found a non-continuation before processing all expected
	620	* bytes. This byte begins a new character, so quit, even if
	621	* allowing this malformation. */
	622	unexpected_non_continuation = TRUE;
	623	break;
	624	}
	625	} /* End of loop through the character's bytes */
	626
	627	/* Save how many bytes were actually in the character */
	628	curlen = s - s0;
	629
	630	/* The loop above finds two types of malformations: non-continuation and/or
	631	* overflow. The non-continuation malformation is really a too-short
	632	* malformation, as it means that the current character ended before it was
	633	* expected to (being terminated prematurely by the beginning of the next
	634	* character, whereas in the too-short malformation there just are too few
	635	* bytes available to hold the character. In both cases, the check below
	636	* that we have found the expected number of bytes would fail if executed.)
	637	* Thus the non-continuation malformation is really unnecessary, being a
	638	* subset of the too-short malformation. But there may be existing
	639	* applications that are expecting the non-continuation type, so we retain
	640	* it, and return it in preference to the too-short malformation. (If this
	641	* code were being written from scratch, the two types might be collapsed
	642	* into one.) I, khw, am also giving priority to returning the
	643	* non-continuation and too-short malformations over overflow when multiple
	644	* ones are present. I don't know of any real reason to prefer one over
	645	* the other, except that it seems to me that multiple-byte errors trumps
	646	* errors from a single byte */
	647	if (UNLIKELY(unexpected_non_continuation)) {
	648	if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	649	if (! (flags & UTF8_CHECK_ONLY)) {
	650	if (curlen == 1) {
	651	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, s, s0));
	652	}
	653	else {
	654	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, s, (int) curlen, s0, (int)expectlen));
	655	}
	656	}
	657	goto malformed;
	658	}
	659	uv = UNICODE_REPLACEMENT;
	660
	661	/* Skip testing for overlongs, as the REPLACEMENT may not be the same
	662	* as what the original expectations were. */
	663	do_overlong_test = FALSE;
	664	if (retlen) {
	665	*retlen = curlen;
	666	}
	667	}
	668	else if (UNLIKELY(curlen < expectlen)) {
	669	if (! (flags & UTF8_ALLOW_SHORT)) {
	670	if (! (flags & UTF8_CHECK_ONLY)) {
	671	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
	672	}
	673	goto malformed;
	674	}
	675	uv = UNICODE_REPLACEMENT;
	676	do_overlong_test = FALSE;
	677	if (retlen) {
	678	*retlen = curlen;
	679	}
	680	}
	681
	682	#ifndef EBCDIC /* EBCDIC can't overflow */
	683	if (UNLIKELY(overflowed)) {
	684	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
	685	goto malformed;
	686	}
	687	#endif
	688
	689	if (do_overlong_test
	690	&& expectlen > (STRLEN) OFFUNISKIP(uv)
	691	&& ! (flags & UTF8_ALLOW_LONG))
	692	{
	693	/* The overlong malformation has lower precedence than the others.
	694	* Note that if this malformation is allowed, we return the actual
	695	* value, instead of the replacement character. This is because this
	696	* value is actually well-defined. */
	697	if (! (flags & UTF8_CHECK_ONLY)) {
	698	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
	699	}
	700	goto malformed;
	701	}
	702
	703	/* Here, the input is considered to be well-formed, but it still could be a
	704	* problematic code point that is not allowed by the input parameters. */
	705	if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
	706	&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
	707	\|UTF8_WARN_ILLEGAL_INTERCHANGE)))
	708	{
	709	if (UNICODE_IS_SURROGATE(uv)) {
	710
	711	/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
	712	* generation of the sv, since no warnings are raised under CHECK */
	713	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
	714	&& ckWARN_d(WARN_SURROGATE))
	715	{
	716	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	717	pack_warn = packWARN(WARN_SURROGATE);
	718	}
	719	if (flags & UTF8_DISALLOW_SURROGATE) {
	720	goto disallowed;
	721	}
	722	}
	723	else if ((uv > PERL_UNICODE_MAX)) {
	724	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
	725	&& ckWARN_d(WARN_NON_UNICODE))
	726	{
	727	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	728	pack_warn = packWARN(WARN_NON_UNICODE);
	729	}
	730	#ifndef EBCDIC /* EBCDIC always allows FE, FF */
	731
	732	/* The first byte being 0xFE or 0xFF is a subset of the SUPER code
	733	* points. We test for these after the regular SUPER ones, and
	734	* before possibly bailing out, so that the more dire warning
	735	* overrides the regular one, if applicable */
	736	if ((s0 & 0xFE) == 0xFE / matches both FE, FF */
	737	&& (flags & (UTF8_WARN_FE_FF\|UTF8_DISALLOW_FE_FF)))
	738	{
	739	if ((flags & (UTF8_WARN_FE_FF\|UTF8_CHECK_ONLY))
	740	== UTF8_WARN_FE_FF
	741	&& ckWARN_d(WARN_UTF8))
	742	{
	743	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
	744	pack_warn = packWARN(WARN_UTF8);
	745	}
	746	if (flags & UTF8_DISALLOW_FE_FF) {
	747	goto disallowed;
	748	}
	749	}
	750	#endif
	751	if (flags & UTF8_DISALLOW_SUPER) {
	752	goto disallowed;
	753	}
	754	}
	755	else if (UNICODE_IS_NONCHAR(uv)) {
	756	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
	757	&& ckWARN_d(WARN_NONCHAR))
	758	{
	759	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	760	pack_warn = packWARN(WARN_NONCHAR);
	761	}
	762	if (flags & UTF8_DISALLOW_NONCHAR) {
	763	goto disallowed;
	764	}
	765	}
	766
	767	if (sv) {
	768	outlier_ret = uv; /* Note we don't bother to convert to native,
	769	as all the outlier code points are the same
	770	in both ASCII and EBCDIC */
	771	goto do_warn;
	772	}
	773
	774	/* Here, this is not considered a malformed character, so drop through
	775	* to return it */
	776	}
	777
	778	return UNI_TO_NATIVE(uv);
	779
	780	/* There are three cases which get to beyond this point. In all 3 cases:
	781	* <sv> if not null points to a string to print as a warning.
	782	* <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
	783	* set.
	784	* <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
	785	* This is done by initializing it to 0, and changing it only
	786	* for case 1).
	787	* The 3 cases are:
	788	* 1) The input is valid but problematic, and to be warned about. The
	789	* return value is the resultant code point; <*retlen> is set to
	790	* <curlen>, the number of bytes that comprise the code point.
	791	* <pack_warn> contains the result of packWARN() for the warning
	792	* types. The entry point for this case is the label <do_warn>;
	793	* 2) The input is a valid code point but disallowed by the parameters to
	794	* this function. The return value is 0. If UTF8_CHECK_ONLY is set,
	795	* <*relen> is -1; otherwise it is <curlen>, the number of bytes that
	796	* comprise the code point. <pack_warn> contains the result of
	797	* packWARN() for the warning types. The entry point for this case is
	798	* the label <disallowed>.
	799	* 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
	800	* is set, <*relen> is -1; otherwise it is <curlen>, the number of
	801	* bytes that comprise the malformation. All such malformations are
	802	* assumed to be warning type <utf8>. The entry point for this case
	803	* is the label <malformed>.
	804	*/
	805
	806	malformed:
	807
	808	if (sv && ckWARN_d(WARN_UTF8)) {
	809	pack_warn = packWARN(WARN_UTF8);
	810	}
	811
	812	disallowed:
	813
	814	if (flags & UTF8_CHECK_ONLY) {
	815	if (retlen)
	816	*retlen = ((STRLEN) -1);
	817	return 0;
	818	}
	819
	820	do_warn:
	821
	822	if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
	823	if warnings are to be raised. */
	824	const char * const string = SvPVX_const(sv);
	825
	826	if (PL_op)
	827	Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
	828	else
	829	Perl_warner(aTHX_ pack_warn, "%s", string);
	830	}
	831
	832	if (retlen) {
	833	*retlen = curlen;
	834	}
	835
	836	return outlier_ret;
	837	}
	838
	839	/*
	840	=for apidoc utf8_to_uvchr_buf
	841
	842	Returns the native code point of the first character in the string C<s> which
	843	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	844	C<*retlen> will be set to the length, in bytes, of that character.
	845
	846	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	847	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	848	NULL) to -1. If those warnings are off, the computed value, if well-defined
	849	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	850	C<retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<retlen>>) is
	851	the next possible position in C<s> that could begin a non-malformed character.
	852	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	853	returned.
	854
	855	=cut
	856	*/
	857
	858
	859	UV
	860	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	861	{
	862	assert(s < send);
	863
	864	return utf8n_to_uvchr(s, send - s, retlen,
	865	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	866	}
	867
	868	/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
	869	* there are no malformations in the input UTF-8 string C<s>. surrogates,
	870	* non-character code points, and non-Unicode code points are allowed. */
	871
	872	UV
	873	Perl_valid_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	874	{
	875	UV expectlen = UTF8SKIP(s);
	876	const U8* send = s + expectlen;
	877	UV uv = *s;
	878
	879	PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
	880	PERL_UNUSED_CONTEXT;
	881
	882	if (retlen) {
	883	*retlen = expectlen;
	884	}
	885
	886	/* An invariant is trivially returned */
	887	if (expectlen == 1) {
	888	return uv;
	889	}
	890
	891	#ifdef EBCDIC
	892	uv = NATIVE_UTF8_TO_I8(uv);
	893	#endif
	894
	895	/* Remove the leading bits that indicate the number of bytes, leaving just
	896	* the bits that are part of the value */
	897	uv &= UTF_START_MASK(expectlen);
	898
	899	/* Now, loop through the remaining bytes, accumulating each into the
	900	* working total as we go. (I khw tried unrolling the loop for up to 4
	901	* bytes, but there was no performance improvement) */
	902	for (++s; s < send; s++) {
	903	uv = UTF8_ACCUMULATE(uv, *s);
	904	}
	905
	906	return UNI_TO_NATIVE(uv);
	907
	908	}
	909
	910	/*
	911	=for apidoc utf8_to_uvuni_buf
	912
	913	Only in very rare circumstances should code need to be dealing in Unicode
	914	(as opposed to native) code points. In those few cases, use
	915	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>> instead.
	916
	917	Returns the Unicode (not-native) code point of the first character in the
	918	string C<s> which
	919	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	920	C<retlen> will be set to the length, in bytes, of that character.
	921
	922	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	923	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	924	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	925	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	926	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	927	next possible position in C<s> that could begin a non-malformed character.
	928	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	929
	930	=cut
	931	*/
	932
	933	UV
	934	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	935	{
	936	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	937
	938	assert(send > s);
	939
	940	/* Call the low level routine asking for checks */
	941	return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
	942	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
	943	}
	944
	945	/*
	946	=for apidoc utf8_length
	947
	948	Return the length of the UTF-8 char encoded string C<s> in characters.
	949	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	950	up past C<e>, croaks.
	951
	952	=cut
	953	*/
	954
	955	STRLEN
	956	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	957	{
	958	STRLEN len = 0;
	959
	960	PERL_ARGS_ASSERT_UTF8_LENGTH;
	961
	962	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	963	* the bitops (especially ~) can create illegal UTF-8.
	964	* In other words: in Perl UTF-8 is not just for Unicode. */
	965
	966	if (e < s)
	967	goto warn_and_return;
	968	while (s < e) {
	969	s += UTF8SKIP(s);
	970	len++;
	971	}
	972
	973	if (e != s) {
	974	len--;
	975	warn_and_return:
	976	if (PL_op)
	977	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	978	"%s in %s", unees, OP_DESC(PL_op));
	979	else
	980	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	981	}
	982
	983	return len;
	984	}
	985
	986	/*
	987	=for apidoc utf8_distance
	988
	989	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	990	and C<b>.
	991
	992	WARNING: use only if you know that the pointers point inside the
	993	same UTF-8 buffer.
	994
	995	=cut
	996	*/
	997
	998	IV
	999	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	1000	{
	1001	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	1002
	1003	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	1004	}
	1005
	1006	/*
	1007	=for apidoc utf8_hop
	1008
	1009	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	1010	forward or backward.
	1011
	1012	WARNING: do not use the following unless you know C<off> is within
	1013	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	1014	on the first byte of character or just after the last byte of a character.
	1015
	1016	=cut
	1017	*/
	1018
	1019	U8 *
	1020	Perl_utf8_hop(const U8 *s, I32 off)
	1021	{
	1022	PERL_ARGS_ASSERT_UTF8_HOP;
	1023
	1024	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1025	* the bitops (especially ~) can create illegal UTF-8.
	1026	* In other words: in Perl UTF-8 is not just for Unicode. */
	1027
	1028	if (off >= 0) {
	1029	while (off--)
	1030	s += UTF8SKIP(s);
	1031	}
	1032	else {
	1033	while (off++) {
	1034	s--;
	1035	while (UTF8_IS_CONTINUATION(*s))
	1036	s--;
	1037	}
	1038	}
	1039	return (U8 *)s;
	1040	}
	1041
	1042	/*
	1043	=for apidoc bytes_cmp_utf8
	1044
	1045	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	1046	sequence of characters (stored as UTF-8)
	1047	in C<u>, C<ulen>. Returns 0 if they are
	1048	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	1049	if the first string is greater than the second string.
	1050
	1051	-1 or +1 is returned if the shorter string was identical to the start of the
	1052	longer string. -2 or +2 is returned if
	1053	there was a difference between characters
	1054	within the strings.
	1055
	1056	=cut
	1057	*/
	1058
	1059	int
	1060	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	1061	{
	1062	const U8 *const bend = b + blen;
	1063	const U8 *const uend = u + ulen;
	1064
	1065	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	1066
	1067	while (b < bend && u < uend) {
	1068	U8 c = *u++;
	1069	if (!UTF8_IS_INVARIANT(c)) {
	1070	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	1071	if (u < uend) {
	1072	U8 c1 = *u++;
	1073	if (UTF8_IS_CONTINUATION(c1)) {
	1074	c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
	1075	} else {
	1076	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	1077	"Malformed UTF-8 character "
	1078	"(unexpected non-continuation byte 0x%02x"
	1079	", immediately after start byte 0x%02x)"
	1080	/* Dear diag.t, it's in the pod. */
	1081	"%s%s", c1, c,
	1082	PL_op ? " in " : "",
	1083	PL_op ? OP_DESC(PL_op) : "");
	1084	return -2;
	1085	}
	1086	} else {
	1087	if (PL_op)
	1088	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	1089	"%s in %s", unees, OP_DESC(PL_op));
	1090	else
	1091	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	1092	return -2; /* Really want to return undef :-) */
	1093	}
	1094	} else {
	1095	return -2;
	1096	}
	1097	}
	1098	if (*b != c) {
	1099	return *b < c ? -2 : +2;
	1100	}
	1101	++b;
	1102	}
	1103
	1104	if (b == bend && u == uend)
	1105	return 0;
	1106
	1107	return b < bend ? +1 : -1;
	1108	}
	1109
	1110	/*
	1111	=for apidoc utf8_to_bytes
	1112
	1113	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1114	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	1115	updates C<len> to contain the new length.
	1116	Returns zero on failure, setting C<len> to -1.
	1117
	1118	If you need a copy of the string, see L</bytes_from_utf8>.
	1119
	1120	=cut
	1121	*/
	1122
	1123	U8 *
	1124	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	1125	{
	1126	U8 * const save = s;
	1127	U8 * const send = s + *len;
	1128	U8 *d;
	1129
	1130	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	1131	PERL_UNUSED_CONTEXT;
	1132
	1133	/* ensure valid UTF-8 and chars < 256 before updating string */
	1134	while (s < send) {
	1135	if (! UTF8_IS_INVARIANT(*s)) {
	1136	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	1137	*len = ((STRLEN) -1);
	1138	return 0;
	1139	}
	1140	s++;
	1141	}
	1142	s++;
	1143	}
	1144
	1145	d = s = save;
	1146	while (s < send) {
	1147	U8 c = *s++;
	1148	if (! UTF8_IS_INVARIANT(c)) {
	1149	/* Then it is two-byte encoded */
	1150	c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
	1151	s++;
	1152	}
	1153	*d++ = c;
	1154	}
	1155	*d = '\0';
	1156	*len = d - save;
	1157	return save;
	1158	}
	1159
	1160	/*
	1161	=for apidoc bytes_from_utf8
	1162
	1163	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1164	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
	1165	the newly-created string, and updates C<len> to contain the new
	1166	length. Returns the original string if no conversion occurs, C<len>
	1167	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1168	0 if C<s> is converted or consisted entirely of characters that are invariant
	1169	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1170
	1171	=cut
	1172	*/
	1173
	1174	U8 *
	1175	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1176	{
	1177	U8 *d;
	1178	const U8 *start = s;
	1179	const U8 *send;
	1180	I32 count = 0;
	1181
	1182	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1183	PERL_UNUSED_CONTEXT;
	1184	if (!*is_utf8)
	1185	return (U8 *)start;
	1186
	1187	/* ensure valid UTF-8 and chars < 256 before converting string */
	1188	for (send = s + *len; s < send;) {
	1189	if (! UTF8_IS_INVARIANT(*s)) {
	1190	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	1191	return (U8 *)start;
	1192	}
	1193	count++;
	1194	s++;
	1195	}
	1196	s++;
	1197	}
	1198
	1199	*is_utf8 = FALSE;
	1200
	1201	Newx(d, (*len) - count + 1, U8);
	1202	s = start; start = d;
	1203	while (s < send) {
	1204	U8 c = *s++;
	1205	if (! UTF8_IS_INVARIANT(c)) {
	1206	/* Then it is two-byte encoded */
	1207	c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
	1208	s++;
	1209	}
	1210	*d++ = c;
	1211	}
	1212	*d = '\0';
	1213	*len = d - start;
	1214	return (U8 *)start;
	1215	}
	1216
	1217	/*
	1218	=for apidoc bytes_to_utf8
	1219
	1220	Converts a string C<s> of length C<len> bytes from the native encoding into
	1221	UTF-8.
	1222	Returns a pointer to the newly-created string, and sets C<len> to
	1223	reflect the new length in bytes.
	1224
	1225	A C<NUL> character will be written after the end of the string.
	1226
	1227	If you want to convert to UTF-8 from encodings other than
	1228	the native (Latin1 or EBCDIC),
	1229	see L</sv_recode_to_utf8>().
	1230
	1231	=cut
	1232	*/
	1233
	1234	/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
	1235	likewise need duplication. */
	1236
	1237	U8*
	1238	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1239	{
	1240	const U8 * const send = s + (*len);
	1241	U8 *d;
	1242	U8 *dst;
	1243
	1244	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1245	PERL_UNUSED_CONTEXT;
	1246
	1247	Newx(d, (len) 2 + 1, U8);
	1248	dst = d;
	1249
	1250	while (s < send) {
	1251	append_utf8_from_native_byte(*s, &d);
	1252	s++;
	1253	}
	1254	*d = '\0';
	1255	*len = d-dst;
	1256	return dst;
	1257	}
	1258
	1259	/*
	1260	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1261	*
	1262	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1263	* We optimize for native, for obvious reasons. */
	1264
	1265	U8*
	1266	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1267	{
	1268	U8* pend;
	1269	U8* dstart = d;
	1270
	1271	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1272
	1273	if (bytelen & 1)
	1274	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1275
	1276	pend = p + bytelen;
	1277
	1278	while (p < pend) {
	1279	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1280	p += 2;
	1281	if (UNI_IS_INVARIANT(uv)) {
	1282	*d++ = LATIN1_TO_NATIVE((U8) uv);
	1283	continue;
	1284	}
	1285	if (uv <= MAX_UTF8_TWO_BYTE) {
	1286	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	1287	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	1288	continue;
	1289	}
	1290	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	1291	#define LAST_HIGH_SURROGATE 0xDBFF
	1292	#define FIRST_LOW_SURROGATE 0xDC00
	1293	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	1294	if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
	1295	if (p >= pend) {
	1296	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1297	} else {
	1298	UV low = (p[0] << 8) + p[1];
	1299	p += 2;
	1300	if (low < FIRST_LOW_SURROGATE \|\| low > LAST_LOW_SURROGATE)
	1301	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1302	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	1303	+ (low - FIRST_LOW_SURROGATE) + 0x10000;
	1304	}
	1305	} else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
	1306	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1307	}
	1308	#ifdef EBCDIC
	1309	d = uvoffuni_to_utf8_flags(d, uv, 0);
	1310	#else
	1311	if (uv < 0x10000) {
	1312	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1313	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1314	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1315	continue;
	1316	}
	1317	else {
	1318	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1319	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1320	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1321	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1322	continue;
	1323	}
	1324	#endif
	1325	}
	1326	*newlen = d - dstart;
	1327	return d;
	1328	}
	1329
	1330	/* Note: this one is slightly destructive of the source. */
	1331
	1332	U8*
	1333	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1334	{
	1335	U8* s = (U8*)p;
	1336	U8* const send = s + bytelen;
	1337
	1338	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1339
	1340	if (bytelen & 1)
	1341	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1342	(UV)bytelen);
	1343
	1344	while (s < send) {
	1345	const U8 tmp = s[0];
	1346	s[0] = s[1];
	1347	s[1] = tmp;
	1348	s += 2;
	1349	}
	1350	return utf16_to_utf8(p, d, bytelen, newlen);
	1351	}
	1352
	1353	bool
	1354	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	1355	{
	1356	U8 tmpbuf[UTF8_MAXBYTES+1];
	1357	uvchr_to_utf8(tmpbuf, c);
	1358	return _is_utf8_FOO(classnum, tmpbuf);
	1359	}
	1360
	1361	/* Internal function so we can deprecate the external one, and call
	1362	this one from other deprecated functions in this file */
	1363
	1364	bool
	1365	Perl__is_utf8_idstart(pTHX_ const U8 *p)
	1366	{
	1367	PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
	1368
	1369	if (*p == '_')
	1370	return TRUE;
	1371	return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
	1372	}
	1373
	1374	bool
	1375	Perl__is_uni_perl_idcont(pTHX_ UV c)
	1376	{
	1377	U8 tmpbuf[UTF8_MAXBYTES+1];
	1378	uvchr_to_utf8(tmpbuf, c);
	1379	return _is_utf8_perl_idcont(tmpbuf);
	1380	}
	1381
	1382	bool
	1383	Perl__is_uni_perl_idstart(pTHX_ UV c)
	1384	{
	1385	U8 tmpbuf[UTF8_MAXBYTES+1];
	1386	uvchr_to_utf8(tmpbuf, c);
	1387	return _is_utf8_perl_idstart(tmpbuf);
	1388	}
	1389
	1390	UV
	1391	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
	1392	{
	1393	/* We have the latin1-range values compiled into the core, so just use
	1394	* those, converting the result to utf8. The only difference between upper
	1395	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	1396	* either "SS" or "Ss". Which one to use is passed into the routine in
	1397	* 'S_or_s' to avoid a test */
	1398
	1399	UV converted = toUPPER_LATIN1_MOD(c);
	1400
	1401	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	1402
	1403	assert(S_or_s == 'S' \|\| S_or_s == 's');
	1404
	1405	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	1406	characters in this range */
	1407	*p = (U8) converted;
	1408	*lenp = 1;
	1409	return converted;
	1410	}
	1411
	1412	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	1413	* which it maps to one of them, so as to only have to have one check for
	1414	* it in the main case */
	1415	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	1416	switch (c) {
	1417	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	1418	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	1419	break;
	1420	case MICRO_SIGN:
	1421	converted = GREEK_CAPITAL_LETTER_MU;
	1422	break;
	1423	case LATIN_SMALL_LETTER_SHARP_S:
	1424	*(p)++ = 'S';
	1425	*p = S_or_s;
	1426	*lenp = 2;
	1427	return 'S';
	1428	default:
	1429	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	1430	assert(0); /* NOTREACHED */
	1431	}
	1432	}
	1433
	1434	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1435	*p = UTF8_TWO_BYTE_LO(converted);
	1436	*lenp = 2;
	1437
	1438	return converted;
	1439	}
	1440
	1441	/* Call the function to convert a UTF-8 encoded character to the specified case.
	1442	* Note that there may be more than one character in the result.
	1443	* INP is a pointer to the first byte of the input character
	1444	* OUTP will be set to the first byte of the string of changed characters. It
	1445	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	1446	* LENP will be set to the length in bytes of the string of changed characters
	1447	*
	1448	* The functions return the ordinal of the first character in the string of OUTP */
	1449	#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
	1450	#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
	1451	#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
	1452
	1453	/* This additionally has the input parameter SPECIALS, which if non-zero will
	1454	* cause this to use the SPECIALS hash for folding (meaning get full case
	1455	* folding); otherwise, when zero, this implies a simple case fold */
	1456	#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
	1457
	1458	UV
	1459	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1460	{
	1461	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	1462	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	1463	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1464	* the changed version may be longer than the original character.
	1465	*
	1466	* The ordinal of the first character of the changed version is returned
	1467	* (but note, as explained above, that there may be more.) */
	1468
	1469	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1470
	1471	if (c < 256) {
	1472	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	1473	}
	1474
	1475	uvchr_to_utf8(p, c);
	1476	return CALL_UPPER_CASE(p, p, lenp);
	1477	}
	1478
	1479	UV
	1480	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1481	{
	1482	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1483
	1484	if (c < 256) {
	1485	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	1486	}
	1487
	1488	uvchr_to_utf8(p, c);
	1489	return CALL_TITLE_CASE(p, p, lenp);
	1490	}
	1491
	1492	STATIC U8
	1493	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
	1494	{
	1495	/* We have the latin1-range values compiled into the core, so just use
	1496	* those, converting the result to utf8. Since the result is always just
	1497	* one character, we allow <p> to be NULL */
	1498
	1499	U8 converted = toLOWER_LATIN1(c);
	1500
	1501	if (p != NULL) {
	1502	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	1503	*p = converted;
	1504	*lenp = 1;
	1505	}
	1506	else {
	1507	/* Result is known to always be < 256, so can use the EIGHT_BIT
	1508	* macros */
	1509	*p = UTF8_EIGHT_BIT_HI(converted);
	1510	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	1511	*lenp = 2;
	1512	}
	1513	}
	1514	return converted;
	1515	}
	1516
	1517	UV
	1518	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1519	{
	1520	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1521
	1522	if (c < 256) {
	1523	return to_lower_latin1((U8) c, p, lenp);
	1524	}
	1525
	1526	uvchr_to_utf8(p, c);
	1527	return CALL_LOWER_CASE(p, p, lenp);
	1528	}
	1529
	1530	UV
	1531	Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
	1532	{
	1533	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	1534	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	1535	* FOLD_FLAGS_FULL iff full folding is to be used;
	1536	*
	1537	* Not to be used for locale folds
	1538	*/
	1539
	1540	UV converted;
	1541
	1542	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	1543	PERL_UNUSED_CONTEXT;
	1544
	1545	assert (! (flags & FOLD_FLAGS_LOCALE));
	1546
	1547	if (c == MICRO_SIGN) {
	1548	converted = GREEK_SMALL_LETTER_MU;
	1549	}
	1550	else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
	1551
	1552	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	1553	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	1554	* under those circumstances. */
	1555	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	1556	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	1557	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	1558	p, *lenp, U8);
	1559	return LATIN_SMALL_LETTER_LONG_S;
	1560	}
	1561	else {
	1562	*(p)++ = 's';
	1563	*p = 's';
	1564	*lenp = 2;
	1565	return 's';
	1566	}
	1567	}
	1568	else { /* In this range the fold of all other characters is their lower
	1569	case */
	1570	converted = toLOWER_LATIN1(c);
	1571	}
	1572
	1573	if (UVCHR_IS_INVARIANT(converted)) {
	1574	*p = (U8) converted;
	1575	*lenp = 1;
	1576	}
	1577	else {
	1578	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1579	*p = UTF8_TWO_BYTE_LO(converted);
	1580	*lenp = 2;
	1581	}
	1582
	1583	return converted;
	1584	}
	1585
	1586	UV
	1587	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	1588	{
	1589
	1590	/* Not currently externally documented, and subject to change
	1591	* <flags> bits meanings:
	1592	* FOLD_FLAGS_FULL iff full folding is to be used;
	1593	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	1594	* locale are to be used.
	1595	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	1596	*/
	1597
	1598	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1599
	1600	/* Tread a UTF-8 locale as not being in locale at all */
	1601	if (IN_UTF8_CTYPE_LOCALE) {
	1602	flags &= ~FOLD_FLAGS_LOCALE;
	1603	}
	1604
	1605	if (c < 256) {
	1606	UV result = _to_fold_latin1((U8) c, p, lenp,
	1607	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	1608	/* It is illegal for the fold to cross the 255/256 boundary under
	1609	* locale; in this case return the original */
	1610	return (result > 256 && flags & FOLD_FLAGS_LOCALE)
	1611	? c
	1612	: result;
	1613	}
	1614
	1615	/* If no special needs, just use the macro */
	1616	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	1617	uvchr_to_utf8(p, c);
	1618	return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
	1619	}
	1620	else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
	1621	the special flags. */
	1622	U8 utf8_c[UTF8_MAXBYTES + 1];
	1623	uvchr_to_utf8(utf8_c, c);
	1624	return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
	1625	}
	1626	}
	1627
	1628	PERL_STATIC_INLINE bool
	1629	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1630	const char const swashname, SV const invlist)
	1631	{
	1632	/* returns a boolean giving whether or not the UTF8-encoded character that
	1633	* starts at <p> is in the swash indicated by <swashname>. <swash>
	1634	* contains a pointer to where the swash indicated by <swashname>
	1635	* is to be stored; which this routine will do, so that future calls will
	1636	* look at <*swash> and only generate a swash if it is not null. <invlist>
	1637	* is NULL or an inversion list that defines the swash. If not null, it
	1638	* saves time during initialization of the swash.
	1639	*
	1640	* Note that it is assumed that the buffer length of <p> is enough to
	1641	* contain all the bytes that comprise the character. Thus, <*p> should
	1642	* have been checked before this call for mal-formedness enough to assure
	1643	* that. */
	1644
	1645	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1646
	1647	/* The API should have included a length for the UTF-8 character in <p>,
	1648	* but it doesn't. We therefore assume that p has been validated at least
	1649	* as far as there being enough bytes available in it to accommodate the
	1650	* character without reading beyond the end, and pass that number on to the
	1651	* validating routine */
	1652	if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
	1653	if (ckWARN_d(WARN_UTF8)) {
	1654	Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
	1655	"Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
	1656	if (ckWARN(WARN_UTF8)) { /* This will output details as to the
	1657	what the malformation is */
	1658	utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
	1659	}
	1660	}
	1661	return FALSE;
	1662	}
	1663	if (!*swash) {
	1664	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	1665	*swash = _core_swash_init("utf8",
	1666
	1667	/* Only use the name if there is no inversion
	1668	* list; otherwise will go out to disk */
	1669	(invlist) ? "" : swashname,
	1670
	1671	&PL_sv_undef, 1, 0, invlist, &flags);
	1672	}
	1673
	1674	return swash_fetch(*swash, p, TRUE) != 0;
	1675	}
	1676
	1677	bool
	1678	Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
	1679	{
	1680	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	1681
	1682	assert(classnum < _FIRST_NON_SWASH_CC);
	1683
	1684	return is_utf8_common(p,
	1685	&PL_utf8_swash_ptrs[classnum],
	1686	swash_property_names[classnum],
	1687	PL_XPosix_ptrs[classnum]);
	1688	}
	1689
	1690	bool
	1691	Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
	1692	{
	1693	SV* invlist = NULL;
	1694
	1695	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
	1696
	1697	if (! PL_utf8_perl_idstart) {
	1698	invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
	1699	}
	1700	return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
	1701	}
	1702
	1703	bool
	1704	Perl__is_utf8_xidstart(pTHX_ const U8 *p)
	1705	{
	1706	PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
	1707
	1708	if (*p == '_')
	1709	return TRUE;
	1710	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
	1711	}
	1712
	1713	bool
	1714	Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
	1715	{
	1716	SV* invlist = NULL;
	1717
	1718	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
	1719
	1720	if (! PL_utf8_perl_idcont) {
	1721	invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
	1722	}
	1723	return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
	1724	}
	1725
	1726	bool
	1727	Perl__is_utf8_idcont(pTHX_ const U8 *p)
	1728	{
	1729	PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
	1730
	1731	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
	1732	}
	1733
	1734	bool
	1735	Perl__is_utf8_xidcont(pTHX_ const U8 *p)
	1736	{
	1737	PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
	1738
	1739	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
	1740	}
	1741
	1742	bool
	1743	Perl__is_utf8_mark(pTHX_ const U8 *p)
	1744	{
	1745	PERL_ARGS_ASSERT__IS_UTF8_MARK;
	1746
	1747	return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
	1748	}
	1749
	1750	/*
	1751	=for apidoc to_utf8_case
	1752
	1753	C<p> contains the pointer to the UTF-8 string encoding
	1754	the character that is being converted. This routine assumes that the character
	1755	at C<p> is well-formed.
	1756
	1757	C<ustrp> is a pointer to the character buffer to put the
	1758	conversion result to. C<lenp> is a pointer to the length
	1759	of the result.
	1760
	1761	C<swashp> is a pointer to the swash to use.
	1762
	1763	Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
	1764	and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. C<special> (usually,
	1765	but not always, a multicharacter mapping), is tried first.
	1766
	1767	C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
	1768	any special mappings; C<""> means to use the special mappings. Values other
	1769	than these two are treated as the name of the hash containing the special
	1770	mappings, like C<"utf8::ToSpecLower">.
	1771
	1772	C<normal> is a string like "ToLower" which means the swash
	1773	%utf8::ToLower.
	1774
	1775	=cut */
	1776
	1777	UV
	1778	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1779	SV *swashp, const char normal, const char *special)
	1780	{
	1781	STRLEN len = 0;
	1782	const UV uv1 = valid_utf8_to_uvchr(p, NULL);
	1783
	1784	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1785
	1786	/* Note that swash_fetch() doesn't output warnings for these because it
	1787	* assumes we will */
	1788	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	1789	if (uv1 <= UNICODE_SURROGATE_LAST) {
	1790	if (ckWARN_d(WARN_SURROGATE)) {
	1791	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1792	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	1793	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1794	}
	1795	}
	1796	else if (UNICODE_IS_SUPER(uv1)) {
	1797	if (ckWARN_d(WARN_NON_UNICODE)) {
	1798	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1799	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	1800	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1801	}
	1802	}
	1803
	1804	/* Note that non-characters are perfectly legal, so no warning should
	1805	* be given */
	1806	}
	1807
	1808	if (!swashp) / load on-demand */
	1809	*swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
	1810
	1811	if (special) {
	1812	/* It might be "special" (sometimes, but not always,
	1813	* a multicharacter mapping) */
	1814	HV *hv = NULL;
	1815	SV **svp;
	1816
	1817	/* If passed in the specials name, use that; otherwise use any
	1818	* given in the swash */
	1819	if (*special != '\0') {
	1820	hv = get_hv(special, 0);
	1821	}
	1822	else {
	1823	svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
	1824	if (svp) {
	1825	hv = MUTABLE_HV(SvRV(*svp));
	1826	}
	1827	}
	1828
	1829	if (hv
	1830	&& (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
	1831	&& (*svp))
	1832	{
	1833	const char *s;
	1834
	1835	s = SvPV_const(*svp, len);
	1836	if (len == 1)
	1837	/* EIGHTBIT */
	1838	len = uvchr_to_utf8(ustrp, (U8)s) - ustrp;
	1839	else {
	1840	Copy(s, ustrp, len, U8);
	1841	}
	1842	}
	1843	}
	1844
	1845	if (!len && *swashp) {
	1846	const UV uv2 = swash_fetch(swashp, p, TRUE / => is utf8 */);
	1847
	1848	if (uv2) {
	1849	/* It was "normal" (a single character mapping). */
	1850	len = uvchr_to_utf8(ustrp, uv2) - ustrp;
	1851	}
	1852	}
	1853
	1854	if (len) {
	1855	if (lenp) {
	1856	*lenp = len;
	1857	}
	1858	return valid_utf8_to_uvchr(ustrp, 0);
	1859	}
	1860
	1861	/* Here, there was no mapping defined, which means that the code point maps
	1862	* to itself. Return the inputs */
	1863	len = UTF8SKIP(p);
	1864	if (p != ustrp) { /* Don't copy onto itself */
	1865	Copy(p, ustrp, len, U8);
	1866	}
	1867
	1868	if (lenp)
	1869	*lenp = len;
	1870
	1871	return uv1;
	1872
	1873	}
	1874
	1875	STATIC UV
	1876	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
	1877	{
	1878	/* This is called when changing the case of a utf8-encoded character above
	1879	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	1880	* result contains a character that crosses the 255/256 boundary, disallow
	1881	* the change, and return the original code point. See L<perlfunc/lc> for
	1882	* why;
	1883	*
	1884	* p points to the original string whose case was changed; assumed
	1885	* by this routine to be well-formed
	1886	* result the code point of the first character in the changed-case string
	1887	* ustrp points to the changed-case string (<result> represents its first char)
	1888	* lenp points to the length of <ustrp> */
	1889
	1890	UV original; /* To store the first code point of <p> */
	1891
	1892	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	1893
	1894	assert(UTF8_IS_ABOVE_LATIN1(*p));
	1895
	1896	/* We know immediately if the first character in the string crosses the
	1897	* boundary, so can skip */
	1898	if (result > 255) {
	1899
	1900	/* Look at every character in the result; if any cross the
	1901	* boundary, the whole thing is disallowed */
	1902	U8* s = ustrp + UTF8SKIP(ustrp);
	1903	U8* e = ustrp + *lenp;
	1904	while (s < e) {
	1905	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	1906	goto bad_crossing;
	1907	}
	1908	s += UTF8SKIP(s);
	1909	}
	1910
	1911	/* Here, no characters crossed, result is ok as-is */
	1912	return result;
	1913	}
	1914
	1915	bad_crossing:
	1916
	1917	/* Failed, have to return the original */
	1918	original = valid_utf8_to_uvchr(p, lenp);
	1919	Copy(p, ustrp, *lenp, char);
	1920	return original;
	1921	}
	1922
	1923	/*
	1924	=for apidoc to_utf8_upper
	1925
	1926	Instead use L</toUPPER_utf8>.
	1927
	1928	=cut */
	1929
	1930	/* Not currently externally documented, and subject to change:
	1931	* <flags> is set iff iff the rules from the current underlying locale are to
	1932	* be used. */
	1933
	1934	UV
	1935	Perl__to_utf8_upper_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	1936	{
	1937	UV result;
	1938
	1939	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	1940
	1941	if (flags && IN_UTF8_CTYPE_LOCALE) {
	1942	flags = FALSE;
	1943	}
	1944
	1945	if (UTF8_IS_INVARIANT(*p)) {
	1946	if (flags) {
	1947	result = toUPPER_LC(*p);
	1948	}
	1949	else {
	1950	return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
	1951	}
	1952	}
	1953	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	1954	if (flags) {
	1955	U8 c = TWO_BYTE_UTF8_TO_NATIVE(p, (p+1));
	1956	result = toUPPER_LC(c);
	1957	}
	1958	else {
	1959	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
	1960	ustrp, lenp, 'S');
	1961	}
	1962	}
	1963	else { /* utf8, ord above 255 */
	1964	result = CALL_UPPER_CASE(p, ustrp, lenp);
	1965
	1966	if (flags) {
	1967	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	1968	}
	1969	return result;
	1970	}
	1971
	1972	/* Here, used locale rules. Convert back to utf8 */
	1973	if (UTF8_IS_INVARIANT(result)) {
	1974	*ustrp = (U8) result;
	1975	*lenp = 1;
	1976	}
	1977	else {
	1978	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	1979	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	1980	*lenp = 2;
	1981	}
	1982
	1983	return result;
	1984	}
	1985
	1986	/*
	1987	=for apidoc to_utf8_title
	1988
	1989	Instead use L</toTITLE_utf8>.
	1990
	1991	=cut */
	1992
	1993	/* Not currently externally documented, and subject to change:
	1994	* <flags> is set iff the rules from the current underlying locale are to be
	1995	* used. Since titlecase is not defined in POSIX, for other than a
	1996	* UTF-8 locale, uppercase is used instead for code points < 256.
	1997	*/
	1998
	1999	UV
	2000	Perl__to_utf8_title_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	2001	{
	2002	UV result;
	2003
	2004	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	2005
	2006	if (flags && IN_UTF8_CTYPE_LOCALE) {
	2007	flags = FALSE;
	2008	}
	2009
	2010	if (UTF8_IS_INVARIANT(*p)) {
	2011	if (flags) {
	2012	result = toUPPER_LC(*p);
	2013	}
	2014	else {
	2015	return _to_upper_title_latin1(*p, ustrp, lenp, 's');
	2016	}
	2017	}
	2018	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2019	if (flags) {
	2020	U8 c = TWO_BYTE_UTF8_TO_NATIVE(p, (p+1));
	2021	result = toUPPER_LC(c);
	2022	}
	2023	else {
	2024	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
	2025	ustrp, lenp, 's');
	2026	}
	2027	}
	2028	else { /* utf8, ord above 255 */
	2029	result = CALL_TITLE_CASE(p, ustrp, lenp);
	2030
	2031	if (flags) {
	2032	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2033	}
	2034	return result;
	2035	}
	2036
	2037	/* Here, used locale rules. Convert back to utf8 */
	2038	if (UTF8_IS_INVARIANT(result)) {
	2039	*ustrp = (U8) result;
	2040	*lenp = 1;
	2041	}
	2042	else {
	2043	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2044	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2045	*lenp = 2;
	2046	}
	2047
	2048	return result;
	2049	}
	2050
	2051	/*
	2052	=for apidoc to_utf8_lower
	2053
	2054	Instead use L</toLOWER_utf8>.
	2055
	2056	=cut */
	2057
	2058	/* Not currently externally documented, and subject to change:
	2059	* <flags> is set iff iff the rules from the current underlying locale are to
	2060	* be used.
	2061	*/
	2062
	2063	UV
	2064	Perl__to_utf8_lower_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	2065	{
	2066	UV result;
	2067
	2068	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	2069
	2070	if (flags && IN_UTF8_CTYPE_LOCALE) {
	2071	flags = FALSE;
	2072	}
	2073
	2074	if (UTF8_IS_INVARIANT(*p)) {
	2075	if (flags) {
	2076	result = toLOWER_LC(*p);
	2077	}
	2078	else {
	2079	return to_lower_latin1(*p, ustrp, lenp);
	2080	}
	2081	}
	2082	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2083	if (flags) {
	2084	U8 c = TWO_BYTE_UTF8_TO_NATIVE(p, (p+1));
	2085	result = toLOWER_LC(c);
	2086	}
	2087	else {
	2088	return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
	2089	ustrp, lenp);
	2090	}
	2091	}
	2092	else { /* utf8, ord above 255 */
	2093	result = CALL_LOWER_CASE(p, ustrp, lenp);
	2094
	2095	if (flags) {
	2096	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2097	}
	2098
	2099	return result;
	2100	}
	2101
	2102	/* Here, used locale rules. Convert back to utf8 */
	2103	if (UTF8_IS_INVARIANT(result)) {
	2104	*ustrp = (U8) result;
	2105	*lenp = 1;
	2106	}
	2107	else {
	2108	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2109	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2110	*lenp = 2;
	2111	}
	2112
	2113	return result;
	2114	}
	2115
	2116	/*
	2117	=for apidoc to_utf8_fold
	2118
	2119	Instead use L</toFOLD_utf8>.
	2120
	2121	=cut */
	2122
	2123	/* Not currently externally documented, and subject to change,
	2124	* in <flags>
	2125	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	2126	* locale are to be used.
	2127	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	2128	* otherwise simple folds
	2129	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	2130	* prohibited
	2131	*/
	2132
	2133	UV
	2134	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, U8 flags)
	2135	{
	2136	UV result;
	2137
	2138	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2139
	2140	/* These are mutually exclusive */
	2141	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	2142
	2143	assert(p != ustrp); /* Otherwise overwrites */
	2144
	2145	if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
	2146	flags &= ~FOLD_FLAGS_LOCALE;
	2147	}
	2148
	2149	if (UTF8_IS_INVARIANT(*p)) {
	2150	if (flags & FOLD_FLAGS_LOCALE) {
	2151	result = toFOLD_LC(*p);
	2152	}
	2153	else {
	2154	return _to_fold_latin1(*p, ustrp, lenp,
	2155	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2156	}
	2157	}
	2158	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2159	if (flags & FOLD_FLAGS_LOCALE) {
	2160	U8 c = TWO_BYTE_UTF8_TO_NATIVE(p, (p+1));
	2161	result = toFOLD_LC(c);
	2162	}
	2163	else {
	2164	return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(p, (p+1)),
	2165	ustrp, lenp,
	2166	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2167	}
	2168	}
	2169	else { /* utf8, ord above 255 */
	2170	result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	2171
	2172	if (flags & FOLD_FLAGS_LOCALE) {
	2173
	2174	/* Special case these two characters, as what normally gets
	2175	* returned under locale doesn't work */
	2176	if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
	2177	&& memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
	2178	sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
	2179	{
	2180	goto return_long_s;
	2181	}
	2182	else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
	2183	&& memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
	2184	sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
	2185	{
	2186	goto return_ligature_st;
	2187	}
	2188	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	2189	}
	2190	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	2191	return result;
	2192	}
	2193	else {
	2194	/* This is called when changing the case of a utf8-encoded
	2195	* character above the ASCII range, and the result should not
	2196	* contain an ASCII character. */
	2197
	2198	UV original; /* To store the first code point of <p> */
	2199
	2200	/* Look at every character in the result; if any cross the
	2201	* boundary, the whole thing is disallowed */
	2202	U8* s = ustrp;
	2203	U8* e = ustrp + *lenp;
	2204	while (s < e) {
	2205	if (isASCII(*s)) {
	2206	/* Crossed, have to return the original */
	2207	original = valid_utf8_to_uvchr(p, lenp);
	2208
	2209	/* But in these instances, there is an alternative we can
	2210	* return that is valid */
	2211	if (original == LATIN_CAPITAL_LETTER_SHARP_S
	2212	\|\| original == LATIN_SMALL_LETTER_SHARP_S)
	2213	{
	2214	goto return_long_s;
	2215	}
	2216	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	2217	goto return_ligature_st;
	2218	}
	2219	Copy(p, ustrp, *lenp, char);
	2220	return original;
	2221	}
	2222	s += UTF8SKIP(s);
	2223	}
	2224
	2225	/* Here, no characters crossed, result is ok as-is */
	2226	return result;
	2227	}
	2228	}
	2229
	2230	/* Here, used locale rules. Convert back to utf8 */
	2231	if (UTF8_IS_INVARIANT(result)) {
	2232	*ustrp = (U8) result;
	2233	*lenp = 1;
	2234	}
	2235	else {
	2236	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2237	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2238	*lenp = 2;
	2239	}
	2240
	2241	return result;
	2242
	2243	return_long_s:
	2244	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	2245	* folds to a string of two of these characters. By returning this
	2246	* instead, then, e.g.,
	2247	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	2248	* works. */
	2249
	2250	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	2251	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	2252	ustrp, *lenp, U8);
	2253	return LATIN_SMALL_LETTER_LONG_S;
	2254
	2255	return_ligature_st:
	2256	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	2257	* have the other one fold to it */
	2258
	2259	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	2260	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	2261	return LATIN_SMALL_LIGATURE_ST;
	2262	}
	2263
	2264	/* Note:
	2265	* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
	2266	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2267	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2268	*/
	2269
	2270	SV*
	2271	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2272	{
	2273	PERL_ARGS_ASSERT_SWASH_INIT;
	2274
	2275	/* Returns a copy of a swash initiated by the called function. This is the
	2276	* public interface, and returning a copy prevents others from doing
	2277	* mischief on the original */
	2278
	2279	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
	2280	}
	2281
	2282	SV*
	2283	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV listsv, I32 minbits, I32 none, SV invlist, U8* const flags_p)
	2284	{
	2285
	2286	/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
	2287	* use the following define */
	2288
	2289	#define CORE_SWASH_INIT_RETURN(x) \
	2290	PL_curpm= old_PL_curpm; \
	2291	return x
	2292
	2293	/* Initialize and return a swash, creating it if necessary. It does this
	2294	* by calling utf8_heavy.pl in the general case. The returned value may be
	2295	* the swash's inversion list instead if the input parameters allow it.
	2296	* Which is returned should be immaterial to callers, as the only
	2297	* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
	2298	* and swash_to_invlist() handle both these transparently.
	2299	*
	2300	* This interface should only be used by functions that won't destroy or
	2301	* adversely change the swash, as doing so affects all other uses of the
	2302	* swash in the program; the general public should use 'Perl_swash_init'
	2303	* instead.
	2304	*
	2305	* pkg is the name of the package that <name> should be in.
	2306	* name is the name of the swash to find. Typically it is a Unicode
	2307	* property name, including user-defined ones
	2308	* listsv is a string to initialize the swash with. It must be of the form
	2309	* documented as the subroutine return value in
	2310	* L<perlunicode/User-Defined Character Properties>
	2311	* minbits is the number of bits required to represent each data element.
	2312	* It is '1' for binary properties.
	2313	* none I (khw) do not understand this one, but it is used only in tr///.
	2314	* invlist is an inversion list to initialize the swash with (or NULL)
	2315	* flags_p if non-NULL is the address of various input and output flag bits
	2316	* to the routine, as follows: ('I' means is input to the routine;
	2317	* 'O' means output from the routine. Only flags marked O are
	2318	* meaningful on return.)
	2319	* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
	2320	* came from a user-defined property. (I O)
	2321	* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
	2322	* when the swash cannot be located, to simply return NULL. (I)
	2323	* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
	2324	* return of an inversion list instead of a swash hash if this routine
	2325	* thinks that would result in faster execution of swash_fetch() later
	2326	* on. (I)
	2327	*
	2328	* Thus there are three possible inputs to find the swash: <name>,
	2329	* <listsv>, and <invlist>. At least one must be specified. The result
	2330	* will be the union of the specified ones, although <listsv>'s various
	2331	* actions can intersect, etc. what <name> gives. To avoid going out to
	2332	* disk at all, <invlist> should specify completely what the swash should
	2333	* have, and <listsv> should be &PL_sv_undef and <name> should be "".
	2334	*
	2335	* <invlist> is only valid for binary properties */
	2336
	2337	PMOP old_PL_curpm= PL_curpm; / save away the old PL_curpm */
	2338
	2339	SV* retval = &PL_sv_undef;
	2340	HV* swash_hv = NULL;
	2341	const int invlist_swash_boundary =
	2342	(flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
	2343	? 512 /* Based on some benchmarking, but not extensive, see commit
	2344	message */
	2345	: -1; /* Never return just an inversion list */
	2346
	2347	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	2348	assert(! invlist \|\| minbits == 1);
	2349
	2350	PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
	2351	that triggered the swash init and the swash init perl logic itself.
	2352	See perl #122747 */
	2353
	2354	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	2355	* so */
	2356	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	2357	dSP;
	2358	const size_t pkg_len = strlen(pkg);
	2359	const size_t name_len = strlen(name);
	2360	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2361	SV* errsv_save;
	2362	GV *method;
	2363
	2364	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	2365
	2366	PUSHSTACKi(PERLSI_MAGIC);
	2367	ENTER;
	2368	SAVEHINTS();
	2369	/* We might get here via a subroutine signature which uses a utf8
	2370	* parameter name, at which point PL_subname will have been set
	2371	* but not yet used. */
	2372	save_item(PL_subname);
	2373	if (PL_parser && PL_parser->error_count)
	2374	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	2375	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2376	if (!method) { /* demand load utf8 */
	2377	ENTER;
	2378	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	2379	GvSV(PL_errgv) = NULL;
	2380	#ifndef NO_TAINT_SUPPORT
	2381	/* It is assumed that callers of this routine are not passing in
	2382	* any user derived data. */
	2383	SAVEBOOL(TAINT_get);
	2384	TAINT_NOT;
	2385	#endif
	2386	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2387	NULL);
	2388	{
	2389	/* Not ERRSV, as there is no need to vivify a scalar we are
	2390	about to discard. */
	2391	SV * const errsv = GvSV(PL_errgv);
	2392	if (!SvTRUE(errsv)) {
	2393	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	2394	SvREFCNT_dec(errsv);
	2395	}
	2396	}
	2397	LEAVE;
	2398	}
	2399	SPAGAIN;
	2400	PUSHMARK(SP);
	2401	EXTEND(SP,5);
	2402	mPUSHp(pkg, pkg_len);
	2403	mPUSHp(name, name_len);
	2404	PUSHs(listsv);
	2405	mPUSHi(minbits);
	2406	mPUSHi(none);
	2407	PUTBACK;
	2408	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	2409	GvSV(PL_errgv) = NULL;
	2410	/* If we already have a pointer to the method, no need to use
	2411	* call_method() to repeat the lookup. */
	2412	if (method
	2413	? call_sv(MUTABLE_SV(method), G_SCALAR)
	2414	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2415	{
	2416	retval = *PL_stack_sp--;
	2417	SvREFCNT_inc(retval);
	2418	}
	2419	{
	2420	/* Not ERRSV. See above. */
	2421	SV * const errsv = GvSV(PL_errgv);
	2422	if (!SvTRUE(errsv)) {
	2423	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	2424	SvREFCNT_dec(errsv);
	2425	}
	2426	}
	2427	LEAVE;
	2428	POPSTACK;
	2429	if (IN_PERL_COMPILETIME) {
	2430	CopHINTS_set(PL_curcop, PL_hints);
	2431	}
	2432	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2433	if (SvPOK(retval))
	2434
	2435	/* If caller wants to handle missing properties, let them */
	2436	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
	2437	CORE_SWASH_INIT_RETURN(NULL);
	2438	}
	2439	Perl_croak(aTHX_
	2440	"Can't find Unicode property definition \"%"SVf"\"",
	2441	SVfARG(retval));
	2442	NOT_REACHED; /* NOTREACHED */
	2443	}
	2444	} /* End of calling the module to find the swash */
	2445
	2446	/* If this operation fetched a swash, and we will need it later, get it */
	2447	if (retval != &PL_sv_undef
	2448	&& (minbits == 1 \|\| (flags_p
	2449	&& ! (*flags_p
	2450	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
	2451	{
	2452	swash_hv = MUTABLE_HV(SvRV(retval));
	2453
	2454	/* If we don't already know that there is a user-defined component to
	2455	* this swash, and the user has indicated they wish to know if there is
	2456	* one (by passing <flags_p>), find out */
	2457	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
	2458	SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
	2459	if (user_defined && SvUV(*user_defined)) {
	2460	*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	2461	}
	2462	}
	2463	}
	2464
	2465	/* Make sure there is an inversion list for binary properties */
	2466	if (minbits == 1) {
	2467	SV** swash_invlistsvp = NULL;
	2468	SV* swash_invlist = NULL;
	2469	bool invlist_in_swash_is_valid = FALSE;
	2470	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
	2471	an unclaimed reference count */
	2472
	2473	/* If this operation fetched a swash, get its already existing
	2474	* inversion list, or create one for it */
	2475
	2476	if (swash_hv) {
	2477	swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
	2478	if (swash_invlistsvp) {
	2479	swash_invlist = *swash_invlistsvp;
	2480	invlist_in_swash_is_valid = TRUE;
	2481	}
	2482	else {
	2483	swash_invlist = _swash_to_invlist(retval);
	2484	swash_invlist_unclaimed = TRUE;
	2485	}
	2486	}
	2487
	2488	/* If an inversion list was passed in, have to include it */
	2489	if (invlist) {
	2490
	2491	/* Any fetched swash will by now have an inversion list in it;
	2492	* otherwise <swash_invlist> will be NULL, indicating that we
	2493	* didn't fetch a swash */
	2494	if (swash_invlist) {
	2495
	2496	/* Add the passed-in inversion list, which invalidates the one
	2497	* already stored in the swash */
	2498	invlist_in_swash_is_valid = FALSE;
	2499	_invlist_union(invlist, swash_invlist, &swash_invlist);
	2500	}
	2501	else {
	2502
	2503	/* Here, there is no swash already. Set up a minimal one, if
	2504	* we are going to return a swash */
	2505	if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
	2506	swash_hv = newHV();
	2507	retval = newRV_noinc(MUTABLE_SV(swash_hv));
	2508	}
	2509	swash_invlist = invlist;
	2510	}
	2511	}
	2512
	2513	/* Here, we have computed the union of all the passed-in data. It may
	2514	* be that there was an inversion list in the swash which didn't get
	2515	* touched; otherwise save the computed one */
	2516	if (! invlist_in_swash_is_valid
	2517	&& (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
	2518	{
	2519	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
	2520	{
	2521	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2522	}
	2523	/* We just stole a reference count. */
	2524	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
	2525	else SvREFCNT_inc_simple_void_NN(swash_invlist);
	2526	}
	2527
	2528	SvREADONLY_on(swash_invlist);
	2529
	2530	/* Use the inversion list stand-alone if small enough */
	2531	if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
	2532	SvREFCNT_dec(retval);
	2533	if (!swash_invlist_unclaimed)
	2534	SvREFCNT_inc_simple_void_NN(swash_invlist);
	2535	retval = newRV_noinc(swash_invlist);
	2536	}
	2537	}
	2538
	2539	CORE_SWASH_INIT_RETURN(retval);
	2540	#undef CORE_SWASH_INIT_RETURN
	2541	}
	2542
	2543
	2544	/* This API is wrong for special case conversions since we may need to
	2545	* return several Unicode characters for a single Unicode character
	2546	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2547	* the lower-level routine, and it is similarly broken for returning
	2548	* multiple values. --jhi
	2549	* For those, you should use to_utf8_case() instead */
	2550	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	2551
	2552	/* Note:
	2553	* Returns the value of property/mapping C<swash> for the first character
	2554	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2555	* assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
	2556	* is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2557	*
	2558	* A "swash" is a hash which contains initially the keys/values set up by
	2559	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	2560	* property for all possible code points. Things are stored in a compact form
	2561	* (see utf8_heavy.pl) so that calculation is required to find the actual
	2562	* property value for a given code point. As code points are looked up, new
	2563	* key/value pairs are added to the hash, so that the calculation doesn't have
	2564	* to ever be re-done. Further, each calculation is done, not just for the
	2565	* desired one, but for a whole block of code points adjacent to that one.
	2566	* For binary properties on ASCII machines, the block is usually for 64 code
	2567	* points, starting with a code point evenly divisible by 64. Thus if the
	2568	* property value for code point 257 is requested, the code goes out and
	2569	* calculates the property values for all 64 code points between 256 and 319,
	2570	* and stores these as a single 64-bit long bit vector, called a "swatch",
	2571	* under the key for code point 256. The key is the UTF-8 encoding for code
	2572	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	2573	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	2574	* for code point 258 is then requested, this code realizes that it would be
	2575	* stored under the key for 256, and would find that value and extract the
	2576	* relevant bit, offset from 256.
	2577	*
	2578	* Non-binary properties are stored in as many bits as necessary to represent
	2579	* their values (32 currently, though the code is more general than that), not
	2580	* as single bits, but the principal is the same: the value for each key is a
	2581	* vector that encompasses the property values for all code points whose UTF-8
	2582	* representations are represented by the key. That is, for all code points
	2583	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	2584	* bytes of that.
	2585	*/
	2586	UV
	2587	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2588	{
	2589	HV *const hv = MUTABLE_HV(SvRV(swash));
	2590	U32 klen;
	2591	U32 off;
	2592	STRLEN slen = 0;
	2593	STRLEN needents;
	2594	const U8 *tmps = NULL;
	2595	SV *swatch;
	2596	const U8 c = *ptr;
	2597
	2598	PERL_ARGS_ASSERT_SWASH_FETCH;
	2599
	2600	/* If it really isn't a hash, it isn't really swash; must be an inversion
	2601	* list */
	2602	if (SvTYPE(hv) != SVt_PVHV) {
	2603	return _invlist_contains_cp((SV*)hv,
	2604	(do_utf8)
	2605	? valid_utf8_to_uvchr(ptr, NULL)
	2606	: c);
	2607	}
	2608
	2609	/* We store the values in a "swatch" which is a vec() value in a swash
	2610	* hash. Code points 0-255 are a single vec() stored with key length
	2611	* (klen) 0. All other code points have a UTF-8 representation
	2612	* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
	2613	* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
	2614	* length for them is the length of the encoded char - 1. ptr[klen] is the
	2615	* final byte in the sequence representing the character */
	2616	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
	2617	klen = 0;
	2618	needents = 256;
	2619	off = c;
	2620	}
	2621	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2622	klen = 0;
	2623	needents = 256;
	2624	off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
	2625	}
	2626	else {
	2627	klen = UTF8SKIP(ptr) - 1;
	2628
	2629	/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
	2630	* the vec is the final byte in the sequence. (In EBCDIC this is
	2631	* converted to I8 to get consecutive values.) To help you visualize
	2632	* all this:
	2633	* Straight 1047 After final byte
	2634	* UTF-8 UTF-EBCDIC I8 transform
	2635	* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
	2636	* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
	2637	* ...
	2638	* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
	2639	* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
	2640	* ...
	2641	* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
	2642	* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
	2643	* ...
	2644	* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
	2645	* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
	2646	* ...
	2647	* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
	2648	* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
	2649	*
	2650	* (There are no discontinuities in the elided (...) entries.)
	2651	* The UTF-8 key for these 33 code points is '\xD0' (which also is the
	2652	* key for the next 31, up through U+043F, whose UTF-8 final byte is
	2653	* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
	2654	* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
	2655	* index into the vec() swatch (after subtracting 0x80, which we
	2656	* actually do with an '&').
	2657	* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
	2658	* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
	2659	* dicontinuities which go away by transforming it into I8, and we
	2660	* effectively subtract 0xA0 to get the index. */
	2661	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2662	off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
	2663	}
	2664
	2665	/*
	2666	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2667	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2668	* it's nothing to sniff at.) Pity we usually come through at least
	2669	* two function calls to get here...
	2670	*
	2671	* NB: this code assumes that swatches are never modified, once generated!
	2672	*/
	2673
	2674	if (hv == PL_last_swash_hv &&
	2675	klen == PL_last_swash_klen &&
	2676	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2677	{
	2678	tmps = PL_last_swash_tmps;
	2679	slen = PL_last_swash_slen;
	2680	}
	2681	else {
	2682	/* Try our second-level swatch cache, kept in a hash. */
	2683	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2684
	2685	/* If not cached, generate it via swatch_get */
	2686	if (!svp \|\| !SvPOK(*svp)
	2687	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
	2688	{
	2689	if (klen) {
	2690	const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
	2691	swatch = swatch_get(swash,
	2692	code_point & ~((UV)needents - 1),
	2693	needents);
	2694	}
	2695	else { /* For the first 256 code points, the swatch has a key of
	2696	length 0 */
	2697	swatch = swatch_get(swash, 0, needents);
	2698	}
	2699
	2700	if (IN_PERL_COMPILETIME)
	2701	CopHINTS_set(PL_curcop, PL_hints);
	2702
	2703	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2704
	2705	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2706	\|\| (slen << 3) < needents)
	2707	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	2708	"svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
	2709	svp, tmps, (UV)slen, (UV)needents);
	2710	}
	2711
	2712	PL_last_swash_hv = hv;
	2713	assert(klen <= sizeof(PL_last_swash_key));
	2714	PL_last_swash_klen = (U8)klen;
	2715	/* FIXME change interpvar.h? */
	2716	PL_last_swash_tmps = (U8 *) tmps;
	2717	PL_last_swash_slen = slen;
	2718	if (klen)
	2719	Copy(ptr, PL_last_swash_key, klen, U8);
	2720	}
	2721
	2722	switch ((int)((slen << 3) / needents)) {
	2723	case 1:
	2724	return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
	2725	case 8:
	2726	return ((UV) tmps[off]);
	2727	case 16:
	2728	off <<= 1;
	2729	return
	2730	((UV) tmps[off ] << 8) +
	2731	((UV) tmps[off + 1]);
	2732	case 32:
	2733	off <<= 2;
	2734	return
	2735	((UV) tmps[off ] << 24) +
	2736	((UV) tmps[off + 1] << 16) +
	2737	((UV) tmps[off + 2] << 8) +
	2738	((UV) tmps[off + 3]);
	2739	}
	2740	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	2741	"slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
	2742	NORETURN_FUNCTION_END;
	2743	}
	2744
	2745	/* Read a single line of the main body of the swash input text. These are of
	2746	* the form:
	2747	* 0053 0056 0073
	2748	* where each number is hex. The first two numbers form the minimum and
	2749	* maximum of a range, and the third is the value associated with the range.
	2750	* Not all swashes should have a third number
	2751	*
	2752	* On input: l points to the beginning of the line to be examined; it points
	2753	* to somewhere in the string of the whole input text, and is
	2754	* terminated by a \n or the null string terminator.
	2755	* lend points to the null terminator of that string
	2756	* wants_value is non-zero if the swash expects a third number
	2757	* typestr is the name of the swash's mapping, like 'ToLower'
	2758	* On output: min, max, and *val are set to the values read from the line.
	2759	* returns a pointer just beyond the line examined. If there was no
	2760	* valid min number on the line, returns lend+1
	2761	*/
	2762
	2763	STATIC U8*
	2764	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2765	const bool wants_value, const U8* const typestr)
	2766	{
	2767	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2768	STRLEN numlen; /* Length of the number */
	2769	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	2770	\| PERL_SCAN_DISALLOW_PREFIX
	2771	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2772
	2773	/* nl points to the next \n in the scan */
	2774	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2775
	2776	PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
	2777
	2778	/* Get the first number on the line: the range minimum */
	2779	numlen = lend - l;
	2780	min = grok_hex((char )l, &numlen, &flags, NULL);
	2781	max = min; /* So can never return without setting max */
	2782	if (numlen) /* If found a hex number, position past it */
	2783	l += numlen;
	2784	else if (nl) { /* Else, go handle next line, if any */
	2785	return nl + 1; /* 1 is length of "\n" */
	2786	}
	2787	else { /* Else, no next line */
	2788	return lend + 1; /* to LIST's end at which \n is not found */
	2789	}
	2790
	2791	/* The max range value follows, separated by a BLANK */
	2792	if (isBLANK(*l)) {
	2793	++l;
	2794	flags = PERL_SCAN_SILENT_ILLDIGIT
	2795	\| PERL_SCAN_DISALLOW_PREFIX
	2796	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2797	numlen = lend - l;
	2798	max = grok_hex((char )l, &numlen, &flags, NULL);
	2799	if (numlen)
	2800	l += numlen;
	2801	else /* If no value here, it is a single element range */
	2802	max = min;
	2803
	2804	/* Non-binary tables have a third entry: what the first element of the
	2805	* range maps to. The map for those currently read here is in hex */
	2806	if (wants_value) {
	2807	if (isBLANK(*l)) {
	2808	++l;
	2809	flags = PERL_SCAN_SILENT_ILLDIGIT
	2810	\| PERL_SCAN_DISALLOW_PREFIX
	2811	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2812	numlen = lend - l;
	2813	val = grok_hex((char )l, &numlen, &flags, NULL);
	2814	if (numlen)
	2815	l += numlen;
	2816	else
	2817	*val = 0;
	2818	}
	2819	else {
	2820	*val = 0;
	2821	if (typeto) {
	2822	/* diag_listed_as: To%s: illegal mapping '%s' */
	2823	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2824	typestr, l);
	2825	}
	2826	}
	2827	}
	2828	else
	2829	val = 0; / bits == 1, then any val should be ignored */
	2830	}
	2831	else { /* Nothing following range min, should be single element with no
	2832	mapping expected */
	2833	if (wants_value) {
	2834	*val = 0;
	2835	if (typeto) {
	2836	/* diag_listed_as: To%s: illegal mapping '%s' */
	2837	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2838	}
	2839	}
	2840	else
	2841	val = 0; / bits == 1, then val should be ignored */
	2842	}
	2843
	2844	/* Position to next line if any, or EOF */
	2845	if (nl)
	2846	l = nl + 1;
	2847	else
	2848	l = lend;
	2849
	2850	return l;
	2851	}
	2852
	2853	/* Note:
	2854	* Returns a swatch (a bit vector string) for a code point sequence
	2855	* that starts from the value C<start> and comprises the number C<span>.
	2856	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2857	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2858	*/
	2859	STATIC SV*
	2860	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	2861	{
	2862	SV *swatch;
	2863	U8 l, lend, x, xend, s, send;
	2864	STRLEN lcur, xcur, scur;
	2865	HV *const hv = MUTABLE_HV(SvRV(swash));
	2866	SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
	2867
	2868	SV** listsvp = NULL; /* The string containing the main body of the table */
	2869	SV** extssvp = NULL;
	2870	SV** invert_it_svp = NULL;
	2871	U8* typestr = NULL;
	2872	STRLEN bits;
	2873	STRLEN octets; /* if bits == 1, then octets == 0 */
	2874	UV none;
	2875	UV end = start + span;
	2876
	2877	if (invlistsvp == NULL) {
	2878	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2879	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2880	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2881	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2882	listsvp = hv_fetchs(hv, "LIST", FALSE);
	2883	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2884
	2885	bits = SvUV(*bitssvp);
	2886	none = SvUV(*nonesvp);
	2887	typestr = (U8)SvPV_nolen(typesvp);
	2888	}
	2889	else {
	2890	bits = 1;
	2891	none = 0;
	2892	}
	2893	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2894
	2895	PERL_ARGS_ASSERT_SWATCH_GET;
	2896
	2897	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2898	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
	2899	(UV)bits);
	2900	}
	2901
	2902	/* If overflowed, use the max possible */
	2903	if (end < start) {
	2904	end = UV_MAX;
	2905	span = end - start;
	2906	}
	2907
	2908	/* create and initialize $swatch */
	2909	scur = octets ? (span * octets) : (span + 7) / 8;
	2910	swatch = newSV(scur);
	2911	SvPOK_on(swatch);
	2912	s = (U8*)SvPVX(swatch);
	2913	if (octets && none) {
	2914	const U8* const e = s + scur;
	2915	while (s < e) {
	2916	if (bits == 8)
	2917	*s++ = (U8)(none & 0xff);
	2918	else if (bits == 16) {
	2919	*s++ = (U8)((none >> 8) & 0xff);
	2920	*s++ = (U8)( none & 0xff);
	2921	}
	2922	else if (bits == 32) {
	2923	*s++ = (U8)((none >> 24) & 0xff);
	2924	*s++ = (U8)((none >> 16) & 0xff);
	2925	*s++ = (U8)((none >> 8) & 0xff);
	2926	*s++ = (U8)( none & 0xff);
	2927	}
	2928	}
	2929	*s = '\0';
	2930	}
	2931	else {
	2932	(void)memzero((U8*)s, scur + 1);
	2933	}
	2934	SvCUR_set(swatch, scur);
	2935	s = (U8*)SvPVX(swatch);
	2936
	2937	if (invlistsvp) { /* If has an inversion list set up use that */
	2938	_invlist_populate_swatch(*invlistsvp, start, end, s);
	2939	return swatch;
	2940	}
	2941
	2942	/* read $swash->{LIST} */
	2943	l = (U8)SvPV(listsvp, lcur);
	2944	lend = l + lcur;
	2945	while (l < lend) {
	2946	UV min, max, val, upper;
	2947	l = swash_scan_list_line(l, lend, &min, &max, &val,
	2948	cBOOL(octets), typestr);
	2949	if (l > lend) {
	2950	break;
	2951	}
	2952
	2953	/* If looking for something beyond this range, go try the next one */
	2954	if (max < start)
	2955	continue;
	2956
	2957	/* <end> is generally 1 beyond where we want to set things, but at the
	2958	* platform's infinity, where we can't go any higher, we want to
	2959	* include the code point at <end> */
	2960	upper = (max < end)
	2961	? max
	2962	: (max != UV_MAX \|\| end != UV_MAX)
	2963	? end - 1
	2964	: end;
	2965
	2966	if (octets) {
	2967	UV key;
	2968	if (min < start) {
	2969	if (!none \|\| val < none) {
	2970	val += start - min;
	2971	}
	2972	min = start;
	2973	}
	2974	for (key = min; key <= upper; key++) {
	2975	STRLEN offset;
	2976	/* offset must be non-negative (start <= min <= key < end) */
	2977	offset = octets * (key - start);
	2978	if (bits == 8)
	2979	s[offset] = (U8)(val & 0xff);
	2980	else if (bits == 16) {
	2981	s[offset ] = (U8)((val >> 8) & 0xff);
	2982	s[offset + 1] = (U8)( val & 0xff);
	2983	}
	2984	else if (bits == 32) {
	2985	s[offset ] = (U8)((val >> 24) & 0xff);
	2986	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2987	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2988	s[offset + 3] = (U8)( val & 0xff);
	2989	}
	2990
	2991	if (!none \|\| val < none)
	2992	++val;
	2993	}
	2994	}
	2995	else { /* bits == 1, then val should be ignored */
	2996	UV key;
	2997	if (min < start)
	2998	min = start;
	2999
	3000	for (key = min; key <= upper; key++) {
	3001	const STRLEN offset = (STRLEN)(key - start);
	3002	s[offset >> 3] \|= 1 << (offset & 7);
	3003	}
	3004	}
	3005	} /* while */
	3006
	3007	/* Invert if the data says it should be. Assumes that bits == 1 */
	3008	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3009
	3010	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	3011	* be 0, and their inversion should also be 0, as we don't succeed any
	3012	* Unicode property matches for non-Unicode code points */
	3013	if (start <= PERL_UNICODE_MAX) {
	3014
	3015	/* The code below assumes that we never cross the
	3016	* Unicode/above-Unicode boundary in a range, as otherwise we would
	3017	* have to figure out where to stop flipping the bits. Since this
	3018	* boundary is divisible by a large power of 2, and swatches comes
	3019	* in small powers of 2, this should be a valid assumption */
	3020	assert(start + span - 1 <= PERL_UNICODE_MAX);
	3021
	3022	send = s + scur;
	3023	while (s < send) {
	3024	s = ~(s);
	3025	s++;
	3026	}
	3027	}
	3028	}
	3029
	3030	/* read $swash->{EXTRAS}
	3031	* This code also copied to swash_to_invlist() below */
	3032	x = (U8)SvPV(extssvp, xcur);
	3033	xend = x + xcur;
	3034	while (x < xend) {
	3035	STRLEN namelen;
	3036	U8 *namestr;
	3037	SV** othersvp;
	3038	HV* otherhv;
	3039	STRLEN otherbits;
	3040	SV *otherbitssvp, other;
	3041	U8 s, o, *nl;
	3042	STRLEN slen, olen;
	3043
	3044	const U8 opc = *x++;
	3045	if (opc == '\n')
	3046	continue;
	3047
	3048	nl = (U8*)memchr(x, '\n', xend - x);
	3049
	3050	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3051	if (nl) {
	3052	x = nl + 1; /* 1 is length of "\n" */
	3053	continue;
	3054	}
	3055	else {
	3056	x = xend; /* to EXTRAS' end at which \n is not found */
	3057	break;
	3058	}
	3059	}
	3060
	3061	namestr = x;
	3062	if (nl) {
	3063	namelen = nl - namestr;
	3064	x = nl + 1;
	3065	}
	3066	else {
	3067	namelen = xend - namestr;
	3068	x = xend;
	3069	}
	3070
	3071	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3072	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3073	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3074	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3075	if (bits < otherbits)
	3076	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	3077	"bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
	3078
	3079	/* The "other" swatch must be destroyed after. */
	3080	other = swatch_get(*othersvp, start, span);
	3081	o = (U8*)SvPV(other, olen);
	3082
	3083	if (!olen)
	3084	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	3085
	3086	s = (U8*)SvPV(swatch, slen);
	3087	if (bits == 1 && otherbits == 1) {
	3088	if (slen != olen)
	3089	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	3090	"mismatch, slen=%"UVuf", olen=%"UVuf,
	3091	(UV)slen, (UV)olen);
	3092
	3093	switch (opc) {
	3094	case '+':
	3095	while (slen--)
	3096	s++ \|= o++;
	3097	break;
	3098	case '!':
	3099	while (slen--)
	3100	s++ \|= ~o++;
	3101	break;
	3102	case '-':
	3103	while (slen--)
	3104	s++ &= ~o++;
	3105	break;
	3106	case '&':
	3107	while (slen--)
	3108	s++ &= o++;
	3109	break;
	3110	default:
	3111	break;
	3112	}
	3113	}
	3114	else {
	3115	STRLEN otheroctets = otherbits >> 3;
	3116	STRLEN offset = 0;
	3117	U8* const send = s + slen;
	3118
	3119	while (s < send) {
	3120	UV otherval = 0;
	3121
	3122	if (otherbits == 1) {
	3123	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	3124	++offset;
	3125	}
	3126	else {
	3127	STRLEN vlen = otheroctets;
	3128	otherval = *o++;
	3129	while (--vlen) {
	3130	otherval <<= 8;
	3131	otherval \|= *o++;
	3132	}
	3133	}
	3134
	3135	if (opc == '+' && otherval)
	3136	NOOP; /* replace with otherval */
	3137	else if (opc == '!' && !otherval)
	3138	otherval = 1;
	3139	else if (opc == '-' && otherval)
	3140	otherval = 0;
	3141	else if (opc == '&' && !otherval)
	3142	otherval = 0;
	3143	else {
	3144	s += octets; /* no replacement */
	3145	continue;
	3146	}
	3147
	3148	if (bits == 8)
	3149	*s++ = (U8)( otherval & 0xff);
	3150	else if (bits == 16) {
	3151	*s++ = (U8)((otherval >> 8) & 0xff);
	3152	*s++ = (U8)( otherval & 0xff);
	3153	}
	3154	else if (bits == 32) {
	3155	*s++ = (U8)((otherval >> 24) & 0xff);
	3156	*s++ = (U8)((otherval >> 16) & 0xff);
	3157	*s++ = (U8)((otherval >> 8) & 0xff);
	3158	*s++ = (U8)( otherval & 0xff);
	3159	}
	3160	}
	3161	}
	3162	sv_free(other); /* through with it! */
	3163	} /* while */
	3164	return swatch;
	3165	}
	3166
	3167	HV*
	3168	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	3169	{
	3170
	3171	/* Subject to change or removal. For use only in regcomp.c and regexec.c
	3172	* Can't be used on a property that is subject to user override, as it
	3173	* relies on the value of SPECIALS in the swash which would be set by
	3174	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	3175	* for overridden properties
	3176	*
	3177	* Returns a hash which is the inversion and closure of a swash mapping.
	3178	* For example, consider the input lines:
	3179	* 004B 006B
	3180	* 004C 006C
	3181	* 212A 006B
	3182	*
	3183	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	3184	* 006C. The value for each key is an array. For 006C, the array would
	3185	* have two elements, the utf8 for itself, and for 004C. For 006B, there
	3186	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	3187	*
	3188	* Note that there are no elements in the hash for 004B, 004C, 212A. The
	3189	* keys are only code points that are folded-to, so it isn't a full closure.
	3190	*
	3191	* Essentially, for any code point, it gives all the code points that map to
	3192	* it, or the list of 'froms' for that point.
	3193	*
	3194	* Currently it ignores any additions or deletions from other swashes,
	3195	* looking at just the main body of the swash, and if there are SPECIALS
	3196	* in the swash, at that hash
	3197	*
	3198	* The specials hash can be extra code points, and most likely consists of
	3199	* maps from single code points to multiple ones (each expressed as a string
	3200	* of utf8 characters). This function currently returns only 1-1 mappings.
	3201	* However consider this possible input in the specials hash:
	3202	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	3203	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	3204	*
	3205	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	3206	* currently handle. But it also means that FB05 and FB06 are equivalent in
	3207	* a 1-1 mapping which we should handle, and this relationship may not be in
	3208	* the main table. Therefore this function examines all the multi-char
	3209	* sequences and adds the 1-1 mappings that come out of that. */
	3210
	3211	U8 l, lend;
	3212	STRLEN lcur;
	3213	HV *const hv = MUTABLE_HV(SvRV(swash));
	3214
	3215	/* The string containing the main body of the table. This will have its
	3216	* assertion fail if the swash has been converted to its inversion list */
	3217	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	3218
	3219	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3220	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3221	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	3222	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	3223	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	3224	const STRLEN bits = SvUV(*bitssvp);
	3225	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3226	const UV none = SvUV(*nonesvp);
	3227	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	3228
	3229	HV* ret = newHV();
	3230
	3231	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	3232
	3233	/* Must have at least 8 bits to get the mappings */
	3234	if (bits != 8 && bits != 16 && bits != 32) {
	3235	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	3236	(UV)bits);
	3237	}
	3238
	3239	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	3240	mapping to more than one character */
	3241
	3242	/* Construct an inverse mapping hash for the specials */
	3243	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	3244	HV * specials_inverse = newHV();
	3245	char char_from; / the lhs of the map */
	3246	I32 from_len; /* its byte length */
	3247	char char_to; / the rhs of the map */
	3248	I32 to_len; /* its byte length */
	3249	SV sv_to; / and in a sv */
	3250	AV* from_list; /* list of things that map to each 'to' */
	3251
	3252	hv_iterinit(specials_hv);
	3253
	3254	/* The keys are the characters (in utf8) that map to the corresponding
	3255	* utf8 string value. Iterate through the list creating the inverse
	3256	* list. */
	3257	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	3258	SV** listp;
	3259	if (! SvPOK(sv_to)) {
	3260	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
	3261	"unexpectedly is not a string, flags=%lu",
	3262	(unsigned long)SvFLAGS(sv_to));
	3263	}
	3264	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8) char_from, 0), valid_utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	3265
	3266	/* Each key in the inverse list is a mapped-to value, and the key's
	3267	* hash value is a list of the strings (each in utf8) that map to
	3268	* it. Those strings are all one character long */
	3269	if ((listp = hv_fetch(specials_inverse,
	3270	SvPVX(sv_to),
	3271	SvCUR(sv_to), 0)))
	3272	{
	3273	from_list = (AV) listp;
	3274	}
	3275	else { /* No entry yet for it: create one */
	3276	from_list = newAV();
	3277	if (! hv_store(specials_inverse,
	3278	SvPVX(sv_to),
	3279	SvCUR(sv_to),
	3280	(SV*) from_list, 0))
	3281	{
	3282	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3283	}
	3284	}
	3285
	3286	/* Here have the list associated with this 'to' (perhaps newly
	3287	* created and empty). Just add to it. Note that we ASSUME that
	3288	* the input is guaranteed to not have duplications, so we don't
	3289	* check for that. Duplications just slow down execution time. */
	3290	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	3291	}
	3292
	3293	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	3294	* it looking for cases like the FB05/FB06 examples above. There would
	3295	* be an entry in the hash like
	3296	* 'st' => [ FB05, FB06 ]
	3297	* In this example we will create two lists that get stored in the
	3298	* returned hash, 'ret':
	3299	* FB05 => [ FB05, FB06 ]
	3300	* FB06 => [ FB05, FB06 ]
	3301	*
	3302	* Note that there is nothing to do if the array only has one element.
	3303	* (In the normal 1-1 case handled below, we don't have to worry about
	3304	* two lists, as everything gets tied to the single list that is
	3305	* generated for the single character 'to'. But here, we are omitting
	3306	* that list, ('st' in the example), so must have multiple lists.) */
	3307	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	3308	&char_to, &to_len)))
	3309	{
	3310	if (av_tindex(from_list) > 0) {
	3311	SSize_t i;
	3312
	3313	/* We iterate over all combinations of i,j to place each code
	3314	* point on each list */
	3315	for (i = 0; i <= av_tindex(from_list); i++) {
	3316	SSize_t j;
	3317	AV* i_list = newAV();
	3318	SV** entryp = av_fetch(from_list, i, FALSE);
	3319	if (entryp == NULL) {
	3320	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3321	}
	3322	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	3323	Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
	3324	}
	3325	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	3326	(SV*) i_list, FALSE))
	3327	{
	3328	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3329	}
	3330
	3331	/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	3332	for (j = 0; j <= av_tindex(from_list); j++) {
	3333	entryp = av_fetch(from_list, j, FALSE);
	3334	if (entryp == NULL) {
	3335	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3336	}
	3337
	3338	/* When i==j this adds itself to the list */
	3339	av_push(i_list, newSVuv(utf8_to_uvchr_buf(
	3340	(U8) SvPVX(entryp),
	3341	(U8) SvPVX(entryp) + SvCUR(*entryp),
	3342	0)));
	3343	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	3344	}
	3345	}
	3346	}
	3347	}
	3348	SvREFCNT_dec(specials_inverse); /* done with it */
	3349	} /* End of specials */
	3350
	3351	/* read $swash->{LIST} */
	3352	l = (U8)SvPV(listsvp, lcur);
	3353	lend = l + lcur;
	3354
	3355	/* Go through each input line */
	3356	while (l < lend) {
	3357	UV min, max, val;
	3358	UV inverse;
	3359	l = swash_scan_list_line(l, lend, &min, &max, &val,
	3360	cBOOL(octets), typestr);
	3361	if (l > lend) {
	3362	break;
	3363	}
	3364
	3365	/* Each element in the range is to be inverted */
	3366	for (inverse = min; inverse <= max; inverse++) {
	3367	AV* list;
	3368	SV** listp;
	3369	IV i;
	3370	bool found_key = FALSE;
	3371	bool found_inverse = FALSE;
	3372
	3373	/* The key is the inverse mapping */
	3374	char key[UTF8_MAXBYTES+1];
	3375	char* key_end = (char ) uvchr_to_utf8((U8) key, val);
	3376	STRLEN key_len = key_end - key;
	3377
	3378	/* Get the list for the map */
	3379	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	3380	list = (AV) listp;
	3381	}
	3382	else { /* No entry yet for it: create one */
	3383	list = newAV();
	3384	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	3385	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3386	}
	3387	}
	3388
	3389	/* Look through list to see if this inverse mapping already is
	3390	* listed, or if there is a mapping to itself already */
	3391	for (i = 0; i <= av_tindex(list); i++) {
	3392	SV** entryp = av_fetch(list, i, FALSE);
	3393	SV* entry;
	3394	UV uv;
	3395	if (entryp == NULL) {
	3396	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3397	}
	3398	entry = *entryp;
	3399	uv = SvUV(entry);
	3400	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));/
	3401	if (uv == val) {
	3402	found_key = TRUE;
	3403	}
	3404	if (uv == inverse) {
	3405	found_inverse = TRUE;
	3406	}
	3407
	3408	/* No need to continue searching if found everything we are
	3409	* looking for */
	3410	if (found_key && found_inverse) {
	3411	break;
	3412	}
	3413	}
	3414
	3415	/* Make sure there is a mapping to itself on the list */
	3416	if (! found_key) {
	3417	av_push(list, newSVuv(val));
	3418	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));/
	3419	}
	3420
	3421
	3422	/* Simply add the value to the list */
	3423	if (! found_inverse) {
	3424	av_push(list, newSVuv(inverse));
	3425	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));/
	3426	}
	3427
	3428	/* swatch_get() increments the value of val for each element in the
	3429	* range. That makes more compact tables possible. You can
	3430	* express the capitalization, for example, of all consecutive
	3431	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	3432	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	3433	* and it's not documented; it appears to be used only in
	3434	* implementing tr//; I copied the semantics from swatch_get(), just
	3435	* in case */
	3436	if (!none \|\| val < none) {
	3437	++val;
	3438	}
	3439	}
	3440	}
	3441
	3442	return ret;
	3443	}
	3444
	3445	SV*
	3446	Perl__swash_to_invlist(pTHX_ SV* const swash)
	3447	{
	3448
	3449	/* Subject to change or removal. For use only in one place in regcomp.c.
	3450	* Ownership is given to one reference count in the returned SV* */
	3451
	3452	U8 l, lend;
	3453	char *loc;
	3454	STRLEN lcur;
	3455	HV *const hv = MUTABLE_HV(SvRV(swash));
	3456	UV elements = 0; /* Number of elements in the inversion list */
	3457	U8 empty[] = "";
	3458	SV** listsvp;
	3459	SV** typesvp;
	3460	SV** bitssvp;
	3461	SV** extssvp;
	3462	SV** invert_it_svp;
	3463
	3464	U8* typestr;
	3465	STRLEN bits;
	3466	STRLEN octets; /* if bits == 1, then octets == 0 */
	3467	U8 x, xend;
	3468	STRLEN xcur;
	3469
	3470	SV* invlist;
	3471
	3472	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	3473
	3474	/* If not a hash, it must be the swash's inversion list instead */
	3475	if (SvTYPE(hv) != SVt_PVHV) {
	3476	return SvREFCNT_inc_simple_NN((SV*) hv);
	3477	}
	3478
	3479	/* The string containing the main body of the table */
	3480	listsvp = hv_fetchs(hv, "LIST", FALSE);
	3481	typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3482	bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3483	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	3484	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	3485
	3486	typestr = (U8)SvPV_nolen(typesvp);
	3487	bits = SvUV(*bitssvp);
	3488	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3489
	3490	/* read $swash->{LIST} */
	3491	if (SvPOK(*listsvp)) {
	3492	l = (U8)SvPV(listsvp, lcur);
	3493	}
	3494	else {
	3495	/* LIST legitimately doesn't contain a string during compilation phases
	3496	* of Perl itself, before the Unicode tables are generated. In this
	3497	* case, just fake things up by creating an empty list */
	3498	l = empty;
	3499	lcur = 0;
	3500	}
	3501	loc = (char *) l;
	3502	lend = l + lcur;
	3503
	3504	if (l == 'V') { / Inversion list format */
	3505	const char after_atou = (char ) lend;
	3506	UV element0;
	3507	UV* other_elements_ptr;
	3508
	3509	/* The first number is a count of the rest */
	3510	l++;
	3511	elements = grok_atou((const char *)l, &after_atou);
	3512	if (elements == 0) {
	3513	invlist = _new_invlist(0);
	3514	}
	3515	else {
	3516	while (isSPACE(*l)) l++;
	3517	l = (U8 *) after_atou;
	3518
	3519	/* Get the 0th element, which is needed to setup the inversion list */
	3520	while (isSPACE(*l)) l++;
	3521	element0 = (UV) grok_atou((const char *)l, &after_atou);
	3522	l = (U8 *) after_atou;
	3523	invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
	3524	elements--;
	3525
	3526	/* Then just populate the rest of the input */
	3527	while (elements-- > 0) {
	3528	if (l > lend) {
	3529	Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
	3530	}
	3531	while (isSPACE(*l)) l++;
	3532	other_elements_ptr++ = (UV) grok_atou((const char )l, &after_atou);
	3533	l = (U8 *) after_atou;
	3534	}
	3535	}
	3536	}
	3537	else {
	3538
	3539	/* Scan the input to count the number of lines to preallocate array
	3540	* size based on worst possible case, which is each line in the input
	3541	* creates 2 elements in the inversion list: 1) the beginning of a
	3542	* range in the list; 2) the beginning of a range not in the list. */
	3543	while ((loc = (strchr(loc, '\n'))) != NULL) {
	3544	elements += 2;
	3545	loc++;
	3546	}
	3547
	3548	/* If the ending is somehow corrupt and isn't a new line, add another
	3549	* element for the final range that isn't in the inversion list */
	3550	if (! (*lend == '\n'
	3551	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	3552	{
	3553	elements++;
	3554	}
	3555
	3556	invlist = _new_invlist(elements);
	3557
	3558	/* Now go through the input again, adding each range to the list */
	3559	while (l < lend) {
	3560	UV start, end;
	3561	UV val; /* Not used by this function */
	3562
	3563	l = swash_scan_list_line(l, lend, &start, &end, &val,
	3564	cBOOL(octets), typestr);
	3565
	3566	if (l > lend) {
	3567	break;
	3568	}
	3569
	3570	invlist = _add_range_to_invlist(invlist, start, end);
	3571	}
	3572	}
	3573
	3574	/* Invert if the data says it should be */
	3575	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3576	_invlist_invert(invlist);
	3577	}
	3578
	3579	/* This code is copied from swatch_get()
	3580	* read $swash->{EXTRAS} */
	3581	x = (U8)SvPV(extssvp, xcur);
	3582	xend = x + xcur;
	3583	while (x < xend) {
	3584	STRLEN namelen;
	3585	U8 *namestr;
	3586	SV** othersvp;
	3587	HV* otherhv;
	3588	STRLEN otherbits;
	3589	SV *otherbitssvp, other;
	3590	U8 *nl;
	3591
	3592	const U8 opc = *x++;
	3593	if (opc == '\n')
	3594	continue;
	3595
	3596	nl = (U8*)memchr(x, '\n', xend - x);
	3597
	3598	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3599	if (nl) {
	3600	x = nl + 1; /* 1 is length of "\n" */
	3601	continue;
	3602	}
	3603	else {
	3604	x = xend; /* to EXTRAS' end at which \n is not found */
	3605	break;
	3606	}
	3607	}
	3608
	3609	namestr = x;
	3610	if (nl) {
	3611	namelen = nl - namestr;
	3612	x = nl + 1;
	3613	}
	3614	else {
	3615	namelen = xend - namestr;
	3616	x = xend;
	3617	}
	3618
	3619	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3620	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3621	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3622	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3623
	3624	if (bits != otherbits \|\| bits != 1) {
	3625	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	3626	"properties, bits=%"UVuf", otherbits=%"UVuf,
	3627	(UV)bits, (UV)otherbits);
	3628	}
	3629
	3630	/* The "other" swatch must be destroyed after. */
	3631	other = _swash_to_invlist((SV )othersvp);
	3632
	3633	/* End of code copied from swatch_get() */
	3634	switch (opc) {
	3635	case '+':
	3636	_invlist_union(invlist, other, &invlist);
	3637	break;
	3638	case '!':
	3639	_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
	3640	break;
	3641	case '-':
	3642	_invlist_subtract(invlist, other, &invlist);
	3643	break;
	3644	case '&':
	3645	_invlist_intersection(invlist, other, &invlist);
	3646	break;
	3647	default:
	3648	break;
	3649	}
	3650	sv_free(other); /* through with it! */
	3651	}
	3652
	3653	SvREADONLY_on(invlist);
	3654	return invlist;
	3655	}
	3656
	3657	SV*
	3658	Perl__get_swash_invlist(pTHX_ SV* const swash)
	3659	{
	3660	SV** ptr;
	3661
	3662	PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
	3663
	3664	if (! SvROK(swash)) {
	3665	return NULL;
	3666	}
	3667
	3668	/* If it really isn't a hash, it isn't really swash; must be an inversion
	3669	* list */
	3670	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
	3671	return SvRV(swash);
	3672	}
	3673
	3674	ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
	3675	if (! ptr) {
	3676	return NULL;
	3677	}
	3678
	3679	return *ptr;
	3680	}
	3681
	3682	bool
	3683	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	3684	{
	3685	/* May change: warns if surrogates, non-character code points, or
	3686	* non-Unicode code points are in s which has length len bytes. Returns
	3687	* TRUE if none found; FALSE otherwise. The only other validity check is
	3688	* to make sure that this won't exceed the string's length */
	3689
	3690	const U8* const e = s + len;
	3691	bool ok = TRUE;
	3692
	3693	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3694
	3695	while (s < e) {
	3696	if (UTF8SKIP(s) > len) {
	3697	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3698	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3699	return FALSE;
	3700	}
	3701	if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
	3702	STRLEN char_len;
	3703	if (UTF8_IS_SUPER(s)) {
	3704	if (ckWARN_d(WARN_NON_UNICODE)) {
	3705	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
	3706	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3707	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	3708	ok = FALSE;
	3709	}
	3710	}
	3711	else if (UTF8_IS_SURROGATE(s)) {
	3712	if (ckWARN_d(WARN_SURROGATE)) {
	3713	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
	3714	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3715	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	3716	ok = FALSE;
	3717	}
	3718	}
	3719	else if
	3720	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	3721	&& (ckWARN_d(WARN_NONCHAR)))
	3722	{
	3723	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
	3724	Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
	3725	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	3726	ok = FALSE;
	3727	}
	3728	}
	3729	s += UTF8SKIP(s);
	3730	}
	3731
	3732	return ok;
	3733	}
	3734
	3735	/*
	3736	=for apidoc pv_uni_display
	3737
	3738	Build to the scalar C<dsv> a displayable version of the string C<spv>,
	3739	length C<len>, the displayable version being at most C<pvlim> bytes long
	3740	(if longer, the rest is truncated and "..." will be appended).
	3741
	3742	The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
	3743	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	3744	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	3745	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	3746	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	3747	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	3748
	3749	The pointer to the PV of the C<dsv> is returned.
	3750
	3751	=cut */
	3752	char *
	3753	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	3754	{
	3755	int truncated = 0;
	3756	const char s, e;
	3757
	3758	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	3759
	3760	sv_setpvs(dsv, "");
	3761	SvUTF8_off(dsv);
	3762	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	3763	UV u;
	3764	/* This serves double duty as a flag and a character to print after
	3765	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	3766	*/
	3767	char ok = 0;
	3768
	3769	if (pvlim && SvCUR(dsv) >= pvlim) {
	3770	truncated++;
	3771	break;
	3772	}
	3773	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	3774	if (u < 256) {
	3775	const unsigned char c = (unsigned char)u & 0xFF;
	3776	if (flags & UNI_DISPLAY_BACKSLASH) {
	3777	switch (c) {
	3778	case '\n':
	3779	ok = 'n'; break;
	3780	case '\r':
	3781	ok = 'r'; break;
	3782	case '\t':
	3783	ok = 't'; break;
	3784	case '\f':
	3785	ok = 'f'; break;
	3786	case '\a':
	3787	ok = 'a'; break;
	3788	case '\\':
	3789	ok = '\\'; break;
	3790	default: break;
	3791	}
	3792	if (ok) {
	3793	const char string = ok;
	3794	sv_catpvs(dsv, "\\");
	3795	sv_catpvn(dsv, &string, 1);
	3796	}
	3797	}
	3798	/* isPRINT() is the locale-blind version. */
	3799	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	3800	const char string = c;
	3801	sv_catpvn(dsv, &string, 1);
	3802	ok = 1;
	3803	}
	3804	}
	3805	if (!ok)
	3806	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	3807	}
	3808	if (truncated)
	3809	sv_catpvs(dsv, "...");
	3810
	3811	return SvPVX(dsv);
	3812	}
	3813
	3814	/*
	3815	=for apidoc sv_uni_display
	3816
	3817	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	3818	the displayable version being at most C<pvlim> bytes long
	3819	(if longer, the rest is truncated and "..." will be appended).
	3820
	3821	The C<flags> argument is as in L</pv_uni_display>().
	3822
	3823	The pointer to the PV of the C<dsv> is returned.
	3824
	3825	=cut
	3826	*/
	3827	char *
	3828	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	3829	{
	3830	const char * const ptr =
	3831	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	3832
	3833	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	3834
	3835	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	3836	SvCUR(ssv), pvlim, flags);
	3837	}
	3838
	3839	/*
	3840	=for apidoc foldEQ_utf8
	3841
	3842	Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
	3843	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3844	How far into the strings to compare is determined by other input parameters.
	3845
	3846	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	3847	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C<u2>
	3848	with respect to C<s2>.
	3849
	3850	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
	3851	equality. In other words, C<s1>+C<l1> will be used as a goal to reach. The
	3852	scan will not be considered to be a match unless the goal is reached, and
	3853	scanning won't continue past that goal. Correspondingly for C<l2> with respect to
	3854	C<s2>.
	3855
	3856	If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
	3857	considered an end pointer to the position 1 byte past the maximum point
	3858	in C<s1> beyond which scanning will not continue under any circumstances.
	3859	(This routine assumes that UTF-8 encoded input strings are not malformed;
	3860	malformed input can cause it to read past C<pe1>).
	3861	This means that if both C<l1> and C<pe1> are specified, and C<pe1>
	3862	is less than C<s1>+C<l1>, the match will never be successful because it can
	3863	never
	3864	get as far as its goal (and in fact is asserted against). Correspondingly for
	3865	C<pe2> with respect to C<s2>.
	3866
	3867	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	3868	C<l2> must be non-zero), and if both do, both have to be
	3869	reached for a successful match. Also, if the fold of a character is multiple
	3870	characters, all of them must be matched (see tr21 reference below for
	3871	'folding').
	3872
	3873	Upon a successful match, if C<pe1> is non-NULL,
	3874	it will be set to point to the beginning of the I<next> character of C<s1>
	3875	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	3876
	3877	For case-insensitiveness, the "casefolding" of Unicode is used
	3878	instead of upper/lowercasing both the characters, see
	3879	L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	3880
	3881	=cut */
	3882
	3883	/* A flags parameter has been added which may change, and hence isn't
	3884	* externally documented. Currently it is:
	3885	* 0 for as-documented above
	3886	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	3887	ASCII one, to not match
	3888	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	3889	* locale are to be used.
	3890	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	3891	* routine. This allows that step to be skipped.
	3892	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	3893	*/
	3894	I32
	3895	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, UV l1, bool u1, const char s2, char **pe2, UV l2, bool u2, U32 flags)
	3896	{
	3897	const U8 p1 = (const U8)s1; /* Point to current char */
	3898	const U8 p2 = (const U8)s2;
	3899	const U8 g1 = NULL; / goal for s1 */
	3900	const U8 *g2 = NULL;
	3901	const U8 e1 = NULL; / Don't scan s1 past this */
	3902	U8 f1 = NULL; / Point to current folded */
	3903	const U8 *e2 = NULL;
	3904	U8 *f2 = NULL;
	3905	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3906	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3907	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3908
	3909	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	3910
	3911	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	3912	&& (flags & (FOLDEQ_S1_ALREADY_FOLDED \| FOLDEQ_S2_ALREADY_FOLDED))));
	3913	/* The algorithm is to trial the folds without regard to the flags on
	3914	* the first line of the above assert(), and then see if the result
	3915	* violates them. This means that the inputs can't be pre-folded to a
	3916	* violating result, hence the assert. This could be changed, with the
	3917	* addition of extra tests here for the already-folded case, which would
	3918	* slow it down. That cost is more than any possible gain for when these
	3919	* flags are specified, as the flags indicate /il or /iaa matching which
	3920	* is less common than /iu, and I (khw) also believe that real-world /il
	3921	* and /iaa matches are most likely to involve code points 0-255, and this
	3922	* function only under rare conditions gets called for 0-255. */
	3923
	3924	if (IN_UTF8_CTYPE_LOCALE) {
	3925	flags &= ~FOLDEQ_LOCALE;
	3926	}
	3927
	3928	if (pe1) {
	3929	e1 = (U8*)pe1;
	3930	}
	3931
	3932	if (l1) {
	3933	g1 = (const U8*)s1 + l1;
	3934	}
	3935
	3936	if (pe2) {
	3937	e2 = (U8*)pe2;
	3938	}
	3939
	3940	if (l2) {
	3941	g2 = (const U8*)s2 + l2;
	3942	}
	3943
	3944	/* Must have at least one goal */
	3945	assert(g1 \|\| g2);
	3946
	3947	if (g1) {
	3948
	3949	/* Will never match if goal is out-of-bounds */
	3950	assert(! e1 \|\| e1 >= g1);
	3951
	3952	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3953	* only go as far as the goal */
	3954	e1 = g1;
	3955	}
	3956	else {
	3957	assert(e1); /* Must have an end for looking at s1 */
	3958	}
	3959
	3960	/* Same for goal for s2 */
	3961	if (g2) {
	3962	assert(! e2 \|\| e2 >= g2);
	3963	e2 = g2;
	3964	}
	3965	else {
	3966	assert(e2);
	3967	}
	3968
	3969	/* If both operands are already folded, we could just do a memEQ on the
	3970	* whole strings at once, but it would be better if the caller realized
	3971	* this and didn't even call us */
	3972
	3973	/* Look through both strings, a character at a time */
	3974	while (p1 < e1 && p2 < e2) {
	3975
	3976	/* If at the beginning of a new character in s1, get its fold to use
	3977	* and the length of the fold. (exception: locale rules just get the
	3978	* character to a single byte) */
	3979	if (n1 == 0) {
	3980	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	3981	f1 = (U8 *) p1;
	3982	n1 = UTF8SKIP(f1);
	3983	}
	3984	else {
	3985	/* If in locale matching, we use two sets of rules, depending
	3986	* on if the code point is above or below 255. Here, we test
	3987	* for and handle locale rules */
	3988	if ((flags & FOLDEQ_LOCALE)
	3989	&& (! u1 \|\| ! UTF8_IS_ABOVE_LATIN1(*p1)))
	3990	{
	3991	/* There is no mixing of code points above and below 255. */
	3992	if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
	3993	return 0;
	3994	}
	3995
	3996	/* We handle locale rules by converting, if necessary, the
	3997	* code point to a single byte. */
	3998	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
	3999	foldbuf1 = p1;
	4000	}
	4001	else {
	4002	foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(p1, *(p1 + 1));
	4003	}
	4004	n1 = 1;
	4005	}
	4006	else if (isASCII(p1)) { / Note, that here won't be both
	4007	ASCII and using locale rules */
	4008
	4009	/* If trying to mix non- with ASCII, and not supposed to,
	4010	* fail */
	4011	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	4012	return 0;
	4013	}
	4014	n1 = 1;
	4015	foldbuf1 = toFOLD(p1);
	4016	}
	4017	else if (u1) {
	4018	to_utf8_fold(p1, foldbuf1, &n1);
	4019	}
	4020	else { /* Not utf8, get utf8 fold */
	4021	to_uni_fold(*p1, foldbuf1, &n1);
	4022	}
	4023	f1 = foldbuf1;
	4024	}
	4025	}
	4026
	4027	if (n2 == 0) { /* Same for s2 */
	4028	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	4029	f2 = (U8 *) p2;
	4030	n2 = UTF8SKIP(f2);
	4031	}
	4032	else {
	4033	if ((flags & FOLDEQ_LOCALE)
	4034	&& (! u2 \|\| ! UTF8_IS_ABOVE_LATIN1(*p2)))
	4035	{
	4036	/* Here, the next char in s2 is < 256. We've already
	4037	* worked on s1, and if it isn't also < 256, can't match */
	4038	if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
	4039	return 0;
	4040	}
	4041	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
	4042	foldbuf2 = p2;
	4043	}
	4044	else {
	4045	foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(p2, *(p2 + 1));
	4046	}
	4047
	4048	/* Use another function to handle locale rules. We've made
	4049	* sure that both characters to compare are single bytes */
	4050	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
	4051	return 0;
	4052	}
	4053	n1 = n2 = 0;
	4054	}
	4055	else if (isASCII(*p2)) {
	4056	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	4057	return 0;
	4058	}
	4059	n2 = 1;
	4060	foldbuf2 = toFOLD(p2);
	4061	}
	4062	else if (u2) {
	4063	to_utf8_fold(p2, foldbuf2, &n2);
	4064	}
	4065	else {
	4066	to_uni_fold(*p2, foldbuf2, &n2);
	4067	}
	4068	f2 = foldbuf2;
	4069	}
	4070	}
	4071
	4072	/* Here f1 and f2 point to the beginning of the strings to compare.
	4073	* These strings are the folds of the next character from each input
	4074	* string, stored in utf8. */
	4075
	4076	/* While there is more to look for in both folds, see if they
	4077	* continue to match */
	4078	while (n1 && n2) {
	4079	U8 fold_length = UTF8SKIP(f1);
	4080	if (fold_length != UTF8SKIP(f2)
	4081	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	4082	function call for single
	4083	byte */
	4084	\|\| memNE((char)f1, (char)f2, fold_length))
	4085	{
	4086	return 0; /* mismatch */
	4087	}
	4088
	4089	/* Here, they matched, advance past them */
	4090	n1 -= fold_length;
	4091	f1 += fold_length;
	4092	n2 -= fold_length;
	4093	f2 += fold_length;
	4094	}
	4095
	4096	/* When reach the end of any fold, advance the input past it */
	4097	if (n1 == 0) {
	4098	p1 += u1 ? UTF8SKIP(p1) : 1;
	4099	}
	4100	if (n2 == 0) {
	4101	p2 += u2 ? UTF8SKIP(p2) : 1;
	4102	}
	4103	} /* End of loop through both strings */
	4104
	4105	/* A match is defined by each scan that specified an explicit length
	4106	* reaching its final goal, and the other not having matched a partial
	4107	* character (which can happen when the fold of a character is more than one
	4108	* character). */
	4109	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	4110	return 0;
	4111	}
	4112
	4113	/* Successful match. Set output pointers */
	4114	if (pe1) {
	4115	pe1 = (char)p1;
	4116	}
	4117	if (pe2) {
	4118	pe2 = (char)p2;
	4119	}
	4120	return 1;
	4121	}
	4122
	4123	/* XXX The next two functions should likely be moved to mathoms.c once all
	4124	* occurrences of them are removed from the core; some cpan-upstream modules
	4125	* still use them */
	4126
	4127	U8 *
	4128	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	4129	{
	4130	PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
	4131
	4132	return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
	4133	}
	4134
	4135	/*
	4136	=for apidoc utf8n_to_uvuni
	4137
	4138	Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
	4139
	4140	This function was useful for code that wanted to handle both EBCDIC and
	4141	ASCII platforms with Unicode properties, but starting in Perl v5.20, the
	4142	distinctions between the platforms have mostly been made invisible to most
	4143	code, so this function is quite unlikely to be what you want. If you do need
	4144	this precise functionality, use instead
	4145	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>>
	4146	or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))\|/utf8n_to_uvchr>>.
	4147
	4148	=cut
	4149	*/
	4150
	4151	UV
	4152	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	4153	{
	4154	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	4155
	4156	return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
	4157	}
	4158
	4159	/*
	4160	=for apidoc uvuni_to_utf8_flags
	4161
	4162	Instead you almost certainly want to use L</uvchr_to_utf8> or
	4163	L</uvchr_to_utf8_flags>>.
	4164
	4165	This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
	4166	which itself, while not deprecated, should be used only in isolated
	4167	circumstances. These functions were useful for code that wanted to handle
	4168	both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
	4169	v5.20, the distinctions between the platforms have mostly been made invisible
	4170	to most code, so this function is quite unlikely to be what you want.
	4171
	4172	=cut
	4173	*/
	4174
	4175	U8 *
	4176	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	4177	{
	4178	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	4179
	4180	return uvoffuni_to_utf8_flags(d, uv, flags);
	4181	}
	4182
	4183	/*
	4184	* Local variables:
	4185	* c-indentation-style: bsd
	4186	* c-basic-offset: 4
	4187	* indent-tabs-mode: nil
	4188	* End:
	4189	*
	4190	* ex: set ts=8 sts=4 sw=4 et:
	4191	*/