perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34	#include "invlist_inline.h"
	35
	36	static const char unees[] =
	37	"Malformed UTF-8 character (unexpected end of string)";
	38	static const char cp_above_legal_max[] =
	39	"It is deprecated to use code point 0x%"UVXf"; the permissible max is 0x%"UVXf"";
	40
	41	#define MAX_NON_DEPRECATED_CP (IV_MAX)
	42
	43	/*
	44	=head1 Unicode Support
	45	These are various utility functions for manipulating UTF8-encoded
	46	strings. For the uninitiated, this is a method of representing arbitrary
	47	Unicode characters as a variable number of bytes, in such a way that
	48	characters in the ASCII range are unmodified, and a zero byte never appears
	49	within non-zero characters.
	50
	51	=cut
	52	*/
	53
	54	/*
	55	=for apidoc is_invariant_string
	56
	57	Returns true iff the first C<len> bytes of the string C<s> are the same
	58	regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
	59	EBCDIC machines). That is, if they are UTF-8 invariant. On ASCII-ish
	60	machines, all the ASCII characters and only the ASCII characters fit this
	61	definition. On EBCDIC machines, the ASCII-range characters are invariant, but
	62	so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
	63	EBCDIC).
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
	66	use this option, that C<s> can't have embedded C<NUL> characters and has to
	67	have a terminating C<NUL> byte).
	68
	69	See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
	70
	71	=cut
	72	*/
	73
	74	bool
	75	Perl_is_invariant_string(const U8 *s, STRLEN len)
	76	{
	77	const U8* const send = s + (len ? len : strlen((const char *)s));
	78	const U8* x = s;
	79
	80	PERL_ARGS_ASSERT_IS_INVARIANT_STRING;
	81
	82	for (; x < send; ++x) {
	83	if (!UTF8_IS_INVARIANT(*x))
	84	break;
	85	}
	86
	87	return x == send;
	88	}
	89
	90	/*
	91	=for apidoc uvoffuni_to_utf8_flags
	92
	93	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	94	Instead, B<Almost all code should use L</uvchr_to_utf8> or
	95	L</uvchr_to_utf8_flags>>.
	96
	97	This function is like them, but the input is a strict Unicode
	98	(as opposed to native) code point. Only in very rare circumstances should code
	99	not be using the native code point.
	100
	101	For details, see the description for L</uvchr_to_utf8_flags>.
	102
	103	=cut
	104	*/
	105
	106	#define HANDLE_UNICODE_SURROGATE(uv, flags) \
	107	STMT_START { \
	108	if (flags & UNICODE_WARN_SURROGATE) { \
	109	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
	110	"UTF-16 surrogate U+%04"UVXf, uv); \
	111	} \
	112	if (flags & UNICODE_DISALLOW_SURROGATE) { \
	113	return NULL; \
	114	} \
	115	} STMT_END;
	116
	117	#define HANDLE_UNICODE_NONCHAR(uv, flags) \
	118	STMT_START { \
	119	if (flags & UNICODE_WARN_NONCHAR) { \
	120	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
	121	"Unicode non-character U+%04"UVXf" is not " \
	122	"recommended for open interchange", uv); \
	123	} \
	124	if (flags & UNICODE_DISALLOW_NONCHAR) { \
	125	return NULL; \
	126	} \
	127	} STMT_END;
	128
	129	/* Use shorter names internally in this file */
	130	#define SHIFT UTF_ACCUMULATION_SHIFT
	131	#undef MARK
	132	#define MARK UTF_CONTINUATION_MARK
	133	#define MASK UTF_CONTINUATION_MASK
	134
	135	U8 *
	136	Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	137	{
	138	PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
	139
	140	if (OFFUNI_IS_INVARIANT(uv)) {
	141	*d++ = LATIN1_TO_NATIVE(uv);
	142	return d;
	143	}
	144	if (uv <= MAX_UTF8_TWO_BYTE) {
	145	*d++ = UTF8_TWO_BYTE_HI(uv);
	146	*d++ = UTF8_TWO_BYTE_LO(uv);
	147	return d;
	148	}
	149
	150	/* Not 2-byte; test for and handle 3-byte result. In the test immediately
	151	* below, the 16 is for start bytes E0-EF (which are all the possible ones
	152	* for 3 byte characters). The 2 is for 2 continuation bytes; these each
	153	* contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
	154	* on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
	155	* 0x800-0xFFFF on ASCII */
	156	if (uv < (16 * (1U << (2 * SHIFT)))) {
	157	d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) SHIFT)) \| UTF_START_MARK(3));
	158	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	159	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	160
	161	#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
	162	aren't tested here */
	163	/* The most likely code points in this range are below the surrogates.
	164	* Do an extra test to quickly exclude those. */
	165	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
	166	if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
	167	\|\| UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
	168	{
	169	HANDLE_UNICODE_NONCHAR(uv, flags);
	170	}
	171	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	172	HANDLE_UNICODE_SURROGATE(uv, flags);
	173	}
	174	}
	175	#endif
	176	return d;
	177	}
	178
	179	/* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
	180	* platforms, and 0x4000 on EBCDIC. There are problematic cases that can
	181	* happen starting with 4-byte characters on ASCII platforms. We unify the
	182	* code for these with EBCDIC, even though some of them require 5-bytes on
	183	* those, because khw believes the code saving is worth the very slight
	184	* performance hit on these high EBCDIC code points. */
	185
	186	if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
	187	if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
	188	&& ckWARN_d(WARN_DEPRECATED))
	189	{
	190	Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
	191	cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
	192	}
	193	if ( (flags & UNICODE_WARN_SUPER)
	194	\|\| ( UNICODE_IS_ABOVE_31_BIT(uv)
	195	&& (flags & UNICODE_WARN_ABOVE_31_BIT)))
	196	{
	197	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	198
	199	/* Choose the more dire applicable warning */
	200	(UNICODE_IS_ABOVE_31_BIT(uv))
	201	? "Code point 0x%"UVXf" is not Unicode, and not portable"
	202	: "Code point 0x%"UVXf" is not Unicode, may not be portable",
	203	uv);
	204	}
	205	if (flags & UNICODE_DISALLOW_SUPER
	206	\|\| ( UNICODE_IS_ABOVE_31_BIT(uv)
	207	&& (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
	208	{
	209	return NULL;
	210	}
	211	}
	212	else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
	213	HANDLE_UNICODE_NONCHAR(uv, flags);
	214	}
	215
	216	/* Test for and handle 4-byte result. In the test immediately below, the
	217	* 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
	218	* characters). The 3 is for 3 continuation bytes; these each contribute
	219	* SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
	220	* ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
	221	* 0x1_0000-0x1F_FFFF on ASCII */
	222	if (uv < (8 * (1U << (3 * SHIFT)))) {
	223	d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) SHIFT)) \| UTF_START_MARK(4));
	224	d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) SHIFT)) & MASK) \| MARK);
	225	d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) SHIFT)) & MASK) \| MARK);
	226	d++ = I8_TO_NATIVE_UTF8(( uv / (1 - 1) */ & MASK) \| MARK);
	227
	228	#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
	229	characters. The end-plane non-characters for EBCDIC were
	230	handled just above */
	231	if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
	232	HANDLE_UNICODE_NONCHAR(uv, flags);
	233	}
	234	else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
	235	HANDLE_UNICODE_SURROGATE(uv, flags);
	236	}
	237	#endif
	238
	239	return d;
	240	}
	241
	242	/* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
	243	* platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
	244	* format. The unrolled version above turns out to not save all that much
	245	* time, and at these high code points (well above the legal Unicode range
	246	* on ASCII platforms, and well above anything in common use in EBCDIC),
	247	* khw believes that less code outweighs slight performance gains. */
	248
	249	{
	250	STRLEN len = OFFUNISKIP(uv);
	251	U8 *p = d+len-1;
	252	while (p > d) {
	253	*p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	254	uv >>= UTF_ACCUMULATION_SHIFT;
	255	}
	256	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	257	return d+len;
	258	}
	259	}
	260
	261	/*
	262	=for apidoc uvchr_to_utf8
	263
	264	Adds the UTF-8 representation of the native code point C<uv> to the end
	265	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	266	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	267	the byte after the end of the new character. In other words,
	268
	269	d = uvchr_to_utf8(d, uv);
	270
	271	is the recommended wide native character-aware way of saying
	272
	273	*(d++) = uv;
	274
	275	This function accepts any UV as input, but very high code points (above
	276	C<IV_MAX> on the platform) will raise a deprecation warning. This is
	277	typically 0x7FFF_FFFF in a 32-bit word.
	278
	279	It is possible to forbid or warn on non-Unicode code points, or those that may
	280	be problematic by using L</uvchr_to_utf8_flags>.
	281
	282	=cut
	283	*/
	284
	285	/* This is also a macro */
	286	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	287
	288	U8 *
	289	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	290	{
	291	return uvchr_to_utf8(d, uv);
	292	}
	293
	294	/*
	295	=for apidoc uvchr_to_utf8_flags
	296
	297	Adds the UTF-8 representation of the native code point C<uv> to the end
	298	of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
	299	C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
	300	the byte after the end of the new character. In other words,
	301
	302	d = uvchr_to_utf8_flags(d, uv, flags);
	303
	304	or, in most cases,
	305
	306	d = uvchr_to_utf8_flags(d, uv, 0);
	307
	308	This is the Unicode-aware way of saying
	309
	310	*(d++) = uv;
	311
	312	If C<flags> is 0, this function accepts any UV as input, but very high code
	313	points (above C<IV_MAX> for the platform) will raise a deprecation warning.
	314	This is typically 0x7FFF_FFFF in a 32-bit word.
	315
	316	Specifying C<flags> can further restrict what is allowed and not warned on, as
	317	follows:
	318
	319	If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
	320	the function will raise a warning, provided UTF8 warnings are enabled. If
	321	instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
	322	NULL. If both flags are set, the function will both warn and return NULL.
	323
	324	Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
	325	affect how the function handles a Unicode non-character.
	326
	327	And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
	328	affect the handling of code points that are above the Unicode maximum of
	329	0x10FFFF. Languages other than Perl may not be able to accept files that
	330	contain these.
	331
	332	The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
	333	the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
	334	three DISALLOW flags.
	335
	336	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	337	so using them is more problematic than other above-Unicode code points. Perl
	338	invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
	339	likely that non-Perl languages will not be able to read files that contain
	340	these that written by the perl interpreter; nor would Perl understand files
	341	written by something that uses a different extension. For these reasons, there
	342	is a separate set of flags that can warn and/or disallow these extremely high
	343	code points, even if other above-Unicode ones are accepted. These are the
	344	C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
	345	are entirely independent from the deprecation warning for code points above
	346	C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
	347	code point that needs more than 31 bits to represent. When that happens,
	348	effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
	349	32-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
	350	above-Unicode code points, including these, as malformations; and
	351	C<UNICODE_WARN_SUPER> warns on these.)
	352
	353	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	354	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	355	than on ASCII. Prior to that, code points 2**31 and higher were simply
	356	unrepresentable, and a different, incompatible method was used to represent
	357	code points between 230 and 231 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
	358	and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
	359	platforms, warning and disallowing 2**31 and higher.
	360
	361	=cut
	362	*/
	363
	364	/* This is also a macro */
	365	PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
	366
	367	U8 *
	368	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	369	{
	370	return uvchr_to_utf8_flags(d, uv, flags);
	371	}
	372
	373	/*
	374	=for apidoc is_utf8_string
	375
	376	Returns true if the first C<len> bytes of string C<s> form a valid
	377	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	378	using C<strlen(s)> (which means if you use this option, that C<s> can't have
	379	embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
	380	that all characters being ASCII constitute 'a valid UTF-8 string'.
	381
	382	See also L</is_invariant_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
	383
	384	=cut
	385	*/
	386
	387	bool
	388	Perl_is_utf8_string(const U8 *s, STRLEN len)
	389	{
	390	const U8* const send = s + (len ? len : strlen((const char *)s));
	391	const U8* x = s;
	392
	393	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	394
	395	while (x < send) {
	396	STRLEN len = isUTF8_CHAR(x, send);
	397	if (UNLIKELY(! len)) {
	398	return FALSE;
	399	}
	400	x += len;
	401	}
	402
	403	return TRUE;
	404	}
	405
	406	/*
	407	Implemented as a macro in utf8.h
	408
	409	=for apidoc is_utf8_string_loc
	410
	411	Like L</is_utf8_string> but stores the location of the failure (in the
	412	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	413	"utf8ness success") in the C<ep>.
	414
	415	See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
	416
	417	=for apidoc is_utf8_string_loclen
	418
	419	Like L</is_utf8_string>() but stores the location of the failure (in the
	420	case of "utf8ness failure") or the location C<s>+C<len> (in the case of
	421	"utf8ness success") in the C<ep>, and the number of UTF-8
	422	encoded characters in the C<el>.
	423
	424	See also L</is_utf8_string_loc>() and L</is_utf8_string>().
	425
	426	=cut
	427	*/
	428
	429	bool
	430	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	431	{
	432	const U8* const send = s + (len ? len : strlen((const char *)s));
	433	const U8* x = s;
	434	STRLEN outlen = 0;
	435
	436	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	437
	438	while (x < send) {
	439	STRLEN len = isUTF8_CHAR(x, send);
	440	if (UNLIKELY(! len)) {
	441	goto out;
	442	}
	443	x += len;
	444	outlen++;
	445	}
	446
	447	out:
	448	if (el)
	449	*el = outlen;
	450
	451	if (ep)
	452	*ep = x;
	453	return (x == send);
	454	}
	455
	456	/*
	457
	458	=for apidoc utf8n_to_uvchr
	459
	460	THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
	461	Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
	462
	463	Bottom level UTF-8 decode routine.
	464	Returns the native code point value of the first character in the string C<s>,
	465	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
	466	C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
	467	the length, in bytes, of that character.
	468
	469	The value of C<flags> determines the behavior when C<s> does not point to a
	470	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	471	zero is returned and C<retlen> is set so that (S<C<s> + C<retlen>>) is the
	472	next possible position in C<s> that could begin a non-malformed character.
	473	Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
	474
	475	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	476	individual types of malformations, such as the sequence being overlong (that
	477	is, when there is a shorter sequence that can express the same code point;
	478	overlong sequences are expressly forbidden in the UTF-8 standard due to
	479	potential security issues). Another malformation example is the first byte of
	480	a character not being a legal first byte. See F<utf8.h> for the list of such
	481	flags. For allowed 0 length strings, this function returns 0; for allowed
	482	overlong sequences, the computed code point is returned; for all other allowed
	483	malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
	484	determinable reasonable value.
	485
	486	The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
	487	flags) malformation is found. If this flag is set, the routine assumes that
	488	the caller will raise a warning, and this function will silently just set
	489	C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
	490
	491	Note that this API requires disambiguation between successful decoding a C<NUL>
	492	character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
	493	in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
	494	first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the
	495	input had an error.
	496
	497	Certain code points are considered problematic. These are Unicode surrogates,
	498	Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
	499	By default these are considered regular code points, but certain situations
	500	warrant special handling for them. If C<flags> contains
	501	C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all three classes are treated as
	502	malformations and handled as such. The flags C<UTF8_DISALLOW_SURROGATE>,
	503	C<UTF8_DISALLOW_NONCHAR>, and C<UTF8_DISALLOW_SUPER> (meaning above the legal
	504	Unicode maximum) can be set to disallow these categories individually.
	505
	506	The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
	507	C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
	508	raised for their respective categories, but otherwise the code points are
	509	considered valid (not malformations). To get a category to both be treated as
	510	a malformation and raise a warning, specify both the WARN and DISALLOW flags.
	511	(But note that warnings are not raised if lexically disabled nor if
	512	C<UTF8_CHECK_ONLY> is also specified.)
	513
	514	It is now deprecated to have very high code points (above C<IV_MAX> on the
	515	platforms) and this function will raise a deprecation warning for these (unless
	516	such warnings are turned off). This value, is typically 0x7FFF_FFFF (2**31 -1)
	517	in a 32-bit word.
	518
	519	Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
	520	so using them is more problematic than other above-Unicode code points. Perl
	521	invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
	522	likely that non-Perl languages will not be able to read files that contain
	523	these that written by the perl interpreter; nor would Perl understand files
	524	written by something that uses a different extension. For these reasons, there
	525	is a separate set of flags that can warn and/or disallow these extremely high
	526	code points, even if other above-Unicode ones are accepted. These are the
	527	C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
	528	are entirely independent from the deprecation warning for code points above
	529	C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
	530	code point that needs more than 31 bits to represent. When that happens,
	531	effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
	532	32-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
	533	above-Unicode code points, including these, as malformations; and
	534	C<UTF8_WARN_SUPER> warns on these.)
	535
	536	On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
	537	extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
	538	than on ASCII. Prior to that, code points 2**31 and higher were simply
	539	unrepresentable, and a different, incompatible method was used to represent
	540	code points between 230 and 231 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
	541	and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
	542	platforms, warning and disallowing 2**31 and higher.
	543
	544	All other code points corresponding to Unicode characters, including private
	545	use and those yet to be assigned, are never considered malformed and never
	546	warn.
	547
	548	=cut
	549	*/
	550
	551	UV
	552	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	553	{
	554	const U8 * const s0 = s;
	555	U8 overflow_byte = '\0'; /* Save byte in case of overflow */
	556	U8 * send;
	557	UV uv = *s;
	558	STRLEN expectlen;
	559	SV* sv = NULL;
	560	UV outlier_ret = 0; /* return value when input is in error or problematic
	561	*/
	562	UV pack_warn = 0; /* Save result of packWARN() for later */
	563	bool unexpected_non_continuation = FALSE;
	564	bool overflowed = FALSE;
	565	bool do_overlong_test = TRUE; /* May have to skip this test */
	566
	567	const char* const malformed_text = "Malformed UTF-8 character";
	568
	569	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	570
	571	/* The order of malformation tests here is important. We should consume as
	572	* few bytes as possible in order to not skip any valid character. This is
	573	* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
	574	* http://unicode.org/reports/tr36 for more discussion as to why. For
	575	* example, once we've done a UTF8SKIP, we can tell the expected number of
	576	* bytes, and could fail right off the bat if the input parameters indicate
	577	* that there are too few available. But it could be that just that first
	578	* byte is garbled, and the intended character occupies fewer bytes. If we
	579	* blindly assumed that the first byte is correct, and skipped based on
	580	* that number, we could skip over a valid input character. So instead, we
	581	* always examine the sequence byte-by-byte.
	582	*
	583	* We also should not consume too few bytes, otherwise someone could inject
	584	* things. For example, an input could be deliberately designed to
	585	* overflow, and if this code bailed out immediately upon discovering that,
	586	* returning to the caller C<*retlen> pointing to the very next byte (one
	587	* which is actually part of of the overflowing sequence), that could look
	588	* legitimate to the caller, which could discard the initial partial
	589	* sequence and process the rest, inappropriately */
	590
	591	/* Zero length strings, if allowed, of necessity are zero */
	592	if (UNLIKELY(curlen == 0)) {
	593	if (retlen) {
	594	*retlen = 0;
	595	}
	596
	597	if (flags & UTF8_ALLOW_EMPTY) {
	598	return 0;
	599	}
	600	if (! (flags & UTF8_CHECK_ONLY)) {
	601	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
	602	}
	603	goto malformed;
	604	}
	605
	606	expectlen = UTF8SKIP(s);
	607
	608	/* A well-formed UTF-8 character, as the vast majority of calls to this
	609	* function will be for, has this expected length. For efficiency, set
	610	* things up here to return it. It will be overriden only in those rare
	611	* cases where a malformation is found */
	612	if (retlen) {
	613	*retlen = expectlen;
	614	}
	615
	616	/* An invariant is trivially well-formed */
	617	if (UTF8_IS_INVARIANT(uv)) {
	618	return uv;
	619	}
	620
	621	/* A continuation character can't start a valid sequence */
	622	if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
	623	if (flags & UTF8_ALLOW_CONTINUATION) {
	624	if (retlen) {
	625	*retlen = 1;
	626	}
	627	return UNICODE_REPLACEMENT;
	628	}
	629
	630	if (! (flags & UTF8_CHECK_ONLY)) {
	631	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
	632	}
	633	curlen = 1;
	634	goto malformed;
	635	}
	636
	637	/* Here is not a continuation byte, nor an invariant. The only thing left
	638	* is a start byte (possibly for an overlong) */
	639
	640	#ifdef EBCDIC
	641	uv = NATIVE_UTF8_TO_I8(uv);
	642	#endif
	643
	644	/* Remove the leading bits that indicate the number of bytes in the
	645	* character's whole UTF-8 sequence, leaving just the bits that are part of
	646	* the value */
	647	uv &= UTF_START_MASK(expectlen);
	648
	649	/* Now, loop through the remaining bytes in the character's sequence,
	650	* accumulating each into the working value as we go. Be sure to not look
	651	* past the end of the input string */
	652	send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
	653
	654	for (s = s0 + 1; s < send; s++) {
	655	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
	656	if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
	657
	658	/* The original implementors viewed this malformation as more
	659	* serious than the others (though I, khw, don't understand
	660	* why, since other malformations also give very very wrong
	661	* results), so there is no way to turn off checking for it.
	662	* Set a flag, but keep going in the loop, so that we absorb
	663	* the rest of the bytes that comprise the character. */
	664	overflowed = TRUE;
	665	overflow_byte = s; / Save for warning message's use */
	666	}
	667	uv = UTF8_ACCUMULATE(uv, *s);
	668	}
	669	else {
	670	/* Here, found a non-continuation before processing all expected
	671	* bytes. This byte begins a new character, so quit, even if
	672	* allowing this malformation. */
	673	unexpected_non_continuation = TRUE;
	674	break;
	675	}
	676	} /* End of loop through the character's bytes */
	677
	678	/* Save how many bytes were actually in the character */
	679	curlen = s - s0;
	680
	681	/* The loop above finds two types of malformations: non-continuation and/or
	682	* overflow. The non-continuation malformation is really a too-short
	683	* malformation, as it means that the current character ended before it was
	684	* expected to (being terminated prematurely by the beginning of the next
	685	* character, whereas in the too-short malformation there just are too few
	686	* bytes available to hold the character. In both cases, the check below
	687	* that we have found the expected number of bytes would fail if executed.)
	688	* Thus the non-continuation malformation is really unnecessary, being a
	689	* subset of the too-short malformation. But there may be existing
	690	* applications that are expecting the non-continuation type, so we retain
	691	* it, and return it in preference to the too-short malformation. (If this
	692	* code were being written from scratch, the two types might be collapsed
	693	* into one.) I, khw, am also giving priority to returning the
	694	* non-continuation and too-short malformations over overflow when multiple
	695	* ones are present. I don't know of any real reason to prefer one over
	696	* the other, except that it seems to me that multiple-byte errors trumps
	697	* errors from a single byte */
	698	if (UNLIKELY(unexpected_non_continuation)) {
	699	if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	700	if (! (flags & UTF8_CHECK_ONLY)) {
	701	if (curlen == 1) {
	702	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, s, s0));
	703	}
	704	else {
	705	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, s, (int) curlen, s0, (int)expectlen));
	706	}
	707	}
	708	goto malformed;
	709	}
	710	uv = UNICODE_REPLACEMENT;
	711
	712	/* Skip testing for overlongs, as the REPLACEMENT may not be the same
	713	* as what the original expectations were. */
	714	do_overlong_test = FALSE;
	715	if (retlen) {
	716	*retlen = curlen;
	717	}
	718	}
	719	else if (UNLIKELY(curlen < expectlen)) {
	720	if (! (flags & UTF8_ALLOW_SHORT)) {
	721	if (! (flags & UTF8_CHECK_ONLY)) {
	722	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
	723	}
	724	goto malformed;
	725	}
	726	uv = UNICODE_REPLACEMENT;
	727	do_overlong_test = FALSE;
	728	if (retlen) {
	729	*retlen = curlen;
	730	}
	731	}
	732
	733	if (UNLIKELY(overflowed)) {
	734	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
	735	goto malformed;
	736	}
	737
	738	if (do_overlong_test
	739	&& expectlen > (STRLEN) OFFUNISKIP(uv)
	740	&& ! (flags & UTF8_ALLOW_LONG))
	741	{
	742	/* The overlong malformation has lower precedence than the others.
	743	* Note that if this malformation is allowed, we return the actual
	744	* value, instead of the replacement character. This is because this
	745	* value is actually well-defined. */
	746	if (! (flags & UTF8_CHECK_ONLY)) {
	747	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
	748	}
	749	goto malformed;
	750	}
	751
	752	/* Here, the input is considered to be well-formed, but it still could be a
	753	* problematic code point that is not allowed by the input parameters. */
	754	if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
	755	&& ((flags & ( UTF8_DISALLOW_NONCHAR
	756	\|UTF8_DISALLOW_SURROGATE
	757	\|UTF8_DISALLOW_SUPER
	758	\|UTF8_DISALLOW_ABOVE_31_BIT
	759	\|UTF8_WARN_NONCHAR
	760	\|UTF8_WARN_SURROGATE
	761	\|UTF8_WARN_SUPER
	762	\|UTF8_WARN_ABOVE_31_BIT))
	763	\|\| ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
	764	&& ckWARN_d(WARN_DEPRECATED))))
	765	{
	766	if (UNICODE_IS_SURROGATE(uv)) {
	767
	768	/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
	769	* generation of the sv, since no warnings are raised under CHECK */
	770	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
	771	&& ckWARN_d(WARN_SURROGATE))
	772	{
	773	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	774	pack_warn = packWARN(WARN_SURROGATE);
	775	}
	776	if (flags & UTF8_DISALLOW_SURROGATE) {
	777	goto disallowed;
	778	}
	779	}
	780	else if ((uv > PERL_UNICODE_MAX)) {
	781	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
	782	&& ckWARN_d(WARN_NON_UNICODE))
	783	{
	784	sv = sv_2mortal(Perl_newSVpvf(aTHX_
	785	"Code point 0x%04"UVXf" is not Unicode, may not be portable",
	786	uv));
	787	pack_warn = packWARN(WARN_NON_UNICODE);
	788	}
	789
	790	/* The maximum code point ever specified by a standard was
	791	* 2**31 - 1. Anything larger than that is a Perl extension that
	792	* very well may not be understood by other applications (including
	793	* earlier perl versions on EBCDIC platforms). On ASCII platforms,
	794	* these code points are indicated by the first UTF-8 byte being
	795	* 0xFE or 0xFF. We test for these after the regular SUPER ones,
	796	* and before possibly bailing out, so that the slightly more dire
	797	* warning will override the regular one. */
	798	if (
	799	#ifndef EBCDIC
	800	(s0 & 0xFE) == 0xFE / matches both FE, FF */
	801	#else
	802	/* The I8 for 2**31 (U+80000000) is
	803	* \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
	804	* and it turns out that on all EBCDIC pages recognized that
	805	* the UTF-EBCDIC for that code point is
	806	* \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
	807	* For the next lower code point, the 1047 UTF-EBCDIC is
	808	* \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
	809	* The other code pages differ only in the bytes following
	810	* \x42. Thus the following works (the minimum continuation
	811	* byte is \x41). */
	812	*s0 == 0xFE && send - s0 > 7 && ( s0[1] > 0x41
	813	\|\| s0[2] > 0x41
	814	\|\| s0[3] > 0x41
	815	\|\| s0[4] > 0x41
	816	\|\| s0[5] > 0x41
	817	\|\| s0[6] > 0x41
	818	\|\| s0[7] > 0x42)
	819	#endif
	820	&& (flags & (UTF8_WARN_ABOVE_31_BIT\|UTF8_WARN_SUPER
	821	\|UTF8_DISALLOW_ABOVE_31_BIT)))
	822	{
	823	if ( ! (flags & UTF8_CHECK_ONLY)
	824	&& (flags & (UTF8_WARN_ABOVE_31_BIT\|UTF8_WARN_SUPER))
	825	&& ckWARN_d(WARN_UTF8))
	826	{
	827	sv = sv_2mortal(Perl_newSVpvf(aTHX_
	828	"Code point 0x%"UVXf" is not Unicode, and not portable",
	829	uv));
	830	pack_warn = packWARN(WARN_UTF8);
	831	}
	832	if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
	833	goto disallowed;
	834	}
	835	}
	836
	837	if (flags & UTF8_DISALLOW_SUPER) {
	838	goto disallowed;
	839	}
	840
	841	/* The deprecated warning overrides any non-deprecated one */
	842	if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
	843	{
	844	sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
	845	uv, MAX_NON_DEPRECATED_CP));
	846	pack_warn = packWARN(WARN_DEPRECATED);
	847	}
	848	}
	849	else if (UNICODE_IS_NONCHAR(uv)) {
	850	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
	851	&& ckWARN_d(WARN_NONCHAR))
	852	{
	853	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
	854	pack_warn = packWARN(WARN_NONCHAR);
	855	}
	856	if (flags & UTF8_DISALLOW_NONCHAR) {
	857	goto disallowed;
	858	}
	859	}
	860
	861	if (sv) {
	862	outlier_ret = uv; /* Note we don't bother to convert to native,
	863	as all the outlier code points are the same
	864	in both ASCII and EBCDIC */
	865	goto do_warn;
	866	}
	867
	868	/* Here, this is not considered a malformed character, so drop through
	869	* to return it */
	870	}
	871
	872	return UNI_TO_NATIVE(uv);
	873
	874	/* There are three cases which get to beyond this point. In all 3 cases:
	875	* <sv> if not null points to a string to print as a warning.
	876	* <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
	877	* set.
	878	* <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
	879	* This is done by initializing it to 0, and changing it only
	880	* for case 1).
	881	* The 3 cases are:
	882	* 1) The input is valid but problematic, and to be warned about. The
	883	* return value is the resultant code point; <*retlen> is set to
	884	* <curlen>, the number of bytes that comprise the code point.
	885	* <pack_warn> contains the result of packWARN() for the warning
	886	* types. The entry point for this case is the label <do_warn>;
	887	* 2) The input is a valid code point but disallowed by the parameters to
	888	* this function. The return value is 0. If UTF8_CHECK_ONLY is set,
	889	* <*relen> is -1; otherwise it is <curlen>, the number of bytes that
	890	* comprise the code point. <pack_warn> contains the result of
	891	* packWARN() for the warning types. The entry point for this case is
	892	* the label <disallowed>.
	893	* 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
	894	* is set, <*relen> is -1; otherwise it is <curlen>, the number of
	895	* bytes that comprise the malformation. All such malformations are
	896	* assumed to be warning type <utf8>. The entry point for this case
	897	* is the label <malformed>.
	898	*/
	899
	900	malformed:
	901
	902	if (sv && ckWARN_d(WARN_UTF8)) {
	903	pack_warn = packWARN(WARN_UTF8);
	904	}
	905
	906	disallowed:
	907
	908	if (flags & UTF8_CHECK_ONLY) {
	909	if (retlen)
	910	*retlen = ((STRLEN) -1);
	911	return 0;
	912	}
	913
	914	do_warn:
	915
	916	if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
	917	if warnings are to be raised. */
	918	const char * const string = SvPVX_const(sv);
	919
	920	if (PL_op)
	921	Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
	922	else
	923	Perl_warner(aTHX_ pack_warn, "%s", string);
	924	}
	925
	926	if (retlen) {
	927	*retlen = curlen;
	928	}
	929
	930	return outlier_ret;
	931	}
	932
	933	/*
	934	=for apidoc utf8_to_uvchr_buf
	935
	936	Returns the native code point of the first character in the string C<s> which
	937	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	938	C<*retlen> will be set to the length, in bytes, of that character.
	939
	940	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	941	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	942	C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
	943	(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
	944	C<retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<retlen>>) is
	945	the next possible position in C<s> that could begin a non-malformed character.
	946	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
	947	returned.
	948
	949	Code points above the platform's C<IV_MAX> will raise a deprecation warning,
	950	unless those are turned off.
	951
	952	=cut
	953	*/
	954
	955
	956	UV
	957	Perl_utf8_to_uvchr_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	958	{
	959	assert(s < send);
	960
	961	return utf8n_to_uvchr(s, send - s, retlen,
	962	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	963	}
	964
	965	/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
	966	* there are no malformations in the input UTF-8 string C<s>. surrogates,
	967	* non-character code points, and non-Unicode code points are allowed. */
	968
	969	UV
	970	Perl_valid_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	971	{
	972	UV expectlen = UTF8SKIP(s);
	973	const U8* send = s + expectlen;
	974	UV uv = *s;
	975
	976	PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
	977	PERL_UNUSED_CONTEXT;
	978
	979	if (retlen) {
	980	*retlen = expectlen;
	981	}
	982
	983	/* An invariant is trivially returned */
	984	if (expectlen == 1) {
	985	return uv;
	986	}
	987
	988	#ifdef EBCDIC
	989	uv = NATIVE_UTF8_TO_I8(uv);
	990	#endif
	991
	992	/* Remove the leading bits that indicate the number of bytes, leaving just
	993	* the bits that are part of the value */
	994	uv &= UTF_START_MASK(expectlen);
	995
	996	/* Now, loop through the remaining bytes, accumulating each into the
	997	* working total as we go. (I khw tried unrolling the loop for up to 4
	998	* bytes, but there was no performance improvement) */
	999	for (++s; s < send; s++) {
	1000	uv = UTF8_ACCUMULATE(uv, *s);
	1001	}
	1002
	1003	return UNI_TO_NATIVE(uv);
	1004
	1005	}
	1006
	1007	/*
	1008	=for apidoc utf8_to_uvuni_buf
	1009
	1010	Only in very rare circumstances should code need to be dealing in Unicode
	1011	(as opposed to native) code points. In those few cases, use
	1012	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>> instead.
	1013
	1014	Returns the Unicode (not-native) code point of the first character in the
	1015	string C<s> which
	1016	is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
	1017	C<retlen> will be set to the length, in bytes, of that character.
	1018
	1019	If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
	1020	enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
	1021	NULL) to -1. If those warnings are off, the computed value if well-defined (or
	1022	the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
	1023	is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
	1024	next possible position in C<s> that could begin a non-malformed character.
	1025	See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
	1026
	1027	Code points above the platform's C<IV_MAX> will raise a deprecation warning,
	1028	unless those are turned off.
	1029
	1030	=cut
	1031	*/
	1032
	1033	UV
	1034	Perl_utf8_to_uvuni_buf(pTHX_ const U8 s, const U8 send, STRLEN *retlen)
	1035	{
	1036	PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
	1037
	1038	assert(send > s);
	1039
	1040	/* Call the low level routine asking for checks */
	1041	return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
	1042	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
	1043	}
	1044
	1045	/*
	1046	=for apidoc utf8_length
	1047
	1048	Return the length of the UTF-8 char encoded string C<s> in characters.
	1049	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	1050	up past C<e>, croaks.
	1051
	1052	=cut
	1053	*/
	1054
	1055	STRLEN
	1056	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	1057	{
	1058	STRLEN len = 0;
	1059
	1060	PERL_ARGS_ASSERT_UTF8_LENGTH;
	1061
	1062	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	1063	* the bitops (especially ~) can create illegal UTF-8.
	1064	* In other words: in Perl UTF-8 is not just for Unicode. */
	1065
	1066	if (e < s)
	1067	goto warn_and_return;
	1068	while (s < e) {
	1069	s += UTF8SKIP(s);
	1070	len++;
	1071	}
	1072
	1073	if (e != s) {
	1074	len--;
	1075	warn_and_return:
	1076	if (PL_op)
	1077	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	1078	"%s in %s", unees, OP_DESC(PL_op));
	1079	else
	1080	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	1081	}
	1082
	1083	return len;
	1084	}
	1085
	1086	/*
	1087	=for apidoc utf8_distance
	1088
	1089	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	1090	and C<b>.
	1091
	1092	WARNING: use only if you know that the pointers point inside the
	1093	same UTF-8 buffer.
	1094
	1095	=cut
	1096	*/
	1097
	1098	IV
	1099	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	1100	{
	1101	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	1102
	1103	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	1104	}
	1105
	1106	/*
	1107	=for apidoc utf8_hop
	1108
	1109	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	1110	forward or backward.
	1111
	1112	WARNING: do not use the following unless you know C<off> is within
	1113	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	1114	on the first byte of character or just after the last byte of a character.
	1115
	1116	=cut
	1117	*/
	1118
	1119	U8 *
	1120	Perl_utf8_hop(const U8 *s, I32 off)
	1121	{
	1122	PERL_ARGS_ASSERT_UTF8_HOP;
	1123
	1124	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	1125	* the bitops (especially ~) can create illegal UTF-8.
	1126	* In other words: in Perl UTF-8 is not just for Unicode. */
	1127
	1128	if (off >= 0) {
	1129	while (off--)
	1130	s += UTF8SKIP(s);
	1131	}
	1132	else {
	1133	while (off++) {
	1134	s--;
	1135	while (UTF8_IS_CONTINUATION(*s))
	1136	s--;
	1137	}
	1138	}
	1139	return (U8 *)s;
	1140	}
	1141
	1142	/*
	1143	=for apidoc bytes_cmp_utf8
	1144
	1145	Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
	1146	sequence of characters (stored as UTF-8)
	1147	in C<u>, C<ulen>. Returns 0 if they are
	1148	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	1149	if the first string is greater than the second string.
	1150
	1151	-1 or +1 is returned if the shorter string was identical to the start of the
	1152	longer string. -2 or +2 is returned if
	1153	there was a difference between characters
	1154	within the strings.
	1155
	1156	=cut
	1157	*/
	1158
	1159	int
	1160	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	1161	{
	1162	const U8 *const bend = b + blen;
	1163	const U8 *const uend = u + ulen;
	1164
	1165	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	1166
	1167	while (b < bend && u < uend) {
	1168	U8 c = *u++;
	1169	if (!UTF8_IS_INVARIANT(c)) {
	1170	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	1171	if (u < uend) {
	1172	U8 c1 = *u++;
	1173	if (UTF8_IS_CONTINUATION(c1)) {
	1174	c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
	1175	} else {
	1176	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	1177	"Malformed UTF-8 character "
	1178	"(unexpected non-continuation byte 0x%02x"
	1179	", immediately after start byte 0x%02x)"
	1180	/* Dear diag.t, it's in the pod. */
	1181	"%s%s", c1, c,
	1182	PL_op ? " in " : "",
	1183	PL_op ? OP_DESC(PL_op) : "");
	1184	return -2;
	1185	}
	1186	} else {
	1187	if (PL_op)
	1188	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	1189	"%s in %s", unees, OP_DESC(PL_op));
	1190	else
	1191	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	1192	return -2; /* Really want to return undef :-) */
	1193	}
	1194	} else {
	1195	return -2;
	1196	}
	1197	}
	1198	if (*b != c) {
	1199	return *b < c ? -2 : +2;
	1200	}
	1201	++b;
	1202	}
	1203
	1204	if (b == bend && u == uend)
	1205	return 0;
	1206
	1207	return b < bend ? +1 : -1;
	1208	}
	1209
	1210	/*
	1211	=for apidoc utf8_to_bytes
	1212
	1213	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1214	Unlike L</bytes_to_utf8>, this over-writes the original string, and
	1215	updates C<len> to contain the new length.
	1216	Returns zero on failure, setting C<len> to -1.
	1217
	1218	If you need a copy of the string, see L</bytes_from_utf8>.
	1219
	1220	=cut
	1221	*/
	1222
	1223	U8 *
	1224	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	1225	{
	1226	U8 * const save = s;
	1227	U8 * const send = s + *len;
	1228	U8 *d;
	1229
	1230	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	1231	PERL_UNUSED_CONTEXT;
	1232
	1233	/* ensure valid UTF-8 and chars < 256 before updating string */
	1234	while (s < send) {
	1235	if (! UTF8_IS_INVARIANT(*s)) {
	1236	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	1237	*len = ((STRLEN) -1);
	1238	return 0;
	1239	}
	1240	s++;
	1241	}
	1242	s++;
	1243	}
	1244
	1245	d = s = save;
	1246	while (s < send) {
	1247	U8 c = *s++;
	1248	if (! UTF8_IS_INVARIANT(c)) {
	1249	/* Then it is two-byte encoded */
	1250	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	1251	s++;
	1252	}
	1253	*d++ = c;
	1254	}
	1255	*d = '\0';
	1256	*len = d - save;
	1257	return save;
	1258	}
	1259
	1260	/*
	1261	=for apidoc bytes_from_utf8
	1262
	1263	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1264	Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
	1265	the newly-created string, and updates C<len> to contain the new
	1266	length. Returns the original string if no conversion occurs, C<len>
	1267	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1268	0 if C<s> is converted or consisted entirely of characters that are invariant
	1269	in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
	1270
	1271	=cut
	1272	*/
	1273
	1274	U8 *
	1275	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1276	{
	1277	U8 *d;
	1278	const U8 *start = s;
	1279	const U8 *send;
	1280	I32 count = 0;
	1281
	1282	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1283	PERL_UNUSED_CONTEXT;
	1284	if (!*is_utf8)
	1285	return (U8 *)start;
	1286
	1287	/* ensure valid UTF-8 and chars < 256 before converting string */
	1288	for (send = s + *len; s < send;) {
	1289	if (! UTF8_IS_INVARIANT(*s)) {
	1290	if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
	1291	return (U8 *)start;
	1292	}
	1293	count++;
	1294	s++;
	1295	}
	1296	s++;
	1297	}
	1298
	1299	*is_utf8 = FALSE;
	1300
	1301	Newx(d, (*len) - count + 1, U8);
	1302	s = start; start = d;
	1303	while (s < send) {
	1304	U8 c = *s++;
	1305	if (! UTF8_IS_INVARIANT(c)) {
	1306	/* Then it is two-byte encoded */
	1307	c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
	1308	s++;
	1309	}
	1310	*d++ = c;
	1311	}
	1312	*d = '\0';
	1313	*len = d - start;
	1314	return (U8 *)start;
	1315	}
	1316
	1317	/*
	1318	=for apidoc bytes_to_utf8
	1319
	1320	Converts a string C<s> of length C<len> bytes from the native encoding into
	1321	UTF-8.
	1322	Returns a pointer to the newly-created string, and sets C<len> to
	1323	reflect the new length in bytes.
	1324
	1325	A C<NUL> character will be written after the end of the string.
	1326
	1327	If you want to convert to UTF-8 from encodings other than
	1328	the native (Latin1 or EBCDIC),
	1329	see L</sv_recode_to_utf8>().
	1330
	1331	=cut
	1332	*/
	1333
	1334	/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
	1335	likewise need duplication. */
	1336
	1337	U8*
	1338	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1339	{
	1340	const U8 * const send = s + (*len);
	1341	U8 *d;
	1342	U8 *dst;
	1343
	1344	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1345	PERL_UNUSED_CONTEXT;
	1346
	1347	Newx(d, (len) 2 + 1, U8);
	1348	dst = d;
	1349
	1350	while (s < send) {
	1351	append_utf8_from_native_byte(*s, &d);
	1352	s++;
	1353	}
	1354	*d = '\0';
	1355	*len = d-dst;
	1356	return dst;
	1357	}
	1358
	1359	/*
	1360	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1361	*
	1362	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1363	* We optimize for native, for obvious reasons. */
	1364
	1365	U8*
	1366	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1367	{
	1368	U8* pend;
	1369	U8* dstart = d;
	1370
	1371	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1372
	1373	if (bytelen & 1)
	1374	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1375
	1376	pend = p + bytelen;
	1377
	1378	while (p < pend) {
	1379	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1380	p += 2;
	1381	if (OFFUNI_IS_INVARIANT(uv)) {
	1382	*d++ = LATIN1_TO_NATIVE((U8) uv);
	1383	continue;
	1384	}
	1385	if (uv <= MAX_UTF8_TWO_BYTE) {
	1386	*d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
	1387	*d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
	1388	continue;
	1389	}
	1390	#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
	1391	#define LAST_HIGH_SURROGATE 0xDBFF
	1392	#define FIRST_LOW_SURROGATE 0xDC00
	1393	#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
	1394
	1395	/* This assumes that most uses will be in the first Unicode plane, not
	1396	* needing surrogates */
	1397	if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
	1398	&& uv <= UNICODE_SURROGATE_LAST))
	1399	{
	1400	if (UNLIKELY(p >= pend) \|\| UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
	1401	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1402	}
	1403	else {
	1404	UV low = (p[0] << 8) + p[1];
	1405	if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
	1406	\|\| UNLIKELY(low > LAST_LOW_SURROGATE))
	1407	{
	1408	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1409	}
	1410	p += 2;
	1411	uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
	1412	+ (low - FIRST_LOW_SURROGATE) + 0x10000;
	1413	}
	1414	}
	1415	#ifdef EBCDIC
	1416	d = uvoffuni_to_utf8_flags(d, uv, 0);
	1417	#else
	1418	if (uv < 0x10000) {
	1419	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1420	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1421	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1422	continue;
	1423	}
	1424	else {
	1425	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1426	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1427	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1428	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1429	continue;
	1430	}
	1431	#endif
	1432	}
	1433	*newlen = d - dstart;
	1434	return d;
	1435	}
	1436
	1437	/* Note: this one is slightly destructive of the source. */
	1438
	1439	U8*
	1440	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1441	{
	1442	U8* s = (U8*)p;
	1443	U8* const send = s + bytelen;
	1444
	1445	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1446
	1447	if (bytelen & 1)
	1448	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1449	(UV)bytelen);
	1450
	1451	while (s < send) {
	1452	const U8 tmp = s[0];
	1453	s[0] = s[1];
	1454	s[1] = tmp;
	1455	s += 2;
	1456	}
	1457	return utf16_to_utf8(p, d, bytelen, newlen);
	1458	}
	1459
	1460	bool
	1461	Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
	1462	{
	1463	U8 tmpbuf[UTF8_MAXBYTES+1];
	1464	uvchr_to_utf8(tmpbuf, c);
	1465	return _is_utf8_FOO(classnum, tmpbuf);
	1466	}
	1467
	1468	/* Internal function so we can deprecate the external one, and call
	1469	this one from other deprecated functions in this file */
	1470
	1471	bool
	1472	Perl__is_utf8_idstart(pTHX_ const U8 *p)
	1473	{
	1474	PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
	1475
	1476	if (*p == '_')
	1477	return TRUE;
	1478	return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
	1479	}
	1480
	1481	bool
	1482	Perl__is_uni_perl_idcont(pTHX_ UV c)
	1483	{
	1484	U8 tmpbuf[UTF8_MAXBYTES+1];
	1485	uvchr_to_utf8(tmpbuf, c);
	1486	return _is_utf8_perl_idcont(tmpbuf);
	1487	}
	1488
	1489	bool
	1490	Perl__is_uni_perl_idstart(pTHX_ UV c)
	1491	{
	1492	U8 tmpbuf[UTF8_MAXBYTES+1];
	1493	uvchr_to_utf8(tmpbuf, c);
	1494	return _is_utf8_perl_idstart(tmpbuf);
	1495	}
	1496
	1497	UV
	1498	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
	1499	{
	1500	/* We have the latin1-range values compiled into the core, so just use
	1501	* those, converting the result to UTF-8. The only difference between upper
	1502	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	1503	* either "SS" or "Ss". Which one to use is passed into the routine in
	1504	* 'S_or_s' to avoid a test */
	1505
	1506	UV converted = toUPPER_LATIN1_MOD(c);
	1507
	1508	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	1509
	1510	assert(S_or_s == 'S' \|\| S_or_s == 's');
	1511
	1512	if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
	1513	characters in this range */
	1514	*p = (U8) converted;
	1515	*lenp = 1;
	1516	return converted;
	1517	}
	1518
	1519	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	1520	* which it maps to one of them, so as to only have to have one check for
	1521	* it in the main case */
	1522	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	1523	switch (c) {
	1524	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	1525	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	1526	break;
	1527	case MICRO_SIGN:
	1528	converted = GREEK_CAPITAL_LETTER_MU;
	1529	break;
	1530	#if UNICODE_MAJOR_VERSION > 2 \
	1531	\|\| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
	1532	&& UNICODE_DOT_DOT_VERSION >= 8)
	1533	case LATIN_SMALL_LETTER_SHARP_S:
	1534	*(p)++ = 'S';
	1535	*p = S_or_s;
	1536	*lenp = 2;
	1537	return 'S';
	1538	#endif
	1539	default:
	1540	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	1541	NOT_REACHED; /* NOTREACHED */
	1542	}
	1543	}
	1544
	1545	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1546	*p = UTF8_TWO_BYTE_LO(converted);
	1547	*lenp = 2;
	1548
	1549	return converted;
	1550	}
	1551
	1552	/* Call the function to convert a UTF-8 encoded character to the specified case.
	1553	* Note that there may be more than one character in the result.
	1554	* INP is a pointer to the first byte of the input character
	1555	* OUTP will be set to the first byte of the string of changed characters. It
	1556	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	1557	* LENP will be set to the length in bytes of the string of changed characters
	1558	*
	1559	* The functions return the ordinal of the first character in the string of OUTP */
	1560	#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
	1561	#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
	1562	#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
	1563
	1564	/* This additionally has the input parameter 'specials', which if non-zero will
	1565	* cause this to use the specials hash for folding (meaning get full case
	1566	* folding); otherwise, when zero, this implies a simple case fold */
	1567	#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
	1568
	1569	UV
	1570	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1571	{
	1572	/* Convert the Unicode character whose ordinal is <c> to its uppercase
	1573	* version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
	1574	* Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1575	* the changed version may be longer than the original character.
	1576	*
	1577	* The ordinal of the first character of the changed version is returned
	1578	* (but note, as explained above, that there may be more.) */
	1579
	1580	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1581
	1582	if (c < 256) {
	1583	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	1584	}
	1585
	1586	uvchr_to_utf8(p, c);
	1587	return CALL_UPPER_CASE(c, p, p, lenp);
	1588	}
	1589
	1590	UV
	1591	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1592	{
	1593	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1594
	1595	if (c < 256) {
	1596	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	1597	}
	1598
	1599	uvchr_to_utf8(p, c);
	1600	return CALL_TITLE_CASE(c, p, p, lenp);
	1601	}
	1602
	1603	STATIC U8
	1604	S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
	1605	{
	1606	/* We have the latin1-range values compiled into the core, so just use
	1607	* those, converting the result to UTF-8. Since the result is always just
	1608	* one character, we allow <p> to be NULL */
	1609
	1610	U8 converted = toLOWER_LATIN1(c);
	1611
	1612	if (p != NULL) {
	1613	if (NATIVE_BYTE_IS_INVARIANT(converted)) {
	1614	*p = converted;
	1615	*lenp = 1;
	1616	}
	1617	else {
	1618	/* Result is known to always be < 256, so can use the EIGHT_BIT
	1619	* macros */
	1620	*p = UTF8_EIGHT_BIT_HI(converted);
	1621	*(p+1) = UTF8_EIGHT_BIT_LO(converted);
	1622	*lenp = 2;
	1623	}
	1624	}
	1625	return converted;
	1626	}
	1627
	1628	UV
	1629	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1630	{
	1631	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1632
	1633	if (c < 256) {
	1634	return to_lower_latin1((U8) c, p, lenp);
	1635	}
	1636
	1637	uvchr_to_utf8(p, c);
	1638	return CALL_LOWER_CASE(c, p, p, lenp);
	1639	}
	1640
	1641	UV
	1642	Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
	1643	{
	1644	/* Corresponds to to_lower_latin1(); <flags> bits meanings:
	1645	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	1646	* FOLD_FLAGS_FULL iff full folding is to be used;
	1647	*
	1648	* Not to be used for locale folds
	1649	*/
	1650
	1651	UV converted;
	1652
	1653	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	1654	PERL_UNUSED_CONTEXT;
	1655
	1656	assert (! (flags & FOLD_FLAGS_LOCALE));
	1657
	1658	if (UNLIKELY(c == MICRO_SIGN)) {
	1659	converted = GREEK_SMALL_LETTER_MU;
	1660	}
	1661	#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
	1662	\|\| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
	1663	\|\| UNICODE_DOT_DOT_VERSION > 0)
	1664	else if ( (flags & FOLD_FLAGS_FULL)
	1665	&& UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
	1666	{
	1667	/* If can't cross 127/128 boundary, can't return "ss"; instead return
	1668	* two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
	1669	* under those circumstances. */
	1670	if (flags & FOLD_FLAGS_NOMIX_ASCII) {
	1671	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	1672	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	1673	p, *lenp, U8);
	1674	return LATIN_SMALL_LETTER_LONG_S;
	1675	}
	1676	else {
	1677	*(p)++ = 's';
	1678	*p = 's';
	1679	*lenp = 2;
	1680	return 's';
	1681	}
	1682	}
	1683	#endif
	1684	else { /* In this range the fold of all other characters is their lower
	1685	case */
	1686	converted = toLOWER_LATIN1(c);
	1687	}
	1688
	1689	if (UVCHR_IS_INVARIANT(converted)) {
	1690	*p = (U8) converted;
	1691	*lenp = 1;
	1692	}
	1693	else {
	1694	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1695	*p = UTF8_TWO_BYTE_LO(converted);
	1696	*lenp = 2;
	1697	}
	1698
	1699	return converted;
	1700	}
	1701
	1702	UV
	1703	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	1704	{
	1705
	1706	/* Not currently externally documented, and subject to change
	1707	* <flags> bits meanings:
	1708	* FOLD_FLAGS_FULL iff full folding is to be used;
	1709	* FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	1710	* locale are to be used.
	1711	* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
	1712	*/
	1713
	1714	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1715
	1716	if (flags & FOLD_FLAGS_LOCALE) {
	1717	/* Treat a UTF-8 locale as not being in locale at all */
	1718	if (IN_UTF8_CTYPE_LOCALE) {
	1719	flags &= ~FOLD_FLAGS_LOCALE;
	1720	}
	1721	else {
	1722	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	1723	goto needs_full_generality;
	1724	}
	1725	}
	1726
	1727	if (c < 256) {
	1728	return _to_fold_latin1((U8) c, p, lenp,
	1729	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	1730	}
	1731
	1732	/* Here, above 255. If no special needs, just use the macro */
	1733	if ( ! (flags & (FOLD_FLAGS_LOCALE\|FOLD_FLAGS_NOMIX_ASCII))) {
	1734	uvchr_to_utf8(p, c);
	1735	return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
	1736	}
	1737	else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
	1738	the special flags. */
	1739	U8 utf8_c[UTF8_MAXBYTES + 1];
	1740
	1741	needs_full_generality:
	1742	uvchr_to_utf8(utf8_c, c);
	1743	return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
	1744	}
	1745	}
	1746
	1747	PERL_STATIC_INLINE bool
	1748	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1749	const char const swashname, SV const invlist)
	1750	{
	1751	/* returns a boolean giving whether or not the UTF8-encoded character that
	1752	* starts at <p> is in the swash indicated by <swashname>. <swash>
	1753	* contains a pointer to where the swash indicated by <swashname>
	1754	* is to be stored; which this routine will do, so that future calls will
	1755	* look at <*swash> and only generate a swash if it is not null. <invlist>
	1756	* is NULL or an inversion list that defines the swash. If not null, it
	1757	* saves time during initialization of the swash.
	1758	*
	1759	* Note that it is assumed that the buffer length of <p> is enough to
	1760	* contain all the bytes that comprise the character. Thus, <*p> should
	1761	* have been checked before this call for mal-formedness enough to assure
	1762	* that. */
	1763
	1764	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1765
	1766	/* The API should have included a length for the UTF-8 character in <p>,
	1767	* but it doesn't. We therefore assume that p has been validated at least
	1768	* as far as there being enough bytes available in it to accommodate the
	1769	* character without reading beyond the end, and pass that number on to the
	1770	* validating routine */
	1771	if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
	1772	if (ckWARN_d(WARN_UTF8)) {
	1773	Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
	1774	"Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
	1775	if (ckWARN(WARN_UTF8)) { /* This will output details as to the
	1776	what the malformation is */
	1777	utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
	1778	}
	1779	}
	1780	return FALSE;
	1781	}
	1782	if (!*swash) {
	1783	U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
	1784	*swash = _core_swash_init("utf8",
	1785
	1786	/* Only use the name if there is no inversion
	1787	* list; otherwise will go out to disk */
	1788	(invlist) ? "" : swashname,
	1789
	1790	&PL_sv_undef, 1, 0, invlist, &flags);
	1791	}
	1792
	1793	return swash_fetch(*swash, p, TRUE) != 0;
	1794	}
	1795
	1796	bool
	1797	Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
	1798	{
	1799	PERL_ARGS_ASSERT__IS_UTF8_FOO;
	1800
	1801	assert(classnum < _FIRST_NON_SWASH_CC);
	1802
	1803	return is_utf8_common(p,
	1804	&PL_utf8_swash_ptrs[classnum],
	1805	swash_property_names[classnum],
	1806	PL_XPosix_ptrs[classnum]);
	1807	}
	1808
	1809	bool
	1810	Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
	1811	{
	1812	SV* invlist = NULL;
	1813
	1814	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
	1815
	1816	if (! PL_utf8_perl_idstart) {
	1817	invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
	1818	}
	1819	return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
	1820	}
	1821
	1822	bool
	1823	Perl__is_utf8_xidstart(pTHX_ const U8 *p)
	1824	{
	1825	PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
	1826
	1827	if (*p == '_')
	1828	return TRUE;
	1829	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
	1830	}
	1831
	1832	bool
	1833	Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
	1834	{
	1835	SV* invlist = NULL;
	1836
	1837	PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
	1838
	1839	if (! PL_utf8_perl_idcont) {
	1840	invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
	1841	}
	1842	return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
	1843	}
	1844
	1845	bool
	1846	Perl__is_utf8_idcont(pTHX_ const U8 *p)
	1847	{
	1848	PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
	1849
	1850	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
	1851	}
	1852
	1853	bool
	1854	Perl__is_utf8_xidcont(pTHX_ const U8 *p)
	1855	{
	1856	PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
	1857
	1858	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
	1859	}
	1860
	1861	bool
	1862	Perl__is_utf8_mark(pTHX_ const U8 *p)
	1863	{
	1864	PERL_ARGS_ASSERT__IS_UTF8_MARK;
	1865
	1866	return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
	1867	}
	1868
	1869	/*
	1870	=for apidoc to_utf8_case
	1871
	1872	Instead use the appropriate one of L</toUPPER_utf8>,
	1873	L</toTITLE_utf8>,
	1874	L</toLOWER_utf8>,
	1875	or L</toFOLD_utf8>.
	1876
	1877	C<p> contains the pointer to the UTF-8 string encoding
	1878	the character that is being converted. This routine assumes that the character
	1879	at C<p> is well-formed.
	1880
	1881	C<ustrp> is a pointer to the character buffer to put the
	1882	conversion result to. C<lenp> is a pointer to the length
	1883	of the result.
	1884
	1885	C<swashp> is a pointer to the swash to use.
	1886
	1887	Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
	1888	and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
	1889	but not always, a multicharacter mapping), is tried first.
	1890
	1891	C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
	1892	any special mappings; C<""> means to use the special mappings. Values other
	1893	than these two are treated as the name of the hash containing the special
	1894	mappings, like C<"utf8::ToSpecLower">.
	1895
	1896	C<normal> is a string like C<"ToLower"> which means the swash
	1897	C<%utf8::ToLower>.
	1898
	1899	Code points above the platform's C<IV_MAX> will raise a deprecation warning,
	1900	unless those are turned off.
	1901
	1902	=cut */
	1903
	1904	UV
	1905	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1906	SV *swashp, const char normal, const char *special)
	1907	{
	1908	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1909
	1910	return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
	1911	}
	1912
	1913	/* change namve uv1 to 'from' */
	1914	UV
	1915	S__to_utf8_case(pTHX_ const UV uv1, const U8 p, U8 ustrp, STRLEN *lenp,
	1916	SV *swashp, const char normal, const char *special)
	1917	{
	1918	STRLEN len = 0;
	1919
	1920	PERL_ARGS_ASSERT__TO_UTF8_CASE;
	1921
	1922	/* Note that swash_fetch() doesn't output warnings for these because it
	1923	* assumes we will */
	1924	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	1925	if (UNLIKELY(uv1 <= UNICODE_SURROGATE_LAST)) {
	1926	if (ckWARN_d(WARN_SURROGATE)) {
	1927	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1928	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	1929	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1930	}
	1931	goto cases_to_self;
	1932	}
	1933	if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
	1934	if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
	1935	&& ckWARN_d(WARN_DEPRECATED))
	1936	{
	1937	Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
	1938	cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
	1939	}
	1940	if (ckWARN_d(WARN_NON_UNICODE)) {
	1941	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1942	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	1943	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1944	}
	1945	goto cases_to_self;
	1946	}
	1947
	1948	/* Note that non-characters are perfectly legal, so no warning should
	1949	* be given */
	1950	}
	1951
	1952	if (!swashp) / load on-demand */
	1953	*swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
	1954
	1955	if (special) {
	1956	/* It might be "special" (sometimes, but not always,
	1957	* a multicharacter mapping) */
	1958	HV *hv = NULL;
	1959	SV **svp;
	1960
	1961	/* If passed in the specials name, use that; otherwise use any
	1962	* given in the swash */
	1963	if (*special != '\0') {
	1964	hv = get_hv(special, 0);
	1965	}
	1966	else {
	1967	svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
	1968	if (svp) {
	1969	hv = MUTABLE_HV(SvRV(*svp));
	1970	}
	1971	}
	1972
	1973	if (hv
	1974	&& (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
	1975	&& (*svp))
	1976	{
	1977	const char *s;
	1978
	1979	s = SvPV_const(*svp, len);
	1980	if (len == 1)
	1981	/* EIGHTBIT */
	1982	len = uvchr_to_utf8(ustrp, (U8)s) - ustrp;
	1983	else {
	1984	Copy(s, ustrp, len, U8);
	1985	}
	1986	}
	1987	}
	1988
	1989	if (!len && *swashp) {
	1990	const UV uv2 = swash_fetch(swashp, p, TRUE / => is UTF-8 */);
	1991
	1992	if (uv2) {
	1993	/* It was "normal" (a single character mapping). */
	1994	len = uvchr_to_utf8(ustrp, uv2) - ustrp;
	1995	}
	1996	}
	1997
	1998	if (len) {
	1999	if (lenp) {
	2000	*lenp = len;
	2001	}
	2002	return valid_utf8_to_uvchr(ustrp, 0);
	2003	}
	2004
	2005	/* Here, there was no mapping defined, which means that the code point maps
	2006	* to itself. Return the inputs */
	2007	cases_to_self:
	2008	len = UTF8SKIP(p);
	2009	if (p != ustrp) { /* Don't copy onto itself */
	2010	Copy(p, ustrp, len, U8);
	2011	}
	2012
	2013	if (lenp)
	2014	*lenp = len;
	2015
	2016	return uv1;
	2017
	2018	}
	2019
	2020	STATIC UV
	2021	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
	2022	{
	2023	/* This is called when changing the case of a UTF-8-encoded character above
	2024	* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
	2025	* result contains a character that crosses the 255/256 boundary, disallow
	2026	* the change, and return the original code point. See L<perlfunc/lc> for
	2027	* why;
	2028	*
	2029	* p points to the original string whose case was changed; assumed
	2030	* by this routine to be well-formed
	2031	* result the code point of the first character in the changed-case string
	2032	* ustrp points to the changed-case string (<result> represents its first char)
	2033	* lenp points to the length of <ustrp> */
	2034
	2035	UV original; /* To store the first code point of <p> */
	2036
	2037	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	2038
	2039	assert(UTF8_IS_ABOVE_LATIN1(*p));
	2040
	2041	/* We know immediately if the first character in the string crosses the
	2042	* boundary, so can skip */
	2043	if (result > 255) {
	2044
	2045	/* Look at every character in the result; if any cross the
	2046	* boundary, the whole thing is disallowed */
	2047	U8* s = ustrp + UTF8SKIP(ustrp);
	2048	U8* e = ustrp + *lenp;
	2049	while (s < e) {
	2050	if (! UTF8_IS_ABOVE_LATIN1(*s)) {
	2051	goto bad_crossing;
	2052	}
	2053	s += UTF8SKIP(s);
	2054	}
	2055
	2056	/* Here, no characters crossed, result is ok as-is, but we warn. */
	2057	_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
	2058	return result;
	2059	}
	2060
	2061	bad_crossing:
	2062
	2063	/* Failed, have to return the original */
	2064	original = valid_utf8_to_uvchr(p, lenp);
	2065
	2066	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	2067	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	2068	"Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
	2069	"resolved to \"\\x{%"UVXf"}\".",
	2070	OP_DESC(PL_op),
	2071	original,
	2072	original);
	2073	Copy(p, ustrp, *lenp, char);
	2074	return original;
	2075	}
	2076
	2077	/*
	2078	=for apidoc to_utf8_upper
	2079
	2080	Instead use L</toUPPER_utf8>.
	2081
	2082	=cut */
	2083
	2084	/* Not currently externally documented, and subject to change:
	2085	* <flags> is set iff iff the rules from the current underlying locale are to
	2086	* be used. */
	2087
	2088	UV
	2089	Perl__to_utf8_upper_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	2090	{
	2091	UV result;
	2092
	2093	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	2094
	2095	if (flags) {
	2096	/* Treat a UTF-8 locale as not being in locale at all */
	2097	if (IN_UTF8_CTYPE_LOCALE) {
	2098	flags = FALSE;
	2099	}
	2100	else {
	2101	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2102	}
	2103	}
	2104
	2105	if (UTF8_IS_INVARIANT(*p)) {
	2106	if (flags) {
	2107	result = toUPPER_LC(*p);
	2108	}
	2109	else {
	2110	return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
	2111	}
	2112	}
	2113	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2114	if (flags) {
	2115	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1));
	2116	result = toUPPER_LC(c);
	2117	}
	2118	else {
	2119	return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)),
	2120	ustrp, lenp, 'S');
	2121	}
	2122	}
	2123	else { /* UTF-8, ord above 255 */
	2124	result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
	2125
	2126	if (flags) {
	2127	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2128	}
	2129	return result;
	2130	}
	2131
	2132	/* Here, used locale rules. Convert back to UTF-8 */
	2133	if (UTF8_IS_INVARIANT(result)) {
	2134	*ustrp = (U8) result;
	2135	*lenp = 1;
	2136	}
	2137	else {
	2138	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2139	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2140	*lenp = 2;
	2141	}
	2142
	2143	return result;
	2144	}
	2145
	2146	/*
	2147	=for apidoc to_utf8_title
	2148
	2149	Instead use L</toTITLE_utf8>.
	2150
	2151	=cut */
	2152
	2153	/* Not currently externally documented, and subject to change:
	2154	* <flags> is set iff the rules from the current underlying locale are to be
	2155	* used. Since titlecase is not defined in POSIX, for other than a
	2156	* UTF-8 locale, uppercase is used instead for code points < 256.
	2157	*/
	2158
	2159	UV
	2160	Perl__to_utf8_title_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	2161	{
	2162	UV result;
	2163
	2164	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	2165
	2166	if (flags) {
	2167	/* Treat a UTF-8 locale as not being in locale at all */
	2168	if (IN_UTF8_CTYPE_LOCALE) {
	2169	flags = FALSE;
	2170	}
	2171	else {
	2172	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2173	}
	2174	}
	2175
	2176	if (UTF8_IS_INVARIANT(*p)) {
	2177	if (flags) {
	2178	result = toUPPER_LC(*p);
	2179	}
	2180	else {
	2181	return _to_upper_title_latin1(*p, ustrp, lenp, 's');
	2182	}
	2183	}
	2184	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2185	if (flags) {
	2186	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1));
	2187	result = toUPPER_LC(c);
	2188	}
	2189	else {
	2190	return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)),
	2191	ustrp, lenp, 's');
	2192	}
	2193	}
	2194	else { /* UTF-8, ord above 255 */
	2195	result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
	2196
	2197	if (flags) {
	2198	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2199	}
	2200	return result;
	2201	}
	2202
	2203	/* Here, used locale rules. Convert back to UTF-8 */
	2204	if (UTF8_IS_INVARIANT(result)) {
	2205	*ustrp = (U8) result;
	2206	*lenp = 1;
	2207	}
	2208	else {
	2209	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2210	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2211	*lenp = 2;
	2212	}
	2213
	2214	return result;
	2215	}
	2216
	2217	/*
	2218	=for apidoc to_utf8_lower
	2219
	2220	Instead use L</toLOWER_utf8>.
	2221
	2222	=cut */
	2223
	2224	/* Not currently externally documented, and subject to change:
	2225	* <flags> is set iff iff the rules from the current underlying locale are to
	2226	* be used.
	2227	*/
	2228
	2229	UV
	2230	Perl__to_utf8_lower_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, bool flags)
	2231	{
	2232	UV result;
	2233
	2234	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	2235
	2236	if (flags) {
	2237	/* Treat a UTF-8 locale as not being in locale at all */
	2238	if (IN_UTF8_CTYPE_LOCALE) {
	2239	flags = FALSE;
	2240	}
	2241	else {
	2242	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2243	}
	2244	}
	2245
	2246	if (UTF8_IS_INVARIANT(*p)) {
	2247	if (flags) {
	2248	result = toLOWER_LC(*p);
	2249	}
	2250	else {
	2251	return to_lower_latin1(*p, ustrp, lenp);
	2252	}
	2253	}
	2254	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2255	if (flags) {
	2256	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1));
	2257	result = toLOWER_LC(c);
	2258	}
	2259	else {
	2260	return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)),
	2261	ustrp, lenp);
	2262	}
	2263	}
	2264	else { /* UTF-8, ord above 255 */
	2265	result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
	2266
	2267	if (flags) {
	2268	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2269	}
	2270
	2271	return result;
	2272	}
	2273
	2274	/* Here, used locale rules. Convert back to UTF-8 */
	2275	if (UTF8_IS_INVARIANT(result)) {
	2276	*ustrp = (U8) result;
	2277	*lenp = 1;
	2278	}
	2279	else {
	2280	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2281	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2282	*lenp = 2;
	2283	}
	2284
	2285	return result;
	2286	}
	2287
	2288	/*
	2289	=for apidoc to_utf8_fold
	2290
	2291	Instead use L</toFOLD_utf8>.
	2292
	2293	=cut */
	2294
	2295	/* Not currently externally documented, and subject to change,
	2296	* in <flags>
	2297	* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
	2298	* locale are to be used.
	2299	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	2300	* otherwise simple folds
	2301	* bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
	2302	* prohibited
	2303	*/
	2304
	2305	UV
	2306	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, U8 flags)
	2307	{
	2308	UV result;
	2309
	2310	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2311
	2312	/* These are mutually exclusive */
	2313	assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
	2314
	2315	assert(p != ustrp); /* Otherwise overwrites */
	2316
	2317	if (flags & FOLD_FLAGS_LOCALE) {
	2318	/* Treat a UTF-8 locale as not being in locale at all */
	2319	if (IN_UTF8_CTYPE_LOCALE) {
	2320	flags &= ~FOLD_FLAGS_LOCALE;
	2321	}
	2322	else {
	2323	_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
	2324	}
	2325	}
	2326
	2327	if (UTF8_IS_INVARIANT(*p)) {
	2328	if (flags & FOLD_FLAGS_LOCALE) {
	2329	result = toFOLD_LC(*p);
	2330	}
	2331	else {
	2332	return _to_fold_latin1(*p, ustrp, lenp,
	2333	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2334	}
	2335	}
	2336	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2337	if (flags & FOLD_FLAGS_LOCALE) {
	2338	U8 c = EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1));
	2339	result = toFOLD_LC(c);
	2340	}
	2341	else {
	2342	return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(p, (p+1)),
	2343	ustrp, lenp,
	2344	flags & (FOLD_FLAGS_FULL \| FOLD_FLAGS_NOMIX_ASCII));
	2345	}
	2346	}
	2347	else { /* UTF-8, ord above 255 */
	2348	result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
	2349
	2350	if (flags & FOLD_FLAGS_LOCALE) {
	2351
	2352	# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
	2353	const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
	2354
	2355	# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	2356	# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
	2357
	2358	const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
	2359
	2360	/* Special case these two characters, as what normally gets
	2361	* returned under locale doesn't work */
	2362	if (UTF8SKIP(p) == cap_sharp_s_len
	2363	&& memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
	2364	{
	2365	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	2366	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	2367	"Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
	2368	"resolved to \"\\x{17F}\\x{17F}\".");
	2369	goto return_long_s;
	2370	}
	2371	else
	2372	#endif
	2373	if (UTF8SKIP(p) == long_s_t_len
	2374	&& memEQ((char *) p, LONG_S_T, long_s_t_len))
	2375	{
	2376	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	2377	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	2378	"Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
	2379	"resolved to \"\\x{FB06}\".");
	2380	goto return_ligature_st;
	2381	}
	2382
	2383	#if UNICODE_MAJOR_VERSION == 3 \
	2384	&& UNICODE_DOT_VERSION == 0 \
	2385	&& UNICODE_DOT_DOT_VERSION == 1
	2386	# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
	2387
	2388	/* And special case this on this Unicode version only, for the same
	2389	* reaons the other two are special cased. They would cross the
	2390	* 255/256 boundary which is forbidden under /l, and so the code
	2391	* wouldn't catch that they are equivalent (which they are only in
	2392	* this release) */
	2393	else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
	2394	&& memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
	2395	{
	2396	/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
	2397	Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
	2398	"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
	2399	"resolved to \"\\x{0131}\".");
	2400	goto return_dotless_i;
	2401	}
	2402	#endif
	2403
	2404	return check_locale_boundary_crossing(p, result, ustrp, lenp);
	2405	}
	2406	else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
	2407	return result;
	2408	}
	2409	else {
	2410	/* This is called when changing the case of a UTF-8-encoded
	2411	* character above the ASCII range, and the result should not
	2412	* contain an ASCII character. */
	2413
	2414	UV original; /* To store the first code point of <p> */
	2415
	2416	/* Look at every character in the result; if any cross the
	2417	* boundary, the whole thing is disallowed */
	2418	U8* s = ustrp;
	2419	U8* e = ustrp + *lenp;
	2420	while (s < e) {
	2421	if (isASCII(*s)) {
	2422	/* Crossed, have to return the original */
	2423	original = valid_utf8_to_uvchr(p, lenp);
	2424
	2425	/* But in these instances, there is an alternative we can
	2426	* return that is valid */
	2427	if (original == LATIN_SMALL_LETTER_SHARP_S
	2428	#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
	2429	\|\| original == LATIN_CAPITAL_LETTER_SHARP_S
	2430	#endif
	2431	) {
	2432	goto return_long_s;
	2433	}
	2434	else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
	2435	goto return_ligature_st;
	2436	}
	2437	#if UNICODE_MAJOR_VERSION == 3 \
	2438	&& UNICODE_DOT_VERSION == 0 \
	2439	&& UNICODE_DOT_DOT_VERSION == 1
	2440
	2441	else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
	2442	goto return_dotless_i;
	2443	}
	2444	#endif
	2445	Copy(p, ustrp, *lenp, char);
	2446	return original;
	2447	}
	2448	s += UTF8SKIP(s);
	2449	}
	2450
	2451	/* Here, no characters crossed, result is ok as-is */
	2452	return result;
	2453	}
	2454	}
	2455
	2456	/* Here, used locale rules. Convert back to UTF-8 */
	2457	if (UTF8_IS_INVARIANT(result)) {
	2458	*ustrp = (U8) result;
	2459	*lenp = 1;
	2460	}
	2461	else {
	2462	*ustrp = UTF8_EIGHT_BIT_HI((U8) result);
	2463	*(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
	2464	*lenp = 2;
	2465	}
	2466
	2467	return result;
	2468
	2469	return_long_s:
	2470	/* Certain folds to 'ss' are prohibited by the options, but they do allow
	2471	* folds to a string of two of these characters. By returning this
	2472	* instead, then, e.g.,
	2473	* fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
	2474	* works. */
	2475
	2476	lenp = 2 sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
	2477	Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
	2478	ustrp, *lenp, U8);
	2479	return LATIN_SMALL_LETTER_LONG_S;
	2480
	2481	return_ligature_st:
	2482	/* Two folds to 'st' are prohibited by the options; instead we pick one and
	2483	* have the other one fold to it */
	2484
	2485	*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
	2486	Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
	2487	return LATIN_SMALL_LIGATURE_ST;
	2488
	2489	#if UNICODE_MAJOR_VERSION == 3 \
	2490	&& UNICODE_DOT_VERSION == 0 \
	2491	&& UNICODE_DOT_DOT_VERSION == 1
	2492
	2493	return_dotless_i:
	2494	*lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
	2495	Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
	2496	return LATIN_SMALL_LETTER_DOTLESS_I;
	2497
	2498	#endif
	2499
	2500	}
	2501
	2502	/* Note:
	2503	* Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
	2504	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2505	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2506	*/
	2507
	2508	SV*
	2509	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2510	{
	2511	PERL_ARGS_ASSERT_SWASH_INIT;
	2512
	2513	/* Returns a copy of a swash initiated by the called function. This is the
	2514	* public interface, and returning a copy prevents others from doing
	2515	* mischief on the original */
	2516
	2517	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
	2518	}
	2519
	2520	SV*
	2521	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV listsv, I32 minbits, I32 none, SV invlist, U8* const flags_p)
	2522	{
	2523
	2524	/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
	2525	* use the following define */
	2526
	2527	#define CORE_SWASH_INIT_RETURN(x) \
	2528	PL_curpm= old_PL_curpm; \
	2529	return x
	2530
	2531	/* Initialize and return a swash, creating it if necessary. It does this
	2532	* by calling utf8_heavy.pl in the general case. The returned value may be
	2533	* the swash's inversion list instead if the input parameters allow it.
	2534	* Which is returned should be immaterial to callers, as the only
	2535	* operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
	2536	* and swash_to_invlist() handle both these transparently.
	2537	*
	2538	* This interface should only be used by functions that won't destroy or
	2539	* adversely change the swash, as doing so affects all other uses of the
	2540	* swash in the program; the general public should use 'Perl_swash_init'
	2541	* instead.
	2542	*
	2543	* pkg is the name of the package that <name> should be in.
	2544	* name is the name of the swash to find. Typically it is a Unicode
	2545	* property name, including user-defined ones
	2546	* listsv is a string to initialize the swash with. It must be of the form
	2547	* documented as the subroutine return value in
	2548	* L<perlunicode/User-Defined Character Properties>
	2549	* minbits is the number of bits required to represent each data element.
	2550	* It is '1' for binary properties.
	2551	* none I (khw) do not understand this one, but it is used only in tr///.
	2552	* invlist is an inversion list to initialize the swash with (or NULL)
	2553	* flags_p if non-NULL is the address of various input and output flag bits
	2554	* to the routine, as follows: ('I' means is input to the routine;
	2555	* 'O' means output from the routine. Only flags marked O are
	2556	* meaningful on return.)
	2557	* _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
	2558	* came from a user-defined property. (I O)
	2559	* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
	2560	* when the swash cannot be located, to simply return NULL. (I)
	2561	* _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
	2562	* return of an inversion list instead of a swash hash if this routine
	2563	* thinks that would result in faster execution of swash_fetch() later
	2564	* on. (I)
	2565	*
	2566	* Thus there are three possible inputs to find the swash: <name>,
	2567	* <listsv>, and <invlist>. At least one must be specified. The result
	2568	* will be the union of the specified ones, although <listsv>'s various
	2569	* actions can intersect, etc. what <name> gives. To avoid going out to
	2570	* disk at all, <invlist> should specify completely what the swash should
	2571	* have, and <listsv> should be &PL_sv_undef and <name> should be "".
	2572	*
	2573	* <invlist> is only valid for binary properties */
	2574
	2575	PMOP old_PL_curpm= PL_curpm; / save away the old PL_curpm */
	2576
	2577	SV* retval = &PL_sv_undef;
	2578	HV* swash_hv = NULL;
	2579	const int invlist_swash_boundary =
	2580	(flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
	2581	? 512 /* Based on some benchmarking, but not extensive, see commit
	2582	message */
	2583	: -1; /* Never return just an inversion list */
	2584
	2585	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	2586	assert(! invlist \|\| minbits == 1);
	2587
	2588	PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
	2589	that triggered the swash init and the swash init perl logic itself.
	2590	See perl #122747 */
	2591
	2592	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	2593	* so */
	2594	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	2595	dSP;
	2596	const size_t pkg_len = strlen(pkg);
	2597	const size_t name_len = strlen(name);
	2598	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2599	SV* errsv_save;
	2600	GV *method;
	2601
	2602	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	2603
	2604	PUSHSTACKi(PERLSI_MAGIC);
	2605	ENTER;
	2606	SAVEHINTS();
	2607	save_re_context();
	2608	/* We might get here via a subroutine signature which uses a utf8
	2609	* parameter name, at which point PL_subname will have been set
	2610	* but not yet used. */
	2611	save_item(PL_subname);
	2612	if (PL_parser && PL_parser->error_count)
	2613	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	2614	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2615	if (!method) { /* demand load UTF-8 */
	2616	ENTER;
	2617	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	2618	GvSV(PL_errgv) = NULL;
	2619	#ifndef NO_TAINT_SUPPORT
	2620	/* It is assumed that callers of this routine are not passing in
	2621	* any user derived data. */
	2622	/* Need to do this after save_re_context() as it will set
	2623	* PL_tainted to 1 while saving $1 etc (see the code after getrx:
	2624	* in Perl_magic_get). Even line to create errsv_save can turn on
	2625	* PL_tainted. */
	2626	SAVEBOOL(TAINT_get);
	2627	TAINT_NOT;
	2628	#endif
	2629	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2630	NULL);
	2631	{
	2632	/* Not ERRSV, as there is no need to vivify a scalar we are
	2633	about to discard. */
	2634	SV * const errsv = GvSV(PL_errgv);
	2635	if (!SvTRUE(errsv)) {
	2636	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	2637	SvREFCNT_dec(errsv);
	2638	}
	2639	}
	2640	LEAVE;
	2641	}
	2642	SPAGAIN;
	2643	PUSHMARK(SP);
	2644	EXTEND(SP,5);
	2645	mPUSHp(pkg, pkg_len);
	2646	mPUSHp(name, name_len);
	2647	PUSHs(listsv);
	2648	mPUSHi(minbits);
	2649	mPUSHi(none);
	2650	PUTBACK;
	2651	if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
	2652	GvSV(PL_errgv) = NULL;
	2653	/* If we already have a pointer to the method, no need to use
	2654	* call_method() to repeat the lookup. */
	2655	if (method
	2656	? call_sv(MUTABLE_SV(method), G_SCALAR)
	2657	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2658	{
	2659	retval = *PL_stack_sp--;
	2660	SvREFCNT_inc(retval);
	2661	}
	2662	{
	2663	/* Not ERRSV. See above. */
	2664	SV * const errsv = GvSV(PL_errgv);
	2665	if (!SvTRUE(errsv)) {
	2666	GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
	2667	SvREFCNT_dec(errsv);
	2668	}
	2669	}
	2670	LEAVE;
	2671	POPSTACK;
	2672	if (IN_PERL_COMPILETIME) {
	2673	CopHINTS_set(PL_curcop, PL_hints);
	2674	}
	2675	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2676	if (SvPOK(retval))
	2677
	2678	/* If caller wants to handle missing properties, let them */
	2679	if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
	2680	CORE_SWASH_INIT_RETURN(NULL);
	2681	}
	2682	Perl_croak(aTHX_
	2683	"Can't find Unicode property definition \"%"SVf"\"",
	2684	SVfARG(retval));
	2685	NOT_REACHED; /* NOTREACHED */
	2686	}
	2687	} /* End of calling the module to find the swash */
	2688
	2689	/* If this operation fetched a swash, and we will need it later, get it */
	2690	if (retval != &PL_sv_undef
	2691	&& (minbits == 1 \|\| (flags_p
	2692	&& ! (*flags_p
	2693	& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
	2694	{
	2695	swash_hv = MUTABLE_HV(SvRV(retval));
	2696
	2697	/* If we don't already know that there is a user-defined component to
	2698	* this swash, and the user has indicated they wish to know if there is
	2699	* one (by passing <flags_p>), find out */
	2700	if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
	2701	SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
	2702	if (user_defined && SvUV(*user_defined)) {
	2703	*flags_p \|= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
	2704	}
	2705	}
	2706	}
	2707
	2708	/* Make sure there is an inversion list for binary properties */
	2709	if (minbits == 1) {
	2710	SV** swash_invlistsvp = NULL;
	2711	SV* swash_invlist = NULL;
	2712	bool invlist_in_swash_is_valid = FALSE;
	2713	bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
	2714	an unclaimed reference count */
	2715
	2716	/* If this operation fetched a swash, get its already existing
	2717	* inversion list, or create one for it */
	2718
	2719	if (swash_hv) {
	2720	swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
	2721	if (swash_invlistsvp) {
	2722	swash_invlist = *swash_invlistsvp;
	2723	invlist_in_swash_is_valid = TRUE;
	2724	}
	2725	else {
	2726	swash_invlist = _swash_to_invlist(retval);
	2727	swash_invlist_unclaimed = TRUE;
	2728	}
	2729	}
	2730
	2731	/* If an inversion list was passed in, have to include it */
	2732	if (invlist) {
	2733
	2734	/* Any fetched swash will by now have an inversion list in it;
	2735	* otherwise <swash_invlist> will be NULL, indicating that we
	2736	* didn't fetch a swash */
	2737	if (swash_invlist) {
	2738
	2739	/* Add the passed-in inversion list, which invalidates the one
	2740	* already stored in the swash */
	2741	invlist_in_swash_is_valid = FALSE;
	2742	_invlist_union(invlist, swash_invlist, &swash_invlist);
	2743	}
	2744	else {
	2745
	2746	/* Here, there is no swash already. Set up a minimal one, if
	2747	* we are going to return a swash */
	2748	if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
	2749	swash_hv = newHV();
	2750	retval = newRV_noinc(MUTABLE_SV(swash_hv));
	2751	}
	2752	swash_invlist = invlist;
	2753	}
	2754	}
	2755
	2756	/* Here, we have computed the union of all the passed-in data. It may
	2757	* be that there was an inversion list in the swash which didn't get
	2758	* touched; otherwise save the computed one */
	2759	if (! invlist_in_swash_is_valid
	2760	&& (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
	2761	{
	2762	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
	2763	{
	2764	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2765	}
	2766	/* We just stole a reference count. */
	2767	if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
	2768	else SvREFCNT_inc_simple_void_NN(swash_invlist);
	2769	}
	2770
	2771	SvREADONLY_on(swash_invlist);
	2772
	2773	/* Use the inversion list stand-alone if small enough */
	2774	if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
	2775	SvREFCNT_dec(retval);
	2776	if (!swash_invlist_unclaimed)
	2777	SvREFCNT_inc_simple_void_NN(swash_invlist);
	2778	retval = newRV_noinc(swash_invlist);
	2779	}
	2780	}
	2781
	2782	CORE_SWASH_INIT_RETURN(retval);
	2783	#undef CORE_SWASH_INIT_RETURN
	2784	}
	2785
	2786
	2787	/* This API is wrong for special case conversions since we may need to
	2788	* return several Unicode characters for a single Unicode character
	2789	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2790	* the lower-level routine, and it is similarly broken for returning
	2791	* multiple values. --jhi
	2792	* For those, you should use S__to_utf8_case() instead */
	2793	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	2794
	2795	/* Note:
	2796	* Returns the value of property/mapping C<swash> for the first character
	2797	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2798	* assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
	2799	* is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2800	*
	2801	* A "swash" is a hash which contains initially the keys/values set up by
	2802	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	2803	* property for all possible code points. Things are stored in a compact form
	2804	* (see utf8_heavy.pl) so that calculation is required to find the actual
	2805	* property value for a given code point. As code points are looked up, new
	2806	* key/value pairs are added to the hash, so that the calculation doesn't have
	2807	* to ever be re-done. Further, each calculation is done, not just for the
	2808	* desired one, but for a whole block of code points adjacent to that one.
	2809	* For binary properties on ASCII machines, the block is usually for 64 code
	2810	* points, starting with a code point evenly divisible by 64. Thus if the
	2811	* property value for code point 257 is requested, the code goes out and
	2812	* calculates the property values for all 64 code points between 256 and 319,
	2813	* and stores these as a single 64-bit long bit vector, called a "swatch",
	2814	* under the key for code point 256. The key is the UTF-8 encoding for code
	2815	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	2816	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	2817	* for code point 258 is then requested, this code realizes that it would be
	2818	* stored under the key for 256, and would find that value and extract the
	2819	* relevant bit, offset from 256.
	2820	*
	2821	* Non-binary properties are stored in as many bits as necessary to represent
	2822	* their values (32 currently, though the code is more general than that), not
	2823	* as single bits, but the principal is the same: the value for each key is a
	2824	* vector that encompasses the property values for all code points whose UTF-8
	2825	* representations are represented by the key. That is, for all code points
	2826	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	2827	* bytes of that.
	2828	*/
	2829	UV
	2830	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2831	{
	2832	HV *const hv = MUTABLE_HV(SvRV(swash));
	2833	U32 klen;
	2834	U32 off;
	2835	STRLEN slen = 0;
	2836	STRLEN needents;
	2837	const U8 *tmps = NULL;
	2838	SV *swatch;
	2839	const U8 c = *ptr;
	2840
	2841	PERL_ARGS_ASSERT_SWASH_FETCH;
	2842
	2843	/* If it really isn't a hash, it isn't really swash; must be an inversion
	2844	* list */
	2845	if (SvTYPE(hv) != SVt_PVHV) {
	2846	return _invlist_contains_cp((SV*)hv,
	2847	(do_utf8)
	2848	? valid_utf8_to_uvchr(ptr, NULL)
	2849	: c);
	2850	}
	2851
	2852	/* We store the values in a "swatch" which is a vec() value in a swash
	2853	* hash. Code points 0-255 are a single vec() stored with key length
	2854	* (klen) 0. All other code points have a UTF-8 representation
	2855	* 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
	2856	* share 0xAA..0xYY, which is the key in the hash to that vec. So the key
	2857	* length for them is the length of the encoded char - 1. ptr[klen] is the
	2858	* final byte in the sequence representing the character */
	2859	if (!do_utf8 \|\| UTF8_IS_INVARIANT(c)) {
	2860	klen = 0;
	2861	needents = 256;
	2862	off = c;
	2863	}
	2864	else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	2865	klen = 0;
	2866	needents = 256;
	2867	off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
	2868	}
	2869	else {
	2870	klen = UTF8SKIP(ptr) - 1;
	2871
	2872	/* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
	2873	* the vec is the final byte in the sequence. (In EBCDIC this is
	2874	* converted to I8 to get consecutive values.) To help you visualize
	2875	* all this:
	2876	* Straight 1047 After final byte
	2877	* UTF-8 UTF-EBCDIC I8 transform
	2878	* U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
	2879	* U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
	2880	* ...
	2881	* U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
	2882	* U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
	2883	* ...
	2884	* U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
	2885	* U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
	2886	* ...
	2887	* U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
	2888	* U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
	2889	* ...
	2890	* U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
	2891	* U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
	2892	*
	2893	* (There are no discontinuities in the elided (...) entries.)
	2894	* The UTF-8 key for these 33 code points is '\xD0' (which also is the
	2895	* key for the next 31, up through U+043F, whose UTF-8 final byte is
	2896	* \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
	2897	* The final UTF-8 byte, which ranges between \x80 and \xBF, is an
	2898	* index into the vec() swatch (after subtracting 0x80, which we
	2899	* actually do with an '&').
	2900	* In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
	2901	* code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
	2902	* dicontinuities which go away by transforming it into I8, and we
	2903	* effectively subtract 0xA0 to get the index. */
	2904	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2905	off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
	2906	}
	2907
	2908	/*
	2909	* This single-entry cache saves about 1/3 of the UTF-8 overhead in test
	2910	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2911	* it's nothing to sniff at.) Pity we usually come through at least
	2912	* two function calls to get here...
	2913	*
	2914	* NB: this code assumes that swatches are never modified, once generated!
	2915	*/
	2916
	2917	if (hv == PL_last_swash_hv &&
	2918	klen == PL_last_swash_klen &&
	2919	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2920	{
	2921	tmps = PL_last_swash_tmps;
	2922	slen = PL_last_swash_slen;
	2923	}
	2924	else {
	2925	/* Try our second-level swatch cache, kept in a hash. */
	2926	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2927
	2928	/* If not cached, generate it via swatch_get */
	2929	if (!svp \|\| !SvPOK(*svp)
	2930	\|\| !(tmps = (const U8)SvPV_const(svp, slen)))
	2931	{
	2932	if (klen) {
	2933	const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
	2934	swatch = swatch_get(swash,
	2935	code_point & ~((UV)needents - 1),
	2936	needents);
	2937	}
	2938	else { /* For the first 256 code points, the swatch has a key of
	2939	length 0 */
	2940	swatch = swatch_get(swash, 0, needents);
	2941	}
	2942
	2943	if (IN_PERL_COMPILETIME)
	2944	CopHINTS_set(PL_curcop, PL_hints);
	2945
	2946	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2947
	2948	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2949	\|\| (slen << 3) < needents)
	2950	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	2951	"svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
	2952	svp, tmps, (UV)slen, (UV)needents);
	2953	}
	2954
	2955	PL_last_swash_hv = hv;
	2956	assert(klen <= sizeof(PL_last_swash_key));
	2957	PL_last_swash_klen = (U8)klen;
	2958	/* FIXME change interpvar.h? */
	2959	PL_last_swash_tmps = (U8 *) tmps;
	2960	PL_last_swash_slen = slen;
	2961	if (klen)
	2962	Copy(ptr, PL_last_swash_key, klen, U8);
	2963	}
	2964
	2965	switch ((int)((slen << 3) / needents)) {
	2966	case 1:
	2967	return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
	2968	case 8:
	2969	return ((UV) tmps[off]);
	2970	case 16:
	2971	off <<= 1;
	2972	return
	2973	((UV) tmps[off ] << 8) +
	2974	((UV) tmps[off + 1]);
	2975	case 32:
	2976	off <<= 2;
	2977	return
	2978	((UV) tmps[off ] << 24) +
	2979	((UV) tmps[off + 1] << 16) +
	2980	((UV) tmps[off + 2] << 8) +
	2981	((UV) tmps[off + 3]);
	2982	}
	2983	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	2984	"slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
	2985	NORETURN_FUNCTION_END;
	2986	}
	2987
	2988	/* Read a single line of the main body of the swash input text. These are of
	2989	* the form:
	2990	* 0053 0056 0073
	2991	* where each number is hex. The first two numbers form the minimum and
	2992	* maximum of a range, and the third is the value associated with the range.
	2993	* Not all swashes should have a third number
	2994	*
	2995	* On input: l points to the beginning of the line to be examined; it points
	2996	* to somewhere in the string of the whole input text, and is
	2997	* terminated by a \n or the null string terminator.
	2998	* lend points to the null terminator of that string
	2999	* wants_value is non-zero if the swash expects a third number
	3000	* typestr is the name of the swash's mapping, like 'ToLower'
	3001	* On output: min, max, and *val are set to the values read from the line.
	3002	* returns a pointer just beyond the line examined. If there was no
	3003	* valid min number on the line, returns lend+1
	3004	*/
	3005
	3006	STATIC U8*
	3007	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	3008	const bool wants_value, const U8* const typestr)
	3009	{
	3010	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	3011	STRLEN numlen; /* Length of the number */
	3012	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	3013	\| PERL_SCAN_DISALLOW_PREFIX
	3014	\| PERL_SCAN_SILENT_NON_PORTABLE;
	3015
	3016	/* nl points to the next \n in the scan */
	3017	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	3018
	3019	PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
	3020
	3021	/* Get the first number on the line: the range minimum */
	3022	numlen = lend - l;
	3023	min = grok_hex((char )l, &numlen, &flags, NULL);
	3024	max = min; /* So can never return without setting max */
	3025	if (numlen) /* If found a hex number, position past it */
	3026	l += numlen;
	3027	else if (nl) { /* Else, go handle next line, if any */
	3028	return nl + 1; /* 1 is length of "\n" */
	3029	}
	3030	else { /* Else, no next line */
	3031	return lend + 1; /* to LIST's end at which \n is not found */
	3032	}
	3033
	3034	/* The max range value follows, separated by a BLANK */
	3035	if (isBLANK(*l)) {
	3036	++l;
	3037	flags = PERL_SCAN_SILENT_ILLDIGIT
	3038	\| PERL_SCAN_DISALLOW_PREFIX
	3039	\| PERL_SCAN_SILENT_NON_PORTABLE;
	3040	numlen = lend - l;
	3041	max = grok_hex((char )l, &numlen, &flags, NULL);
	3042	if (numlen)
	3043	l += numlen;
	3044	else /* If no value here, it is a single element range */
	3045	max = min;
	3046
	3047	/* Non-binary tables have a third entry: what the first element of the
	3048	* range maps to. The map for those currently read here is in hex */
	3049	if (wants_value) {
	3050	if (isBLANK(*l)) {
	3051	++l;
	3052	flags = PERL_SCAN_SILENT_ILLDIGIT
	3053	\| PERL_SCAN_DISALLOW_PREFIX
	3054	\| PERL_SCAN_SILENT_NON_PORTABLE;
	3055	numlen = lend - l;
	3056	val = grok_hex((char )l, &numlen, &flags, NULL);
	3057	if (numlen)
	3058	l += numlen;
	3059	else
	3060	*val = 0;
	3061	}
	3062	else {
	3063	*val = 0;
	3064	if (typeto) {
	3065	/* diag_listed_as: To%s: illegal mapping '%s' */
	3066	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	3067	typestr, l);
	3068	}
	3069	}
	3070	}
	3071	else
	3072	val = 0; / bits == 1, then any val should be ignored */
	3073	}
	3074	else { /* Nothing following range min, should be single element with no
	3075	mapping expected */
	3076	if (wants_value) {
	3077	*val = 0;
	3078	if (typeto) {
	3079	/* diag_listed_as: To%s: illegal mapping '%s' */
	3080	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	3081	}
	3082	}
	3083	else
	3084	val = 0; / bits == 1, then val should be ignored */
	3085	}
	3086
	3087	/* Position to next line if any, or EOF */
	3088	if (nl)
	3089	l = nl + 1;
	3090	else
	3091	l = lend;
	3092
	3093	return l;
	3094	}
	3095
	3096	/* Note:
	3097	* Returns a swatch (a bit vector string) for a code point sequence
	3098	* that starts from the value C<start> and comprises the number C<span>.
	3099	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	3100	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	3101	*/
	3102	STATIC SV*
	3103	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	3104	{
	3105	SV *swatch;
	3106	U8 l, lend, x, xend, s, send;
	3107	STRLEN lcur, xcur, scur;
	3108	HV *const hv = MUTABLE_HV(SvRV(swash));
	3109	SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
	3110
	3111	SV** listsvp = NULL; /* The string containing the main body of the table */
	3112	SV** extssvp = NULL;
	3113	SV** invert_it_svp = NULL;
	3114	U8* typestr = NULL;
	3115	STRLEN bits;
	3116	STRLEN octets; /* if bits == 1, then octets == 0 */
	3117	UV none;
	3118	UV end = start + span;
	3119
	3120	if (invlistsvp == NULL) {
	3121	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3122	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	3123	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3124	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	3125	listsvp = hv_fetchs(hv, "LIST", FALSE);
	3126	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	3127
	3128	bits = SvUV(*bitssvp);
	3129	none = SvUV(*nonesvp);
	3130	typestr = (U8)SvPV_nolen(typesvp);
	3131	}
	3132	else {
	3133	bits = 1;
	3134	none = 0;
	3135	}
	3136	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3137
	3138	PERL_ARGS_ASSERT_SWATCH_GET;
	3139
	3140	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	3141	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
	3142	(UV)bits);
	3143	}
	3144
	3145	/* If overflowed, use the max possible */
	3146	if (end < start) {
	3147	end = UV_MAX;
	3148	span = end - start;
	3149	}
	3150
	3151	/* create and initialize $swatch */
	3152	scur = octets ? (span * octets) : (span + 7) / 8;
	3153	swatch = newSV(scur);
	3154	SvPOK_on(swatch);
	3155	s = (U8*)SvPVX(swatch);
	3156	if (octets && none) {
	3157	const U8* const e = s + scur;
	3158	while (s < e) {
	3159	if (bits == 8)
	3160	*s++ = (U8)(none & 0xff);
	3161	else if (bits == 16) {
	3162	*s++ = (U8)((none >> 8) & 0xff);
	3163	*s++ = (U8)( none & 0xff);
	3164	}
	3165	else if (bits == 32) {
	3166	*s++ = (U8)((none >> 24) & 0xff);
	3167	*s++ = (U8)((none >> 16) & 0xff);
	3168	*s++ = (U8)((none >> 8) & 0xff);
	3169	*s++ = (U8)( none & 0xff);
	3170	}
	3171	}
	3172	*s = '\0';
	3173	}
	3174	else {
	3175	(void)memzero((U8*)s, scur + 1);
	3176	}
	3177	SvCUR_set(swatch, scur);
	3178	s = (U8*)SvPVX(swatch);
	3179
	3180	if (invlistsvp) { /* If has an inversion list set up use that */
	3181	_invlist_populate_swatch(*invlistsvp, start, end, s);
	3182	return swatch;
	3183	}
	3184
	3185	/* read $swash->{LIST} */
	3186	l = (U8)SvPV(listsvp, lcur);
	3187	lend = l + lcur;
	3188	while (l < lend) {
	3189	UV min, max, val, upper;
	3190	l = swash_scan_list_line(l, lend, &min, &max, &val,
	3191	cBOOL(octets), typestr);
	3192	if (l > lend) {
	3193	break;
	3194	}
	3195
	3196	/* If looking for something beyond this range, go try the next one */
	3197	if (max < start)
	3198	continue;
	3199
	3200	/* <end> is generally 1 beyond where we want to set things, but at the
	3201	* platform's infinity, where we can't go any higher, we want to
	3202	* include the code point at <end> */
	3203	upper = (max < end)
	3204	? max
	3205	: (max != UV_MAX \|\| end != UV_MAX)
	3206	? end - 1
	3207	: end;
	3208
	3209	if (octets) {
	3210	UV key;
	3211	if (min < start) {
	3212	if (!none \|\| val < none) {
	3213	val += start - min;
	3214	}
	3215	min = start;
	3216	}
	3217	for (key = min; key <= upper; key++) {
	3218	STRLEN offset;
	3219	/* offset must be non-negative (start <= min <= key < end) */
	3220	offset = octets * (key - start);
	3221	if (bits == 8)
	3222	s[offset] = (U8)(val & 0xff);
	3223	else if (bits == 16) {
	3224	s[offset ] = (U8)((val >> 8) & 0xff);
	3225	s[offset + 1] = (U8)( val & 0xff);
	3226	}
	3227	else if (bits == 32) {
	3228	s[offset ] = (U8)((val >> 24) & 0xff);
	3229	s[offset + 1] = (U8)((val >> 16) & 0xff);
	3230	s[offset + 2] = (U8)((val >> 8) & 0xff);
	3231	s[offset + 3] = (U8)( val & 0xff);
	3232	}
	3233
	3234	if (!none \|\| val < none)
	3235	++val;
	3236	}
	3237	}
	3238	else { /* bits == 1, then val should be ignored */
	3239	UV key;
	3240	if (min < start)
	3241	min = start;
	3242
	3243	for (key = min; key <= upper; key++) {
	3244	const STRLEN offset = (STRLEN)(key - start);
	3245	s[offset >> 3] \|= 1 << (offset & 7);
	3246	}
	3247	}
	3248	} /* while */
	3249
	3250	/* Invert if the data says it should be. Assumes that bits == 1 */
	3251	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3252
	3253	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	3254	* be 0, and their inversion should also be 0, as we don't succeed any
	3255	* Unicode property matches for non-Unicode code points */
	3256	if (start <= PERL_UNICODE_MAX) {
	3257
	3258	/* The code below assumes that we never cross the
	3259	* Unicode/above-Unicode boundary in a range, as otherwise we would
	3260	* have to figure out where to stop flipping the bits. Since this
	3261	* boundary is divisible by a large power of 2, and swatches comes
	3262	* in small powers of 2, this should be a valid assumption */
	3263	assert(start + span - 1 <= PERL_UNICODE_MAX);
	3264
	3265	send = s + scur;
	3266	while (s < send) {
	3267	s = ~(s);
	3268	s++;
	3269	}
	3270	}
	3271	}
	3272
	3273	/* read $swash->{EXTRAS}
	3274	* This code also copied to swash_to_invlist() below */
	3275	x = (U8)SvPV(extssvp, xcur);
	3276	xend = x + xcur;
	3277	while (x < xend) {
	3278	STRLEN namelen;
	3279	U8 *namestr;
	3280	SV** othersvp;
	3281	HV* otherhv;
	3282	STRLEN otherbits;
	3283	SV *otherbitssvp, other;
	3284	U8 s, o, *nl;
	3285	STRLEN slen, olen;
	3286
	3287	const U8 opc = *x++;
	3288	if (opc == '\n')
	3289	continue;
	3290
	3291	nl = (U8*)memchr(x, '\n', xend - x);
	3292
	3293	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3294	if (nl) {
	3295	x = nl + 1; /* 1 is length of "\n" */
	3296	continue;
	3297	}
	3298	else {
	3299	x = xend; /* to EXTRAS' end at which \n is not found */
	3300	break;
	3301	}
	3302	}
	3303
	3304	namestr = x;
	3305	if (nl) {
	3306	namelen = nl - namestr;
	3307	x = nl + 1;
	3308	}
	3309	else {
	3310	namelen = xend - namestr;
	3311	x = xend;
	3312	}
	3313
	3314	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3315	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3316	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3317	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3318	if (bits < otherbits)
	3319	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	3320	"bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
	3321
	3322	/* The "other" swatch must be destroyed after. */
	3323	other = swatch_get(*othersvp, start, span);
	3324	o = (U8*)SvPV(other, olen);
	3325
	3326	if (!olen)
	3327	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	3328
	3329	s = (U8*)SvPV(swatch, slen);
	3330	if (bits == 1 && otherbits == 1) {
	3331	if (slen != olen)
	3332	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	3333	"mismatch, slen=%"UVuf", olen=%"UVuf,
	3334	(UV)slen, (UV)olen);
	3335
	3336	switch (opc) {
	3337	case '+':
	3338	while (slen--)
	3339	s++ \|= o++;
	3340	break;
	3341	case '!':
	3342	while (slen--)
	3343	s++ \|= ~o++;
	3344	break;
	3345	case '-':
	3346	while (slen--)
	3347	s++ &= ~o++;
	3348	break;
	3349	case '&':
	3350	while (slen--)
	3351	s++ &= o++;
	3352	break;
	3353	default:
	3354	break;
	3355	}
	3356	}
	3357	else {
	3358	STRLEN otheroctets = otherbits >> 3;
	3359	STRLEN offset = 0;
	3360	U8* const send = s + slen;
	3361
	3362	while (s < send) {
	3363	UV otherval = 0;
	3364
	3365	if (otherbits == 1) {
	3366	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	3367	++offset;
	3368	}
	3369	else {
	3370	STRLEN vlen = otheroctets;
	3371	otherval = *o++;
	3372	while (--vlen) {
	3373	otherval <<= 8;
	3374	otherval \|= *o++;
	3375	}
	3376	}
	3377
	3378	if (opc == '+' && otherval)
	3379	NOOP; /* replace with otherval */
	3380	else if (opc == '!' && !otherval)
	3381	otherval = 1;
	3382	else if (opc == '-' && otherval)
	3383	otherval = 0;
	3384	else if (opc == '&' && !otherval)
	3385	otherval = 0;
	3386	else {
	3387	s += octets; /* no replacement */
	3388	continue;
	3389	}
	3390
	3391	if (bits == 8)
	3392	*s++ = (U8)( otherval & 0xff);
	3393	else if (bits == 16) {
	3394	*s++ = (U8)((otherval >> 8) & 0xff);
	3395	*s++ = (U8)( otherval & 0xff);
	3396	}
	3397	else if (bits == 32) {
	3398	*s++ = (U8)((otherval >> 24) & 0xff);
	3399	*s++ = (U8)((otherval >> 16) & 0xff);
	3400	*s++ = (U8)((otherval >> 8) & 0xff);
	3401	*s++ = (U8)( otherval & 0xff);
	3402	}
	3403	}
	3404	}
	3405	sv_free(other); /* through with it! */
	3406	} /* while */
	3407	return swatch;
	3408	}
	3409
	3410	HV*
	3411	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	3412	{
	3413
	3414	/* Subject to change or removal. For use only in regcomp.c and regexec.c
	3415	* Can't be used on a property that is subject to user override, as it
	3416	* relies on the value of SPECIALS in the swash which would be set by
	3417	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	3418	* for overridden properties
	3419	*
	3420	* Returns a hash which is the inversion and closure of a swash mapping.
	3421	* For example, consider the input lines:
	3422	* 004B 006B
	3423	* 004C 006C
	3424	* 212A 006B
	3425	*
	3426	* The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
	3427	* 006C. The value for each key is an array. For 006C, the array would
	3428	* have two elements, the UTF-8 for itself, and for 004C. For 006B, there
	3429	* would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
	3430	*
	3431	* Note that there are no elements in the hash for 004B, 004C, 212A. The
	3432	* keys are only code points that are folded-to, so it isn't a full closure.
	3433	*
	3434	* Essentially, for any code point, it gives all the code points that map to
	3435	* it, or the list of 'froms' for that point.
	3436	*
	3437	* Currently it ignores any additions or deletions from other swashes,
	3438	* looking at just the main body of the swash, and if there are SPECIALS
	3439	* in the swash, at that hash
	3440	*
	3441	* The specials hash can be extra code points, and most likely consists of
	3442	* maps from single code points to multiple ones (each expressed as a string
	3443	* of UTF-8 characters). This function currently returns only 1-1 mappings.
	3444	* However consider this possible input in the specials hash:
	3445	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	3446	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	3447	*
	3448	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	3449	* currently handle. But it also means that FB05 and FB06 are equivalent in
	3450	* a 1-1 mapping which we should handle, and this relationship may not be in
	3451	* the main table. Therefore this function examines all the multi-char
	3452	* sequences and adds the 1-1 mappings that come out of that.
	3453	*
	3454	* XXX This function was originally intended to be multipurpose, but its
	3455	* only use is quite likely to remain for constructing the inversion of
	3456	* the CaseFolding (//i) property. If it were more general purpose for
	3457	* regex patterns, it would have to do the FB05/FB06 game for simple folds,
	3458	* because certain folds are prohibited under /iaa and /il. As an example,
	3459	* in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
	3460	* equivalent under /i. But under /iaa and /il, the folds to 'i' are
	3461	* prohibited, so we would not figure out that they fold to each other.
	3462	* Code could be written to automatically figure this out, similar to the
	3463	* code that does this for multi-character folds, but this is the only case
	3464	* where something like this is ever likely to happen, as all the single
	3465	* char folds to the 0-255 range are now quite settled. Instead there is a
	3466	* little special code that is compiled only for this Unicode version. This
	3467	* is smaller and didn't require much coding time to do. But this makes
	3468	* this routine strongly tied to being used just for CaseFolding. If ever
	3469	* it should be generalized, this would have to be fixed */
	3470
	3471	U8 l, lend;
	3472	STRLEN lcur;
	3473	HV *const hv = MUTABLE_HV(SvRV(swash));
	3474
	3475	/* The string containing the main body of the table. This will have its
	3476	* assertion fail if the swash has been converted to its inversion list */
	3477	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	3478
	3479	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3480	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3481	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	3482	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	3483	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	3484	const STRLEN bits = SvUV(*bitssvp);
	3485	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3486	const UV none = SvUV(*nonesvp);
	3487	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	3488
	3489	HV* ret = newHV();
	3490
	3491	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	3492
	3493	/* Must have at least 8 bits to get the mappings */
	3494	if (bits != 8 && bits != 16 && bits != 32) {
	3495	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	3496	(UV)bits);
	3497	}
	3498
	3499	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	3500	mapping to more than one character */
	3501
	3502	/* Construct an inverse mapping hash for the specials */
	3503	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	3504	HV * specials_inverse = newHV();
	3505	char char_from; / the lhs of the map */
	3506	I32 from_len; /* its byte length */
	3507	char char_to; / the rhs of the map */
	3508	I32 to_len; /* its byte length */
	3509	SV sv_to; / and in a sv */
	3510	AV* from_list; /* list of things that map to each 'to' */
	3511
	3512	hv_iterinit(specials_hv);
	3513
	3514	/* The keys are the characters (in UTF-8) that map to the corresponding
	3515	* UTF-8 string value. Iterate through the list creating the inverse
	3516	* list. */
	3517	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	3518	SV** listp;
	3519	if (! SvPOK(sv_to)) {
	3520	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
	3521	"unexpectedly is not a string, flags=%lu",
	3522	(unsigned long)SvFLAGS(sv_to));
	3523	}
	3524	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8) char_from, 0), valid_utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	3525
	3526	/* Each key in the inverse list is a mapped-to value, and the key's
	3527	* hash value is a list of the strings (each in UTF-8) that map to
	3528	* it. Those strings are all one character long */
	3529	if ((listp = hv_fetch(specials_inverse,
	3530	SvPVX(sv_to),
	3531	SvCUR(sv_to), 0)))
	3532	{
	3533	from_list = (AV) listp;
	3534	}
	3535	else { /* No entry yet for it: create one */
	3536	from_list = newAV();
	3537	if (! hv_store(specials_inverse,
	3538	SvPVX(sv_to),
	3539	SvCUR(sv_to),
	3540	(SV*) from_list, 0))
	3541	{
	3542	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3543	}
	3544	}
	3545
	3546	/* Here have the list associated with this 'to' (perhaps newly
	3547	* created and empty). Just add to it. Note that we ASSUME that
	3548	* the input is guaranteed to not have duplications, so we don't
	3549	* check for that. Duplications just slow down execution time. */
	3550	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	3551	}
	3552
	3553	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	3554	* it looking for cases like the FB05/FB06 examples above. There would
	3555	* be an entry in the hash like
	3556	* 'st' => [ FB05, FB06 ]
	3557	* In this example we will create two lists that get stored in the
	3558	* returned hash, 'ret':
	3559	* FB05 => [ FB05, FB06 ]
	3560	* FB06 => [ FB05, FB06 ]
	3561	*
	3562	* Note that there is nothing to do if the array only has one element.
	3563	* (In the normal 1-1 case handled below, we don't have to worry about
	3564	* two lists, as everything gets tied to the single list that is
	3565	* generated for the single character 'to'. But here, we are omitting
	3566	* that list, ('st' in the example), so must have multiple lists.) */
	3567	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	3568	&char_to, &to_len)))
	3569	{
	3570	if (av_tindex(from_list) > 0) {
	3571	SSize_t i;
	3572
	3573	/* We iterate over all combinations of i,j to place each code
	3574	* point on each list */
	3575	for (i = 0; i <= av_tindex(from_list); i++) {
	3576	SSize_t j;
	3577	AV* i_list = newAV();
	3578	SV** entryp = av_fetch(from_list, i, FALSE);
	3579	if (entryp == NULL) {
	3580	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3581	}
	3582	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	3583	Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
	3584	}
	3585	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	3586	(SV*) i_list, FALSE))
	3587	{
	3588	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3589	}
	3590
	3591	/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	3592	for (j = 0; j <= av_tindex(from_list); j++) {
	3593	entryp = av_fetch(from_list, j, FALSE);
	3594	if (entryp == NULL) {
	3595	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3596	}
	3597
	3598	/* When i==j this adds itself to the list */
	3599	av_push(i_list, newSVuv(utf8_to_uvchr_buf(
	3600	(U8) SvPVX(entryp),
	3601	(U8) SvPVX(entryp) + SvCUR(*entryp),
	3602	0)));
	3603	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	3604	}
	3605	}
	3606	}
	3607	}
	3608	SvREFCNT_dec(specials_inverse); /* done with it */
	3609	} /* End of specials */
	3610
	3611	/* read $swash->{LIST} */
	3612
	3613	#if UNICODE_MAJOR_VERSION == 3 \
	3614	&& UNICODE_DOT_VERSION == 0 \
	3615	&& UNICODE_DOT_DOT_VERSION == 1
	3616
	3617	/* For this version only U+130 and U+131 are equivalent under qr//i. Add a
	3618	* rule so that things work under /iaa and /il */
	3619
	3620	SV * mod_listsv = sv_mortalcopy(*listsvp);
	3621	sv_catpv(mod_listsv, "130\t130\t131\n");
	3622	l = (U8*)SvPV(mod_listsv, lcur);
	3623
	3624	#else
	3625
	3626	l = (U8)SvPV(listsvp, lcur);
	3627
	3628	#endif
	3629
	3630	lend = l + lcur;
	3631
	3632	/* Go through each input line */
	3633	while (l < lend) {
	3634	UV min, max, val;
	3635	UV inverse;
	3636	l = swash_scan_list_line(l, lend, &min, &max, &val,
	3637	cBOOL(octets), typestr);
	3638	if (l > lend) {
	3639	break;
	3640	}
	3641
	3642	/* Each element in the range is to be inverted */
	3643	for (inverse = min; inverse <= max; inverse++) {
	3644	AV* list;
	3645	SV** listp;
	3646	IV i;
	3647	bool found_key = FALSE;
	3648	bool found_inverse = FALSE;
	3649
	3650	/* The key is the inverse mapping */
	3651	char key[UTF8_MAXBYTES+1];
	3652	char* key_end = (char ) uvchr_to_utf8((U8) key, val);
	3653	STRLEN key_len = key_end - key;
	3654
	3655	/* Get the list for the map */
	3656	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	3657	list = (AV) listp;
	3658	}
	3659	else { /* No entry yet for it: create one */
	3660	list = newAV();
	3661	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	3662	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3663	}
	3664	}
	3665
	3666	/* Look through list to see if this inverse mapping already is
	3667	* listed, or if there is a mapping to itself already */
	3668	for (i = 0; i <= av_tindex(list); i++) {
	3669	SV** entryp = av_fetch(list, i, FALSE);
	3670	SV* entry;
	3671	UV uv;
	3672	if (entryp == NULL) {
	3673	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3674	}
	3675	entry = *entryp;
	3676	uv = SvUV(entry);
	3677	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));/
	3678	if (uv == val) {
	3679	found_key = TRUE;
	3680	}
	3681	if (uv == inverse) {
	3682	found_inverse = TRUE;
	3683	}
	3684
	3685	/* No need to continue searching if found everything we are
	3686	* looking for */
	3687	if (found_key && found_inverse) {
	3688	break;
	3689	}
	3690	}
	3691
	3692	/* Make sure there is a mapping to itself on the list */
	3693	if (! found_key) {
	3694	av_push(list, newSVuv(val));
	3695	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));/
	3696	}
	3697
	3698
	3699	/* Simply add the value to the list */
	3700	if (! found_inverse) {
	3701	av_push(list, newSVuv(inverse));
	3702	/DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));/
	3703	}
	3704
	3705	/* swatch_get() increments the value of val for each element in the
	3706	* range. That makes more compact tables possible. You can
	3707	* express the capitalization, for example, of all consecutive
	3708	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	3709	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	3710	* and it's not documented; it appears to be used only in
	3711	* implementing tr//; I copied the semantics from swatch_get(), just
	3712	* in case */
	3713	if (!none \|\| val < none) {
	3714	++val;
	3715	}
	3716	}
	3717	}
	3718
	3719	return ret;
	3720	}
	3721
	3722	SV*
	3723	Perl__swash_to_invlist(pTHX_ SV* const swash)
	3724	{
	3725
	3726	/* Subject to change or removal. For use only in one place in regcomp.c.
	3727	* Ownership is given to one reference count in the returned SV* */
	3728
	3729	U8 l, lend;
	3730	char *loc;
	3731	STRLEN lcur;
	3732	HV *const hv = MUTABLE_HV(SvRV(swash));
	3733	UV elements = 0; /* Number of elements in the inversion list */
	3734	U8 empty[] = "";
	3735	SV** listsvp;
	3736	SV** typesvp;
	3737	SV** bitssvp;
	3738	SV** extssvp;
	3739	SV** invert_it_svp;
	3740
	3741	U8* typestr;
	3742	STRLEN bits;
	3743	STRLEN octets; /* if bits == 1, then octets == 0 */
	3744	U8 x, xend;
	3745	STRLEN xcur;
	3746
	3747	SV* invlist;
	3748
	3749	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	3750
	3751	/* If not a hash, it must be the swash's inversion list instead */
	3752	if (SvTYPE(hv) != SVt_PVHV) {
	3753	return SvREFCNT_inc_simple_NN((SV*) hv);
	3754	}
	3755
	3756	/* The string containing the main body of the table */
	3757	listsvp = hv_fetchs(hv, "LIST", FALSE);
	3758	typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3759	bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3760	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	3761	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	3762
	3763	typestr = (U8)SvPV_nolen(typesvp);
	3764	bits = SvUV(*bitssvp);
	3765	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3766
	3767	/* read $swash->{LIST} */
	3768	if (SvPOK(*listsvp)) {
	3769	l = (U8)SvPV(listsvp, lcur);
	3770	}
	3771	else {
	3772	/* LIST legitimately doesn't contain a string during compilation phases
	3773	* of Perl itself, before the Unicode tables are generated. In this
	3774	* case, just fake things up by creating an empty list */
	3775	l = empty;
	3776	lcur = 0;
	3777	}
	3778	loc = (char *) l;
	3779	lend = l + lcur;
	3780
	3781	if (l == 'V') { / Inversion list format */
	3782	const char after_atou = (char ) lend;
	3783	UV element0;
	3784	UV* other_elements_ptr;
	3785
	3786	/* The first number is a count of the rest */
	3787	l++;
	3788	if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
	3789	Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
	3790	}
	3791	if (elements == 0) {
	3792	invlist = _new_invlist(0);
	3793	}
	3794	else {
	3795	while (isSPACE(*l)) l++;
	3796	l = (U8 *) after_atou;
	3797
	3798	/* Get the 0th element, which is needed to setup the inversion list */
	3799	while (isSPACE(*l)) l++;
	3800	if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
	3801	Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
	3802	}
	3803	l = (U8 *) after_atou;
	3804	invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
	3805	elements--;
	3806
	3807	/* Then just populate the rest of the input */
	3808	while (elements-- > 0) {
	3809	if (l > lend) {
	3810	Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
	3811	}
	3812	while (isSPACE(*l)) l++;
	3813	if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
	3814	Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
	3815	}
	3816	l = (U8 *) after_atou;
	3817	}
	3818	}
	3819	}
	3820	else {
	3821
	3822	/* Scan the input to count the number of lines to preallocate array
	3823	* size based on worst possible case, which is each line in the input
	3824	* creates 2 elements in the inversion list: 1) the beginning of a
	3825	* range in the list; 2) the beginning of a range not in the list. */
	3826	while ((loc = (strchr(loc, '\n'))) != NULL) {
	3827	elements += 2;
	3828	loc++;
	3829	}
	3830
	3831	/* If the ending is somehow corrupt and isn't a new line, add another
	3832	* element for the final range that isn't in the inversion list */
	3833	if (! (*lend == '\n'
	3834	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	3835	{
	3836	elements++;
	3837	}
	3838
	3839	invlist = _new_invlist(elements);
	3840
	3841	/* Now go through the input again, adding each range to the list */
	3842	while (l < lend) {
	3843	UV start, end;
	3844	UV val; /* Not used by this function */
	3845
	3846	l = swash_scan_list_line(l, lend, &start, &end, &val,
	3847	cBOOL(octets), typestr);
	3848
	3849	if (l > lend) {
	3850	break;
	3851	}
	3852
	3853	invlist = _add_range_to_invlist(invlist, start, end);
	3854	}
	3855	}
	3856
	3857	/* Invert if the data says it should be */
	3858	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3859	_invlist_invert(invlist);
	3860	}
	3861
	3862	/* This code is copied from swatch_get()
	3863	* read $swash->{EXTRAS} */
	3864	x = (U8)SvPV(extssvp, xcur);
	3865	xend = x + xcur;
	3866	while (x < xend) {
	3867	STRLEN namelen;
	3868	U8 *namestr;
	3869	SV** othersvp;
	3870	HV* otherhv;
	3871	STRLEN otherbits;
	3872	SV *otherbitssvp, other;
	3873	U8 *nl;
	3874
	3875	const U8 opc = *x++;
	3876	if (opc == '\n')
	3877	continue;
	3878
	3879	nl = (U8*)memchr(x, '\n', xend - x);
	3880
	3881	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3882	if (nl) {
	3883	x = nl + 1; /* 1 is length of "\n" */
	3884	continue;
	3885	}
	3886	else {
	3887	x = xend; /* to EXTRAS' end at which \n is not found */
	3888	break;
	3889	}
	3890	}
	3891
	3892	namestr = x;
	3893	if (nl) {
	3894	namelen = nl - namestr;
	3895	x = nl + 1;
	3896	}
	3897	else {
	3898	namelen = xend - namestr;
	3899	x = xend;
	3900	}
	3901
	3902	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3903	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3904	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3905	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3906
	3907	if (bits != otherbits \|\| bits != 1) {
	3908	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	3909	"properties, bits=%"UVuf", otherbits=%"UVuf,
	3910	(UV)bits, (UV)otherbits);
	3911	}
	3912
	3913	/* The "other" swatch must be destroyed after. */
	3914	other = _swash_to_invlist((SV )othersvp);
	3915
	3916	/* End of code copied from swatch_get() */
	3917	switch (opc) {
	3918	case '+':
	3919	_invlist_union(invlist, other, &invlist);
	3920	break;
	3921	case '!':
	3922	_invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
	3923	break;
	3924	case '-':
	3925	_invlist_subtract(invlist, other, &invlist);
	3926	break;
	3927	case '&':
	3928	_invlist_intersection(invlist, other, &invlist);
	3929	break;
	3930	default:
	3931	break;
	3932	}
	3933	sv_free(other); /* through with it! */
	3934	}
	3935
	3936	SvREADONLY_on(invlist);
	3937	return invlist;
	3938	}
	3939
	3940	SV*
	3941	Perl__get_swash_invlist(pTHX_ SV* const swash)
	3942	{
	3943	SV** ptr;
	3944
	3945	PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
	3946
	3947	if (! SvROK(swash)) {
	3948	return NULL;
	3949	}
	3950
	3951	/* If it really isn't a hash, it isn't really swash; must be an inversion
	3952	* list */
	3953	if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
	3954	return SvRV(swash);
	3955	}
	3956
	3957	ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
	3958	if (! ptr) {
	3959	return NULL;
	3960	}
	3961
	3962	return *ptr;
	3963	}
	3964
	3965	bool
	3966	Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
	3967	{
	3968	/* May change: warns if surrogates, non-character code points, or
	3969	* non-Unicode code points are in s which has length len bytes. Returns
	3970	* TRUE if none found; FALSE otherwise. The only other validity check is
	3971	* to make sure that this won't exceed the string's length.
	3972	*
	3973	* Code points above the platform's C<IV_MAX> will raise a deprecation
	3974	* warning, unless those are turned off. */
	3975
	3976	const U8* const e = s + len;
	3977	bool ok = TRUE;
	3978
	3979	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3980
	3981	while (s < e) {
	3982	if (UTF8SKIP(s) > len) {
	3983	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3984	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3985	return FALSE;
	3986	}
	3987	if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
	3988	STRLEN char_len;
	3989	if (UTF8_IS_SUPER(s, e)) {
	3990	if ( ckWARN_d(WARN_NON_UNICODE)
	3991	\|\| ( ckWARN_d(WARN_DEPRECATED)
	3992	#if defined(UV_IS_QUAD)
	3993	/* 2**63 and up meet these conditions provided we have
	3994	* a 64-bit word. */
	3995	# ifdef EBCDIC
	3996	&& *s == 0xFE && e - s >= UTF8_MAXBYTES
	3997	&& s[1] >= 0x49
	3998	# else
	3999	&& *s == 0xFF && e -s >= UTF8_MAXBYTES
	4000	&& s[2] >= 0x88
	4001	# endif
	4002	#else /* Below is 32-bit words */
	4003	/* 2**31 and above meet these conditions on all EBCDIC
	4004	* pages recognized for 32-bit platforms */
	4005	# ifdef EBCDIC
	4006	&& *s == 0xFE && e - s >= UTF8_MAXBYTES
	4007	&& s[6] >= 0x43
	4008	# else
	4009	&& *s >= 0xFE
	4010	# endif
	4011	#endif
	4012	)) {
	4013	/* A side effect of this function will be to warn */
	4014	(void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_SUPER);
	4015	ok = FALSE;
	4016	}
	4017	}
	4018	else if (UTF8_IS_SURROGATE(s, e)) {
	4019	if (ckWARN_d(WARN_SURROGATE)) {
	4020	/* This has a different warning than the one the called
	4021	* function would output, so can't just call it, unlike we
	4022	* do for the non-chars and above-unicodes */
	4023	UV uv = utf8_to_uvchr_buf(s, e, &char_len);
	4024	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	4025	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	4026	ok = FALSE;
	4027	}
	4028	}
	4029	else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
	4030	/* A side effect of this function will be to warn */
	4031	(void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
	4032	ok = FALSE;
	4033	}
	4034	}
	4035	s += UTF8SKIP(s);
	4036	}
	4037
	4038	return ok;
	4039	}
	4040
	4041	/*
	4042	=for apidoc pv_uni_display
	4043
	4044	Build to the scalar C<dsv> a displayable version of the string C<spv>,
	4045	length C<len>, the displayable version being at most C<pvlim> bytes long
	4046	(if longer, the rest is truncated and C<"..."> will be appended).
	4047
	4048	The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
	4049	C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
	4050	to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
	4051	(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
	4052	C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
	4053	C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
	4054
	4055	The pointer to the PV of the C<dsv> is returned.
	4056
	4057	See also L</sv_uni_display>.
	4058
	4059	=cut */
	4060	char *
	4061	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	4062	{
	4063	int truncated = 0;
	4064	const char s, e;
	4065
	4066	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	4067
	4068	sv_setpvs(dsv, "");
	4069	SvUTF8_off(dsv);
	4070	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	4071	UV u;
	4072	/* This serves double duty as a flag and a character to print after
	4073	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	4074	*/
	4075	char ok = 0;
	4076
	4077	if (pvlim && SvCUR(dsv) >= pvlim) {
	4078	truncated++;
	4079	break;
	4080	}
	4081	u = utf8_to_uvchr_buf((U8)s, (U8)e, 0);
	4082	if (u < 256) {
	4083	const unsigned char c = (unsigned char)u & 0xFF;
	4084	if (flags & UNI_DISPLAY_BACKSLASH) {
	4085	switch (c) {
	4086	case '\n':
	4087	ok = 'n'; break;
	4088	case '\r':
	4089	ok = 'r'; break;
	4090	case '\t':
	4091	ok = 't'; break;
	4092	case '\f':
	4093	ok = 'f'; break;
	4094	case '\a':
	4095	ok = 'a'; break;
	4096	case '\\':
	4097	ok = '\\'; break;
	4098	default: break;
	4099	}
	4100	if (ok) {
	4101	const char string = ok;
	4102	sv_catpvs(dsv, "\\");
	4103	sv_catpvn(dsv, &string, 1);
	4104	}
	4105	}
	4106	/* isPRINT() is the locale-blind version. */
	4107	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	4108	const char string = c;
	4109	sv_catpvn(dsv, &string, 1);
	4110	ok = 1;
	4111	}
	4112	}
	4113	if (!ok)
	4114	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	4115	}
	4116	if (truncated)
	4117	sv_catpvs(dsv, "...");
	4118
	4119	return SvPVX(dsv);
	4120	}
	4121
	4122	/*
	4123	=for apidoc sv_uni_display
	4124
	4125	Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
	4126	the displayable version being at most C<pvlim> bytes long
	4127	(if longer, the rest is truncated and "..." will be appended).
	4128
	4129	The C<flags> argument is as in L</pv_uni_display>().
	4130
	4131	The pointer to the PV of the C<dsv> is returned.
	4132
	4133	=cut
	4134	*/
	4135	char *
	4136	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	4137	{
	4138	const char * const ptr =
	4139	isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
	4140
	4141	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	4142
	4143	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
	4144	SvCUR(ssv), pvlim, flags);
	4145	}
	4146
	4147	/*
	4148	=for apidoc foldEQ_utf8
	4149
	4150	Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
	4151	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	4152	How far into the strings to compare is determined by other input parameters.
	4153
	4154	If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
	4155	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C<u2>
	4156	with respect to C<s2>.
	4157
	4158	If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
	4159	equality. In other words, C<s1>+C<l1> will be used as a goal to reach. The
	4160	scan will not be considered to be a match unless the goal is reached, and
	4161	scanning won't continue past that goal. Correspondingly for C<l2> with respect to
	4162	C<s2>.
	4163
	4164	If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that pointer is
	4165	considered an end pointer to the position 1 byte past the maximum point
	4166	in C<s1> beyond which scanning will not continue under any circumstances.
	4167	(This routine assumes that UTF-8 encoded input strings are not malformed;
	4168	malformed input can cause it to read past C<pe1>).
	4169	This means that if both C<l1> and C<pe1> are specified, and C<pe1>
	4170	is less than C<s1>+C<l1>, the match will never be successful because it can
	4171	never
	4172	get as far as its goal (and in fact is asserted against). Correspondingly for
	4173	C<pe2> with respect to C<s2>.
	4174
	4175	At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
	4176	C<l2> must be non-zero), and if both do, both have to be
	4177	reached for a successful match. Also, if the fold of a character is multiple
	4178	characters, all of them must be matched (see tr21 reference below for
	4179	'folding').
	4180
	4181	Upon a successful match, if C<pe1> is non-C<NULL>,
	4182	it will be set to point to the beginning of the I<next> character of C<s1>
	4183	beyond what was matched. Correspondingly for C<pe2> and C<s2>.
	4184
	4185	For case-insensitiveness, the "casefolding" of Unicode is used
	4186	instead of upper/lowercasing both the characters, see
	4187	L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
	4188
	4189	=cut */
	4190
	4191	/* A flags parameter has been added which may change, and hence isn't
	4192	* externally documented. Currently it is:
	4193	* 0 for as-documented above
	4194	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	4195	ASCII one, to not match
	4196	* FOLDEQ_LOCALE is set iff the rules from the current underlying
	4197	* locale are to be used.
	4198	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	4199	* routine. This allows that step to be skipped.
	4200	* Currently, this requires s1 to be encoded as UTF-8
	4201	* (u1 must be true), which is asserted for.
	4202	* FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
	4203	* cross certain boundaries. Hence, the caller should
	4204	* let this function do the folding instead of
	4205	* pre-folding. This code contains an assertion to
	4206	* that effect. However, if the caller knows what
	4207	* it's doing, it can pass this flag to indicate that,
	4208	* and the assertion is skipped.
	4209	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	4210	* FOLDEQ_S2_FOLDS_SANE
	4211	*/
	4212	I32
	4213	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, UV l1, bool u1, const char s2, char **pe2, UV l2, bool u2, U32 flags)
	4214	{
	4215	const U8 p1 = (const U8)s1; /* Point to current char */
	4216	const U8 p2 = (const U8)s2;
	4217	const U8 g1 = NULL; / goal for s1 */
	4218	const U8 *g2 = NULL;
	4219	const U8 e1 = NULL; / Don't scan s1 past this */
	4220	U8 f1 = NULL; / Point to current folded */
	4221	const U8 *e2 = NULL;
	4222	U8 *f2 = NULL;
	4223	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	4224	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	4225	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	4226	U8 flags_for_folder = FOLD_FLAGS_FULL;
	4227
	4228	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	4229
	4230	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_LOCALE))
	4231	&& (((flags & FOLDEQ_S1_ALREADY_FOLDED)
	4232	&& !(flags & FOLDEQ_S1_FOLDS_SANE))
	4233	\|\| ((flags & FOLDEQ_S2_ALREADY_FOLDED)
	4234	&& !(flags & FOLDEQ_S2_FOLDS_SANE)))));
	4235	/* The algorithm is to trial the folds without regard to the flags on
	4236	* the first line of the above assert(), and then see if the result
	4237	* violates them. This means that the inputs can't be pre-folded to a
	4238	* violating result, hence the assert. This could be changed, with the
	4239	* addition of extra tests here for the already-folded case, which would
	4240	* slow it down. That cost is more than any possible gain for when these
	4241	* flags are specified, as the flags indicate /il or /iaa matching which
	4242	* is less common than /iu, and I (khw) also believe that real-world /il
	4243	* and /iaa matches are most likely to involve code points 0-255, and this
	4244	* function only under rare conditions gets called for 0-255. */
	4245
	4246	if (flags & FOLDEQ_LOCALE) {
	4247	if (IN_UTF8_CTYPE_LOCALE) {
	4248	flags &= ~FOLDEQ_LOCALE;
	4249	}
	4250	else {
	4251	flags_for_folder \|= FOLD_FLAGS_LOCALE;
	4252	}
	4253	}
	4254
	4255	if (pe1) {
	4256	e1 = (U8*)pe1;
	4257	}
	4258
	4259	if (l1) {
	4260	g1 = (const U8*)s1 + l1;
	4261	}
	4262
	4263	if (pe2) {
	4264	e2 = (U8*)pe2;
	4265	}
	4266
	4267	if (l2) {
	4268	g2 = (const U8*)s2 + l2;
	4269	}
	4270
	4271	/* Must have at least one goal */
	4272	assert(g1 \|\| g2);
	4273
	4274	if (g1) {
	4275
	4276	/* Will never match if goal is out-of-bounds */
	4277	assert(! e1 \|\| e1 >= g1);
	4278
	4279	/* Here, there isn't an end pointer, or it is beyond the goal. We
	4280	* only go as far as the goal */
	4281	e1 = g1;
	4282	}
	4283	else {
	4284	assert(e1); /* Must have an end for looking at s1 */
	4285	}
	4286
	4287	/* Same for goal for s2 */
	4288	if (g2) {
	4289	assert(! e2 \|\| e2 >= g2);
	4290	e2 = g2;
	4291	}
	4292	else {
	4293	assert(e2);
	4294	}
	4295
	4296	/* If both operands are already folded, we could just do a memEQ on the
	4297	* whole strings at once, but it would be better if the caller realized
	4298	* this and didn't even call us */
	4299
	4300	/* Look through both strings, a character at a time */
	4301	while (p1 < e1 && p2 < e2) {
	4302
	4303	/* If at the beginning of a new character in s1, get its fold to use
	4304	* and the length of the fold. */
	4305	if (n1 == 0) {
	4306	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	4307	f1 = (U8 *) p1;
	4308	assert(u1);
	4309	n1 = UTF8SKIP(f1);
	4310	}
	4311	else {
	4312	if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
	4313
	4314	/* We have to forbid mixing ASCII with non-ASCII if the
	4315	* flags so indicate. And, we can short circuit having to
	4316	* call the general functions for this common ASCII case,
	4317	* all of whose non-locale folds are also ASCII, and hence
	4318	* UTF-8 invariants, so the UTF8ness of the strings is not
	4319	* relevant. */
	4320	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	4321	return 0;
	4322	}
	4323	n1 = 1;
	4324	foldbuf1 = toFOLD(p1);
	4325	}
	4326	else if (u1) {
	4327	_to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
	4328	}
	4329	else { /* Not UTF-8, get UTF-8 fold */
	4330	_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
	4331	}
	4332	f1 = foldbuf1;
	4333	}
	4334	}
	4335
	4336	if (n2 == 0) { /* Same for s2 */
	4337	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	4338	f2 = (U8 *) p2;
	4339	assert(u2);
	4340	n2 = UTF8SKIP(f2);
	4341	}
	4342	else {
	4343	if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
	4344	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	4345	return 0;
	4346	}
	4347	n2 = 1;
	4348	foldbuf2 = toFOLD(p2);
	4349	}
	4350	else if (u2) {
	4351	_to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
	4352	}
	4353	else {
	4354	_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
	4355	}
	4356	f2 = foldbuf2;
	4357	}
	4358	}
	4359
	4360	/* Here f1 and f2 point to the beginning of the strings to compare.
	4361	* These strings are the folds of the next character from each input
	4362	* string, stored in UTF-8. */
	4363
	4364	/* While there is more to look for in both folds, see if they
	4365	* continue to match */
	4366	while (n1 && n2) {
	4367	U8 fold_length = UTF8SKIP(f1);
	4368	if (fold_length != UTF8SKIP(f2)
	4369	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	4370	function call for single
	4371	byte */
	4372	\|\| memNE((char)f1, (char)f2, fold_length))
	4373	{
	4374	return 0; /* mismatch */
	4375	}
	4376
	4377	/* Here, they matched, advance past them */
	4378	n1 -= fold_length;
	4379	f1 += fold_length;
	4380	n2 -= fold_length;
	4381	f2 += fold_length;
	4382	}
	4383
	4384	/* When reach the end of any fold, advance the input past it */
	4385	if (n1 == 0) {
	4386	p1 += u1 ? UTF8SKIP(p1) : 1;
	4387	}
	4388	if (n2 == 0) {
	4389	p2 += u2 ? UTF8SKIP(p2) : 1;
	4390	}
	4391	} /* End of loop through both strings */
	4392
	4393	/* A match is defined by each scan that specified an explicit length
	4394	* reaching its final goal, and the other not having matched a partial
	4395	* character (which can happen when the fold of a character is more than one
	4396	* character). */
	4397	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	4398	return 0;
	4399	}
	4400
	4401	/* Successful match. Set output pointers */
	4402	if (pe1) {
	4403	pe1 = (char)p1;
	4404	}
	4405	if (pe2) {
	4406	pe2 = (char)p2;
	4407	}
	4408	return 1;
	4409	}
	4410
	4411	/* XXX The next two functions should likely be moved to mathoms.c once all
	4412	* occurrences of them are removed from the core; some cpan-upstream modules
	4413	* still use them */
	4414
	4415	U8 *
	4416	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	4417	{
	4418	PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
	4419
	4420	return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
	4421	}
	4422
	4423	/*
	4424	=for apidoc utf8n_to_uvuni
	4425
	4426	Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
	4427
	4428	This function was useful for code that wanted to handle both EBCDIC and
	4429	ASCII platforms with Unicode properties, but starting in Perl v5.20, the
	4430	distinctions between the platforms have mostly been made invisible to most
	4431	code, so this function is quite unlikely to be what you want. If you do need
	4432	this precise functionality, use instead
	4433	C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))\|/utf8_to_uvchr_buf>>
	4434	or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))\|/utf8n_to_uvchr>>.
	4435
	4436	=cut
	4437	*/
	4438
	4439	UV
	4440	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	4441	{
	4442	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	4443
	4444	return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
	4445	}
	4446
	4447	/*
	4448	=for apidoc uvuni_to_utf8_flags
	4449
	4450	Instead you almost certainly want to use L</uvchr_to_utf8> or
	4451	L</uvchr_to_utf8_flags>.
	4452
	4453	This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
	4454	which itself, while not deprecated, should be used only in isolated
	4455	circumstances. These functions were useful for code that wanted to handle
	4456	both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
	4457	v5.20, the distinctions between the platforms have mostly been made invisible
	4458	to most code, so this function is quite unlikely to be what you want.
	4459
	4460	=cut
	4461	*/
	4462
	4463	U8 *
	4464	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	4465	{
	4466	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	4467
	4468	return uvoffuni_to_utf8_flags(d, uv, flags);
	4469	}
	4470
	4471	/*
	4472	* ex: set ts=8 sts=4 sw=4 et:
	4473	*/