perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>.
	66
	67	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	68
	69	=cut
	70	*/
	71
	72	bool
	73	Perl_is_ascii_string(const U8 *s, STRLEN len)
	74	{
	75	const U8* const send = s + (len ? len : strlen((const char *)s));
	76	const U8* x = s;
	77
	78	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	79
	80	for (; x < send; ++x) {
	81	if (!UTF8_IS_INVARIANT(*x))
	82	break;
	83	}
	84
	85	return x == send;
	86	}
	87
	88	/*
	89	=for apidoc uvuni_to_utf8_flags
	90
	91	Adds the UTF-8 representation of the code point C<uv> to the end
	92	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	93	bytes available. The return value is the pointer to the byte after the
	94	end of the new character. In other words,
	95
	96	d = uvuni_to_utf8_flags(d, uv, flags);
	97
	98	or, in most cases,
	99
	100	d = uvuni_to_utf8(d, uv);
	101
	102	(which is equivalent to)
	103
	104	d = uvuni_to_utf8_flags(d, uv, 0);
	105
	106	This is the recommended Unicode-aware way of saying
	107
	108	*(d++) = uv;
	109
	110	This function will convert to UTF-8 (and not warn) even code points that aren't
	111	legal Unicode or are problematic, unless C<flags> contains one or more of the
	112	following flags.
	113	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	114	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	115	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	116	If both flags are set, the function will both warn and return NULL.
	117
	118	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
	119	affect how the function handles a Unicode non-character. And, likewise for the
	120	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
	121	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	122	even less portable) can be warned and/or disallowed even if other above-Unicode
	123	code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	124	flags.
	125
	126	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	127	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	128	DISALLOW flags.
	129
	130
	131	=cut
	132	*/
	133
	134	U8 *
	135	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	136	{
	137	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	138
	139	if (ckWARN_d(WARN_UTF8)) {
	140	if (UNICODE_IS_SURROGATE(uv)) {
	141	if (flags & UNICODE_WARN_SURROGATE) {
	142	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
	143	"UTF-16 surrogate U+%04"UVXf, uv);
	144	}
	145	if (flags & UNICODE_DISALLOW_SURROGATE) {
	146	return NULL;
	147	}
	148	}
	149	else if (UNICODE_IS_SUPER(uv)) {
	150	if (flags & UNICODE_WARN_SUPER
	151	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	152	{
	153	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	154	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	155	}
	156	if (flags & UNICODE_DISALLOW_SUPER
	157	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	158	{
	159	return NULL;
	160	}
	161	}
	162	else if (UNICODE_IS_NONCHAR(uv)) {
	163	if (flags & UNICODE_WARN_NONCHAR) {
	164	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
	165	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	166	uv);
	167	}
	168	if (flags & UNICODE_DISALLOW_NONCHAR) {
	169	return NULL;
	170	}
	171	}
	172	}
	173	if (UNI_IS_INVARIANT(uv)) {
	174	*d++ = (U8)UTF_TO_NATIVE(uv);
	175	return d;
	176	}
	177	#if defined(EBCDIC)
	178	else {
	179	STRLEN len = UNISKIP(uv);
	180	U8 *p = d+len-1;
	181	while (p > d) {
	182	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	183	uv >>= UTF_ACCUMULATION_SHIFT;
	184	}
	185	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	186	return d+len;
	187	}
	188	#else /* Non loop style */
	189	if (uv < 0x800) {
	190	*d++ = (U8)(( uv >> 6) \| 0xc0);
	191	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	192	return d;
	193	}
	194	if (uv < 0x10000) {
	195	*d++ = (U8)(( uv >> 12) \| 0xe0);
	196	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	197	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	198	return d;
	199	}
	200	if (uv < 0x200000) {
	201	*d++ = (U8)(( uv >> 18) \| 0xf0);
	202	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	204	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	205	return d;
	206	}
	207	if (uv < 0x4000000) {
	208	*d++ = (U8)(( uv >> 24) \| 0xf8);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	if (uv < 0x80000000) {
	216	*d++ = (U8)(( uv >> 30) \| 0xfc);
	217	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	218	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	219	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	220	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	221	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	222	return d;
	223	}
	224	#ifdef HAS_QUAD
	225	if (uv < UTF8_QUAD_MAX)
	226	#endif
	227	{
	228	d++ = 0xfe; / Can't match U+FEFF! */
	229	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	231	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	232	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	233	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	234	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	235	return d;
	236	}
	237	#ifdef HAS_QUAD
	238	{
	239	d++ = 0xff; / Can't match U+FFFE! */
	240	d++ = 0x80; / 6 Reserved bits */
	241	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	242	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	243	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	244	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	245	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	246	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	247	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	248	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	249	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	250	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	251	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	252	return d;
	253	}
	254	#endif
	255	#endif /* Loop style */
	256	}
	257
	258	/*
	259
	260	Tests if some arbitrary number of bytes begins in a valid UTF-8
	261	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	262	UTF-8 character. The actual number of bytes in the UTF-8 character
	263	will be returned if it is valid, otherwise 0.
	264
	265	This is the "slow" version as opposed to the "fast" version which is
	266	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	267	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	268	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	269	you should use the _slow(). In practice this means that the _slow()
	270	will be used very rarely, since the maximum Unicode code point (as of
	271	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	272	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	273	five bytes or more.
	274
	275	=cut */
	276	STATIC STRLEN
	277	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	278	{
	279	U8 u = *s;
	280	STRLEN slen;
	281	UV uv, ouv;
	282
	283	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	284
	285	if (UTF8_IS_INVARIANT(u))
	286	return 1;
	287
	288	if (!UTF8_IS_START(u))
	289	return 0;
	290
	291	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	292	return 0;
	293
	294	slen = len - 1;
	295	s++;
	296	#ifdef EBCDIC
	297	u = NATIVE_TO_UTF(u);
	298	#endif
	299	u &= UTF_START_MASK(len);
	300	uv = u;
	301	ouv = uv;
	302	while (slen--) {
	303	if (!UTF8_IS_CONTINUATION(*s))
	304	return 0;
	305	uv = UTF8_ACCUMULATE(uv, *s);
	306	if (uv < ouv)
	307	return 0;
	308	ouv = uv;
	309	s++;
	310	}
	311
	312	if ((STRLEN)UNISKIP(uv) < len)
	313	return 0;
	314
	315	return len;
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char
	320
	321	Tests if some arbitrary number of bytes begins in a valid UTF-8
	322	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	323	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	324	character will be returned if it is valid, otherwise 0.
	325
	326	=cut */
	327	STRLEN
	328	Perl_is_utf8_char(const U8 *s)
	329	{
	330	const STRLEN len = UTF8SKIP(s);
	331
	332	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	333	#ifdef IS_UTF8_CHAR
	334	if (IS_UTF8_CHAR_FAST(len))
	335	return IS_UTF8_CHAR(s, len) ? len : 0;
	336	#endif /* #ifdef IS_UTF8_CHAR */
	337	return is_utf8_char_slow(s, len);
	338	}
	339
	340
	341	/*
	342	=for apidoc is_utf8_string
	343
	344	Returns true if first C<len> bytes of the given string form a valid
	345	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	346	using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
	347	string that contains code points above 0x7F encoded in UTF-8' because a
	348	valid ASCII string is a valid UTF-8 string.
	349
	350	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	351
	352	=cut
	353	*/
	354
	355	bool
	356	Perl_is_utf8_string(const U8 *s, STRLEN len)
	357	{
	358	const U8* const send = s + (len ? len : strlen((const char *)s));
	359	const U8* x = s;
	360
	361	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	362
	363	while (x < send) {
	364	STRLEN c;
	365	/* Inline the easy bits of is_utf8_char() here for speed... */
	366	if (UTF8_IS_INVARIANT(*x))
	367	c = 1;
	368	else if (!UTF8_IS_START(*x))
	369	goto out;
	370	else {
	371	/* ... and call is_utf8_char() only if really needed. */
	372	#ifdef IS_UTF8_CHAR
	373	c = UTF8SKIP(x);
	374	if (IS_UTF8_CHAR_FAST(c)) {
	375	if (!IS_UTF8_CHAR(x, c))
	376	c = 0;
	377	}
	378	else
	379	c = is_utf8_char_slow(x, c);
	380	#else
	381	c = is_utf8_char(x);
	382	#endif /* #ifdef IS_UTF8_CHAR */
	383	if (!c)
	384	goto out;
	385	}
	386	x += c;
	387	}
	388
	389	out:
	390	if (x != send)
	391	return FALSE;
	392
	393	return TRUE;
	394	}
	395
	396	/*
	397	Implemented as a macro in utf8.h
	398
	399	=for apidoc is_utf8_string_loc
	400
	401	Like is_utf8_string() but stores the location of the failure (in the
	402	case of "utf8ness failure") or the location s+len (in the case of
	403	"utf8ness success") in the C<ep>.
	404
	405	See also is_utf8_string_loclen() and is_utf8_string().
	406
	407	=for apidoc is_utf8_string_loclen
	408
	409	Like is_utf8_string() but stores the location of the failure (in the
	410	case of "utf8ness failure") or the location s+len (in the case of
	411	"utf8ness success") in the C<ep>, and the number of UTF-8
	412	encoded characters in the C<el>.
	413
	414	See also is_utf8_string_loc() and is_utf8_string().
	415
	416	=cut
	417	*/
	418
	419	bool
	420	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	421	{
	422	const U8* const send = s + (len ? len : strlen((const char *)s));
	423	const U8* x = s;
	424	STRLEN c;
	425	STRLEN outlen = 0;
	426
	427	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	428
	429	while (x < send) {
	430	/* Inline the easy bits of is_utf8_char() here for speed... */
	431	if (UTF8_IS_INVARIANT(*x))
	432	c = 1;
	433	else if (!UTF8_IS_START(*x))
	434	goto out;
	435	else {
	436	/* ... and call is_utf8_char() only if really needed. */
	437	#ifdef IS_UTF8_CHAR
	438	c = UTF8SKIP(x);
	439	if (IS_UTF8_CHAR_FAST(c)) {
	440	if (!IS_UTF8_CHAR(x, c))
	441	c = 0;
	442	} else
	443	c = is_utf8_char_slow(x, c);
	444	#else
	445	c = is_utf8_char(x);
	446	#endif /* #ifdef IS_UTF8_CHAR */
	447	if (!c)
	448	goto out;
	449	}
	450	x += c;
	451	outlen++;
	452	}
	453
	454	out:
	455	if (el)
	456	*el = outlen;
	457
	458	if (ep)
	459	*ep = x;
	460	return (x == send);
	461	}
	462
	463	/*
	464
	465	=for apidoc utf8n_to_uvuni
	466
	467	Bottom level UTF-8 decode routine.
	468	Returns the code point value of the first character in the string C<s>
	469	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
	470	C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
	471	character.
	472
	473	The value of C<flags> determines the behavior when C<s> does not point to a
	474	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	475	C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
	476	is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
	477	is raised.
	478
	479	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	480	individual types of malformations, such as the sequence being overlong (that
	481	is, when there is a shorter sequence that can express the same code point;
	482	overlong sequences are expressly forbidden in the UTF-8 standard due to
	483	potential security issues). Another malformation example is the first byte of
	484	a character not being a legal first byte. See F<utf8.h> for the list of such
	485	flags. Of course, the value returned by this function under such conditions is
	486	not reliable.
	487
	488	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	489	flags) malformation is found. If this flag is set, the routine assumes that
	490	the caller will raise a warning, and this function will silently just set
	491	C<retlen> to C<-1> and return zero.
	492
	493	Certain code points are considered problematic. These are Unicode surrogates,
	494	Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
	495	By default these are considered regular code points, but certain situations
	496	warrant special handling for them. if C<flags> contains
	497	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	498	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	499	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	500	maximum) can be set to disallow these categories individually.
	501
	502	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	503	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	504	for their respective categories, but otherwise the code points are considered
	505	valid (not malformations). To get a category to both be treated as a
	506	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	507	(But note that warnings are not raised if lexically disabled nor if
	508	UTF8_CHECK_ONLY is also specified.)
	509
	510	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	511	the others that are above the Unicode legal maximum. There are several
	512	reasons, one of which is that the original UTF-8 specification never went above
	513	this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
	514	on ASCII platforms for these large code point begins with a byte containing
	515	0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
	516	malformations, while allowing smaller above-Unicode code points. (Of course
	517	UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
	518	as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
	519	flags, but applies just to these code points.
	520
	521	All other code points corresponding to Unicode characters, including private
	522	use and those yet to be assigned, are never considered malformed and never
	523	warn.
	524
	525	Most code should use utf8_to_uvchr() rather than call this directly.
	526
	527	=cut
	528	*/
	529
	530	UV
	531	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	532	{
	533	dVAR;
	534	const U8 * const s0 = s;
	535	UV uv = *s, ouv = 0;
	536	STRLEN len = 1;
	537	bool dowarn = ckWARN_d(WARN_UTF8);
	538	const UV startbyte = *s;
	539	STRLEN expectlen = 0;
	540	U32 warning = 0;
	541	SV* sv = NULL;
	542
	543	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	544
	545	/* This list is a superset of the UTF8_ALLOW_XXX. */
	546
	547	#define UTF8_WARN_EMPTY 1
	548	#define UTF8_WARN_CONTINUATION 2
	549	#define UTF8_WARN_NON_CONTINUATION 3
	550	#define UTF8_WARN_SHORT 4
	551	#define UTF8_WARN_OVERFLOW 5
	552	#define UTF8_WARN_LONG 6
	553
	554	if (curlen == 0 &&
	555	!(flags & UTF8_ALLOW_EMPTY)) {
	556	warning = UTF8_WARN_EMPTY;
	557	goto malformed;
	558	}
	559
	560	if (UTF8_IS_INVARIANT(uv)) {
	561	if (retlen)
	562	*retlen = 1;
	563	return (UV) (NATIVE_TO_UTF(*s));
	564	}
	565
	566	if (UTF8_IS_CONTINUATION(uv) &&
	567	!(flags & UTF8_ALLOW_CONTINUATION)) {
	568	warning = UTF8_WARN_CONTINUATION;
	569	goto malformed;
	570	}
	571
	572	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	573	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	574	warning = UTF8_WARN_NON_CONTINUATION;
	575	goto malformed;
	576	}
	577
	578	#ifdef EBCDIC
	579	uv = NATIVE_TO_UTF(uv);
	580	#else
	581	if (uv == 0xfe \|\| uv == 0xff) {
	582	if (flags & (UTF8_WARN_SUPER\|UTF8_WARN_FE_FF)) {
	583	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
	584	flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
	585	}
	586	if (flags & (UTF8_DISALLOW_SUPER\|UTF8_DISALLOW_FE_FF)) {
	587	goto malformed;
	588	}
	589	}
	590	#endif
	591
	592	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	593	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	594	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	595	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	596	#ifdef EBCDIC
	597	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	598	else { len = 7; uv &= 0x01; }
	599	#else
	600	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	601	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	602	else { len = 13; uv = 0; } /* whoa! */
	603	#endif
	604
	605	if (retlen)
	606	*retlen = len;
	607
	608	expectlen = len;
	609
	610	if ((curlen < expectlen) &&
	611	!(flags & UTF8_ALLOW_SHORT)) {
	612	warning = UTF8_WARN_SHORT;
	613	goto malformed;
	614	}
	615
	616	len--;
	617	s++;
	618	ouv = uv; /* ouv is the value from the previous iteration */
	619
	620	while (len--) {
	621	if (!UTF8_IS_CONTINUATION(*s) &&
	622	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	623	s--;
	624	warning = UTF8_WARN_NON_CONTINUATION;
	625	goto malformed;
	626	}
	627	else
	628	uv = UTF8_ACCUMULATE(uv, *s);
	629	if (!(uv > ouv)) { /* If the value didn't grow from the previous
	630	iteration, something is horribly wrong */
	631	/* These cannot be allowed. */
	632	if (uv == ouv) {
	633	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	634	warning = UTF8_WARN_LONG;
	635	goto malformed;
	636	}
	637	}
	638	else { /* uv < ouv */
	639	/* This cannot be allowed. */
	640	warning = UTF8_WARN_OVERFLOW;
	641	goto malformed;
	642	}
	643	}
	644	s++;
	645	ouv = uv;
	646	}
	647
	648	if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
	649	warning = UTF8_WARN_LONG;
	650	goto malformed;
	651	} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE\|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
	652	if (UNICODE_IS_SURROGATE(uv)) {
	653	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
	654	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	655	}
	656	if (flags & UTF8_DISALLOW_SURROGATE) {
	657	goto disallowed;
	658	}
	659	}
	660	else if (UNICODE_IS_NONCHAR(uv)) {
	661	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
	662	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	663	}
	664	if (flags & UTF8_DISALLOW_NONCHAR) {
	665	goto disallowed;
	666	}
	667	}
	668	else if ((uv > PERL_UNICODE_MAX)) {
	669	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
	670	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	671	}
	672	if (flags & UTF8_DISALLOW_SUPER) {
	673	goto disallowed;
	674	}
	675	}
	676
	677	/* Here, this is not considered a malformed character, so drop through
	678	* to return it */
	679	}
	680
	681	return uv;
	682
	683	disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
	684	set if there is to be a warning. */
	685	if (!sv) {
	686	dowarn = 0;
	687	}
	688
	689	malformed:
	690
	691	if (flags & UTF8_CHECK_ONLY) {
	692	if (retlen)
	693	*retlen = ((STRLEN) -1);
	694	return 0;
	695	}
	696
	697	if (dowarn) {
	698	if (! sv) {
	699	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	700	}
	701
	702	switch (warning) {
	703	case 0: /* Intentionally empty. */ break;
	704	case UTF8_WARN_EMPTY:
	705	sv_catpvs(sv, "(empty string)");
	706	break;
	707	case UTF8_WARN_CONTINUATION:
	708	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	709	break;
	710	case UTF8_WARN_NON_CONTINUATION:
	711	if (s == s0)
	712	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	713	(UV)s[1], startbyte);
	714	else {
	715	const int len = (int)(s-s0);
	716	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	717	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	718	}
	719
	720	break;
	721	case UTF8_WARN_SHORT:
	722	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	723	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	724	expectlen = curlen; /* distance for caller to skip */
	725	break;
	726	case UTF8_WARN_OVERFLOW:
	727	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	728	ouv, *s, startbyte);
	729	break;
	730	case UTF8_WARN_LONG:
	731	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	732	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	733	break;
	734	default:
	735	sv_catpvs(sv, "(unknown reason)");
	736	break;
	737	}
	738
	739	if (sv) {
	740	const char * const s = SvPVX_const(sv);
	741
	742	if (PL_op)
	743	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	744	"%s in %s", s, OP_DESC(PL_op));
	745	else
	746	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	747	}
	748	}
	749
	750	if (retlen)
	751	*retlen = expectlen ? expectlen : len;
	752
	753	return 0;
	754	}
	755
	756	/*
	757	=for apidoc utf8_to_uvchr
	758
	759	Returns the native code point of the first character in the string C<s>
	760	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	761	length, in bytes, of that character.
	762
	763	If C<s> does not point to a well-formed UTF-8 character, zero is
	764	returned and retlen is set, if possible, to -1.
	765
	766	=cut
	767	*/
	768
	769
	770	UV
	771	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	772	{
	773	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	774
	775	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	776	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	777	}
	778
	779	/*
	780	=for apidoc utf8_to_uvuni
	781
	782	Returns the Unicode code point of the first character in the string C<s>
	783	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	784	length, in bytes, of that character.
	785
	786	This function should only be used when the returned UV is considered
	787	an index into the Unicode semantic tables (e.g. swashes).
	788
	789	If C<s> does not point to a well-formed UTF-8 character, zero is
	790	returned and retlen is set, if possible, to -1.
	791
	792	=cut
	793	*/
	794
	795	UV
	796	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	797	{
	798	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	799
	800	/* Call the low level routine asking for checks */
	801	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	802	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	803	}
	804
	805	/*
	806	=for apidoc utf8_length
	807
	808	Return the length of the UTF-8 char encoded string C<s> in characters.
	809	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	810	up past C<e>, croaks.
	811
	812	=cut
	813	*/
	814
	815	STRLEN
	816	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	817	{
	818	dVAR;
	819	STRLEN len = 0;
	820
	821	PERL_ARGS_ASSERT_UTF8_LENGTH;
	822
	823	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	824	* the bitops (especially ~) can create illegal UTF-8.
	825	* In other words: in Perl UTF-8 is not just for Unicode. */
	826
	827	if (e < s)
	828	goto warn_and_return;
	829	while (s < e) {
	830	if (!UTF8_IS_INVARIANT(*s))
	831	s += UTF8SKIP(s);
	832	else
	833	s++;
	834	len++;
	835	}
	836
	837	if (e != s) {
	838	len--;
	839	warn_and_return:
	840	if (PL_op)
	841	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	842	"%s in %s", unees, OP_DESC(PL_op));
	843	else
	844	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	845	}
	846
	847	return len;
	848	}
	849
	850	/*
	851	=for apidoc utf8_distance
	852
	853	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	854	and C<b>.
	855
	856	WARNING: use only if you know that the pointers point inside the
	857	same UTF-8 buffer.
	858
	859	=cut
	860	*/
	861
	862	IV
	863	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	864	{
	865	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	866
	867	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	868	}
	869
	870	/*
	871	=for apidoc utf8_hop
	872
	873	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	874	forward or backward.
	875
	876	WARNING: do not use the following unless you know C<off> is within
	877	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	878	on the first byte of character or just after the last byte of a character.
	879
	880	=cut
	881	*/
	882
	883	U8 *
	884	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	885	{
	886	PERL_ARGS_ASSERT_UTF8_HOP;
	887
	888	PERL_UNUSED_CONTEXT;
	889	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	890	* the bitops (especially ~) can create illegal UTF-8.
	891	* In other words: in Perl UTF-8 is not just for Unicode. */
	892
	893	if (off >= 0) {
	894	while (off--)
	895	s += UTF8SKIP(s);
	896	}
	897	else {
	898	while (off++) {
	899	s--;
	900	while (UTF8_IS_CONTINUATION(*s))
	901	s--;
	902	}
	903	}
	904	return (U8 *)s;
	905	}
	906
	907	/*
	908	=for apidoc bytes_cmp_utf8
	909
	910	Compares the sequence of characters (stored as octets) in b, blen with the
	911	sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
	912	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	913	if the first string is greater than the second string.
	914
	915	-1 or +1 is returned if the shorter string was identical to the start of the
	916	longer string. -2 or +2 is returned if the was a difference between characters
	917	within the strings.
	918
	919	=cut
	920	*/
	921
	922	int
	923	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	924	{
	925	const U8 *const bend = b + blen;
	926	const U8 *const uend = u + ulen;
	927
	928	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	929
	930	PERL_UNUSED_CONTEXT;
	931
	932	while (b < bend && u < uend) {
	933	U8 c = *u++;
	934	if (!UTF8_IS_INVARIANT(c)) {
	935	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	936	if (u < uend) {
	937	U8 c1 = *u++;
	938	if (UTF8_IS_CONTINUATION(c1)) {
	939	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
	940	} else {
	941	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	942	"Malformed UTF-8 character "
	943	"(unexpected non-continuation byte 0x%02x"
	944	", immediately after start byte 0x%02x)"
	945	/* Dear diag.t, it's in the pod. */
	946	"%s%s", c1, c,
	947	PL_op ? " in " : "",
	948	PL_op ? OP_DESC(PL_op) : "");
	949	return -2;
	950	}
	951	} else {
	952	if (PL_op)
	953	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	954	"%s in %s", unees, OP_DESC(PL_op));
	955	else
	956	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	957	return -2; /* Really want to return undef :-) */
	958	}
	959	} else {
	960	return -2;
	961	}
	962	}
	963	if (*b != c) {
	964	return *b < c ? -2 : +2;
	965	}
	966	++b;
	967	}
	968
	969	if (b == bend && u == uend)
	970	return 0;
	971
	972	return b < bend ? +1 : -1;
	973	}
	974
	975	/*
	976	=for apidoc utf8_to_bytes
	977
	978	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	979	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	980	updates len to contain the new length.
	981	Returns zero on failure, setting C<len> to -1.
	982
	983	If you need a copy of the string, see C<bytes_from_utf8>.
	984
	985	=cut
	986	*/
	987
	988	U8 *
	989	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	990	{
	991	U8 * const save = s;
	992	U8 * const send = s + *len;
	993	U8 *d;
	994
	995	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	996
	997	/* ensure valid UTF-8 and chars < 256 before updating string */
	998	while (s < send) {
	999	U8 c = *s++;
	1000
	1001	if (!UTF8_IS_INVARIANT(c) &&
	1002	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	1003	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	1004	*len = ((STRLEN) -1);
	1005	return 0;
	1006	}
	1007	}
	1008
	1009	d = s = save;
	1010	while (s < send) {
	1011	STRLEN ulen;
	1012	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	1013	s += ulen;
	1014	}
	1015	*d = '\0';
	1016	*len = d - save;
	1017	return save;
	1018	}
	1019
	1020	/*
	1021	=for apidoc bytes_from_utf8
	1022
	1023	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1024	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	1025	the newly-created string, and updates C<len> to contain the new
	1026	length. Returns the original string if no conversion occurs, C<len>
	1027	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1028	0 if C<s> is converted or consisted entirely of characters that are invariant
	1029	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1030
	1031	=cut
	1032	*/
	1033
	1034	U8 *
	1035	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1036	{
	1037	U8 *d;
	1038	const U8 *start = s;
	1039	const U8 *send;
	1040	I32 count = 0;
	1041
	1042	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1043
	1044	PERL_UNUSED_CONTEXT;
	1045	if (!*is_utf8)
	1046	return (U8 *)start;
	1047
	1048	/* ensure valid UTF-8 and chars < 256 before converting string */
	1049	for (send = s + *len; s < send;) {
	1050	U8 c = *s++;
	1051	if (!UTF8_IS_INVARIANT(c)) {
	1052	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	1053	(c = *s++) && UTF8_IS_CONTINUATION(c))
	1054	count++;
	1055	else
	1056	return (U8 *)start;
	1057	}
	1058	}
	1059
	1060	*is_utf8 = FALSE;
	1061
	1062	Newx(d, (*len) - count + 1, U8);
	1063	s = start; start = d;
	1064	while (s < send) {
	1065	U8 c = *s++;
	1066	if (!UTF8_IS_INVARIANT(c)) {
	1067	/* Then it is two-byte encoded */
	1068	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
	1069	}
	1070	*d++ = c;
	1071	}
	1072	*d = '\0';
	1073	*len = d - start;
	1074	return (U8 *)start;
	1075	}
	1076
	1077	/*
	1078	=for apidoc bytes_to_utf8
	1079
	1080	Converts a string C<s> of length C<len> bytes from the native encoding into
	1081	UTF-8.
	1082	Returns a pointer to the newly-created string, and sets C<len> to
	1083	reflect the new length in bytes.
	1084
	1085	A NUL character will be written after the end of the string.
	1086
	1087	If you want to convert to UTF-8 from encodings other than
	1088	the native (Latin1 or EBCDIC),
	1089	see sv_recode_to_utf8().
	1090
	1091	=cut
	1092	*/
	1093
	1094	U8*
	1095	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1096	{
	1097	const U8 * const send = s + (*len);
	1098	U8 *d;
	1099	U8 *dst;
	1100
	1101	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1102	PERL_UNUSED_CONTEXT;
	1103
	1104	Newx(d, (len) 2 + 1, U8);
	1105	dst = d;
	1106
	1107	while (s < send) {
	1108	const UV uv = NATIVE_TO_ASCII(*s++);
	1109	if (UNI_IS_INVARIANT(uv))
	1110	*d++ = (U8)UTF_TO_NATIVE(uv);
	1111	else {
	1112	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	1113	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	1114	}
	1115	}
	1116	*d = '\0';
	1117	*len = d-dst;
	1118	return dst;
	1119	}
	1120
	1121	/*
	1122	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1123	*
	1124	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1125	* We optimize for native, for obvious reasons. */
	1126
	1127	U8*
	1128	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1129	{
	1130	U8* pend;
	1131	U8* dstart = d;
	1132
	1133	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1134
	1135	if (bytelen & 1)
	1136	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1137
	1138	pend = p + bytelen;
	1139
	1140	while (p < pend) {
	1141	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1142	p += 2;
	1143	if (uv < 0x80) {
	1144	#ifdef EBCDIC
	1145	*d++ = UNI_TO_NATIVE(uv);
	1146	#else
	1147	*d++ = (U8)uv;
	1148	#endif
	1149	continue;
	1150	}
	1151	if (uv < 0x800) {
	1152	*d++ = (U8)(( uv >> 6) \| 0xc0);
	1153	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1154	continue;
	1155	}
	1156	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	1157	if (p >= pend) {
	1158	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1159	} else {
	1160	UV low = (p[0] << 8) + p[1];
	1161	p += 2;
	1162	if (low < 0xdc00 \|\| low > 0xdfff)
	1163	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1164	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	1165	}
	1166	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	1167	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1168	}
	1169	if (uv < 0x10000) {
	1170	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1171	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1172	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1173	continue;
	1174	}
	1175	else {
	1176	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1177	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1178	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1179	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1180	continue;
	1181	}
	1182	}
	1183	*newlen = d - dstart;
	1184	return d;
	1185	}
	1186
	1187	/* Note: this one is slightly destructive of the source. */
	1188
	1189	U8*
	1190	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1191	{
	1192	U8* s = (U8*)p;
	1193	U8* const send = s + bytelen;
	1194
	1195	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1196
	1197	if (bytelen & 1)
	1198	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1199	(UV)bytelen);
	1200
	1201	while (s < send) {
	1202	const U8 tmp = s[0];
	1203	s[0] = s[1];
	1204	s[1] = tmp;
	1205	s += 2;
	1206	}
	1207	return utf16_to_utf8(p, d, bytelen, newlen);
	1208	}
	1209
	1210	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	1211
	1212	bool
	1213	Perl_is_uni_alnum(pTHX_ UV c)
	1214	{
	1215	U8 tmpbuf[UTF8_MAXBYTES+1];
	1216	uvchr_to_utf8(tmpbuf, c);
	1217	return is_utf8_alnum(tmpbuf);
	1218	}
	1219
	1220	bool
	1221	Perl_is_uni_idfirst(pTHX_ UV c)
	1222	{
	1223	U8 tmpbuf[UTF8_MAXBYTES+1];
	1224	uvchr_to_utf8(tmpbuf, c);
	1225	return is_utf8_idfirst(tmpbuf);
	1226	}
	1227
	1228	bool
	1229	Perl_is_uni_alpha(pTHX_ UV c)
	1230	{
	1231	U8 tmpbuf[UTF8_MAXBYTES+1];
	1232	uvchr_to_utf8(tmpbuf, c);
	1233	return is_utf8_alpha(tmpbuf);
	1234	}
	1235
	1236	bool
	1237	Perl_is_uni_ascii(pTHX_ UV c)
	1238	{
	1239	U8 tmpbuf[UTF8_MAXBYTES+1];
	1240	uvchr_to_utf8(tmpbuf, c);
	1241	return is_utf8_ascii(tmpbuf);
	1242	}
	1243
	1244	bool
	1245	Perl_is_uni_space(pTHX_ UV c)
	1246	{
	1247	U8 tmpbuf[UTF8_MAXBYTES+1];
	1248	uvchr_to_utf8(tmpbuf, c);
	1249	return is_utf8_space(tmpbuf);
	1250	}
	1251
	1252	bool
	1253	Perl_is_uni_digit(pTHX_ UV c)
	1254	{
	1255	U8 tmpbuf[UTF8_MAXBYTES+1];
	1256	uvchr_to_utf8(tmpbuf, c);
	1257	return is_utf8_digit(tmpbuf);
	1258	}
	1259
	1260	bool
	1261	Perl_is_uni_upper(pTHX_ UV c)
	1262	{
	1263	U8 tmpbuf[UTF8_MAXBYTES+1];
	1264	uvchr_to_utf8(tmpbuf, c);
	1265	return is_utf8_upper(tmpbuf);
	1266	}
	1267
	1268	bool
	1269	Perl_is_uni_lower(pTHX_ UV c)
	1270	{
	1271	U8 tmpbuf[UTF8_MAXBYTES+1];
	1272	uvchr_to_utf8(tmpbuf, c);
	1273	return is_utf8_lower(tmpbuf);
	1274	}
	1275
	1276	bool
	1277	Perl_is_uni_cntrl(pTHX_ UV c)
	1278	{
	1279	U8 tmpbuf[UTF8_MAXBYTES+1];
	1280	uvchr_to_utf8(tmpbuf, c);
	1281	return is_utf8_cntrl(tmpbuf);
	1282	}
	1283
	1284	bool
	1285	Perl_is_uni_graph(pTHX_ UV c)
	1286	{
	1287	U8 tmpbuf[UTF8_MAXBYTES+1];
	1288	uvchr_to_utf8(tmpbuf, c);
	1289	return is_utf8_graph(tmpbuf);
	1290	}
	1291
	1292	bool
	1293	Perl_is_uni_print(pTHX_ UV c)
	1294	{
	1295	U8 tmpbuf[UTF8_MAXBYTES+1];
	1296	uvchr_to_utf8(tmpbuf, c);
	1297	return is_utf8_print(tmpbuf);
	1298	}
	1299
	1300	bool
	1301	Perl_is_uni_punct(pTHX_ UV c)
	1302	{
	1303	U8 tmpbuf[UTF8_MAXBYTES+1];
	1304	uvchr_to_utf8(tmpbuf, c);
	1305	return is_utf8_punct(tmpbuf);
	1306	}
	1307
	1308	bool
	1309	Perl_is_uni_xdigit(pTHX_ UV c)
	1310	{
	1311	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1312	uvchr_to_utf8(tmpbuf, c);
	1313	return is_utf8_xdigit(tmpbuf);
	1314	}
	1315
	1316	UV
	1317	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1318	{
	1319	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1320
	1321	uvchr_to_utf8(p, c);
	1322	return to_utf8_upper(p, p, lenp);
	1323	}
	1324
	1325	UV
	1326	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1327	{
	1328	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1329
	1330	uvchr_to_utf8(p, c);
	1331	return to_utf8_title(p, p, lenp);
	1332	}
	1333
	1334	UV
	1335	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1336	{
	1337	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1338
	1339	uvchr_to_utf8(p, c);
	1340	return to_utf8_lower(p, p, lenp);
	1341	}
	1342
	1343	UV
	1344	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	1345	{
	1346	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1347
	1348	uvchr_to_utf8(p, c);
	1349	return _to_utf8_fold_flags(p, p, lenp, flags);
	1350	}
	1351
	1352	/* for now these all assume no locale info available for Unicode > 255 */
	1353
	1354	bool
	1355	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1356	{
	1357	return is_uni_alnum(c); /* XXX no locale support yet */
	1358	}
	1359
	1360	bool
	1361	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1362	{
	1363	return is_uni_idfirst(c); /* XXX no locale support yet */
	1364	}
	1365
	1366	bool
	1367	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1368	{
	1369	return is_uni_alpha(c); /* XXX no locale support yet */
	1370	}
	1371
	1372	bool
	1373	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1374	{
	1375	return is_uni_ascii(c); /* XXX no locale support yet */
	1376	}
	1377
	1378	bool
	1379	Perl_is_uni_space_lc(pTHX_ UV c)
	1380	{
	1381	return is_uni_space(c); /* XXX no locale support yet */
	1382	}
	1383
	1384	bool
	1385	Perl_is_uni_digit_lc(pTHX_ UV c)
	1386	{
	1387	return is_uni_digit(c); /* XXX no locale support yet */
	1388	}
	1389
	1390	bool
	1391	Perl_is_uni_upper_lc(pTHX_ UV c)
	1392	{
	1393	return is_uni_upper(c); /* XXX no locale support yet */
	1394	}
	1395
	1396	bool
	1397	Perl_is_uni_lower_lc(pTHX_ UV c)
	1398	{
	1399	return is_uni_lower(c); /* XXX no locale support yet */
	1400	}
	1401
	1402	bool
	1403	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1404	{
	1405	return is_uni_cntrl(c); /* XXX no locale support yet */
	1406	}
	1407
	1408	bool
	1409	Perl_is_uni_graph_lc(pTHX_ UV c)
	1410	{
	1411	return is_uni_graph(c); /* XXX no locale support yet */
	1412	}
	1413
	1414	bool
	1415	Perl_is_uni_print_lc(pTHX_ UV c)
	1416	{
	1417	return is_uni_print(c); /* XXX no locale support yet */
	1418	}
	1419
	1420	bool
	1421	Perl_is_uni_punct_lc(pTHX_ UV c)
	1422	{
	1423	return is_uni_punct(c); /* XXX no locale support yet */
	1424	}
	1425
	1426	bool
	1427	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1428	{
	1429	return is_uni_xdigit(c); /* XXX no locale support yet */
	1430	}
	1431
	1432	U32
	1433	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1434	{
	1435	/* XXX returns only the first character -- do not use XXX */
	1436	/* XXX no locale support yet */
	1437	STRLEN len;
	1438	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1439	return (U32)to_uni_upper(c, tmpbuf, &len);
	1440	}
	1441
	1442	U32
	1443	Perl_to_uni_title_lc(pTHX_ U32 c)
	1444	{
	1445	/* XXX returns only the first character XXX -- do not use XXX */
	1446	/* XXX no locale support yet */
	1447	STRLEN len;
	1448	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1449	return (U32)to_uni_title(c, tmpbuf, &len);
	1450	}
	1451
	1452	U32
	1453	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1454	{
	1455	/* XXX returns only the first character -- do not use XXX */
	1456	/* XXX no locale support yet */
	1457	STRLEN len;
	1458	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1459	return (U32)to_uni_lower(c, tmpbuf, &len);
	1460	}
	1461
	1462	static bool
	1463	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1464	const char *const swashname)
	1465	{
	1466	dVAR;
	1467
	1468	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1469
	1470	if (!is_utf8_char(p))
	1471	return FALSE;
	1472	if (!*swash)
	1473	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1474	return swash_fetch(*swash, p, TRUE) != 0;
	1475	}
	1476
	1477	bool
	1478	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1479	{
	1480	dVAR;
	1481
	1482	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1483
	1484	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1485	* descendant of isalnum(3), in other words, it doesn't
	1486	* contain the '_'. --jhi */
	1487	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1488	}
	1489
	1490	bool
	1491	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1492	{
	1493	dVAR;
	1494
	1495	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1496
	1497	if (*p == '_')
	1498	return TRUE;
	1499	/* is_utf8_idstart would be more logical. */
	1500	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1501	}
	1502
	1503	bool
	1504	Perl_is_utf8_xidfirst(pTHX_ const U8 p) / The naming is historical. */
	1505	{
	1506	dVAR;
	1507
	1508	PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
	1509
	1510	if (*p == '_')
	1511	return TRUE;
	1512	/* is_utf8_idstart would be more logical. */
	1513	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
	1514	}
	1515
	1516	bool
	1517	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1518	{
	1519	dVAR;
	1520
	1521	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1522
	1523	if (*p == '_')
	1524	return TRUE;
	1525	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1526	}
	1527
	1528	bool
	1529	Perl_is_utf8_xidcont(pTHX_ const U8 *p)
	1530	{
	1531	dVAR;
	1532
	1533	PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
	1534
	1535	if (*p == '_')
	1536	return TRUE;
	1537	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
	1538	}
	1539
	1540	bool
	1541	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1542	{
	1543	dVAR;
	1544
	1545	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1546
	1547	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1548	}
	1549
	1550	bool
	1551	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1552	{
	1553	dVAR;
	1554
	1555	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1556
	1557	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1558	}
	1559
	1560	bool
	1561	Perl_is_utf8_space(pTHX_ const U8 *p)
	1562	{
	1563	dVAR;
	1564
	1565	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1566
	1567	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1568	}
	1569
	1570	bool
	1571	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1572	{
	1573	dVAR;
	1574
	1575	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1576
	1577	return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
	1578	}
	1579
	1580	bool
	1581	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1582	{
	1583	dVAR;
	1584
	1585	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1586
	1587	return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
	1588	}
	1589
	1590	bool
	1591	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1592	{
	1593	dVAR;
	1594
	1595	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1596
	1597	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1598	}
	1599
	1600	bool
	1601	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1602	{
	1603	dVAR;
	1604
	1605	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1606
	1607	return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
	1608	}
	1609
	1610	bool
	1611	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1612	{
	1613	dVAR;
	1614
	1615	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1616
	1617	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1618	}
	1619
	1620	bool
	1621	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1622	{
	1623	dVAR;
	1624
	1625	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1626
	1627	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1628	}
	1629
	1630	bool
	1631	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1632	{
	1633	dVAR;
	1634
	1635	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1636
	1637	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1638	}
	1639
	1640	bool
	1641	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1642	{
	1643	dVAR;
	1644
	1645	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1646
	1647	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1648	}
	1649
	1650	bool
	1651	Perl_is_utf8_print(pTHX_ const U8 *p)
	1652	{
	1653	dVAR;
	1654
	1655	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1656
	1657	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1658	}
	1659
	1660	bool
	1661	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1662	{
	1663	dVAR;
	1664
	1665	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1666
	1667	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1668	}
	1669
	1670	bool
	1671	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1672	{
	1673	dVAR;
	1674
	1675	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1676
	1677	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1678	}
	1679
	1680	bool
	1681	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1682	{
	1683	dVAR;
	1684
	1685	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1686
	1687	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1688	}
	1689
	1690	bool
	1691	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1692	{
	1693	dVAR;
	1694
	1695	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1696
	1697	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1698	}
	1699
	1700	bool
	1701	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1702	{
	1703	dVAR;
	1704
	1705	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1706
	1707	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1708	}
	1709
	1710	bool
	1711	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1712	{
	1713	dVAR;
	1714
	1715	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1716
	1717	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1718	}
	1719
	1720	bool
	1721	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1722	{
	1723	dVAR;
	1724
	1725	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1726
	1727	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1728	}
	1729
	1730	bool
	1731	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1732	{
	1733	dVAR;
	1734
	1735	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1736
	1737	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1738	}
	1739
	1740	bool
	1741	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1742	{
	1743	dVAR;
	1744
	1745	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1746
	1747	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1748	}
	1749
	1750	bool
	1751	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1752	{
	1753	dVAR;
	1754
	1755	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1756
	1757	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1758	}
	1759
	1760	bool
	1761	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1762	{
	1763	dVAR;
	1764
	1765	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1766
	1767	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1768	}
	1769
	1770	bool
	1771	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1772	{
	1773	dVAR;
	1774
	1775	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1776
	1777	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1778	}
	1779
	1780	bool
	1781	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1782	{
	1783	dVAR;
	1784
	1785	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1786
	1787	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1788	}
	1789
	1790	/*
	1791	=for apidoc to_utf8_case
	1792
	1793	The "p" contains the pointer to the UTF-8 string encoding
	1794	the character that is being converted.
	1795
	1796	The "ustrp" is a pointer to the character buffer to put the
	1797	conversion result to. The "lenp" is a pointer to the length
	1798	of the result.
	1799
	1800	The "swashp" is a pointer to the swash to use.
	1801
	1802	Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
	1803	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1804	but not always, a multicharacter mapping), is tried first.
	1805
	1806	The "special" is a string like "utf8::ToSpecLower", which means the
	1807	hash %utf8::ToSpecLower. The access to the hash is through
	1808	Perl_to_utf8_case().
	1809
	1810	The "normal" is a string like "ToLower" which means the swash
	1811	%utf8::ToLower.
	1812
	1813	=cut */
	1814
	1815	UV
	1816	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1817	SV *swashp, const char normal, const char *special)
	1818	{
	1819	dVAR;
	1820	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1821	STRLEN len = 0;
	1822	const UV uv0 = utf8_to_uvchr(p, NULL);
	1823	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1824	* are necessary in EBCDIC, they are redundant no-ops
	1825	* in ASCII-ish platforms, and hopefully optimized away. */
	1826	const UV uv1 = NATIVE_TO_UNI(uv0);
	1827
	1828	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1829
	1830	/* Note that swash_fetch() doesn't output warnings for these because it
	1831	* assumes we will */
	1832	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	1833	if (uv1 <= UNICODE_SURROGATE_LAST) {
	1834	if (ckWARN_d(WARN_SURROGATE)) {
	1835	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1836	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	1837	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1838	}
	1839	}
	1840	else if (UNICODE_IS_SUPER(uv1)) {
	1841	if (ckWARN_d(WARN_NON_UNICODE)) {
	1842	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1843	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	1844	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1845	}
	1846	}
	1847
	1848	/* Note that non-characters are perfectly legal, so no warning should
	1849	* be given */
	1850	}
	1851
	1852	uvuni_to_utf8(tmpbuf, uv1);
	1853
	1854	if (!swashp) / load on-demand */
	1855	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1856
	1857	if (special) {
	1858	/* It might be "special" (sometimes, but not always,
	1859	* a multicharacter mapping) */
	1860	HV * const hv = get_hv(special, 0);
	1861	SV **svp;
	1862
	1863	if (hv &&
	1864	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1865	(*svp)) {
	1866	const char *s;
	1867
	1868	s = SvPV_const(*svp, len);
	1869	if (len == 1)
	1870	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1871	else {
	1872	#ifdef EBCDIC
	1873	/* If we have EBCDIC we need to remap the characters
	1874	* since any characters in the low 256 are Unicode
	1875	* code points, not EBCDIC. */
	1876	U8 t = (U8)s, tend = t + len, d;
	1877
	1878	d = tmpbuf;
	1879	if (SvUTF8(*svp)) {
	1880	STRLEN tlen = 0;
	1881
	1882	while (t < tend) {
	1883	const UV c = utf8_to_uvchr(t, &tlen);
	1884	if (tlen > 0) {
	1885	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1886	t += tlen;
	1887	}
	1888	else
	1889	break;
	1890	}
	1891	}
	1892	else {
	1893	while (t < tend) {
	1894	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1895	t++;
	1896	}
	1897	}
	1898	len = d - tmpbuf;
	1899	Copy(tmpbuf, ustrp, len, U8);
	1900	#else
	1901	Copy(s, ustrp, len, U8);
	1902	#endif
	1903	}
	1904	}
	1905	}
	1906
	1907	if (!len && *swashp) {
	1908	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1909
	1910	if (uv2) {
	1911	/* It was "normal" (a single character mapping). */
	1912	const UV uv3 = UNI_TO_NATIVE(uv2);
	1913	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1914	}
	1915	}
	1916
	1917	if (!len) /* Neither: just copy. In other words, there was no mapping
	1918	defined, which means that the code point maps to itself */
	1919	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1920
	1921	if (lenp)
	1922	*lenp = len;
	1923
	1924	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1925	}
	1926
	1927	/*
	1928	=for apidoc to_utf8_upper
	1929
	1930	Convert the UTF-8 encoded character at p to its uppercase version and
	1931	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1932	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1933	the uppercase version may be longer than the original character.
	1934
	1935	The first character of the uppercased version is returned
	1936	(but note, as explained above, that there may be more.)
	1937
	1938	=cut */
	1939
	1940	UV
	1941	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1942	{
	1943	dVAR;
	1944
	1945	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1946
	1947	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1948	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1949	}
	1950
	1951	/*
	1952	=for apidoc to_utf8_title
	1953
	1954	Convert the UTF-8 encoded character at p to its titlecase version and
	1955	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1956	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1957	titlecase version may be longer than the original character.
	1958
	1959	The first character of the titlecased version is returned
	1960	(but note, as explained above, that there may be more.)
	1961
	1962	=cut */
	1963
	1964	UV
	1965	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1966	{
	1967	dVAR;
	1968
	1969	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	1970
	1971	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1972	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1973	}
	1974
	1975	/*
	1976	=for apidoc to_utf8_lower
	1977
	1978	Convert the UTF-8 encoded character at p to its lowercase version and
	1979	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1980	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1981	lowercase version may be longer than the original character.
	1982
	1983	The first character of the lowercased version is returned
	1984	(but note, as explained above, that there may be more.)
	1985
	1986	=cut */
	1987
	1988	UV
	1989	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1990	{
	1991	dVAR;
	1992
	1993	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	1994
	1995	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1996	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1997	}
	1998
	1999	/*
	2000	=for apidoc to_utf8_fold
	2001
	2002	Convert the UTF-8 encoded character at p to its foldcase version and
	2003	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2004	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2005	foldcase version may be longer than the original character (up to
	2006	three characters).
	2007
	2008	The first character of the foldcased version is returned
	2009	(but note, as explained above, that there may be more.)
	2010
	2011	=cut */
	2012
	2013	/* Not currently externally documented is 'flags', which currently is non-zero
	2014	* if full case folds are to be used; otherwise simple folds */
	2015
	2016	UV
	2017	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, U8 flags)
	2018	{
	2019	const char *specials = (flags) ? "utf8::ToSpecFold" : NULL;
	2020
	2021	dVAR;
	2022
	2023	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2024
	2025	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2026	&PL_utf8_tofold, "ToFold", specials);
	2027	}
	2028
	2029	/* Note:
	2030	* A "swash" is a swatch hash.
	2031	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	2032	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2033	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2034	*/
	2035	SV*
	2036	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2037	{
	2038	dVAR;
	2039	SV* retval;
	2040	dSP;
	2041	const size_t pkg_len = strlen(pkg);
	2042	const size_t name_len = strlen(name);
	2043	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2044	SV* errsv_save;
	2045	GV *method;
	2046
	2047	PERL_ARGS_ASSERT_SWASH_INIT;
	2048
	2049	PUSHSTACKi(PERLSI_MAGIC);
	2050	ENTER;
	2051	SAVEHINTS();
	2052	save_re_context();
	2053	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2054	if (!method) { /* demand load utf8 */
	2055	ENTER;
	2056	errsv_save = newSVsv(ERRSV);
	2057	/* It is assumed that callers of this routine are not passing in any
	2058	user derived data. */
	2059	/* Need to do this after save_re_context() as it will set PL_tainted to
	2060	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	2061	Even line to create errsv_save can turn on PL_tainted. */
	2062	SAVEBOOL(PL_tainted);
	2063	PL_tainted = 0;
	2064	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2065	NULL);
	2066	if (!SvTRUE(ERRSV))
	2067	sv_setsv(ERRSV, errsv_save);
	2068	SvREFCNT_dec(errsv_save);
	2069	LEAVE;
	2070	}
	2071	SPAGAIN;
	2072	PUSHMARK(SP);
	2073	EXTEND(SP,5);
	2074	mPUSHp(pkg, pkg_len);
	2075	mPUSHp(name, name_len);
	2076	PUSHs(listsv);
	2077	mPUSHi(minbits);
	2078	mPUSHi(none);
	2079	PUTBACK;
	2080	errsv_save = newSVsv(ERRSV);
	2081	/* If we already have a pointer to the method, no need to use call_method()
	2082	to repeat the lookup. */
	2083	if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
	2084	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2085	retval = newSVsv(*PL_stack_sp--);
	2086	else
	2087	retval = &PL_sv_undef;
	2088	if (!SvTRUE(ERRSV))
	2089	sv_setsv(ERRSV, errsv_save);
	2090	SvREFCNT_dec(errsv_save);
	2091	LEAVE;
	2092	POPSTACK;
	2093	if (IN_PERL_COMPILETIME) {
	2094	CopHINTS_set(PL_curcop, PL_hints);
	2095	}
	2096	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2097	if (SvPOK(retval))
	2098	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	2099	SVfARG(retval));
	2100	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	2101	}
	2102	return retval;
	2103	}
	2104
	2105
	2106	/* This API is wrong for special case conversions since we may need to
	2107	* return several Unicode characters for a single Unicode character
	2108	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2109	* the lower-level routine, and it is similarly broken for returning
	2110	* multiple values. --jhi
	2111	* For those, you should use to_utf8_case() instead */
	2112	/* Now SWASHGET is recasted into S_swash_get in this file. */
	2113
	2114	/* Note:
	2115	* Returns the value of property/mapping C<swash> for the first character
	2116	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2117	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	2118	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2119	*/
	2120	UV
	2121	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2122	{
	2123	dVAR;
	2124	HV *const hv = MUTABLE_HV(SvRV(swash));
	2125	U32 klen;
	2126	U32 off;
	2127	STRLEN slen;
	2128	STRLEN needents;
	2129	const U8 *tmps = NULL;
	2130	U32 bit;
	2131	SV *swatch;
	2132	U8 tmputf8[2];
	2133	const UV c = NATIVE_TO_ASCII(*ptr);
	2134
	2135	PERL_ARGS_ASSERT_SWASH_FETCH;
	2136
	2137	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	2138	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	2139	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	2140	ptr = tmputf8;
	2141	}
	2142	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	2143	* then the "swatch" is a vec() for all the chars which start
	2144	* with 0xAA..0xYY
	2145	* So the key in the hash (klen) is length of encoded char -1
	2146	*/
	2147	klen = UTF8SKIP(ptr) - 1;
	2148	off = ptr[klen];
	2149
	2150	if (klen == 0) {
	2151	/* If char is invariant then swatch is for all the invariant chars
	2152	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	2153	*/
	2154	needents = UTF_CONTINUATION_MARK;
	2155	off = NATIVE_TO_UTF(ptr[klen]);
	2156	}
	2157	else {
	2158	/* If char is encoded then swatch is for the prefix */
	2159	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2160	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	2161	if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
	2162	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
	2163
	2164	/* This outputs warnings for binary properties only, assuming that
	2165	* to_utf8_case() will output any. Also, surrogates aren't checked
	2166	* for, as that would warn on things like /\p{Gc=Cs}/ */
	2167	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2168	if (SvUV(*bitssvp) == 1) {
	2169	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2170	"Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
	2171	}
	2172	}
	2173	}
	2174
	2175	/*
	2176	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2177	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2178	* it's nothing to sniff at.) Pity we usually come through at least
	2179	* two function calls to get here...
	2180	*
	2181	* NB: this code assumes that swatches are never modified, once generated!
	2182	*/
	2183
	2184	if (hv == PL_last_swash_hv &&
	2185	klen == PL_last_swash_klen &&
	2186	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2187	{
	2188	tmps = PL_last_swash_tmps;
	2189	slen = PL_last_swash_slen;
	2190	}
	2191	else {
	2192	/* Try our second-level swatch cache, kept in a hash. */
	2193	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2194
	2195	/* If not cached, generate it via swash_get */
	2196	if (!svp \|\| !SvPOK(*svp)
	2197	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	2198	/* We use utf8n_to_uvuni() as we want an index into
	2199	Unicode tables, not a native character number.
	2200	*/
	2201	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	2202	ckWARN(WARN_UTF8) ?
	2203	0 : UTF8_ALLOW_ANY);
	2204	swatch = swash_get(swash,
	2205	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	2206	(klen) ? (code_point & ~(needents - 1)) : 0,
	2207	needents);
	2208
	2209	if (IN_PERL_COMPILETIME)
	2210	CopHINTS_set(PL_curcop, PL_hints);
	2211
	2212	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2213
	2214	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2215	\|\| (slen << 3) < needents)
	2216	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	2217	}
	2218
	2219	PL_last_swash_hv = hv;
	2220	assert(klen <= sizeof(PL_last_swash_key));
	2221	PL_last_swash_klen = (U8)klen;
	2222	/* FIXME change interpvar.h? */
	2223	PL_last_swash_tmps = (U8 *) tmps;
	2224	PL_last_swash_slen = slen;
	2225	if (klen)
	2226	Copy(ptr, PL_last_swash_key, klen, U8);
	2227	}
	2228
	2229	switch ((int)((slen << 3) / needents)) {
	2230	case 1:
	2231	bit = 1 << (off & 7);
	2232	off >>= 3;
	2233	return (tmps[off] & bit) != 0;
	2234	case 8:
	2235	return tmps[off];
	2236	case 16:
	2237	off <<= 1;
	2238	return (tmps[off] << 8) + tmps[off + 1] ;
	2239	case 32:
	2240	off <<= 2;
	2241	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2242	}
	2243	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2244	NORETURN_FUNCTION_END;
	2245	}
	2246
	2247	/* Read a single line of the main body of the swash input text. These are of
	2248	* the form:
	2249	* 0053 0056 0073
	2250	* where each number is hex. The first two numbers form the minimum and
	2251	* maximum of a range, and the third is the value associated with the range.
	2252	* Not all swashes should have a third number
	2253	*
	2254	* On input: l points to the beginning of the line to be examined; it points
	2255	* to somewhere in the string of the whole input text, and is
	2256	* terminated by a \n or the null string terminator.
	2257	* lend points to the null terminator of that string
	2258	* wants_value is non-zero if the swash expects a third number
	2259	* typestr is the name of the swash's mapping, like 'ToLower'
	2260	* On output: min, max, and *val are set to the values read from the line.
	2261	* returns a pointer just beyond the line examined. If there was no
	2262	* valid min number on the line, returns lend+1
	2263	*/
	2264
	2265	STATIC U8*
	2266	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2267	const bool wants_value, const U8* const typestr)
	2268	{
	2269	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2270	STRLEN numlen; /* Length of the number */
	2271	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	2272	\| PERL_SCAN_DISALLOW_PREFIX
	2273	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2274
	2275	/* nl points to the next \n in the scan */
	2276	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2277
	2278	/* Get the first number on the line: the range minimum */
	2279	numlen = lend - l;
	2280	min = grok_hex((char )l, &numlen, &flags, NULL);
	2281	if (numlen) /* If found a hex number, position past it */
	2282	l += numlen;
	2283	else if (nl) { /* Else, go handle next line, if any */
	2284	return nl + 1; /* 1 is length of "\n" */
	2285	}
	2286	else { /* Else, no next line */
	2287	return lend + 1; /* to LIST's end at which \n is not found */
	2288	}
	2289
	2290	/* The max range value follows, separated by a BLANK */
	2291	if (isBLANK(*l)) {
	2292	++l;
	2293	flags = PERL_SCAN_SILENT_ILLDIGIT
	2294	\| PERL_SCAN_DISALLOW_PREFIX
	2295	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2296	numlen = lend - l;
	2297	max = grok_hex((char )l, &numlen, &flags, NULL);
	2298	if (numlen)
	2299	l += numlen;
	2300	else /* If no value here, it is a single element range */
	2301	max = min;
	2302
	2303	/* Non-binary tables have a third entry: what the first element of the
	2304	* range maps to */
	2305	if (wants_value) {
	2306	if (isBLANK(*l)) {
	2307	++l;
	2308	flags = PERL_SCAN_SILENT_ILLDIGIT
	2309	\| PERL_SCAN_DISALLOW_PREFIX
	2310	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2311	numlen = lend - l;
	2312	val = grok_hex((char )l, &numlen, &flags, NULL);
	2313	if (numlen)
	2314	l += numlen;
	2315	else
	2316	*val = 0;
	2317	}
	2318	else {
	2319	*val = 0;
	2320	if (typeto) {
	2321	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2322	typestr, l);
	2323	}
	2324	}
	2325	}
	2326	else
	2327	val = 0; / bits == 1, then any val should be ignored */
	2328	}
	2329	else { /* Nothing following range min, should be single element with no
	2330	mapping expected */
	2331	max = min;
	2332	if (wants_value) {
	2333	*val = 0;
	2334	if (typeto) {
	2335	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2336	}
	2337	}
	2338	else
	2339	val = 0; / bits == 1, then val should be ignored */
	2340	}
	2341
	2342	/* Position to next line if any, or EOF */
	2343	if (nl)
	2344	l = nl + 1;
	2345	else
	2346	l = lend;
	2347
	2348	return l;
	2349	}
	2350
	2351	/* Note:
	2352	* Returns a swatch (a bit vector string) for a code point sequence
	2353	* that starts from the value C<start> and comprises the number C<span>.
	2354	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2355	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2356	*/
	2357	STATIC SV*
	2358	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2359	{
	2360	SV *swatch;
	2361	U8 l, lend, x, xend, s, send;
	2362	STRLEN lcur, xcur, scur;
	2363	HV *const hv = MUTABLE_HV(SvRV(swash));
	2364
	2365	/* The string containing the main body of the table */
	2366	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2367
	2368	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2369	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2370	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2371	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2372	SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2373	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2374	const STRLEN bits = SvUV(*bitssvp);
	2375	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2376	const UV none = SvUV(*nonesvp);
	2377	const UV end = start + span;
	2378
	2379	PERL_ARGS_ASSERT_SWASH_GET;
	2380
	2381	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2382	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2383	(UV)bits);
	2384	}
	2385
	2386	/* create and initialize $swatch */
	2387	scur = octets ? (span * octets) : (span + 7) / 8;
	2388	swatch = newSV(scur);
	2389	SvPOK_on(swatch);
	2390	s = (U8*)SvPVX(swatch);
	2391	if (octets && none) {
	2392	const U8* const e = s + scur;
	2393	while (s < e) {
	2394	if (bits == 8)
	2395	*s++ = (U8)(none & 0xff);
	2396	else if (bits == 16) {
	2397	*s++ = (U8)((none >> 8) & 0xff);
	2398	*s++ = (U8)( none & 0xff);
	2399	}
	2400	else if (bits == 32) {
	2401	*s++ = (U8)((none >> 24) & 0xff);
	2402	*s++ = (U8)((none >> 16) & 0xff);
	2403	*s++ = (U8)((none >> 8) & 0xff);
	2404	*s++ = (U8)( none & 0xff);
	2405	}
	2406	}
	2407	*s = '\0';
	2408	}
	2409	else {
	2410	(void)memzero((U8*)s, scur + 1);
	2411	}
	2412	SvCUR_set(swatch, scur);
	2413	s = (U8*)SvPVX(swatch);
	2414
	2415	/* read $swash->{LIST} */
	2416	l = (U8)SvPV(listsvp, lcur);
	2417	lend = l + lcur;
	2418	while (l < lend) {
	2419	UV min, max, val;
	2420	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2421	cBOOL(octets), typestr);
	2422	if (l > lend) {
	2423	break;
	2424	}
	2425
	2426	/* If looking for something beyond this range, go try the next one */
	2427	if (max < start)
	2428	continue;
	2429
	2430	if (octets) {
	2431	UV key;
	2432	if (min < start) {
	2433	if (!none \|\| val < none) {
	2434	val += start - min;
	2435	}
	2436	min = start;
	2437	}
	2438	for (key = min; key <= max; key++) {
	2439	STRLEN offset;
	2440	if (key >= end)
	2441	goto go_out_list;
	2442	/* offset must be non-negative (start <= min <= key < end) */
	2443	offset = octets * (key - start);
	2444	if (bits == 8)
	2445	s[offset] = (U8)(val & 0xff);
	2446	else if (bits == 16) {
	2447	s[offset ] = (U8)((val >> 8) & 0xff);
	2448	s[offset + 1] = (U8)( val & 0xff);
	2449	}
	2450	else if (bits == 32) {
	2451	s[offset ] = (U8)((val >> 24) & 0xff);
	2452	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2453	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2454	s[offset + 3] = (U8)( val & 0xff);
	2455	}
	2456
	2457	if (!none \|\| val < none)
	2458	++val;
	2459	}
	2460	}
	2461	else { /* bits == 1, then val should be ignored */
	2462	UV key;
	2463	if (min < start)
	2464	min = start;
	2465	for (key = min; key <= max; key++) {
	2466	const STRLEN offset = (STRLEN)(key - start);
	2467	if (key >= end)
	2468	goto go_out_list;
	2469	s[offset >> 3] \|= 1 << (offset & 7);
	2470	}
	2471	}
	2472	} /* while */
	2473	go_out_list:
	2474
	2475	/* Invert if the data says it should be */
	2476	if (invert_it_svp && SvUV(*invert_it_svp)) {
	2477	send = s + scur;
	2478	while (s < send) {
	2479	s = ~(s);
	2480	s++;
	2481	}
	2482	}
	2483
	2484	/* read $swash->{EXTRAS}
	2485	* This code also copied to swash_to_invlist() below */
	2486	x = (U8)SvPV(extssvp, xcur);
	2487	xend = x + xcur;
	2488	while (x < xend) {
	2489	STRLEN namelen;
	2490	U8 *namestr;
	2491	SV** othersvp;
	2492	HV* otherhv;
	2493	STRLEN otherbits;
	2494	SV *otherbitssvp, other;
	2495	U8 s, o, *nl;
	2496	STRLEN slen, olen;
	2497
	2498	const U8 opc = *x++;
	2499	if (opc == '\n')
	2500	continue;
	2501
	2502	nl = (U8*)memchr(x, '\n', xend - x);
	2503
	2504	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2505	if (nl) {
	2506	x = nl + 1; /* 1 is length of "\n" */
	2507	continue;
	2508	}
	2509	else {
	2510	x = xend; /* to EXTRAS' end at which \n is not found */
	2511	break;
	2512	}
	2513	}
	2514
	2515	namestr = x;
	2516	if (nl) {
	2517	namelen = nl - namestr;
	2518	x = nl + 1;
	2519	}
	2520	else {
	2521	namelen = xend - namestr;
	2522	x = xend;
	2523	}
	2524
	2525	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2526	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2527	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2528	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2529	if (bits < otherbits)
	2530	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2531
	2532	/* The "other" swatch must be destroyed after. */
	2533	other = swash_get(*othersvp, start, span);
	2534	o = (U8*)SvPV(other, olen);
	2535
	2536	if (!olen)
	2537	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2538
	2539	s = (U8*)SvPV(swatch, slen);
	2540	if (bits == 1 && otherbits == 1) {
	2541	if (slen != olen)
	2542	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2543
	2544	switch (opc) {
	2545	case '+':
	2546	while (slen--)
	2547	s++ \|= o++;
	2548	break;
	2549	case '!':
	2550	while (slen--)
	2551	s++ \|= ~o++;
	2552	break;
	2553	case '-':
	2554	while (slen--)
	2555	s++ &= ~o++;
	2556	break;
	2557	case '&':
	2558	while (slen--)
	2559	s++ &= o++;
	2560	break;
	2561	default:
	2562	break;
	2563	}
	2564	}
	2565	else {
	2566	STRLEN otheroctets = otherbits >> 3;
	2567	STRLEN offset = 0;
	2568	U8* const send = s + slen;
	2569
	2570	while (s < send) {
	2571	UV otherval = 0;
	2572
	2573	if (otherbits == 1) {
	2574	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2575	++offset;
	2576	}
	2577	else {
	2578	STRLEN vlen = otheroctets;
	2579	otherval = *o++;
	2580	while (--vlen) {
	2581	otherval <<= 8;
	2582	otherval \|= *o++;
	2583	}
	2584	}
	2585
	2586	if (opc == '+' && otherval)
	2587	NOOP; /* replace with otherval */
	2588	else if (opc == '!' && !otherval)
	2589	otherval = 1;
	2590	else if (opc == '-' && otherval)
	2591	otherval = 0;
	2592	else if (opc == '&' && !otherval)
	2593	otherval = 0;
	2594	else {
	2595	s += octets; /* no replacement */
	2596	continue;
	2597	}
	2598
	2599	if (bits == 8)
	2600	*s++ = (U8)( otherval & 0xff);
	2601	else if (bits == 16) {
	2602	*s++ = (U8)((otherval >> 8) & 0xff);
	2603	*s++ = (U8)( otherval & 0xff);
	2604	}
	2605	else if (bits == 32) {
	2606	*s++ = (U8)((otherval >> 24) & 0xff);
	2607	*s++ = (U8)((otherval >> 16) & 0xff);
	2608	*s++ = (U8)((otherval >> 8) & 0xff);
	2609	*s++ = (U8)( otherval & 0xff);
	2610	}
	2611	}
	2612	}
	2613	sv_free(other); /* through with it! */
	2614	} /* while */
	2615	return swatch;
	2616	}
	2617
	2618	HV*
	2619	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	2620	{
	2621
	2622	/* Subject to change or removal. For use only in one place in regcomp.c.
	2623	* Can't be used on a property that is subject to user override, as it
	2624	* relies on the value of SPECIALS in the swash which would be set by
	2625	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	2626	* for overridden properties
	2627	*
	2628	* Returns a hash which is the inversion and closure of a swash mapping.
	2629	* For example, consider the input lines:
	2630	* 004B 006B
	2631	* 004C 006C
	2632	* 212A 006B
	2633	*
	2634	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	2635	* 006C. The value for each key is an array. For 006C, the array would
	2636	* have a two elements, the utf8 for itself, and for 004C. For 006B, there
	2637	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	2638	*
	2639	* Essentially, for any code point, it gives all the code points that map to
	2640	* it, or the list of 'froms' for that point.
	2641	*
	2642	* Currently it ignores any additions or deletions from other swashes,
	2643	* looking at just the main body of the swash, and if there are SPECIALS
	2644	* in the swash, at that hash
	2645	*
	2646	* The specials hash can be extra code points, and most likely consists of
	2647	* maps from single code points to multiple ones (each expressed as a string
	2648	* of utf8 characters). This function currently returns only 1-1 mappings.
	2649	* However consider this possible input in the specials hash:
	2650	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	2651	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	2652	*
	2653	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	2654	* currently handle. But it also means that FB05 and FB06 are equivalent in
	2655	* a 1-1 mapping which we should handle, and this relationship may not be in
	2656	* the main table. Therefore this function examines all the multi-char
	2657	* sequences and adds the 1-1 mappings that come out of that. */
	2658
	2659	U8 l, lend;
	2660	STRLEN lcur;
	2661	HV *const hv = MUTABLE_HV(SvRV(swash));
	2662
	2663	/* The string containing the main body of the table */
	2664	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2665
	2666	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2667	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2668	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2669	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	2670	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2671	const STRLEN bits = SvUV(*bitssvp);
	2672	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2673	const UV none = SvUV(*nonesvp);
	2674	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	2675
	2676	HV* ret = newHV();
	2677
	2678	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	2679
	2680	/* Must have at least 8 bits to get the mappings */
	2681	if (bits != 8 && bits != 16 && bits != 32) {
	2682	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	2683	(UV)bits);
	2684	}
	2685
	2686	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	2687	mapping to more than one character */
	2688
	2689	/* Construct an inverse mapping hash for the specials */
	2690	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	2691	HV * specials_inverse = newHV();
	2692	char char_from; / the lhs of the map */
	2693	I32 from_len; /* its byte length */
	2694	char char_to; / the rhs of the map */
	2695	I32 to_len; /* its byte length */
	2696	SV sv_to; / and in a sv */
	2697	AV* from_list; /* list of things that map to each 'to' */
	2698
	2699	hv_iterinit(specials_hv);
	2700
	2701	/* The keys are the characters (in utf8) that map to the corresponding
	2702	* utf8 string value. Iterate through the list creating the inverse
	2703	* list. */
	2704	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	2705	SV** listp;
	2706	if (! SvPOK(sv_to)) {
	2707	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() unexpectedly is not a string");
	2708	}
	2709	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8) char_from, 0), utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	2710
	2711	/* Each key in the inverse list is a mapped-to value, and the key's
	2712	* hash value is a list of the strings (each in utf8) that map to
	2713	* it. Those strings are all one character long */
	2714	if ((listp = hv_fetch(specials_inverse,
	2715	SvPVX(sv_to),
	2716	SvCUR(sv_to), 0)))
	2717	{
	2718	from_list = (AV) listp;
	2719	}
	2720	else { /* No entry yet for it: create one */
	2721	from_list = newAV();
	2722	if (! hv_store(specials_inverse,
	2723	SvPVX(sv_to),
	2724	SvCUR(sv_to),
	2725	(SV*) from_list, 0))
	2726	{
	2727	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2728	}
	2729	}
	2730
	2731	/* Here have the list associated with this 'to' (perhaps newly
	2732	* created and empty). Just add to it. Note that we ASSUME that
	2733	* the input is guaranteed to not have duplications, so we don't
	2734	* check for that. Duplications just slow down execution time. */
	2735	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	2736	}
	2737
	2738	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	2739	* it looking for cases like the FB05/FB06 examples above. There would
	2740	* be an entry in the hash like
	2741	* 'st' => [ FB05, FB06 ]
	2742	* In this example we will create two lists that get stored in the
	2743	* returned hash, 'ret':
	2744	* FB05 => [ FB05, FB06 ]
	2745	* FB06 => [ FB05, FB06 ]
	2746	*
	2747	* Note that there is nothing to do if the array only has one element.
	2748	* (In the normal 1-1 case handled below, we don't have to worry about
	2749	* two lists, as everything gets tied to the single list that is
	2750	* generated for the single character 'to'. But here, we are omitting
	2751	* that list, ('st' in the example), so must have multiple lists.) */
	2752	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	2753	&char_to, &to_len)))
	2754	{
	2755	if (av_len(from_list) > 0) {
	2756	int i;
	2757
	2758	/* We iterate over all combinations of i,j to place each code
	2759	* point on each list */
	2760	for (i = 0; i <= av_len(from_list); i++) {
	2761	int j;
	2762	AV* i_list = newAV();
	2763	SV** entryp = av_fetch(from_list, i, FALSE);
	2764	if (entryp == NULL) {
	2765	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2766	}
	2767	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	2768	Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
	2769	}
	2770	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	2771	(SV*) i_list, FALSE))
	2772	{
	2773	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2774	}
	2775
	2776	/* For debugging: UV u = utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	2777	for (j = 0; j <= av_len(from_list); j++) {
	2778	entryp = av_fetch(from_list, j, FALSE);
	2779	if (entryp == NULL) {
	2780	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2781	}
	2782
	2783	/* When i==j this adds itself to the list */
	2784	av_push(i_list, newSVuv(utf8_to_uvchr(
	2785	(U8) SvPVX(entryp), 0)));
	2786	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	2787	}
	2788	}
	2789	}
	2790	}
	2791	SvREFCNT_dec(specials_inverse); /* done with it */
	2792	} /* End of specials */
	2793
	2794	/* read $swash->{LIST} */
	2795	l = (U8)SvPV(listsvp, lcur);
	2796	lend = l + lcur;
	2797
	2798	/* Go through each input line */
	2799	while (l < lend) {
	2800	UV min, max, val;
	2801	UV inverse;
	2802	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2803	cBOOL(octets), typestr);
	2804	if (l > lend) {
	2805	break;
	2806	}
	2807
	2808	/* Each element in the range is to be inverted */
	2809	for (inverse = min; inverse <= max; inverse++) {
	2810	AV* list;
	2811	SV** listp;
	2812	IV i;
	2813	bool found_key = FALSE;
	2814	bool found_inverse = FALSE;
	2815
	2816	/* The key is the inverse mapping */
	2817	char key[UTF8_MAXBYTES+1];
	2818	char* key_end = (char ) uvuni_to_utf8((U8) key, val);
	2819	STRLEN key_len = key_end - key;
	2820
	2821	/* Get the list for the map */
	2822	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	2823	list = (AV) listp;
	2824	}
	2825	else { /* No entry yet for it: create one */
	2826	list = newAV();
	2827	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	2828	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2829	}
	2830	}
	2831
	2832	/* Look through list to see if this inverse mapping already is
	2833	* listed, or if there is a mapping to itself already */
	2834	for (i = 0; i <= av_len(list); i++) {
	2835	SV** entryp = av_fetch(list, i, FALSE);
	2836	SV* entry;
	2837	if (entryp == NULL) {
	2838	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2839	}
	2840	entry = *entryp;
	2841	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));/
	2842	if (SvUV(entry) == val) {
	2843	found_key = TRUE;
	2844	}
	2845	if (SvUV(entry) == inverse) {
	2846	found_inverse = TRUE;
	2847	}
	2848
	2849	/* No need to continue searching if found everything we are
	2850	* looking for */
	2851	if (found_key && found_inverse) {
	2852	break;
	2853	}
	2854	}
	2855
	2856	/* Make sure there is a mapping to itself on the list */
	2857	if (! found_key) {
	2858	av_push(list, newSVuv(val));
	2859	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));/
	2860	}
	2861
	2862
	2863	/* Simply add the value to the list */
	2864	if (! found_inverse) {
	2865	av_push(list, newSVuv(inverse));
	2866	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));/
	2867	}
	2868
	2869	/* swash_get() increments the value of val for each element in the
	2870	* range. That makes more compact tables possible. You can
	2871	* express the capitalization, for example, of all consecutive
	2872	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	2873	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	2874	* and it's not documented; it appears to be used only in
	2875	* implementing tr//; I copied the semantics from swash_get(), just
	2876	* in case */
	2877	if (!none \|\| val < none) {
	2878	++val;
	2879	}
	2880	}
	2881	}
	2882
	2883	return ret;
	2884	}
	2885
	2886	SV*
	2887	Perl__swash_to_invlist(pTHX_ SV* const swash)
	2888	{
	2889
	2890	/* Subject to change or removal. For use only in one place in regcomp.c */
	2891
	2892	U8 l, lend;
	2893	char *loc;
	2894	STRLEN lcur;
	2895	HV *const hv = MUTABLE_HV(SvRV(swash));
	2896	UV elements = 0; /* Number of elements in the inversion list */
	2897	U8 empty[] = "";
	2898
	2899	/* The string containing the main body of the table */
	2900	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2901	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2902	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2903	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2904	SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2905
	2906	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2907	const STRLEN bits = SvUV(*bitssvp);
	2908	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2909	U8 x, xend;
	2910	STRLEN xcur;
	2911
	2912	SV* invlist;
	2913
	2914	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	2915
	2916	/* read $swash->{LIST} */
	2917	if (SvPOK(*listsvp)) {
	2918	l = (U8)SvPV(listsvp, lcur);
	2919	}
	2920	else {
	2921	/* LIST legitimately doesn't contain a string during compilation phases
	2922	* of Perl itself, before the Unicode tables are generated. In this
	2923	* case, just fake things up by creating an empty list */
	2924	l = empty;
	2925	lcur = 0;
	2926	}
	2927	loc = (char *) l;
	2928	lend = l + lcur;
	2929
	2930	/* Scan the input to count the number of lines to preallocate array size
	2931	* based on worst possible case, which is each line in the input creates 2
	2932	* elements in the inversion list: 1) the beginning of a range in the list;
	2933	* 2) the beginning of a range not in the list. */
	2934	while ((loc = (strchr(loc, '\n'))) != NULL) {
	2935	elements += 2;
	2936	loc++;
	2937	}
	2938
	2939	/* If the ending is somehow corrupt and isn't a new line, add another
	2940	* element for the final range that isn't in the inversion list */
	2941	if (! (lend == '\n' \|\| (lend == '\0' && *(lend - 1) == '\n'))) {
	2942	elements++;
	2943	}
	2944
	2945	invlist = _new_invlist(elements);
	2946
	2947	/* Now go through the input again, adding each range to the list */
	2948	while (l < lend) {
	2949	UV start, end;
	2950	UV val; /* Not used by this function */
	2951
	2952	l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
	2953	cBOOL(octets), typestr);
	2954
	2955	if (l > lend) {
	2956	break;
	2957	}
	2958
	2959	_append_range_to_invlist(invlist, start, end);
	2960	}
	2961
	2962	/* Invert if the data says it should be */
	2963	if (invert_it_svp && SvUV(*invert_it_svp)) {
	2964	_invlist_invert(invlist);
	2965	}
	2966
	2967	/* This code is copied from swash_get()
	2968	* read $swash->{EXTRAS} */
	2969	x = (U8)SvPV(extssvp, xcur);
	2970	xend = x + xcur;
	2971	while (x < xend) {
	2972	STRLEN namelen;
	2973	U8 *namestr;
	2974	SV** othersvp;
	2975	HV* otherhv;
	2976	STRLEN otherbits;
	2977	SV *otherbitssvp, other;
	2978	U8 *nl;
	2979
	2980	const U8 opc = *x++;
	2981	if (opc == '\n')
	2982	continue;
	2983
	2984	nl = (U8*)memchr(x, '\n', xend - x);
	2985
	2986	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2987	if (nl) {
	2988	x = nl + 1; /* 1 is length of "\n" */
	2989	continue;
	2990	}
	2991	else {
	2992	x = xend; /* to EXTRAS' end at which \n is not found */
	2993	break;
	2994	}
	2995	}
	2996
	2997	namestr = x;
	2998	if (nl) {
	2999	namelen = nl - namestr;
	3000	x = nl + 1;
	3001	}
	3002	else {
	3003	namelen = xend - namestr;
	3004	x = xend;
	3005	}
	3006
	3007	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3008	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3009	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3010	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3011
	3012	if (bits != otherbits \|\| bits != 1) {
	3013	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean properties");
	3014	}
	3015
	3016	/* The "other" swatch must be destroyed after. */
	3017	other = _swash_to_invlist((SV )othersvp);
	3018
	3019	/* End of code copied from swash_get() */
	3020	switch (opc) {
	3021	case '+':
	3022	_invlist_union(invlist, other, &invlist);
	3023	break;
	3024	case '!':
	3025	_invlist_invert(other);
	3026	_invlist_union(invlist, other, &invlist);
	3027	break;
	3028	case '-':
	3029	_invlist_subtract(invlist, other, &invlist);
	3030	break;
	3031	case '&':
	3032	_invlist_intersection(invlist, other, &invlist);
	3033	break;
	3034	default:
	3035	break;
	3036	}
	3037	sv_free(other); /* through with it! */
	3038	}
	3039
	3040	return invlist;
	3041	}
	3042
	3043	/*
	3044	=for apidoc uvchr_to_utf8
	3045
	3046	Adds the UTF-8 representation of the Native code point C<uv> to the end
	3047	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	3048	bytes available. The return value is the pointer to the byte after the
	3049	end of the new character. In other words,
	3050
	3051	d = uvchr_to_utf8(d, uv);
	3052
	3053	is the recommended wide native character-aware way of saying
	3054
	3055	*(d++) = uv;
	3056
	3057	=cut
	3058	*/
	3059
	3060	/* On ASCII machines this is normally a macro but we want a
	3061	real function in case XS code wants it
	3062	*/
	3063	U8 *
	3064	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	3065	{
	3066	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	3067
	3068	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	3069	}
	3070
	3071	U8 *
	3072	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	3073	{
	3074	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	3075
	3076	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	3077	}
	3078
	3079	/*
	3080	=for apidoc utf8n_to_uvchr
	3081
	3082	Returns the native character value of the first character in the string
	3083	C<s>
	3084	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	3085	length, in bytes, of that character.
	3086
	3087	length and flags are the same as utf8n_to_uvuni().
	3088
	3089	=cut
	3090	*/
	3091	/* On ASCII machines this is normally a macro but we want
	3092	a real function in case XS code wants it
	3093	*/
	3094	UV
	3095	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	3096	U32 flags)
	3097	{
	3098	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	3099
	3100	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	3101
	3102	return UNI_TO_NATIVE(uv);
	3103	}
	3104
	3105	bool
	3106	Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
	3107	{
	3108	/* May change: warns if surrogates, non-character code points, or
	3109	* non-Unicode code points are in s which has length len. Returns TRUE if
	3110	* none found; FALSE otherwise. The only other validity check is to make
	3111	* sure that this won't exceed the string's length */
	3112
	3113	const U8* const e = s + len;
	3114	bool ok = TRUE;
	3115
	3116	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3117
	3118	while (s < e) {
	3119	if (UTF8SKIP(s) > len) {
	3120	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3121	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3122	return FALSE;
	3123	}
	3124	if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
	3125	STRLEN char_len;
	3126	if (UTF8_IS_SUPER(s)) {
	3127	if (ckWARN_d(WARN_NON_UNICODE)) {
	3128	UV uv = utf8_to_uvchr(s, &char_len);
	3129	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3130	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	3131	ok = FALSE;
	3132	}
	3133	}
	3134	else if (UTF8_IS_SURROGATE(s)) {
	3135	if (ckWARN_d(WARN_SURROGATE)) {
	3136	UV uv = utf8_to_uvchr(s, &char_len);
	3137	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3138	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	3139	ok = FALSE;
	3140	}
	3141	}
	3142	else if
	3143	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	3144	&& (ckWARN_d(WARN_NONCHAR)))
	3145	{
	3146	UV uv = utf8_to_uvchr(s, &char_len);
	3147	Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
	3148	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	3149	ok = FALSE;
	3150	}
	3151	}
	3152	s += UTF8SKIP(s);
	3153	}
	3154
	3155	return ok;
	3156	}
	3157
	3158	/*
	3159	=for apidoc pv_uni_display
	3160
	3161	Build to the scalar dsv a displayable version of the string spv,
	3162	length len, the displayable version being at most pvlim bytes long
	3163	(if longer, the rest is truncated and "..." will be appended).
	3164
	3165	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	3166	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	3167	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	3168	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	3169	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	3170	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	3171
	3172	The pointer to the PV of the dsv is returned.
	3173
	3174	=cut */
	3175	char *
	3176	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	3177	{
	3178	int truncated = 0;
	3179	const char s, e;
	3180
	3181	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	3182
	3183	sv_setpvs(dsv, "");
	3184	SvUTF8_off(dsv);
	3185	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	3186	UV u;
	3187	/* This serves double duty as a flag and a character to print after
	3188	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	3189	*/
	3190	char ok = 0;
	3191
	3192	if (pvlim && SvCUR(dsv) >= pvlim) {
	3193	truncated++;
	3194	break;
	3195	}
	3196	u = utf8_to_uvchr((U8*)s, 0);
	3197	if (u < 256) {
	3198	const unsigned char c = (unsigned char)u & 0xFF;
	3199	if (flags & UNI_DISPLAY_BACKSLASH) {
	3200	switch (c) {
	3201	case '\n':
	3202	ok = 'n'; break;
	3203	case '\r':
	3204	ok = 'r'; break;
	3205	case '\t':
	3206	ok = 't'; break;
	3207	case '\f':
	3208	ok = 'f'; break;
	3209	case '\a':
	3210	ok = 'a'; break;
	3211	case '\\':
	3212	ok = '\\'; break;
	3213	default: break;
	3214	}
	3215	if (ok) {
	3216	const char string = ok;
	3217	sv_catpvs(dsv, "\\");
	3218	sv_catpvn(dsv, &string, 1);
	3219	}
	3220	}
	3221	/* isPRINT() is the locale-blind version. */
	3222	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	3223	const char string = c;
	3224	sv_catpvn(dsv, &string, 1);
	3225	ok = 1;
	3226	}
	3227	}
	3228	if (!ok)
	3229	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	3230	}
	3231	if (truncated)
	3232	sv_catpvs(dsv, "...");
	3233
	3234	return SvPVX(dsv);
	3235	}
	3236
	3237	/*
	3238	=for apidoc sv_uni_display
	3239
	3240	Build to the scalar dsv a displayable version of the scalar sv,
	3241	the displayable version being at most pvlim bytes long
	3242	(if longer, the rest is truncated and "..." will be appended).
	3243
	3244	The flags argument is as in pv_uni_display().
	3245
	3246	The pointer to the PV of the dsv is returned.
	3247
	3248	=cut
	3249	*/
	3250	char *
	3251	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	3252	{
	3253	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	3254
	3255	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	3256	SvCUR(ssv), pvlim, flags);
	3257	}
	3258
	3259	/*
	3260	=for apidoc foldEQ_utf8
	3261
	3262	Returns true if the leading portions of the strings s1 and s2 (either or both
	3263	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3264	How far into the strings to compare is determined by other input parameters.
	3265
	3266	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	3267	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	3268	with respect to s2.
	3269
	3270	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	3271	equality. In other words, s1+l1 will be used as a goal to reach. The
	3272	scan will not be considered to be a match unless the goal is reached, and
	3273	scanning won't continue past that goal. Correspondingly for l2 with respect to
	3274	s2.
	3275
	3276	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	3277	considered an end pointer beyond which scanning of s1 will not continue under
	3278	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	3279	is less than s1+l1, the match will never be successful because it can never
	3280	get as far as its goal (and in fact is asserted against). Correspondingly for
	3281	pe2 with respect to s2.
	3282
	3283	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	3284	non-zero), and if both do, both have to be
	3285	reached for a successful match. Also, if the fold of a character is multiple
	3286	characters, all of them must be matched (see tr21 reference below for
	3287	'folding').
	3288
	3289	Upon a successful match, if pe1 is non-NULL,
	3290	it will be set to point to the beginning of the I<next> character of s1 beyond
	3291	what was matched. Correspondingly for pe2 and s2.
	3292
	3293	For case-insensitiveness, the "casefolding" of Unicode is used
	3294	instead of upper/lowercasing both the characters, see
	3295	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	3296
	3297	=cut */
	3298
	3299	/* A flags parameter has been added which may change, and hence isn't
	3300	* externally documented. Currently it is:
	3301	* 0 for as-documented above
	3302	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	3303	ASCII one, to not match
	3304	* FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
	3305	* points below 256; unicode rules for above 255; and
	3306	* folds that cross those boundaries are disallowed,
	3307	* like the NOMIX_ASCII option
	3308	*/
	3309	I32
	3310	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2, U32 flags)
	3311	{
	3312	dVAR;
	3313	register const U8 p1 = (const U8)s1; /* Point to current char */
	3314	register const U8 p2 = (const U8)s2;
	3315	register const U8 g1 = NULL; / goal for s1 */
	3316	register const U8 *g2 = NULL;
	3317	register const U8 e1 = NULL; / Don't scan s1 past this */
	3318	register U8 f1 = NULL; / Point to current folded */
	3319	register const U8 *e2 = NULL;
	3320	register U8 *f2 = NULL;
	3321	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3322	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3323	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3324	U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
	3325	these always fit in 2 bytes */
	3326
	3327	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	3328
	3329	if (pe1) {
	3330	e1 = (U8*)pe1;
	3331	}
	3332
	3333	if (l1) {
	3334	g1 = (const U8*)s1 + l1;
	3335	}
	3336
	3337	if (pe2) {
	3338	e2 = (U8*)pe2;
	3339	}
	3340
	3341	if (l2) {
	3342	g2 = (const U8*)s2 + l2;
	3343	}
	3344
	3345	/* Must have at least one goal */
	3346	assert(g1 \|\| g2);
	3347
	3348	if (g1) {
	3349
	3350	/* Will never match if goal is out-of-bounds */
	3351	assert(! e1 \|\| e1 >= g1);
	3352
	3353	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3354	* only go as far as the goal */
	3355	e1 = g1;
	3356	}
	3357	else {
	3358	assert(e1); /* Must have an end for looking at s1 */
	3359	}
	3360
	3361	/* Same for goal for s2 */
	3362	if (g2) {
	3363	assert(! e2 \|\| e2 >= g2);
	3364	e2 = g2;
	3365	}
	3366	else {
	3367	assert(e2);
	3368	}
	3369
	3370	/* Look through both strings, a character at a time */
	3371	while (p1 < e1 && p2 < e2) {
	3372
	3373	/* If at the beginning of a new character in s1, get its fold to use
	3374	* and the length of the fold. (exception: locale rules just get the
	3375	* character to a single byte) */
	3376	if (n1 == 0) {
	3377
	3378	/* If in locale matching, we use two sets of rules, depending on if
	3379	* the code point is above or below 255. Here, we test for and
	3380	* handle locale rules */
	3381	if ((flags & FOLDEQ_UTF8_LOCALE)
	3382	&& (! u1 \|\| UTF8_IS_INVARIANT(p1) \|\| UTF8_IS_DOWNGRADEABLE_START(p1)))
	3383	{
	3384	/* There is no mixing of code points above and below 255. */
	3385	if (u2 && (! UTF8_IS_INVARIANT(*p2)
	3386	&& ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
	3387	{
	3388	return 0;
	3389	}
	3390
	3391	/* We handle locale rules by converting, if necessary, the code
	3392	* point to a single byte. */
	3393	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
	3394	foldbuf1 = p1;
	3395	}
	3396	else {
	3397	foldbuf1 = TWO_BYTE_UTF8_TO_UNI(p1, *(p1 + 1));
	3398	}
	3399	n1 = 1;
	3400	}
	3401	else if (isASCII(p1)) { / Note, that here won't be both ASCII
	3402	and using locale rules */
	3403
	3404	/* If trying to mix non- with ASCII, and not supposed to, fail */
	3405	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	3406	return 0;
	3407	}
	3408	n1 = 1;
	3409	foldbuf1 = toLOWER(p1); /* Folds in the ASCII range are
	3410	just lowercased */
	3411	}
	3412	else if (u1) {
	3413	to_utf8_fold(p1, foldbuf1, &n1);
	3414	}
	3415	else { /* Not utf8, convert to it first and then get fold */
	3416	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	3417	to_utf8_fold(natbuf, foldbuf1, &n1);
	3418	}
	3419	f1 = foldbuf1;
	3420	}
	3421
	3422	if (n2 == 0) { /* Same for s2 */
	3423	if ((flags & FOLDEQ_UTF8_LOCALE)
	3424	&& (! u2 \|\| UTF8_IS_INVARIANT(p2) \|\| UTF8_IS_DOWNGRADEABLE_START(p2)))
	3425	{
	3426	/* Here, the next char in s2 is < 256. We've already worked on
	3427	* s1, and if it isn't also < 256, can't match */
	3428	if (u1 && (! UTF8_IS_INVARIANT(*p1)
	3429	&& ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
	3430	{
	3431	return 0;
	3432	}
	3433	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
	3434	foldbuf2 = p2;
	3435	}
	3436	else {
	3437	foldbuf2 = TWO_BYTE_UTF8_TO_UNI(p2, *(p2 + 1));
	3438	}
	3439
	3440	/* Use another function to handle locale rules. We've made
	3441	* sure that both characters to compare are single bytes */
	3442	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
	3443	return 0;
	3444	}
	3445	n1 = n2 = 0;
	3446	}
	3447	else if (isASCII(*p2)) {
	3448	if (flags && ! isASCII(*p1)) {
	3449	return 0;
	3450	}
	3451	n2 = 1;
	3452	foldbuf2 = toLOWER(p2);
	3453	}
	3454	else if (u2) {
	3455	to_utf8_fold(p2, foldbuf2, &n2);
	3456	}
	3457	else {
	3458	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	3459	to_utf8_fold(natbuf, foldbuf2, &n2);
	3460	}
	3461	f2 = foldbuf2;
	3462	}
	3463
	3464	/* Here f1 and f2 point to the beginning of the strings to compare.
	3465	* These strings are the folds of the input characters, stored in utf8.
	3466	*/
	3467
	3468	/* While there is more to look for in both folds, see if they
	3469	* continue to match */
	3470	while (n1 && n2) {
	3471	U8 fold_length = UTF8SKIP(f1);
	3472	if (fold_length != UTF8SKIP(f2)
	3473	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	3474	function call for single
	3475	character */
	3476	\|\| memNE((char)f1, (char)f2, fold_length))
	3477	{
	3478	return 0; /* mismatch */
	3479	}
	3480
	3481	/* Here, they matched, advance past them */
	3482	n1 -= fold_length;
	3483	f1 += fold_length;
	3484	n2 -= fold_length;
	3485	f2 += fold_length;
	3486	}
	3487
	3488	/* When reach the end of any fold, advance the input past it */
	3489	if (n1 == 0) {
	3490	p1 += u1 ? UTF8SKIP(p1) : 1;
	3491	}
	3492	if (n2 == 0) {
	3493	p2 += u2 ? UTF8SKIP(p2) : 1;
	3494	}
	3495	} /* End of loop through both strings */
	3496
	3497	/* A match is defined by each scan that specified an explicit length
	3498	* reaching its final goal, and the other not having matched a partial
	3499	* character (which can happen when the fold of a character is more than one
	3500	* character). */
	3501	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	3502	return 0;
	3503	}
	3504
	3505	/* Successful match. Set output pointers */
	3506	if (pe1) {
	3507	pe1 = (char)p1;
	3508	}
	3509	if (pe2) {
	3510	pe2 = (char)p2;
	3511	}
	3512	return 1;
	3513	}
	3514
	3515	/*
	3516	* Local variables:
	3517	* c-indentation-style: bsd
	3518	* c-basic-offset: 4
	3519	* indent-tabs-mode: t
	3520	* End:
	3521	*
	3522	* ex: set ts=8 sts=4 sw=4 noet:
	3523	*/