perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>.
	66
	67	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	68
	69	=cut
	70	*/
	71
	72	bool
	73	Perl_is_ascii_string(const U8 *s, STRLEN len)
	74	{
	75	const U8* const send = s + (len ? len : strlen((const char *)s));
	76	const U8* x = s;
	77
	78	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	79
	80	for (; x < send; ++x) {
	81	if (!UTF8_IS_INVARIANT(*x))
	82	break;
	83	}
	84
	85	return x == send;
	86	}
	87
	88	/*
	89	=for apidoc uvuni_to_utf8_flags
	90
	91	Adds the UTF-8 representation of the code point C<uv> to the end
	92	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	93	bytes available. The return value is the pointer to the byte after the
	94	end of the new character. In other words,
	95
	96	d = uvuni_to_utf8_flags(d, uv, flags);
	97
	98	or, in most cases,
	99
	100	d = uvuni_to_utf8(d, uv);
	101
	102	(which is equivalent to)
	103
	104	d = uvuni_to_utf8_flags(d, uv, 0);
	105
	106	This is the recommended Unicode-aware way of saying
	107
	108	*(d++) = uv;
	109
	110	This function will convert to UTF-8 (and not warn) even code points that aren't
	111	legal Unicode or are problematic, unless C<flags> contains one or more of the
	112	following flags.
	113	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	114	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	115	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	116	If both flags are set, the function will both warn and return NULL.
	117
	118	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
	119	affect how the function handles a Unicode non-character. And, likewise for the
	120	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
	121	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	122	even less portable) can be warned and/or disallowed even if other above-Unicode
	123	code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	124	flags.
	125
	126	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	127	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	128	DISALLOW flags.
	129
	130
	131	=cut
	132	*/
	133
	134	U8 *
	135	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	136	{
	137	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	138
	139	if (ckWARN_d(WARN_UTF8)) {
	140	if (UNICODE_IS_SURROGATE(uv)) {
	141	if (flags & UNICODE_WARN_SURROGATE) {
	142	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	143	"UTF-16 surrogate U+%04"UVXf, uv);
	144	}
	145	if (flags & UNICODE_DISALLOW_SURROGATE) {
	146	return NULL;
	147	}
	148	}
	149	else if (UNICODE_IS_SUPER(uv)) {
	150	if (flags & UNICODE_WARN_SUPER
	151	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	152	{
	153	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	154	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	155	}
	156	if (flags & UNICODE_DISALLOW_SUPER
	157	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	158	{
	159	return NULL;
	160	}
	161	}
	162	else if (UNICODE_IS_NONCHAR(uv)) {
	163	if (flags & UNICODE_WARN_NONCHAR) {
	164	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	165	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	166	uv);
	167	}
	168	if (flags & UNICODE_DISALLOW_NONCHAR) {
	169	return NULL;
	170	}
	171	}
	172	}
	173	if (UNI_IS_INVARIANT(uv)) {
	174	*d++ = (U8)UTF_TO_NATIVE(uv);
	175	return d;
	176	}
	177	#if defined(EBCDIC)
	178	else {
	179	STRLEN len = UNISKIP(uv);
	180	U8 *p = d+len-1;
	181	while (p > d) {
	182	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	183	uv >>= UTF_ACCUMULATION_SHIFT;
	184	}
	185	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	186	return d+len;
	187	}
	188	#else /* Non loop style */
	189	if (uv < 0x800) {
	190	*d++ = (U8)(( uv >> 6) \| 0xc0);
	191	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	192	return d;
	193	}
	194	if (uv < 0x10000) {
	195	*d++ = (U8)(( uv >> 12) \| 0xe0);
	196	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	197	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	198	return d;
	199	}
	200	if (uv < 0x200000) {
	201	*d++ = (U8)(( uv >> 18) \| 0xf0);
	202	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	204	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	205	return d;
	206	}
	207	if (uv < 0x4000000) {
	208	*d++ = (U8)(( uv >> 24) \| 0xf8);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	if (uv < 0x80000000) {
	216	*d++ = (U8)(( uv >> 30) \| 0xfc);
	217	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	218	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	219	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	220	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	221	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	222	return d;
	223	}
	224	#ifdef HAS_QUAD
	225	if (uv < UTF8_QUAD_MAX)
	226	#endif
	227	{
	228	d++ = 0xfe; / Can't match U+FEFF! */
	229	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	231	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	232	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	233	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	234	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	235	return d;
	236	}
	237	#ifdef HAS_QUAD
	238	{
	239	d++ = 0xff; / Can't match U+FFFE! */
	240	d++ = 0x80; / 6 Reserved bits */
	241	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	242	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	243	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	244	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	245	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	246	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	247	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	248	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	249	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	250	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	251	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	252	return d;
	253	}
	254	#endif
	255	#endif /* Loop style */
	256	}
	257
	258	/*
	259
	260	Tests if some arbitrary number of bytes begins in a valid UTF-8
	261	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	262	UTF-8 character. The actual number of bytes in the UTF-8 character
	263	will be returned if it is valid, otherwise 0.
	264
	265	This is the "slow" version as opposed to the "fast" version which is
	266	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	267	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	268	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	269	you should use the _slow(). In practice this means that the _slow()
	270	will be used very rarely, since the maximum Unicode code point (as of
	271	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	272	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	273	five bytes or more.
	274
	275	=cut */
	276	STATIC STRLEN
	277	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	278	{
	279	U8 u = *s;
	280	STRLEN slen;
	281	UV uv, ouv;
	282
	283	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	284
	285	if (UTF8_IS_INVARIANT(u))
	286	return 1;
	287
	288	if (!UTF8_IS_START(u))
	289	return 0;
	290
	291	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	292	return 0;
	293
	294	slen = len - 1;
	295	s++;
	296	#ifdef EBCDIC
	297	u = NATIVE_TO_UTF(u);
	298	#endif
	299	u &= UTF_START_MASK(len);
	300	uv = u;
	301	ouv = uv;
	302	while (slen--) {
	303	if (!UTF8_IS_CONTINUATION(*s))
	304	return 0;
	305	uv = UTF8_ACCUMULATE(uv, *s);
	306	if (uv < ouv)
	307	return 0;
	308	ouv = uv;
	309	s++;
	310	}
	311
	312	if ((STRLEN)UNISKIP(uv) < len)
	313	return 0;
	314
	315	return len;
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char
	320
	321	Tests if some arbitrary number of bytes begins in a valid UTF-8
	322	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	323	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	324	character will be returned if it is valid, otherwise 0.
	325
	326	=cut */
	327	STRLEN
	328	Perl_is_utf8_char(const U8 *s)
	329	{
	330	const STRLEN len = UTF8SKIP(s);
	331
	332	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	333	#ifdef IS_UTF8_CHAR
	334	if (IS_UTF8_CHAR_FAST(len))
	335	return IS_UTF8_CHAR(s, len) ? len : 0;
	336	#endif /* #ifdef IS_UTF8_CHAR */
	337	return is_utf8_char_slow(s, len);
	338	}
	339
	340
	341	/*
	342	=for apidoc is_utf8_string
	343
	344	Returns true if first C<len> bytes of the given string form a valid
	345	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	346	using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
	347	string that contains code points above 0x7F encoded in UTF-8' because a
	348	valid ASCII string is a valid UTF-8 string.
	349
	350	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	351
	352	=cut
	353	*/
	354
	355	bool
	356	Perl_is_utf8_string(const U8 *s, STRLEN len)
	357	{
	358	const U8* const send = s + (len ? len : strlen((const char *)s));
	359	const U8* x = s;
	360
	361	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	362
	363	while (x < send) {
	364	STRLEN c;
	365	/* Inline the easy bits of is_utf8_char() here for speed... */
	366	if (UTF8_IS_INVARIANT(*x))
	367	c = 1;
	368	else if (!UTF8_IS_START(*x))
	369	goto out;
	370	else {
	371	/* ... and call is_utf8_char() only if really needed. */
	372	#ifdef IS_UTF8_CHAR
	373	c = UTF8SKIP(x);
	374	if (IS_UTF8_CHAR_FAST(c)) {
	375	if (!IS_UTF8_CHAR(x, c))
	376	c = 0;
	377	}
	378	else
	379	c = is_utf8_char_slow(x, c);
	380	#else
	381	c = is_utf8_char(x);
	382	#endif /* #ifdef IS_UTF8_CHAR */
	383	if (!c)
	384	goto out;
	385	}
	386	x += c;
	387	}
	388
	389	out:
	390	if (x != send)
	391	return FALSE;
	392
	393	return TRUE;
	394	}
	395
	396	/*
	397	Implemented as a macro in utf8.h
	398
	399	=for apidoc is_utf8_string_loc
	400
	401	Like is_utf8_string() but stores the location of the failure (in the
	402	case of "utf8ness failure") or the location s+len (in the case of
	403	"utf8ness success") in the C<ep>.
	404
	405	See also is_utf8_string_loclen() and is_utf8_string().
	406
	407	=for apidoc is_utf8_string_loclen
	408
	409	Like is_utf8_string() but stores the location of the failure (in the
	410	case of "utf8ness failure") or the location s+len (in the case of
	411	"utf8ness success") in the C<ep>, and the number of UTF-8
	412	encoded characters in the C<el>.
	413
	414	See also is_utf8_string_loc() and is_utf8_string().
	415
	416	=cut
	417	*/
	418
	419	bool
	420	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	421	{
	422	const U8* const send = s + (len ? len : strlen((const char *)s));
	423	const U8* x = s;
	424	STRLEN c;
	425	STRLEN outlen = 0;
	426
	427	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	428
	429	while (x < send) {
	430	/* Inline the easy bits of is_utf8_char() here for speed... */
	431	if (UTF8_IS_INVARIANT(*x))
	432	c = 1;
	433	else if (!UTF8_IS_START(*x))
	434	goto out;
	435	else {
	436	/* ... and call is_utf8_char() only if really needed. */
	437	#ifdef IS_UTF8_CHAR
	438	c = UTF8SKIP(x);
	439	if (IS_UTF8_CHAR_FAST(c)) {
	440	if (!IS_UTF8_CHAR(x, c))
	441	c = 0;
	442	} else
	443	c = is_utf8_char_slow(x, c);
	444	#else
	445	c = is_utf8_char(x);
	446	#endif /* #ifdef IS_UTF8_CHAR */
	447	if (!c)
	448	goto out;
	449	}
	450	x += c;
	451	outlen++;
	452	}
	453
	454	out:
	455	if (el)
	456	*el = outlen;
	457
	458	if (ep)
	459	*ep = x;
	460	return (x == send);
	461	}
	462
	463	/*
	464
	465	=for apidoc utf8n_to_uvuni
	466
	467	Bottom level UTF-8 decode routine.
	468	Returns the code point value of the first character in the string C<s>
	469	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
	470	C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
	471	character.
	472
	473	The value of C<flags> determines the behavior when C<s> does not point to a
	474	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	475	C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
	476	is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
	477	is raised.
	478
	479	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	480	individual types of malformations, such as the sequence being overlong (that
	481	is, when there is a shorter sequence that can express the same code point;
	482	overlong sequences are expressly forbidden in the UTF-8 standard due to
	483	potential security issues). Another malformation example is the first byte of
	484	a character not being a legal first byte. See F<utf8.h> for the list of such
	485	flags. Of course, the value returned by this function under such conditions is
	486	not reliable.
	487
	488	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	489	flags) malformation is found. If this flag is set, the routine assumes that
	490	the caller will raise a warning, and this function will silently just set
	491	C<retlen> to C<-1> and return zero.
	492
	493	Certain code points are considered problematic. These are Unicode surrogates,
	494	Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
	495	By default these are considered regular code points, but certain situations
	496	warrant special handling for them. if C<flags> contains
	497	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	498	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	499	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	500	maximum) can be set to disallow these categories individually.
	501
	502	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	503	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	504	for their respective categories, but otherwise the code points are considered
	505	valid (not malformations). To get a category to both be treated as a
	506	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	507	(But note that warnings are not raised if lexically disabled nor if
	508	UTF8_CHECK_ONLY is also specified.)
	509
	510	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	511	the others that are above the Unicode legal maximum. There are several
	512	reasons, one of which is that the original UTF-8 specification never went above
	513	this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
	514	on ASCII platforms for these large code point begins with a byte containing
	515	0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
	516	malformations, while allowing smaller above-Unicode code points. (Of course
	517	UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
	518	as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
	519	flags, but applies just to these code points.
	520
	521	All other code points corresponding to Unicode characters, including private
	522	use and those yet to be assigned, are never considered malformed and never
	523	warn.
	524
	525	Most code should use utf8_to_uvchr() rather than call this directly.
	526
	527	=cut
	528	*/
	529
	530	UV
	531	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	532	{
	533	dVAR;
	534	const U8 * const s0 = s;
	535	UV uv = *s, ouv = 0;
	536	STRLEN len = 1;
	537	bool dowarn = ckWARN_d(WARN_UTF8);
	538	const UV startbyte = *s;
	539	STRLEN expectlen = 0;
	540	U32 warning = 0;
	541	SV* sv = NULL;
	542
	543	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	544
	545	/* This list is a superset of the UTF8_ALLOW_XXX. */
	546
	547	#define UTF8_WARN_EMPTY 1
	548	#define UTF8_WARN_CONTINUATION 2
	549	#define UTF8_WARN_NON_CONTINUATION 3
	550	#define UTF8_WARN_SHORT 4
	551	#define UTF8_WARN_OVERFLOW 5
	552	#define UTF8_WARN_LONG 6
	553
	554	if (curlen == 0 &&
	555	!(flags & UTF8_ALLOW_EMPTY)) {
	556	warning = UTF8_WARN_EMPTY;
	557	goto malformed;
	558	}
	559
	560	if (UTF8_IS_INVARIANT(uv)) {
	561	if (retlen)
	562	*retlen = 1;
	563	return (UV) (NATIVE_TO_UTF(*s));
	564	}
	565
	566	if (UTF8_IS_CONTINUATION(uv) &&
	567	!(flags & UTF8_ALLOW_CONTINUATION)) {
	568	warning = UTF8_WARN_CONTINUATION;
	569	goto malformed;
	570	}
	571
	572	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	573	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	574	warning = UTF8_WARN_NON_CONTINUATION;
	575	goto malformed;
	576	}
	577
	578	#ifdef EBCDIC
	579	uv = NATIVE_TO_UTF(uv);
	580	#else
	581	if (uv == 0xfe \|\| uv == 0xff) {
	582	if (flags & (UTF8_WARN_SUPER\|UTF8_WARN_FE_FF)) {
	583	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
	584	flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
	585	}
	586	if (flags & (UTF8_DISALLOW_SUPER\|UTF8_DISALLOW_FE_FF)) {
	587	goto malformed;
	588	}
	589	}
	590	#endif
	591
	592	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	593	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	594	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	595	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	596	#ifdef EBCDIC
	597	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	598	else { len = 7; uv &= 0x01; }
	599	#else
	600	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	601	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	602	else { len = 13; uv = 0; } /* whoa! */
	603	#endif
	604
	605	if (retlen)
	606	*retlen = len;
	607
	608	expectlen = len;
	609
	610	if ((curlen < expectlen) &&
	611	!(flags & UTF8_ALLOW_SHORT)) {
	612	warning = UTF8_WARN_SHORT;
	613	goto malformed;
	614	}
	615
	616	len--;
	617	s++;
	618	ouv = uv; /* ouv is the value from the previous iteration */
	619
	620	while (len--) {
	621	if (!UTF8_IS_CONTINUATION(*s) &&
	622	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	623	s--;
	624	warning = UTF8_WARN_NON_CONTINUATION;
	625	goto malformed;
	626	}
	627	else
	628	uv = UTF8_ACCUMULATE(uv, *s);
	629	if (!(uv > ouv)) { /* If the value didn't grow from the previous
	630	iteration, something is horribly wrong */
	631	/* These cannot be allowed. */
	632	if (uv == ouv) {
	633	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	634	warning = UTF8_WARN_LONG;
	635	goto malformed;
	636	}
	637	}
	638	else { /* uv < ouv */
	639	/* This cannot be allowed. */
	640	warning = UTF8_WARN_OVERFLOW;
	641	goto malformed;
	642	}
	643	}
	644	s++;
	645	ouv = uv;
	646	}
	647
	648	if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
	649	warning = UTF8_WARN_LONG;
	650	goto malformed;
	651	} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE\|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
	652	if (UNICODE_IS_SURROGATE(uv)) {
	653	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
	654	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	655	}
	656	if (flags & UTF8_DISALLOW_SURROGATE) {
	657	goto disallowed;
	658	}
	659	}
	660	else if (UNICODE_IS_NONCHAR(uv)) {
	661	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
	662	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	663	}
	664	if (flags & UTF8_DISALLOW_NONCHAR) {
	665	goto disallowed;
	666	}
	667	}
	668	else if ((uv > PERL_UNICODE_MAX)) {
	669	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
	670	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	671	}
	672	if (flags & UTF8_DISALLOW_SUPER) {
	673	goto disallowed;
	674	}
	675	}
	676
	677	/* Here, this is not considered a malformed character, so drop through
	678	* to return it */
	679	}
	680
	681	return uv;
	682
	683	disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
	684	set if there is to be a warning. */
	685	if (!sv) {
	686	dowarn = 0;
	687	}
	688
	689	malformed:
	690
	691	if (flags & UTF8_CHECK_ONLY) {
	692	if (retlen)
	693	*retlen = ((STRLEN) -1);
	694	return 0;
	695	}
	696
	697	if (dowarn) {
	698	if (! sv) {
	699	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	700	}
	701
	702	switch (warning) {
	703	case 0: /* Intentionally empty. */ break;
	704	case UTF8_WARN_EMPTY:
	705	sv_catpvs(sv, "(empty string)");
	706	break;
	707	case UTF8_WARN_CONTINUATION:
	708	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	709	break;
	710	case UTF8_WARN_NON_CONTINUATION:
	711	if (s == s0)
	712	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	713	(UV)s[1], startbyte);
	714	else {
	715	const int len = (int)(s-s0);
	716	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	717	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	718	}
	719
	720	break;
	721	case UTF8_WARN_SHORT:
	722	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	723	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	724	expectlen = curlen; /* distance for caller to skip */
	725	break;
	726	case UTF8_WARN_OVERFLOW:
	727	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	728	ouv, *s, startbyte);
	729	break;
	730	case UTF8_WARN_LONG:
	731	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	732	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	733	break;
	734	default:
	735	sv_catpvs(sv, "(unknown reason)");
	736	break;
	737	}
	738
	739	if (sv) {
	740	const char * const s = SvPVX_const(sv);
	741
	742	if (PL_op)
	743	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	744	"%s in %s", s, OP_DESC(PL_op));
	745	else
	746	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	747	}
	748	}
	749
	750	if (retlen)
	751	*retlen = expectlen ? expectlen : len;
	752
	753	return 0;
	754	}
	755
	756	/*
	757	=for apidoc utf8_to_uvchr
	758
	759	Returns the native code point of the first character in the string C<s>
	760	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	761	length, in bytes, of that character.
	762
	763	If C<s> does not point to a well-formed UTF-8 character, zero is
	764	returned and retlen is set, if possible, to -1.
	765
	766	=cut
	767	*/
	768
	769
	770	UV
	771	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	772	{
	773	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	774
	775	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	776	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	777	}
	778
	779	/*
	780	=for apidoc utf8_to_uvuni
	781
	782	Returns the Unicode code point of the first character in the string C<s>
	783	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	784	length, in bytes, of that character.
	785
	786	This function should only be used when the returned UV is considered
	787	an index into the Unicode semantic tables (e.g. swashes).
	788
	789	If C<s> does not point to a well-formed UTF-8 character, zero is
	790	returned and retlen is set, if possible, to -1.
	791
	792	=cut
	793	*/
	794
	795	UV
	796	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	797	{
	798	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	799
	800	/* Call the low level routine asking for checks */
	801	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	802	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	803	}
	804
	805	/*
	806	=for apidoc utf8_length
	807
	808	Return the length of the UTF-8 char encoded string C<s> in characters.
	809	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	810	up past C<e>, croaks.
	811
	812	=cut
	813	*/
	814
	815	STRLEN
	816	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	817	{
	818	dVAR;
	819	STRLEN len = 0;
	820
	821	PERL_ARGS_ASSERT_UTF8_LENGTH;
	822
	823	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	824	* the bitops (especially ~) can create illegal UTF-8.
	825	* In other words: in Perl UTF-8 is not just for Unicode. */
	826
	827	if (e < s)
	828	goto warn_and_return;
	829	while (s < e) {
	830	if (!UTF8_IS_INVARIANT(*s))
	831	s += UTF8SKIP(s);
	832	else
	833	s++;
	834	len++;
	835	}
	836
	837	if (e != s) {
	838	len--;
	839	warn_and_return:
	840	if (PL_op)
	841	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	842	"%s in %s", unees, OP_DESC(PL_op));
	843	else
	844	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
	845	}
	846
	847	return len;
	848	}
	849
	850	/*
	851	=for apidoc utf8_distance
	852
	853	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	854	and C<b>.
	855
	856	WARNING: use only if you know that the pointers point inside the
	857	same UTF-8 buffer.
	858
	859	=cut
	860	*/
	861
	862	IV
	863	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	864	{
	865	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	866
	867	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	868	}
	869
	870	/*
	871	=for apidoc utf8_hop
	872
	873	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	874	forward or backward.
	875
	876	WARNING: do not use the following unless you know C<off> is within
	877	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	878	on the first byte of character or just after the last byte of a character.
	879
	880	=cut
	881	*/
	882
	883	U8 *
	884	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	885	{
	886	PERL_ARGS_ASSERT_UTF8_HOP;
	887
	888	PERL_UNUSED_CONTEXT;
	889	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	890	* the bitops (especially ~) can create illegal UTF-8.
	891	* In other words: in Perl UTF-8 is not just for Unicode. */
	892
	893	if (off >= 0) {
	894	while (off--)
	895	s += UTF8SKIP(s);
	896	}
	897	else {
	898	while (off++) {
	899	s--;
	900	while (UTF8_IS_CONTINUATION(*s))
	901	s--;
	902	}
	903	}
	904	return (U8 *)s;
	905	}
	906
	907	/*
	908	=for apidoc bytes_cmp_utf8
	909
	910	Compares the sequence of characters (stored as octets) in b, blen with the
	911	sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
	912	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	913	if the first string is greater than the second string.
	914
	915	-1 or +1 is returned if the shorter string was identical to the start of the
	916	longer string. -2 or +2 is returned if the was a difference between characters
	917	within the strings.
	918
	919	=cut
	920	*/
	921
	922	int
	923	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	924	{
	925	const U8 *const bend = b + blen;
	926	const U8 *const uend = u + ulen;
	927
	928	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	929
	930	PERL_UNUSED_CONTEXT;
	931
	932	while (b < bend && u < uend) {
	933	U8 c = *u++;
	934	if (!UTF8_IS_INVARIANT(c)) {
	935	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	936	if (u < uend) {
	937	U8 c1 = *u++;
	938	if (UTF8_IS_CONTINUATION(c1)) {
	939	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
	940	} else {
	941	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	942	"Malformed UTF-8 character "
	943	"(unexpected non-continuation byte 0x%02x"
	944	", immediately after start byte 0x%02x)"
	945	/* Dear diag.t, it's in the pod. */
	946	"%s%s", c1, c,
	947	PL_op ? " in " : "",
	948	PL_op ? OP_DESC(PL_op) : "");
	949	return -2;
	950	}
	951	} else {
	952	if (PL_op)
	953	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	954	"%s in %s", unees, OP_DESC(PL_op));
	955	else
	956	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
	957	return -2; /* Really want to return undef :-) */
	958	}
	959	} else {
	960	return -2;
	961	}
	962	}
	963	if (*b != c) {
	964	return *b < c ? -2 : +2;
	965	}
	966	++b;
	967	}
	968
	969	if (b == bend && u == uend)
	970	return 0;
	971
	972	return b < bend ? +1 : -1;
	973	}
	974
	975	/*
	976	=for apidoc utf8_to_bytes
	977
	978	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	979	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	980	updates len to contain the new length.
	981	Returns zero on failure, setting C<len> to -1.
	982
	983	If you need a copy of the string, see C<bytes_from_utf8>.
	984
	985	=cut
	986	*/
	987
	988	U8 *
	989	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	990	{
	991	U8 * const save = s;
	992	U8 * const send = s + *len;
	993	U8 *d;
	994
	995	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	996
	997	/* ensure valid UTF-8 and chars < 256 before updating string */
	998	while (s < send) {
	999	U8 c = *s++;
	1000
	1001	if (!UTF8_IS_INVARIANT(c) &&
	1002	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	1003	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	1004	*len = ((STRLEN) -1);
	1005	return 0;
	1006	}
	1007	}
	1008
	1009	d = s = save;
	1010	while (s < send) {
	1011	STRLEN ulen;
	1012	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	1013	s += ulen;
	1014	}
	1015	*d = '\0';
	1016	*len = d - save;
	1017	return save;
	1018	}
	1019
	1020	/*
	1021	=for apidoc bytes_from_utf8
	1022
	1023	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1024	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	1025	the newly-created string, and updates C<len> to contain the new
	1026	length. Returns the original string if no conversion occurs, C<len>
	1027	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1028	0 if C<s> is converted or consisted entirely of characters that are invariant
	1029	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1030
	1031	=cut
	1032	*/
	1033
	1034	U8 *
	1035	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1036	{
	1037	U8 *d;
	1038	const U8 *start = s;
	1039	const U8 *send;
	1040	I32 count = 0;
	1041
	1042	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1043
	1044	PERL_UNUSED_CONTEXT;
	1045	if (!*is_utf8)
	1046	return (U8 *)start;
	1047
	1048	/* ensure valid UTF-8 and chars < 256 before converting string */
	1049	for (send = s + *len; s < send;) {
	1050	U8 c = *s++;
	1051	if (!UTF8_IS_INVARIANT(c)) {
	1052	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	1053	(c = *s++) && UTF8_IS_CONTINUATION(c))
	1054	count++;
	1055	else
	1056	return (U8 *)start;
	1057	}
	1058	}
	1059
	1060	*is_utf8 = FALSE;
	1061
	1062	Newx(d, (*len) - count + 1, U8);
	1063	s = start; start = d;
	1064	while (s < send) {
	1065	U8 c = *s++;
	1066	if (!UTF8_IS_INVARIANT(c)) {
	1067	/* Then it is two-byte encoded */
	1068	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
	1069	}
	1070	*d++ = c;
	1071	}
	1072	*d = '\0';
	1073	*len = d - start;
	1074	return (U8 *)start;
	1075	}
	1076
	1077	/*
	1078	=for apidoc bytes_to_utf8
	1079
	1080	Converts a string C<s> of length C<len> bytes from the native encoding into
	1081	UTF-8.
	1082	Returns a pointer to the newly-created string, and sets C<len> to
	1083	reflect the new length in bytes.
	1084
	1085	A NUL character will be written after the end of the string.
	1086
	1087	If you want to convert to UTF-8 from encodings other than
	1088	the native (Latin1 or EBCDIC),
	1089	see sv_recode_to_utf8().
	1090
	1091	=cut
	1092	*/
	1093
	1094	U8*
	1095	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1096	{
	1097	const U8 * const send = s + (*len);
	1098	U8 *d;
	1099	U8 *dst;
	1100
	1101	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1102	PERL_UNUSED_CONTEXT;
	1103
	1104	Newx(d, (len) 2 + 1, U8);
	1105	dst = d;
	1106
	1107	while (s < send) {
	1108	const UV uv = NATIVE_TO_ASCII(*s++);
	1109	if (UNI_IS_INVARIANT(uv))
	1110	*d++ = (U8)UTF_TO_NATIVE(uv);
	1111	else {
	1112	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	1113	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	1114	}
	1115	}
	1116	*d = '\0';
	1117	*len = d-dst;
	1118	return dst;
	1119	}
	1120
	1121	/*
	1122	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1123	*
	1124	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1125	* We optimize for native, for obvious reasons. */
	1126
	1127	U8*
	1128	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1129	{
	1130	U8* pend;
	1131	U8* dstart = d;
	1132
	1133	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1134
	1135	if (bytelen & 1)
	1136	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1137
	1138	pend = p + bytelen;
	1139
	1140	while (p < pend) {
	1141	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1142	p += 2;
	1143	if (uv < 0x80) {
	1144	#ifdef EBCDIC
	1145	*d++ = UNI_TO_NATIVE(uv);
	1146	#else
	1147	*d++ = (U8)uv;
	1148	#endif
	1149	continue;
	1150	}
	1151	if (uv < 0x800) {
	1152	*d++ = (U8)(( uv >> 6) \| 0xc0);
	1153	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1154	continue;
	1155	}
	1156	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	1157	if (p >= pend) {
	1158	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1159	} else {
	1160	UV low = (p[0] << 8) + p[1];
	1161	p += 2;
	1162	if (low < 0xdc00 \|\| low > 0xdfff)
	1163	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1164	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	1165	}
	1166	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	1167	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1168	}
	1169	if (uv < 0x10000) {
	1170	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1171	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1172	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1173	continue;
	1174	}
	1175	else {
	1176	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1177	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1178	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1179	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1180	continue;
	1181	}
	1182	}
	1183	*newlen = d - dstart;
	1184	return d;
	1185	}
	1186
	1187	/* Note: this one is slightly destructive of the source. */
	1188
	1189	U8*
	1190	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1191	{
	1192	U8* s = (U8*)p;
	1193	U8* const send = s + bytelen;
	1194
	1195	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1196
	1197	if (bytelen & 1)
	1198	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1199	(UV)bytelen);
	1200
	1201	while (s < send) {
	1202	const U8 tmp = s[0];
	1203	s[0] = s[1];
	1204	s[1] = tmp;
	1205	s += 2;
	1206	}
	1207	return utf16_to_utf8(p, d, bytelen, newlen);
	1208	}
	1209
	1210	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	1211
	1212	bool
	1213	Perl_is_uni_alnum(pTHX_ UV c)
	1214	{
	1215	U8 tmpbuf[UTF8_MAXBYTES+1];
	1216	uvchr_to_utf8(tmpbuf, c);
	1217	return is_utf8_alnum(tmpbuf);
	1218	}
	1219
	1220	bool
	1221	Perl_is_uni_idfirst(pTHX_ UV c)
	1222	{
	1223	U8 tmpbuf[UTF8_MAXBYTES+1];
	1224	uvchr_to_utf8(tmpbuf, c);
	1225	return is_utf8_idfirst(tmpbuf);
	1226	}
	1227
	1228	bool
	1229	Perl_is_uni_alpha(pTHX_ UV c)
	1230	{
	1231	U8 tmpbuf[UTF8_MAXBYTES+1];
	1232	uvchr_to_utf8(tmpbuf, c);
	1233	return is_utf8_alpha(tmpbuf);
	1234	}
	1235
	1236	bool
	1237	Perl_is_uni_ascii(pTHX_ UV c)
	1238	{
	1239	U8 tmpbuf[UTF8_MAXBYTES+1];
	1240	uvchr_to_utf8(tmpbuf, c);
	1241	return is_utf8_ascii(tmpbuf);
	1242	}
	1243
	1244	bool
	1245	Perl_is_uni_space(pTHX_ UV c)
	1246	{
	1247	U8 tmpbuf[UTF8_MAXBYTES+1];
	1248	uvchr_to_utf8(tmpbuf, c);
	1249	return is_utf8_space(tmpbuf);
	1250	}
	1251
	1252	bool
	1253	Perl_is_uni_digit(pTHX_ UV c)
	1254	{
	1255	U8 tmpbuf[UTF8_MAXBYTES+1];
	1256	uvchr_to_utf8(tmpbuf, c);
	1257	return is_utf8_digit(tmpbuf);
	1258	}
	1259
	1260	bool
	1261	Perl_is_uni_upper(pTHX_ UV c)
	1262	{
	1263	U8 tmpbuf[UTF8_MAXBYTES+1];
	1264	uvchr_to_utf8(tmpbuf, c);
	1265	return is_utf8_upper(tmpbuf);
	1266	}
	1267
	1268	bool
	1269	Perl_is_uni_lower(pTHX_ UV c)
	1270	{
	1271	U8 tmpbuf[UTF8_MAXBYTES+1];
	1272	uvchr_to_utf8(tmpbuf, c);
	1273	return is_utf8_lower(tmpbuf);
	1274	}
	1275
	1276	bool
	1277	Perl_is_uni_cntrl(pTHX_ UV c)
	1278	{
	1279	U8 tmpbuf[UTF8_MAXBYTES+1];
	1280	uvchr_to_utf8(tmpbuf, c);
	1281	return is_utf8_cntrl(tmpbuf);
	1282	}
	1283
	1284	bool
	1285	Perl_is_uni_graph(pTHX_ UV c)
	1286	{
	1287	U8 tmpbuf[UTF8_MAXBYTES+1];
	1288	uvchr_to_utf8(tmpbuf, c);
	1289	return is_utf8_graph(tmpbuf);
	1290	}
	1291
	1292	bool
	1293	Perl_is_uni_print(pTHX_ UV c)
	1294	{
	1295	U8 tmpbuf[UTF8_MAXBYTES+1];
	1296	uvchr_to_utf8(tmpbuf, c);
	1297	return is_utf8_print(tmpbuf);
	1298	}
	1299
	1300	bool
	1301	Perl_is_uni_punct(pTHX_ UV c)
	1302	{
	1303	U8 tmpbuf[UTF8_MAXBYTES+1];
	1304	uvchr_to_utf8(tmpbuf, c);
	1305	return is_utf8_punct(tmpbuf);
	1306	}
	1307
	1308	bool
	1309	Perl_is_uni_xdigit(pTHX_ UV c)
	1310	{
	1311	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1312	uvchr_to_utf8(tmpbuf, c);
	1313	return is_utf8_xdigit(tmpbuf);
	1314	}
	1315
	1316	UV
	1317	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1318	{
	1319	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1320
	1321	uvchr_to_utf8(p, c);
	1322	return to_utf8_upper(p, p, lenp);
	1323	}
	1324
	1325	UV
	1326	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1327	{
	1328	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1329
	1330	uvchr_to_utf8(p, c);
	1331	return to_utf8_title(p, p, lenp);
	1332	}
	1333
	1334	UV
	1335	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1336	{
	1337	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1338
	1339	uvchr_to_utf8(p, c);
	1340	return to_utf8_lower(p, p, lenp);
	1341	}
	1342
	1343	UV
	1344	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1345	{
	1346	PERL_ARGS_ASSERT_TO_UNI_FOLD;
	1347
	1348	uvchr_to_utf8(p, c);
	1349	return to_utf8_fold(p, p, lenp);
	1350	}
	1351
	1352	/* for now these all assume no locale info available for Unicode > 255 */
	1353
	1354	bool
	1355	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1356	{
	1357	return is_uni_alnum(c); /* XXX no locale support yet */
	1358	}
	1359
	1360	bool
	1361	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1362	{
	1363	return is_uni_idfirst(c); /* XXX no locale support yet */
	1364	}
	1365
	1366	bool
	1367	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1368	{
	1369	return is_uni_alpha(c); /* XXX no locale support yet */
	1370	}
	1371
	1372	bool
	1373	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1374	{
	1375	return is_uni_ascii(c); /* XXX no locale support yet */
	1376	}
	1377
	1378	bool
	1379	Perl_is_uni_space_lc(pTHX_ UV c)
	1380	{
	1381	return is_uni_space(c); /* XXX no locale support yet */
	1382	}
	1383
	1384	bool
	1385	Perl_is_uni_digit_lc(pTHX_ UV c)
	1386	{
	1387	return is_uni_digit(c); /* XXX no locale support yet */
	1388	}
	1389
	1390	bool
	1391	Perl_is_uni_upper_lc(pTHX_ UV c)
	1392	{
	1393	return is_uni_upper(c); /* XXX no locale support yet */
	1394	}
	1395
	1396	bool
	1397	Perl_is_uni_lower_lc(pTHX_ UV c)
	1398	{
	1399	return is_uni_lower(c); /* XXX no locale support yet */
	1400	}
	1401
	1402	bool
	1403	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1404	{
	1405	return is_uni_cntrl(c); /* XXX no locale support yet */
	1406	}
	1407
	1408	bool
	1409	Perl_is_uni_graph_lc(pTHX_ UV c)
	1410	{
	1411	return is_uni_graph(c); /* XXX no locale support yet */
	1412	}
	1413
	1414	bool
	1415	Perl_is_uni_print_lc(pTHX_ UV c)
	1416	{
	1417	return is_uni_print(c); /* XXX no locale support yet */
	1418	}
	1419
	1420	bool
	1421	Perl_is_uni_punct_lc(pTHX_ UV c)
	1422	{
	1423	return is_uni_punct(c); /* XXX no locale support yet */
	1424	}
	1425
	1426	bool
	1427	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1428	{
	1429	return is_uni_xdigit(c); /* XXX no locale support yet */
	1430	}
	1431
	1432	U32
	1433	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1434	{
	1435	/* XXX returns only the first character -- do not use XXX */
	1436	/* XXX no locale support yet */
	1437	STRLEN len;
	1438	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1439	return (U32)to_uni_upper(c, tmpbuf, &len);
	1440	}
	1441
	1442	U32
	1443	Perl_to_uni_title_lc(pTHX_ U32 c)
	1444	{
	1445	/* XXX returns only the first character XXX -- do not use XXX */
	1446	/* XXX no locale support yet */
	1447	STRLEN len;
	1448	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1449	return (U32)to_uni_title(c, tmpbuf, &len);
	1450	}
	1451
	1452	U32
	1453	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1454	{
	1455	/* XXX returns only the first character -- do not use XXX */
	1456	/* XXX no locale support yet */
	1457	STRLEN len;
	1458	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1459	return (U32)to_uni_lower(c, tmpbuf, &len);
	1460	}
	1461
	1462	static bool
	1463	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1464	const char *const swashname)
	1465	{
	1466	dVAR;
	1467
	1468	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1469
	1470	if (!is_utf8_char(p))
	1471	return FALSE;
	1472	if (!*swash)
	1473	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1474	return swash_fetch(*swash, p, TRUE) != 0;
	1475	}
	1476
	1477	bool
	1478	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1479	{
	1480	dVAR;
	1481
	1482	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1483
	1484	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1485	* descendant of isalnum(3), in other words, it doesn't
	1486	* contain the '_'. --jhi */
	1487	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1488	}
	1489
	1490	bool
	1491	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1492	{
	1493	dVAR;
	1494
	1495	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1496
	1497	if (*p == '_')
	1498	return TRUE;
	1499	/* is_utf8_idstart would be more logical. */
	1500	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1501	}
	1502
	1503	bool
	1504	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1505	{
	1506	dVAR;
	1507
	1508	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1509
	1510	if (*p == '_')
	1511	return TRUE;
	1512	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1513	}
	1514
	1515	bool
	1516	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1517	{
	1518	dVAR;
	1519
	1520	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1521
	1522	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1523	}
	1524
	1525	bool
	1526	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1527	{
	1528	dVAR;
	1529
	1530	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1531
	1532	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1533	}
	1534
	1535	bool
	1536	Perl_is_utf8_space(pTHX_ const U8 *p)
	1537	{
	1538	dVAR;
	1539
	1540	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1541
	1542	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1543	}
	1544
	1545	bool
	1546	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1547	{
	1548	dVAR;
	1549
	1550	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1551
	1552	return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
	1553	}
	1554
	1555	bool
	1556	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1557	{
	1558	dVAR;
	1559
	1560	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1561
	1562	return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
	1563	}
	1564
	1565	bool
	1566	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1567	{
	1568	dVAR;
	1569
	1570	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1571
	1572	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1573	}
	1574
	1575	bool
	1576	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1577	{
	1578	dVAR;
	1579
	1580	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1581
	1582	return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
	1583	}
	1584
	1585	bool
	1586	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1587	{
	1588	dVAR;
	1589
	1590	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1591
	1592	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1593	}
	1594
	1595	bool
	1596	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1597	{
	1598	dVAR;
	1599
	1600	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1601
	1602	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1603	}
	1604
	1605	bool
	1606	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1607	{
	1608	dVAR;
	1609
	1610	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1611
	1612	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1613	}
	1614
	1615	bool
	1616	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1617	{
	1618	dVAR;
	1619
	1620	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1621
	1622	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1623	}
	1624
	1625	bool
	1626	Perl_is_utf8_print(pTHX_ const U8 *p)
	1627	{
	1628	dVAR;
	1629
	1630	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1631
	1632	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1633	}
	1634
	1635	bool
	1636	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1637	{
	1638	dVAR;
	1639
	1640	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1641
	1642	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1643	}
	1644
	1645	bool
	1646	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1647	{
	1648	dVAR;
	1649
	1650	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1651
	1652	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1653	}
	1654
	1655	bool
	1656	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1657	{
	1658	dVAR;
	1659
	1660	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1661
	1662	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1663	}
	1664
	1665	bool
	1666	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1667	{
	1668	dVAR;
	1669
	1670	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1671
	1672	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1673	}
	1674
	1675	bool
	1676	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1677	{
	1678	dVAR;
	1679
	1680	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1681
	1682	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1683	}
	1684
	1685	bool
	1686	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1687	{
	1688	dVAR;
	1689
	1690	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1691
	1692	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1693	}
	1694
	1695	bool
	1696	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1697	{
	1698	dVAR;
	1699
	1700	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1701
	1702	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1703	}
	1704
	1705	bool
	1706	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1707	{
	1708	dVAR;
	1709
	1710	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1711
	1712	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1713	}
	1714
	1715	bool
	1716	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1717	{
	1718	dVAR;
	1719
	1720	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1721
	1722	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1723	}
	1724
	1725	bool
	1726	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1727	{
	1728	dVAR;
	1729
	1730	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1731
	1732	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1733	}
	1734
	1735	bool
	1736	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1737	{
	1738	dVAR;
	1739
	1740	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1741
	1742	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1743	}
	1744
	1745	bool
	1746	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1747	{
	1748	dVAR;
	1749
	1750	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1751
	1752	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1753	}
	1754
	1755	bool
	1756	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1757	{
	1758	dVAR;
	1759
	1760	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1761
	1762	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1763	}
	1764
	1765	/*
	1766	=for apidoc to_utf8_case
	1767
	1768	The "p" contains the pointer to the UTF-8 string encoding
	1769	the character that is being converted.
	1770
	1771	The "ustrp" is a pointer to the character buffer to put the
	1772	conversion result to. The "lenp" is a pointer to the length
	1773	of the result.
	1774
	1775	The "swashp" is a pointer to the swash to use.
	1776
	1777	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1778	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1779	but not always, a multicharacter mapping), is tried first.
	1780
	1781	The "special" is a string like "utf8::ToSpecLower", which means the
	1782	hash %utf8::ToSpecLower. The access to the hash is through
	1783	Perl_to_utf8_case().
	1784
	1785	The "normal" is a string like "ToLower" which means the swash
	1786	%utf8::ToLower.
	1787
	1788	=cut */
	1789
	1790	UV
	1791	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1792	SV *swashp, const char normal, const char *special)
	1793	{
	1794	dVAR;
	1795	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1796	STRLEN len = 0;
	1797	const UV uv0 = utf8_to_uvchr(p, NULL);
	1798	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1799	* are necessary in EBCDIC, they are redundant no-ops
	1800	* in ASCII-ish platforms, and hopefully optimized away. */
	1801	const UV uv1 = NATIVE_TO_UNI(uv0);
	1802
	1803	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1804
	1805	/* Note that swash_fetch() doesn't output warnings for these because it
	1806	* assumes we will */
	1807	if (uv1 >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) {
	1808	if (uv1 <= UNICODE_SURROGATE_LAST) {
	1809	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1810	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	1811	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1812	}
	1813	else if (UNICODE_IS_SUPER(uv1)) {
	1814	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1815	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	1816	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1817	}
	1818
	1819	/* Note that non-characters are perfectly legal, so no warning should
	1820	* be given */
	1821	}
	1822
	1823	uvuni_to_utf8(tmpbuf, uv1);
	1824
	1825	if (!swashp) / load on-demand */
	1826	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1827	/* This is the beginnings of a skeleton of code to read the info section
	1828	* that is in all the swashes in case we ever want to do that, so one can
	1829	* read things whose maps aren't code points, and whose default if missing
	1830	* is not to the code point itself. This was just to see if it actually
	1831	* worked. Details on what the possibilities are are in perluniprops.pod
	1832	HV * const hv = get_hv("utf8::SwashInfo", 0);
	1833	if (hv) {
	1834	SV **svp;
	1835	svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
	1836	const char *s;
	1837
	1838	HV * const this_hash = SvRV(*svp);
	1839	svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
	1840	s = SvPV_const(*svp, len);
	1841	}
	1842	}*/
	1843
	1844	if (special) {
	1845	/* It might be "special" (sometimes, but not always,
	1846	* a multicharacter mapping) */
	1847	HV * const hv = get_hv(special, 0);
	1848	SV **svp;
	1849
	1850	if (hv &&
	1851	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1852	(*svp)) {
	1853	const char *s;
	1854
	1855	s = SvPV_const(*svp, len);
	1856	if (len == 1)
	1857	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1858	else {
	1859	#ifdef EBCDIC
	1860	/* If we have EBCDIC we need to remap the characters
	1861	* since any characters in the low 256 are Unicode
	1862	* code points, not EBCDIC. */
	1863	U8 t = (U8)s, tend = t + len, d;
	1864
	1865	d = tmpbuf;
	1866	if (SvUTF8(*svp)) {
	1867	STRLEN tlen = 0;
	1868
	1869	while (t < tend) {
	1870	const UV c = utf8_to_uvchr(t, &tlen);
	1871	if (tlen > 0) {
	1872	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1873	t += tlen;
	1874	}
	1875	else
	1876	break;
	1877	}
	1878	}
	1879	else {
	1880	while (t < tend) {
	1881	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1882	t++;
	1883	}
	1884	}
	1885	len = d - tmpbuf;
	1886	Copy(tmpbuf, ustrp, len, U8);
	1887	#else
	1888	Copy(s, ustrp, len, U8);
	1889	#endif
	1890	}
	1891	}
	1892	}
	1893
	1894	if (!len && *swashp) {
	1895	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1896
	1897	if (uv2) {
	1898	/* It was "normal" (a single character mapping). */
	1899	const UV uv3 = UNI_TO_NATIVE(uv2);
	1900	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1901	}
	1902	}
	1903
	1904	if (!len) /* Neither: just copy. In other words, there was no mapping
	1905	defined, which means that the code point maps to itself */
	1906	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1907
	1908	if (lenp)
	1909	*lenp = len;
	1910
	1911	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1912	}
	1913
	1914	/*
	1915	=for apidoc to_utf8_upper
	1916
	1917	Convert the UTF-8 encoded character at p to its uppercase version and
	1918	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1919	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1920	the uppercase version may be longer than the original character.
	1921
	1922	The first character of the uppercased version is returned
	1923	(but note, as explained above, that there may be more.)
	1924
	1925	=cut */
	1926
	1927	UV
	1928	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1929	{
	1930	dVAR;
	1931
	1932	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1933
	1934	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1935	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1936	}
	1937
	1938	/*
	1939	=for apidoc to_utf8_title
	1940
	1941	Convert the UTF-8 encoded character at p to its titlecase version and
	1942	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1943	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1944	titlecase version may be longer than the original character.
	1945
	1946	The first character of the titlecased version is returned
	1947	(but note, as explained above, that there may be more.)
	1948
	1949	=cut */
	1950
	1951	UV
	1952	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1953	{
	1954	dVAR;
	1955
	1956	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	1957
	1958	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1959	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1960	}
	1961
	1962	/*
	1963	=for apidoc to_utf8_lower
	1964
	1965	Convert the UTF-8 encoded character at p to its lowercase version and
	1966	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1967	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1968	lowercase version may be longer than the original character.
	1969
	1970	The first character of the lowercased version is returned
	1971	(but note, as explained above, that there may be more.)
	1972
	1973	=cut */
	1974
	1975	UV
	1976	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1977	{
	1978	dVAR;
	1979
	1980	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	1981
	1982	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1983	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1984	}
	1985
	1986	/*
	1987	=for apidoc to_utf8_fold
	1988
	1989	Convert the UTF-8 encoded character at p to its foldcase version and
	1990	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1991	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1992	foldcase version may be longer than the original character (up to
	1993	three characters).
	1994
	1995	The first character of the foldcased version is returned
	1996	(but note, as explained above, that there may be more.)
	1997
	1998	=cut */
	1999
	2000	UV
	2001	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	2002	{
	2003	dVAR;
	2004
	2005	PERL_ARGS_ASSERT_TO_UTF8_FOLD;
	2006
	2007	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2008	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	2009	}
	2010
	2011	/* Note:
	2012	* A "swash" is a swatch hash.
	2013	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	2014	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2015	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2016	*/
	2017	SV*
	2018	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2019	{
	2020	dVAR;
	2021	SV* retval;
	2022	dSP;
	2023	const size_t pkg_len = strlen(pkg);
	2024	const size_t name_len = strlen(name);
	2025	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2026	SV* errsv_save;
	2027	GV *method;
	2028
	2029	PERL_ARGS_ASSERT_SWASH_INIT;
	2030
	2031	PUSHSTACKi(PERLSI_MAGIC);
	2032	ENTER;
	2033	SAVEHINTS();
	2034	save_re_context();
	2035	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2036	if (!method) { /* demand load utf8 */
	2037	ENTER;
	2038	errsv_save = newSVsv(ERRSV);
	2039	/* It is assumed that callers of this routine are not passing in any
	2040	user derived data. */
	2041	/* Need to do this after save_re_context() as it will set PL_tainted to
	2042	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	2043	Even line to create errsv_save can turn on PL_tainted. */
	2044	SAVEBOOL(PL_tainted);
	2045	PL_tainted = 0;
	2046	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2047	NULL);
	2048	if (!SvTRUE(ERRSV))
	2049	sv_setsv(ERRSV, errsv_save);
	2050	SvREFCNT_dec(errsv_save);
	2051	LEAVE;
	2052	}
	2053	SPAGAIN;
	2054	PUSHMARK(SP);
	2055	EXTEND(SP,5);
	2056	mPUSHp(pkg, pkg_len);
	2057	mPUSHp(name, name_len);
	2058	PUSHs(listsv);
	2059	mPUSHi(minbits);
	2060	mPUSHi(none);
	2061	PUTBACK;
	2062	errsv_save = newSVsv(ERRSV);
	2063	/* If we already have a pointer to the method, no need to use call_method()
	2064	to repeat the lookup. */
	2065	if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
	2066	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2067	retval = newSVsv(*PL_stack_sp--);
	2068	else
	2069	retval = &PL_sv_undef;
	2070	if (!SvTRUE(ERRSV))
	2071	sv_setsv(ERRSV, errsv_save);
	2072	SvREFCNT_dec(errsv_save);
	2073	LEAVE;
	2074	POPSTACK;
	2075	if (IN_PERL_COMPILETIME) {
	2076	CopHINTS_set(PL_curcop, PL_hints);
	2077	}
	2078	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2079	if (SvPOK(retval))
	2080	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	2081	SVfARG(retval));
	2082	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	2083	}
	2084	return retval;
	2085	}
	2086
	2087
	2088	/* This API is wrong for special case conversions since we may need to
	2089	* return several Unicode characters for a single Unicode character
	2090	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2091	* the lower-level routine, and it is similarly broken for returning
	2092	* multiple values. --jhi
	2093	* For those, you should use to_utf8_case() instead */
	2094	/* Now SWASHGET is recasted into S_swash_get in this file. */
	2095
	2096	/* Note:
	2097	* Returns the value of property/mapping C<swash> for the first character
	2098	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2099	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	2100	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2101	*/
	2102	UV
	2103	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2104	{
	2105	dVAR;
	2106	HV *const hv = MUTABLE_HV(SvRV(swash));
	2107	U32 klen;
	2108	U32 off;
	2109	STRLEN slen;
	2110	STRLEN needents;
	2111	const U8 *tmps = NULL;
	2112	U32 bit;
	2113	SV *swatch;
	2114	U8 tmputf8[2];
	2115	const UV c = NATIVE_TO_ASCII(*ptr);
	2116
	2117	PERL_ARGS_ASSERT_SWASH_FETCH;
	2118
	2119	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	2120	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	2121	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	2122	ptr = tmputf8;
	2123	}
	2124	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	2125	* then the "swatch" is a vec() for all the chars which start
	2126	* with 0xAA..0xYY
	2127	* So the key in the hash (klen) is length of encoded char -1
	2128	*/
	2129	klen = UTF8SKIP(ptr) - 1;
	2130	off = ptr[klen];
	2131
	2132	if (klen == 0) {
	2133	/* If char is invariant then swatch is for all the invariant chars
	2134	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	2135	*/
	2136	needents = UTF_CONTINUATION_MARK;
	2137	off = NATIVE_TO_UTF(ptr[klen]);
	2138	}
	2139	else {
	2140	/* If char is encoded then swatch is for the prefix */
	2141	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2142	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	2143	if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_UTF8)) {
	2144	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
	2145
	2146	/* This outputs warnings for binary properties only, assuming that
	2147	* to_utf8_case() will output any. Also, surrogates aren't checked
	2148	* for, as that would warn on things like /\p{Gc=Cs}/ */
	2149	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2150	if (SvUV(*bitssvp) == 1) {
	2151	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	2152	"Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
	2153	}
	2154	}
	2155	}
	2156
	2157	/*
	2158	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2159	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2160	* it's nothing to sniff at.) Pity we usually come through at least
	2161	* two function calls to get here...
	2162	*
	2163	* NB: this code assumes that swatches are never modified, once generated!
	2164	*/
	2165
	2166	if (hv == PL_last_swash_hv &&
	2167	klen == PL_last_swash_klen &&
	2168	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2169	{
	2170	tmps = PL_last_swash_tmps;
	2171	slen = PL_last_swash_slen;
	2172	}
	2173	else {
	2174	/* Try our second-level swatch cache, kept in a hash. */
	2175	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2176
	2177	/* If not cached, generate it via swash_get */
	2178	if (!svp \|\| !SvPOK(*svp)
	2179	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	2180	/* We use utf8n_to_uvuni() as we want an index into
	2181	Unicode tables, not a native character number.
	2182	*/
	2183	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	2184	ckWARN(WARN_UTF8) ?
	2185	0 : UTF8_ALLOW_ANY);
	2186	swatch = swash_get(swash,
	2187	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	2188	(klen) ? (code_point & ~(needents - 1)) : 0,
	2189	needents);
	2190
	2191	if (IN_PERL_COMPILETIME)
	2192	CopHINTS_set(PL_curcop, PL_hints);
	2193
	2194	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2195
	2196	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2197	\|\| (slen << 3) < needents)
	2198	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	2199	}
	2200
	2201	PL_last_swash_hv = hv;
	2202	assert(klen <= sizeof(PL_last_swash_key));
	2203	PL_last_swash_klen = (U8)klen;
	2204	/* FIXME change interpvar.h? */
	2205	PL_last_swash_tmps = (U8 *) tmps;
	2206	PL_last_swash_slen = slen;
	2207	if (klen)
	2208	Copy(ptr, PL_last_swash_key, klen, U8);
	2209	}
	2210
	2211	switch ((int)((slen << 3) / needents)) {
	2212	case 1:
	2213	bit = 1 << (off & 7);
	2214	off >>= 3;
	2215	return (tmps[off] & bit) != 0;
	2216	case 8:
	2217	return tmps[off];
	2218	case 16:
	2219	off <<= 1;
	2220	return (tmps[off] << 8) + tmps[off + 1] ;
	2221	case 32:
	2222	off <<= 2;
	2223	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2224	}
	2225	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2226	NORETURN_FUNCTION_END;
	2227	}
	2228
	2229	/* Read a single line of the main body of the swash input text. These are of
	2230	* the form:
	2231	* 0053 0056 0073
	2232	* where each number is hex. The first two numbers form the minimum and
	2233	* maximum of a range, and the third is the value associated with the range.
	2234	* Not all swashes should have a third number
	2235	*
	2236	* On input: l points to the beginning of the line to be examined; it points
	2237	* to somewhere in the string of the whole input text, and is
	2238	* terminated by a \n or the null string terminator.
	2239	* lend points to the null terminator of that string
	2240	* wants_value is non-zero if the swash expects a third number
	2241	* typestr is the name of the swash's mapping, like 'ToLower'
	2242	* On output: min, max, and *val are set to the values read from the line.
	2243	* returns a pointer just beyond the line examined. If there was no
	2244	* valid min number on the line, returns lend+1
	2245	*/
	2246
	2247	STATIC U8*
	2248	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2249	const bool wants_value, const U8* const typestr)
	2250	{
	2251	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2252	STRLEN numlen; /* Length of the number */
	2253	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2254
	2255	/* nl points to the next \n in the scan */
	2256	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2257
	2258	/* Get the first number on the line: the range minimum */
	2259	numlen = lend - l;
	2260	min = grok_hex((char )l, &numlen, &flags, NULL);
	2261	if (numlen) /* If found a hex number, position past it */
	2262	l += numlen;
	2263	else if (nl) { /* Else, go handle next line, if any */
	2264	return nl + 1; /* 1 is length of "\n" */
	2265	}
	2266	else { /* Else, no next line */
	2267	return lend + 1; /* to LIST's end at which \n is not found */
	2268	}
	2269
	2270	/* The max range value follows, separated by a BLANK */
	2271	if (isBLANK(*l)) {
	2272	++l;
	2273	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2274	numlen = lend - l;
	2275	max = grok_hex((char )l, &numlen, &flags, NULL);
	2276	if (numlen)
	2277	l += numlen;
	2278	else /* If no value here, it is a single element range */
	2279	max = min;
	2280
	2281	/* Non-binary tables have a third entry: what the first element of the
	2282	* range maps to */
	2283	if (wants_value) {
	2284	if (isBLANK(*l)) {
	2285	++l;
	2286	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	2287	PERL_SCAN_DISALLOW_PREFIX;
	2288	numlen = lend - l;
	2289	val = grok_hex((char )l, &numlen, &flags, NULL);
	2290	if (numlen)
	2291	l += numlen;
	2292	else
	2293	*val = 0;
	2294	}
	2295	else {
	2296	*val = 0;
	2297	if (typeto) {
	2298	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2299	typestr, l);
	2300	}
	2301	}
	2302	}
	2303	else
	2304	val = 0; / bits == 1, then any val should be ignored */
	2305	}
	2306	else { /* Nothing following range min, should be single element with no
	2307	mapping expected */
	2308	max = min;
	2309	if (wants_value) {
	2310	*val = 0;
	2311	if (typeto) {
	2312	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2313	}
	2314	}
	2315	else
	2316	val = 0; / bits == 1, then val should be ignored */
	2317	}
	2318
	2319	/* Position to next line if any, or EOF */
	2320	if (nl)
	2321	l = nl + 1;
	2322	else
	2323	l = lend;
	2324
	2325	return l;
	2326	}
	2327
	2328	/* Note:
	2329	* Returns a swatch (a bit vector string) for a code point sequence
	2330	* that starts from the value C<start> and comprises the number C<span>.
	2331	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2332	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2333	*/
	2334	STATIC SV*
	2335	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2336	{
	2337	SV *swatch;
	2338	U8 l, lend, x, xend, *s;
	2339	STRLEN lcur, xcur, scur;
	2340	HV *const hv = MUTABLE_HV(SvRV(swash));
	2341
	2342	/* The string containing the main body of the table */
	2343	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2344
	2345	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2346	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2347	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2348	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2349	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2350	const STRLEN bits = SvUV(*bitssvp);
	2351	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2352	const UV none = SvUV(*nonesvp);
	2353	const UV end = start + span;
	2354
	2355	PERL_ARGS_ASSERT_SWASH_GET;
	2356
	2357	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2358	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2359	(UV)bits);
	2360	}
	2361
	2362	/* create and initialize $swatch */
	2363	scur = octets ? (span * octets) : (span + 7) / 8;
	2364	swatch = newSV(scur);
	2365	SvPOK_on(swatch);
	2366	s = (U8*)SvPVX(swatch);
	2367	if (octets && none) {
	2368	const U8* const e = s + scur;
	2369	while (s < e) {
	2370	if (bits == 8)
	2371	*s++ = (U8)(none & 0xff);
	2372	else if (bits == 16) {
	2373	*s++ = (U8)((none >> 8) & 0xff);
	2374	*s++ = (U8)( none & 0xff);
	2375	}
	2376	else if (bits == 32) {
	2377	*s++ = (U8)((none >> 24) & 0xff);
	2378	*s++ = (U8)((none >> 16) & 0xff);
	2379	*s++ = (U8)((none >> 8) & 0xff);
	2380	*s++ = (U8)( none & 0xff);
	2381	}
	2382	}
	2383	*s = '\0';
	2384	}
	2385	else {
	2386	(void)memzero((U8*)s, scur + 1);
	2387	}
	2388	SvCUR_set(swatch, scur);
	2389	s = (U8*)SvPVX(swatch);
	2390
	2391	/* read $swash->{LIST} */
	2392	l = (U8)SvPV(listsvp, lcur);
	2393	lend = l + lcur;
	2394	while (l < lend) {
	2395	UV min, max, val;
	2396	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2397	cBOOL(octets), typestr);
	2398	if (l > lend) {
	2399	break;
	2400	}
	2401
	2402	/* If looking for something beyond this range, go try the next one */
	2403	if (max < start)
	2404	continue;
	2405
	2406	if (octets) {
	2407	UV key;
	2408	if (min < start) {
	2409	if (!none \|\| val < none) {
	2410	val += start - min;
	2411	}
	2412	min = start;
	2413	}
	2414	for (key = min; key <= max; key++) {
	2415	STRLEN offset;
	2416	if (key >= end)
	2417	goto go_out_list;
	2418	/* offset must be non-negative (start <= min <= key < end) */
	2419	offset = octets * (key - start);
	2420	if (bits == 8)
	2421	s[offset] = (U8)(val & 0xff);
	2422	else if (bits == 16) {
	2423	s[offset ] = (U8)((val >> 8) & 0xff);
	2424	s[offset + 1] = (U8)( val & 0xff);
	2425	}
	2426	else if (bits == 32) {
	2427	s[offset ] = (U8)((val >> 24) & 0xff);
	2428	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2429	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2430	s[offset + 3] = (U8)( val & 0xff);
	2431	}
	2432
	2433	if (!none \|\| val < none)
	2434	++val;
	2435	}
	2436	}
	2437	else { /* bits == 1, then val should be ignored */
	2438	UV key;
	2439	if (min < start)
	2440	min = start;
	2441	for (key = min; key <= max; key++) {
	2442	const STRLEN offset = (STRLEN)(key - start);
	2443	if (key >= end)
	2444	goto go_out_list;
	2445	s[offset >> 3] \|= 1 << (offset & 7);
	2446	}
	2447	}
	2448	} /* while */
	2449	go_out_list:
	2450
	2451	/* read $swash->{EXTRAS} */
	2452	x = (U8)SvPV(extssvp, xcur);
	2453	xend = x + xcur;
	2454	while (x < xend) {
	2455	STRLEN namelen;
	2456	U8 *namestr;
	2457	SV** othersvp;
	2458	HV* otherhv;
	2459	STRLEN otherbits;
	2460	SV *otherbitssvp, other;
	2461	U8 s, o, *nl;
	2462	STRLEN slen, olen;
	2463
	2464	const U8 opc = *x++;
	2465	if (opc == '\n')
	2466	continue;
	2467
	2468	nl = (U8*)memchr(x, '\n', xend - x);
	2469
	2470	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2471	if (nl) {
	2472	x = nl + 1; /* 1 is length of "\n" */
	2473	continue;
	2474	}
	2475	else {
	2476	x = xend; /* to EXTRAS' end at which \n is not found */
	2477	break;
	2478	}
	2479	}
	2480
	2481	namestr = x;
	2482	if (nl) {
	2483	namelen = nl - namestr;
	2484	x = nl + 1;
	2485	}
	2486	else {
	2487	namelen = xend - namestr;
	2488	x = xend;
	2489	}
	2490
	2491	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2492	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2493	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2494	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2495	if (bits < otherbits)
	2496	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2497
	2498	/* The "other" swatch must be destroyed after. */
	2499	other = swash_get(*othersvp, start, span);
	2500	o = (U8*)SvPV(other, olen);
	2501
	2502	if (!olen)
	2503	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2504
	2505	s = (U8*)SvPV(swatch, slen);
	2506	if (bits == 1 && otherbits == 1) {
	2507	if (slen != olen)
	2508	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2509
	2510	switch (opc) {
	2511	case '+':
	2512	while (slen--)
	2513	s++ \|= o++;
	2514	break;
	2515	case '!':
	2516	while (slen--)
	2517	s++ \|= ~o++;
	2518	break;
	2519	case '-':
	2520	while (slen--)
	2521	s++ &= ~o++;
	2522	break;
	2523	case '&':
	2524	while (slen--)
	2525	s++ &= o++;
	2526	break;
	2527	default:
	2528	break;
	2529	}
	2530	}
	2531	else {
	2532	STRLEN otheroctets = otherbits >> 3;
	2533	STRLEN offset = 0;
	2534	U8* const send = s + slen;
	2535
	2536	while (s < send) {
	2537	UV otherval = 0;
	2538
	2539	if (otherbits == 1) {
	2540	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2541	++offset;
	2542	}
	2543	else {
	2544	STRLEN vlen = otheroctets;
	2545	otherval = *o++;
	2546	while (--vlen) {
	2547	otherval <<= 8;
	2548	otherval \|= *o++;
	2549	}
	2550	}
	2551
	2552	if (opc == '+' && otherval)
	2553	NOOP; /* replace with otherval */
	2554	else if (opc == '!' && !otherval)
	2555	otherval = 1;
	2556	else if (opc == '-' && otherval)
	2557	otherval = 0;
	2558	else if (opc == '&' && !otherval)
	2559	otherval = 0;
	2560	else {
	2561	s += octets; /* no replacement */
	2562	continue;
	2563	}
	2564
	2565	if (bits == 8)
	2566	*s++ = (U8)( otherval & 0xff);
	2567	else if (bits == 16) {
	2568	*s++ = (U8)((otherval >> 8) & 0xff);
	2569	*s++ = (U8)( otherval & 0xff);
	2570	}
	2571	else if (bits == 32) {
	2572	*s++ = (U8)((otherval >> 24) & 0xff);
	2573	*s++ = (U8)((otherval >> 16) & 0xff);
	2574	*s++ = (U8)((otherval >> 8) & 0xff);
	2575	*s++ = (U8)( otherval & 0xff);
	2576	}
	2577	}
	2578	}
	2579	sv_free(other); /* through with it! */
	2580	} /* while */
	2581	return swatch;
	2582	}
	2583
	2584	HV*
	2585	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	2586	{
	2587
	2588	/* Subject to change or removal. For use only in one place in regexec.c
	2589	*
	2590	* Returns a hash which is the inversion and closure of a swash mapping.
	2591	* For example, consider the input lines:
	2592	* 004B 006B
	2593	* 004C 006C
	2594	* 212A 006B
	2595	*
	2596	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	2597	* 006C. The value for each key is an array. For 006C, the array would
	2598	* have a two elements, the utf8 for itself, and for 004C. For 006B, there
	2599	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	2600	*
	2601	* Essentially, for any code point, it gives all the code points that map to
	2602	* it, or the list of 'froms' for that point.
	2603	*
	2604	* Currently it only looks at the main body of the swash, and ignores any
	2605	* additions or deletions from other swashes */
	2606
	2607	U8 l, lend;
	2608	STRLEN lcur;
	2609	HV *const hv = MUTABLE_HV(SvRV(swash));
	2610
	2611	/* The string containing the main body of the table */
	2612	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2613
	2614	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2615	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2616	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2617	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	2618	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2619	const STRLEN bits = SvUV(*bitssvp);
	2620	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2621	const UV none = SvUV(*nonesvp);
	2622
	2623	HV* ret = newHV();
	2624
	2625	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	2626
	2627	/* Must have at least 8 bits to get the mappings */
	2628	if (bits != 8 && bits != 16 && bits != 32) {
	2629	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	2630	(UV)bits);
	2631	}
	2632
	2633	/* read $swash->{LIST} */
	2634	l = (U8)SvPV(listsvp, lcur);
	2635	lend = l + lcur;
	2636
	2637	/* Go through each input line */
	2638	while (l < lend) {
	2639	UV min, max, val;
	2640	UV inverse;
	2641	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2642	cBOOL(octets), typestr);
	2643	if (l > lend) {
	2644	break;
	2645	}
	2646
	2647	/* Each element in the range is to be inverted */
	2648	for (inverse = min; inverse <= max; inverse++) {
	2649	AV* list;
	2650	SV* element;
	2651	SV** listp;
	2652	IV i;
	2653	bool found_key = FALSE;
	2654
	2655	/* The key is the inverse mapping */
	2656	char key[UTF8_MAXBYTES+1];
	2657	char* key_end = (char ) uvuni_to_utf8((U8) key, val);
	2658	STRLEN key_len = key_end - key;
	2659
	2660	/* Get the list for the map */
	2661	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	2662	list = (AV) listp;
	2663	}
	2664	else { /* No entry yet for it: create one */
	2665	list = newAV();
	2666	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	2667	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2668	}
	2669	}
	2670
	2671	for (i = 0; i < av_len(list); i++) {
	2672	SV** entryp = av_fetch(list, i, FALSE);
	2673	SV* entry;
	2674	if (entryp == NULL) {
	2675	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2676	}
	2677	entry = *entryp;
	2678	if (SvUV(entry) == val) {
	2679	found_key = TRUE;
	2680	break;
	2681	}
	2682	}
	2683
	2684	/* Make sure there is a mapping to itself on the list */
	2685	if (! found_key) {
	2686	element = newSVuv(val);
	2687	av_push(list, element);
	2688	}
	2689
	2690
	2691	/* Simply add the value to the list */
	2692	element = newSVuv(inverse);
	2693	av_push(list, element);
	2694
	2695	/* swash_get() increments the value of val for each element in the
	2696	* range. That makes more compact tables possible. You can
	2697	* express the capitalization, for example, of all consecutive
	2698	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	2699	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	2700	* and it's not documented, and perhaps not even currently used,
	2701	* but I copied the semantics from swash_get(), just in case */
	2702	if (!none \|\| val < none) {
	2703	++val;
	2704	}
	2705	}
	2706	}
	2707
	2708	return ret;
	2709	}
	2710
	2711	HV*
	2712	Perl__swash_to_invlist(pTHX_ SV* const swash)
	2713	{
	2714
	2715	/* Subject to change or removal. For use only in one place in regcomp.c */
	2716
	2717	U8 l, lend;
	2718	char *loc;
	2719	STRLEN lcur;
	2720	HV *const hv = MUTABLE_HV(SvRV(swash));
	2721	UV elements = 0; /* Number of elements in the inversion list */
	2722	U8 empty[] = "";
	2723
	2724	/* The string containing the main body of the table */
	2725	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2726	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2727	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2728
	2729	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2730	const STRLEN bits = SvUV(*bitssvp);
	2731	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2732
	2733	HV* invlist;
	2734
	2735	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	2736
	2737	/* read $swash->{LIST} */
	2738	if (SvPOK(*listsvp)) {
	2739	l = (U8)SvPV(listsvp, lcur);
	2740	}
	2741	else {
	2742	/* LIST legitimately doesn't contain a string during compilation phases
	2743	* of Perl itself, before the Unicode tables are generated. In this
	2744	* case, just fake things up by creating an empty list */
	2745	l = empty;
	2746	lcur = 0;
	2747	}
	2748	loc = (char *) l;
	2749	lend = l + lcur;
	2750
	2751	/* Scan the input to count the number of lines to preallocate array size
	2752	* based on worst possible case, which is each line in the input creates 2
	2753	* elements in the inversion list: 1) the beginning of a range in the list;
	2754	* 2) the beginning of a range not in the list. */
	2755	while ((loc = (strchr(loc, '\n'))) != NULL) {
	2756	elements += 2;
	2757	loc++;
	2758	}
	2759
	2760	/* If the ending is somehow corrupt and isn't a new line, add another
	2761	* element for the final range that isn't in the inversion list */
	2762	if (! (lend == '\n' \|\| (lend == '\0' && *(lend - 1) == '\n'))) {
	2763	elements++;
	2764	}
	2765
	2766	invlist = _new_invlist(elements);
	2767
	2768	/* Now go through the input again, adding each range to the list */
	2769	while (l < lend) {
	2770	UV start, end;
	2771	UV val; /* Not used by this function */
	2772
	2773	l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
	2774	cBOOL(octets), typestr);
	2775
	2776	if (l > lend) {
	2777	break;
	2778	}
	2779
	2780	_append_range_to_invlist(invlist, start, end);
	2781	}
	2782
	2783	return invlist;
	2784	}
	2785
	2786	/*
	2787	=for apidoc uvchr_to_utf8
	2788
	2789	Adds the UTF-8 representation of the Native code point C<uv> to the end
	2790	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2791	bytes available. The return value is the pointer to the byte after the
	2792	end of the new character. In other words,
	2793
	2794	d = uvchr_to_utf8(d, uv);
	2795
	2796	is the recommended wide native character-aware way of saying
	2797
	2798	*(d++) = uv;
	2799
	2800	=cut
	2801	*/
	2802
	2803	/* On ASCII machines this is normally a macro but we want a
	2804	real function in case XS code wants it
	2805	*/
	2806	U8 *
	2807	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2808	{
	2809	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	2810
	2811	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2812	}
	2813
	2814	U8 *
	2815	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2816	{
	2817	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	2818
	2819	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2820	}
	2821
	2822	/*
	2823	=for apidoc utf8n_to_uvchr
	2824
	2825	Returns the native character value of the first character in the string
	2826	C<s>
	2827	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2828	length, in bytes, of that character.
	2829
	2830	length and flags are the same as utf8n_to_uvuni().
	2831
	2832	=cut
	2833	*/
	2834	/* On ASCII machines this is normally a macro but we want
	2835	a real function in case XS code wants it
	2836	*/
	2837	UV
	2838	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2839	U32 flags)
	2840	{
	2841	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2842
	2843	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	2844
	2845	return UNI_TO_NATIVE(uv);
	2846	}
	2847
	2848	bool
	2849	Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
	2850	{
	2851	/* May change: warns if surrogates, non-character code points, or
	2852	* non-Unicode code points are in s which has length len. Returns TRUE if
	2853	* none found; FALSE otherwise. The only other validity check is to make
	2854	* sure that this won't exceed the string's length */
	2855
	2856	const U8* const e = s + len;
	2857	bool ok = TRUE;
	2858
	2859	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	2860
	2861	while (s < e) {
	2862	if (UTF8SKIP(s) > len) {
	2863	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2864	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	2865	return FALSE;
	2866	}
	2867	if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
	2868	STRLEN char_len;
	2869	if (UTF8_IS_SUPER(s)) {
	2870	UV uv = utf8_to_uvchr(s, &char_len);
	2871	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	2872	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	2873	ok = FALSE;
	2874	}
	2875	else if (UTF8_IS_SURROGATE(s)) {
	2876	UV uv = utf8_to_uvchr(s, &char_len);
	2877	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	2878	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	2879	ok = FALSE;
	2880	}
	2881	else if
	2882	(UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	2883	{
	2884	UV uv = utf8_to_uvchr(s, &char_len);
	2885	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	2886	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	2887	ok = FALSE;
	2888	}
	2889	}
	2890	s += UTF8SKIP(s);
	2891	}
	2892
	2893	return ok;
	2894	}
	2895
	2896	/*
	2897	=for apidoc pv_uni_display
	2898
	2899	Build to the scalar dsv a displayable version of the string spv,
	2900	length len, the displayable version being at most pvlim bytes long
	2901	(if longer, the rest is truncated and "..." will be appended).
	2902
	2903	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2904	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2905	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2906	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2907	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2908	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2909
	2910	The pointer to the PV of the dsv is returned.
	2911
	2912	=cut */
	2913	char *
	2914	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2915	{
	2916	int truncated = 0;
	2917	const char s, e;
	2918
	2919	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	2920
	2921	sv_setpvs(dsv, "");
	2922	SvUTF8_off(dsv);
	2923	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2924	UV u;
	2925	/* This serves double duty as a flag and a character to print after
	2926	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2927	*/
	2928	char ok = 0;
	2929
	2930	if (pvlim && SvCUR(dsv) >= pvlim) {
	2931	truncated++;
	2932	break;
	2933	}
	2934	u = utf8_to_uvchr((U8*)s, 0);
	2935	if (u < 256) {
	2936	const unsigned char c = (unsigned char)u & 0xFF;
	2937	if (flags & UNI_DISPLAY_BACKSLASH) {
	2938	switch (c) {
	2939	case '\n':
	2940	ok = 'n'; break;
	2941	case '\r':
	2942	ok = 'r'; break;
	2943	case '\t':
	2944	ok = 't'; break;
	2945	case '\f':
	2946	ok = 'f'; break;
	2947	case '\a':
	2948	ok = 'a'; break;
	2949	case '\\':
	2950	ok = '\\'; break;
	2951	default: break;
	2952	}
	2953	if (ok) {
	2954	const char string = ok;
	2955	sv_catpvs(dsv, "\\");
	2956	sv_catpvn(dsv, &string, 1);
	2957	}
	2958	}
	2959	/* isPRINT() is the locale-blind version. */
	2960	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	2961	const char string = c;
	2962	sv_catpvn(dsv, &string, 1);
	2963	ok = 1;
	2964	}
	2965	}
	2966	if (!ok)
	2967	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	2968	}
	2969	if (truncated)
	2970	sv_catpvs(dsv, "...");
	2971
	2972	return SvPVX(dsv);
	2973	}
	2974
	2975	/*
	2976	=for apidoc sv_uni_display
	2977
	2978	Build to the scalar dsv a displayable version of the scalar sv,
	2979	the displayable version being at most pvlim bytes long
	2980	(if longer, the rest is truncated and "..." will be appended).
	2981
	2982	The flags argument is as in pv_uni_display().
	2983
	2984	The pointer to the PV of the dsv is returned.
	2985
	2986	=cut
	2987	*/
	2988	char *
	2989	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	2990	{
	2991	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	2992
	2993	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	2994	SvCUR(ssv), pvlim, flags);
	2995	}
	2996
	2997	/*
	2998	=for apidoc foldEQ_utf8
	2999
	3000	Returns true if the leading portions of the strings s1 and s2 (either or both
	3001	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3002	How far into the strings to compare is determined by other input parameters.
	3003
	3004	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	3005	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	3006	with respect to s2.
	3007
	3008	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	3009	equality. In other words, s1+l1 will be used as a goal to reach. The
	3010	scan will not be considered to be a match unless the goal is reached, and
	3011	scanning won't continue past that goal. Correspondingly for l2 with respect to
	3012	s2.
	3013
	3014	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	3015	considered an end pointer beyond which scanning of s1 will not continue under
	3016	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	3017	is less than s1+l1, the match will never be successful because it can never
	3018	get as far as its goal (and in fact is asserted against). Correspondingly for
	3019	pe2 with respect to s2.
	3020
	3021	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	3022	non-zero), and if both do, both have to be
	3023	reached for a successful match. Also, if the fold of a character is multiple
	3024	characters, all of them must be matched (see tr21 reference below for
	3025	'folding').
	3026
	3027	Upon a successful match, if pe1 is non-NULL,
	3028	it will be set to point to the beginning of the I<next> character of s1 beyond
	3029	what was matched. Correspondingly for pe2 and s2.
	3030
	3031	For case-insensitiveness, the "casefolding" of Unicode is used
	3032	instead of upper/lowercasing both the characters, see
	3033	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	3034
	3035	=cut */
	3036	I32
	3037	Perl_foldEQ_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	3038	{
	3039	dVAR;
	3040	register const U8 p1 = (const U8)s1; /* Point to current char */
	3041	register const U8 p2 = (const U8)s2;
	3042	register const U8 g1 = NULL; / goal for s1 */
	3043	register const U8 *g2 = NULL;
	3044	register const U8 e1 = NULL; / Don't scan s1 past this */
	3045	register U8 f1 = NULL; / Point to current folded */
	3046	register const U8 *e2 = NULL;
	3047	register U8 *f2 = NULL;
	3048	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3049	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3050	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3051	U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
	3052	these always fit in 2 bytes */
	3053
	3054	PERL_ARGS_ASSERT_FOLDEQ_UTF8;
	3055
	3056	if (pe1) {
	3057	e1 = (U8*)pe1;
	3058	}
	3059
	3060	if (l1) {
	3061	g1 = (const U8*)s1 + l1;
	3062	}
	3063
	3064	if (pe2) {
	3065	e2 = (U8*)pe2;
	3066	}
	3067
	3068	if (l2) {
	3069	g2 = (const U8*)s2 + l2;
	3070	}
	3071
	3072	/* Must have at least one goal */
	3073	assert(g1 \|\| g2);
	3074
	3075	if (g1) {
	3076
	3077	/* Will never match if goal is out-of-bounds */
	3078	assert(! e1 \|\| e1 >= g1);
	3079
	3080	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3081	* only go as far as the goal */
	3082	e1 = g1;
	3083	}
	3084	else {
	3085	assert(e1); /* Must have an end for looking at s1 */
	3086	}
	3087
	3088	/* Same for goal for s2 */
	3089	if (g2) {
	3090	assert(! e2 \|\| e2 >= g2);
	3091	e2 = g2;
	3092	}
	3093	else {
	3094	assert(e2);
	3095	}
	3096
	3097	/* Look through both strings, a character at a time */
	3098	while (p1 < e1 && p2 < e2) {
	3099
	3100	/* If at the beginning of a new character in s1, get its fold to use
	3101	* and the length of the fold */
	3102	if (n1 == 0) {
	3103	if (u1) {
	3104	to_utf8_fold(p1, foldbuf1, &n1);
	3105	}
	3106	else { /* Not utf8, convert to it first and then get fold */
	3107	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	3108	to_utf8_fold(natbuf, foldbuf1, &n1);
	3109	}
	3110	f1 = foldbuf1;
	3111	}
	3112
	3113	if (n2 == 0) { /* Same for s2 */
	3114	if (u2) {
	3115	to_utf8_fold(p2, foldbuf2, &n2);
	3116	}
	3117	else {
	3118	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	3119	to_utf8_fold(natbuf, foldbuf2, &n2);
	3120	}
	3121	f2 = foldbuf2;
	3122	}
	3123
	3124	/* While there is more to look for in both folds, see if they
	3125	* continue to match */
	3126	while (n1 && n2) {
	3127	U8 fold_length = UTF8SKIP(f1);
	3128	if (fold_length != UTF8SKIP(f2)
	3129	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	3130	function call for single
	3131	character */
	3132	\|\| memNE((char)f1, (char)f2, fold_length))
	3133	{
	3134	return 0; /* mismatch */
	3135	}
	3136
	3137	/* Here, they matched, advance past them */
	3138	n1 -= fold_length;
	3139	f1 += fold_length;
	3140	n2 -= fold_length;
	3141	f2 += fold_length;
	3142	}
	3143
	3144	/* When reach the end of any fold, advance the input past it */
	3145	if (n1 == 0) {
	3146	p1 += u1 ? UTF8SKIP(p1) : 1;
	3147	}
	3148	if (n2 == 0) {
	3149	p2 += u2 ? UTF8SKIP(p2) : 1;
	3150	}
	3151	} /* End of loop through both strings */
	3152
	3153	/* A match is defined by each scan that specified an explicit length
	3154	* reaching its final goal, and the other not having matched a partial
	3155	* character (which can happen when the fold of a character is more than one
	3156	* character). */
	3157	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	3158	return 0;
	3159	}
	3160
	3161	/* Successful match. Set output pointers */
	3162	if (pe1) {
	3163	pe1 = (char)p1;
	3164	}
	3165	if (pe2) {
	3166	pe2 = (char)p2;
	3167	}
	3168	return 1;
	3169	}
	3170
	3171	/*
	3172	* Local variables:
	3173	* c-indentation-style: bsd
	3174	* c-basic-offset: 4
	3175	* indent-tabs-mode: t
	3176	* End:
	3177	*
	3178	* ex: set ts=8 sts=4 sw=4 noet:
	3179	*/