perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>.
	66
	67	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	68
	69	=cut
	70	*/
	71
	72	bool
	73	Perl_is_ascii_string(const U8 *s, STRLEN len)
	74	{
	75	const U8* const send = s + (len ? len : strlen((const char *)s));
	76	const U8* x = s;
	77
	78	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	79
	80	for (; x < send; ++x) {
	81	if (!UTF8_IS_INVARIANT(*x))
	82	break;
	83	}
	84
	85	return x == send;
	86	}
	87
	88	/*
	89	=for apidoc uvuni_to_utf8_flags
	90
	91	Adds the UTF-8 representation of the code point C<uv> to the end
	92	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	93	bytes available. The return value is the pointer to the byte after the
	94	end of the new character. In other words,
	95
	96	d = uvuni_to_utf8_flags(d, uv, flags);
	97
	98	or, in most cases,
	99
	100	d = uvuni_to_utf8(d, uv);
	101
	102	(which is equivalent to)
	103
	104	d = uvuni_to_utf8_flags(d, uv, 0);
	105
	106	This is the recommended Unicode-aware way of saying
	107
	108	*(d++) = uv;
	109
	110	This function will convert to UTF-8 (and not warn) even code points that aren't
	111	legal Unicode or are problematic, unless C<flags> contains one or more of the
	112	following flags.
	113	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	114	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	115	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	116	If both flags are set, the function will both warn and return NULL.
	117
	118	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
	119	affect how the function handles a Unicode non-character. And, likewise for the
	120	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
	121	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	122	even less portable) can be warned and/or disallowed even if other above-Unicode
	123	code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	124	flags.
	125
	126	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	127	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	128	DISALLOW flags.
	129
	130
	131	=cut
	132	*/
	133
	134	U8 *
	135	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	136	{
	137	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	138
	139	if (ckWARN_d(WARN_UTF8)) {
	140	if (UNICODE_IS_SURROGATE(uv)) {
	141	if (flags & UNICODE_WARN_SURROGATE) {
	142	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
	143	"UTF-16 surrogate U+%04"UVXf, uv);
	144	}
	145	if (flags & UNICODE_DISALLOW_SURROGATE) {
	146	return NULL;
	147	}
	148	}
	149	else if (UNICODE_IS_SUPER(uv)) {
	150	if (flags & UNICODE_WARN_SUPER
	151	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	152	{
	153	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	154	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	155	}
	156	if (flags & UNICODE_DISALLOW_SUPER
	157	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	158	{
	159	return NULL;
	160	}
	161	}
	162	else if (UNICODE_IS_NONCHAR(uv)) {
	163	if (flags & UNICODE_WARN_NONCHAR) {
	164	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
	165	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	166	uv);
	167	}
	168	if (flags & UNICODE_DISALLOW_NONCHAR) {
	169	return NULL;
	170	}
	171	}
	172	}
	173	if (UNI_IS_INVARIANT(uv)) {
	174	*d++ = (U8)UTF_TO_NATIVE(uv);
	175	return d;
	176	}
	177	#if defined(EBCDIC)
	178	else {
	179	STRLEN len = UNISKIP(uv);
	180	U8 *p = d+len-1;
	181	while (p > d) {
	182	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	183	uv >>= UTF_ACCUMULATION_SHIFT;
	184	}
	185	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	186	return d+len;
	187	}
	188	#else /* Non loop style */
	189	if (uv < 0x800) {
	190	*d++ = (U8)(( uv >> 6) \| 0xc0);
	191	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	192	return d;
	193	}
	194	if (uv < 0x10000) {
	195	*d++ = (U8)(( uv >> 12) \| 0xe0);
	196	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	197	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	198	return d;
	199	}
	200	if (uv < 0x200000) {
	201	*d++ = (U8)(( uv >> 18) \| 0xf0);
	202	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	204	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	205	return d;
	206	}
	207	if (uv < 0x4000000) {
	208	*d++ = (U8)(( uv >> 24) \| 0xf8);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	if (uv < 0x80000000) {
	216	*d++ = (U8)(( uv >> 30) \| 0xfc);
	217	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	218	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	219	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	220	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	221	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	222	return d;
	223	}
	224	#ifdef HAS_QUAD
	225	if (uv < UTF8_QUAD_MAX)
	226	#endif
	227	{
	228	d++ = 0xfe; / Can't match U+FEFF! */
	229	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	231	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	232	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	233	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	234	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	235	return d;
	236	}
	237	#ifdef HAS_QUAD
	238	{
	239	d++ = 0xff; / Can't match U+FFFE! */
	240	d++ = 0x80; / 6 Reserved bits */
	241	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	242	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	243	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	244	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	245	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	246	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	247	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	248	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	249	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	250	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	251	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	252	return d;
	253	}
	254	#endif
	255	#endif /* Loop style */
	256	}
	257
	258	/*
	259
	260	Tests if some arbitrary number of bytes begins in a valid UTF-8
	261	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	262	UTF-8 character. The actual number of bytes in the UTF-8 character
	263	will be returned if it is valid, otherwise 0.
	264
	265	This is the "slow" version as opposed to the "fast" version which is
	266	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	267	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	268	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	269	you should use the _slow(). In practice this means that the _slow()
	270	will be used very rarely, since the maximum Unicode code point (as of
	271	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	272	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	273	five bytes or more.
	274
	275	=cut */
	276	STATIC STRLEN
	277	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	278	{
	279	U8 u = *s;
	280	STRLEN slen;
	281	UV uv, ouv;
	282
	283	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	284
	285	if (UTF8_IS_INVARIANT(u))
	286	return 1;
	287
	288	if (!UTF8_IS_START(u))
	289	return 0;
	290
	291	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	292	return 0;
	293
	294	slen = len - 1;
	295	s++;
	296	#ifdef EBCDIC
	297	u = NATIVE_TO_UTF(u);
	298	#endif
	299	u &= UTF_START_MASK(len);
	300	uv = u;
	301	ouv = uv;
	302	while (slen--) {
	303	if (!UTF8_IS_CONTINUATION(*s))
	304	return 0;
	305	uv = UTF8_ACCUMULATE(uv, *s);
	306	if (uv < ouv)
	307	return 0;
	308	ouv = uv;
	309	s++;
	310	}
	311
	312	if ((STRLEN)UNISKIP(uv) < len)
	313	return 0;
	314
	315	return len;
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char
	320
	321	Tests if some arbitrary number of bytes begins in a valid UTF-8
	322	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	323	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	324	character will be returned if it is valid, otherwise 0.
	325
	326	=cut */
	327	STRLEN
	328	Perl_is_utf8_char(const U8 *s)
	329	{
	330	const STRLEN len = UTF8SKIP(s);
	331
	332	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	333	#ifdef IS_UTF8_CHAR
	334	if (IS_UTF8_CHAR_FAST(len))
	335	return IS_UTF8_CHAR(s, len) ? len : 0;
	336	#endif /* #ifdef IS_UTF8_CHAR */
	337	return is_utf8_char_slow(s, len);
	338	}
	339
	340
	341	/*
	342	=for apidoc is_utf8_string
	343
	344	Returns true if first C<len> bytes of the given string form a valid
	345	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	346	using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
	347	string that contains code points above 0x7F encoded in UTF-8' because a
	348	valid ASCII string is a valid UTF-8 string.
	349
	350	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	351
	352	=cut
	353	*/
	354
	355	bool
	356	Perl_is_utf8_string(const U8 *s, STRLEN len)
	357	{
	358	const U8* const send = s + (len ? len : strlen((const char *)s));
	359	const U8* x = s;
	360
	361	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	362
	363	while (x < send) {
	364	STRLEN c;
	365	/* Inline the easy bits of is_utf8_char() here for speed... */
	366	if (UTF8_IS_INVARIANT(*x))
	367	c = 1;
	368	else if (!UTF8_IS_START(*x))
	369	goto out;
	370	else {
	371	/* ... and call is_utf8_char() only if really needed. */
	372	#ifdef IS_UTF8_CHAR
	373	c = UTF8SKIP(x);
	374	if (IS_UTF8_CHAR_FAST(c)) {
	375	if (!IS_UTF8_CHAR(x, c))
	376	c = 0;
	377	}
	378	else
	379	c = is_utf8_char_slow(x, c);
	380	#else
	381	c = is_utf8_char(x);
	382	#endif /* #ifdef IS_UTF8_CHAR */
	383	if (!c)
	384	goto out;
	385	}
	386	x += c;
	387	}
	388
	389	out:
	390	if (x != send)
	391	return FALSE;
	392
	393	return TRUE;
	394	}
	395
	396	/*
	397	Implemented as a macro in utf8.h
	398
	399	=for apidoc is_utf8_string_loc
	400
	401	Like is_utf8_string() but stores the location of the failure (in the
	402	case of "utf8ness failure") or the location s+len (in the case of
	403	"utf8ness success") in the C<ep>.
	404
	405	See also is_utf8_string_loclen() and is_utf8_string().
	406
	407	=for apidoc is_utf8_string_loclen
	408
	409	Like is_utf8_string() but stores the location of the failure (in the
	410	case of "utf8ness failure") or the location s+len (in the case of
	411	"utf8ness success") in the C<ep>, and the number of UTF-8
	412	encoded characters in the C<el>.
	413
	414	See also is_utf8_string_loc() and is_utf8_string().
	415
	416	=cut
	417	*/
	418
	419	bool
	420	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	421	{
	422	const U8* const send = s + (len ? len : strlen((const char *)s));
	423	const U8* x = s;
	424	STRLEN c;
	425	STRLEN outlen = 0;
	426
	427	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	428
	429	while (x < send) {
	430	/* Inline the easy bits of is_utf8_char() here for speed... */
	431	if (UTF8_IS_INVARIANT(*x))
	432	c = 1;
	433	else if (!UTF8_IS_START(*x))
	434	goto out;
	435	else {
	436	/* ... and call is_utf8_char() only if really needed. */
	437	#ifdef IS_UTF8_CHAR
	438	c = UTF8SKIP(x);
	439	if (IS_UTF8_CHAR_FAST(c)) {
	440	if (!IS_UTF8_CHAR(x, c))
	441	c = 0;
	442	} else
	443	c = is_utf8_char_slow(x, c);
	444	#else
	445	c = is_utf8_char(x);
	446	#endif /* #ifdef IS_UTF8_CHAR */
	447	if (!c)
	448	goto out;
	449	}
	450	x += c;
	451	outlen++;
	452	}
	453
	454	out:
	455	if (el)
	456	*el = outlen;
	457
	458	if (ep)
	459	*ep = x;
	460	return (x == send);
	461	}
	462
	463	/*
	464
	465	=for apidoc utf8n_to_uvuni
	466
	467	Bottom level UTF-8 decode routine.
	468	Returns the code point value of the first character in the string C<s>
	469	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
	470	C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
	471	character.
	472
	473	The value of C<flags> determines the behavior when C<s> does not point to a
	474	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	475	C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
	476	is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
	477	is raised.
	478
	479	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	480	individual types of malformations, such as the sequence being overlong (that
	481	is, when there is a shorter sequence that can express the same code point;
	482	overlong sequences are expressly forbidden in the UTF-8 standard due to
	483	potential security issues). Another malformation example is the first byte of
	484	a character not being a legal first byte. See F<utf8.h> for the list of such
	485	flags. Of course, the value returned by this function under such conditions is
	486	not reliable.
	487
	488	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	489	flags) malformation is found. If this flag is set, the routine assumes that
	490	the caller will raise a warning, and this function will silently just set
	491	C<retlen> to C<-1> and return zero.
	492
	493	Certain code points are considered problematic. These are Unicode surrogates,
	494	Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
	495	By default these are considered regular code points, but certain situations
	496	warrant special handling for them. if C<flags> contains
	497	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	498	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	499	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	500	maximum) can be set to disallow these categories individually.
	501
	502	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	503	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	504	for their respective categories, but otherwise the code points are considered
	505	valid (not malformations). To get a category to both be treated as a
	506	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	507	(But note that warnings are not raised if lexically disabled nor if
	508	UTF8_CHECK_ONLY is also specified.)
	509
	510	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	511	the others that are above the Unicode legal maximum. There are several
	512	reasons, one of which is that the original UTF-8 specification never went above
	513	this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
	514	on ASCII platforms for these large code point begins with a byte containing
	515	0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
	516	malformations, while allowing smaller above-Unicode code points. (Of course
	517	UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
	518	as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
	519	flags, but applies just to these code points.
	520
	521	All other code points corresponding to Unicode characters, including private
	522	use and those yet to be assigned, are never considered malformed and never
	523	warn.
	524
	525	Most code should use utf8_to_uvchr() rather than call this directly.
	526
	527	=cut
	528	*/
	529
	530	UV
	531	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	532	{
	533	dVAR;
	534	const U8 * const s0 = s;
	535	UV uv = *s, ouv = 0;
	536	STRLEN len = 1;
	537	bool dowarn = ckWARN_d(WARN_UTF8);
	538	const UV startbyte = *s;
	539	STRLEN expectlen = 0;
	540	U32 warning = 0;
	541	SV* sv = NULL;
	542
	543	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	544
	545	/* This list is a superset of the UTF8_ALLOW_XXX. */
	546
	547	#define UTF8_WARN_EMPTY 1
	548	#define UTF8_WARN_CONTINUATION 2
	549	#define UTF8_WARN_NON_CONTINUATION 3
	550	#define UTF8_WARN_SHORT 4
	551	#define UTF8_WARN_OVERFLOW 5
	552	#define UTF8_WARN_LONG 6
	553
	554	if (curlen == 0 &&
	555	!(flags & UTF8_ALLOW_EMPTY)) {
	556	warning = UTF8_WARN_EMPTY;
	557	goto malformed;
	558	}
	559
	560	if (UTF8_IS_INVARIANT(uv)) {
	561	if (retlen)
	562	*retlen = 1;
	563	return (UV) (NATIVE_TO_UTF(*s));
	564	}
	565
	566	if (UTF8_IS_CONTINUATION(uv) &&
	567	!(flags & UTF8_ALLOW_CONTINUATION)) {
	568	warning = UTF8_WARN_CONTINUATION;
	569	goto malformed;
	570	}
	571
	572	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	573	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	574	warning = UTF8_WARN_NON_CONTINUATION;
	575	goto malformed;
	576	}
	577
	578	#ifdef EBCDIC
	579	uv = NATIVE_TO_UTF(uv);
	580	#else
	581	if (uv == 0xfe \|\| uv == 0xff) {
	582	if (flags & (UTF8_WARN_SUPER\|UTF8_WARN_FE_FF)) {
	583	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
	584	flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
	585	}
	586	if (flags & (UTF8_DISALLOW_SUPER\|UTF8_DISALLOW_FE_FF)) {
	587	goto malformed;
	588	}
	589	}
	590	#endif
	591
	592	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	593	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	594	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	595	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	596	#ifdef EBCDIC
	597	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	598	else { len = 7; uv &= 0x01; }
	599	#else
	600	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	601	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	602	else { len = 13; uv = 0; } /* whoa! */
	603	#endif
	604
	605	if (retlen)
	606	*retlen = len;
	607
	608	expectlen = len;
	609
	610	if ((curlen < expectlen) &&
	611	!(flags & UTF8_ALLOW_SHORT)) {
	612	warning = UTF8_WARN_SHORT;
	613	goto malformed;
	614	}
	615
	616	len--;
	617	s++;
	618	ouv = uv; /* ouv is the value from the previous iteration */
	619
	620	while (len--) {
	621	if (!UTF8_IS_CONTINUATION(*s) &&
	622	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	623	s--;
	624	warning = UTF8_WARN_NON_CONTINUATION;
	625	goto malformed;
	626	}
	627	else
	628	uv = UTF8_ACCUMULATE(uv, *s);
	629	if (!(uv > ouv)) { /* If the value didn't grow from the previous
	630	iteration, something is horribly wrong */
	631	/* These cannot be allowed. */
	632	if (uv == ouv) {
	633	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	634	warning = UTF8_WARN_LONG;
	635	goto malformed;
	636	}
	637	}
	638	else { /* uv < ouv */
	639	/* This cannot be allowed. */
	640	warning = UTF8_WARN_OVERFLOW;
	641	goto malformed;
	642	}
	643	}
	644	s++;
	645	ouv = uv;
	646	}
	647
	648	if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
	649	warning = UTF8_WARN_LONG;
	650	goto malformed;
	651	} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE\|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
	652	if (UNICODE_IS_SURROGATE(uv)) {
	653	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
	654	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	655	}
	656	if (flags & UTF8_DISALLOW_SURROGATE) {
	657	goto disallowed;
	658	}
	659	}
	660	else if (UNICODE_IS_NONCHAR(uv)) {
	661	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
	662	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	663	}
	664	if (flags & UTF8_DISALLOW_NONCHAR) {
	665	goto disallowed;
	666	}
	667	}
	668	else if ((uv > PERL_UNICODE_MAX)) {
	669	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
	670	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	671	}
	672	if (flags & UTF8_DISALLOW_SUPER) {
	673	goto disallowed;
	674	}
	675	}
	676
	677	/* Here, this is not considered a malformed character, so drop through
	678	* to return it */
	679	}
	680
	681	return uv;
	682
	683	disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
	684	set if there is to be a warning. */
	685	if (!sv) {
	686	dowarn = 0;
	687	}
	688
	689	malformed:
	690
	691	if (flags & UTF8_CHECK_ONLY) {
	692	if (retlen)
	693	*retlen = ((STRLEN) -1);
	694	return 0;
	695	}
	696
	697	if (dowarn) {
	698	if (! sv) {
	699	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	700	}
	701
	702	switch (warning) {
	703	case 0: /* Intentionally empty. */ break;
	704	case UTF8_WARN_EMPTY:
	705	sv_catpvs(sv, "(empty string)");
	706	break;
	707	case UTF8_WARN_CONTINUATION:
	708	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	709	break;
	710	case UTF8_WARN_NON_CONTINUATION:
	711	if (s == s0)
	712	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	713	(UV)s[1], startbyte);
	714	else {
	715	const int len = (int)(s-s0);
	716	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	717	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	718	}
	719
	720	break;
	721	case UTF8_WARN_SHORT:
	722	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	723	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	724	expectlen = curlen; /* distance for caller to skip */
	725	break;
	726	case UTF8_WARN_OVERFLOW:
	727	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	728	ouv, *s, startbyte);
	729	break;
	730	case UTF8_WARN_LONG:
	731	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	732	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	733	break;
	734	default:
	735	sv_catpvs(sv, "(unknown reason)");
	736	break;
	737	}
	738
	739	if (sv) {
	740	const char * const s = SvPVX_const(sv);
	741
	742	if (PL_op)
	743	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	744	"%s in %s", s, OP_DESC(PL_op));
	745	else
	746	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	747	}
	748	}
	749
	750	if (retlen)
	751	*retlen = expectlen ? expectlen : len;
	752
	753	return 0;
	754	}
	755
	756	/*
	757	=for apidoc utf8_to_uvchr
	758
	759	Returns the native code point of the first character in the string C<s>
	760	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	761	length, in bytes, of that character.
	762
	763	If C<s> does not point to a well-formed UTF-8 character, zero is
	764	returned and retlen is set, if possible, to -1.
	765
	766	=cut
	767	*/
	768
	769
	770	UV
	771	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	772	{
	773	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	774
	775	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	776	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	777	}
	778
	779	/*
	780	=for apidoc utf8_to_uvuni
	781
	782	Returns the Unicode code point of the first character in the string C<s>
	783	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	784	length, in bytes, of that character.
	785
	786	This function should only be used when the returned UV is considered
	787	an index into the Unicode semantic tables (e.g. swashes).
	788
	789	If C<s> does not point to a well-formed UTF-8 character, zero is
	790	returned and retlen is set, if possible, to -1.
	791
	792	=cut
	793	*/
	794
	795	UV
	796	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	797	{
	798	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	799
	800	/* Call the low level routine asking for checks */
	801	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	802	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	803	}
	804
	805	/*
	806	=for apidoc utf8_length
	807
	808	Return the length of the UTF-8 char encoded string C<s> in characters.
	809	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	810	up past C<e>, croaks.
	811
	812	=cut
	813	*/
	814
	815	STRLEN
	816	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	817	{
	818	dVAR;
	819	STRLEN len = 0;
	820
	821	PERL_ARGS_ASSERT_UTF8_LENGTH;
	822
	823	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	824	* the bitops (especially ~) can create illegal UTF-8.
	825	* In other words: in Perl UTF-8 is not just for Unicode. */
	826
	827	if (e < s)
	828	goto warn_and_return;
	829	while (s < e) {
	830	if (!UTF8_IS_INVARIANT(*s))
	831	s += UTF8SKIP(s);
	832	else
	833	s++;
	834	len++;
	835	}
	836
	837	if (e != s) {
	838	len--;
	839	warn_and_return:
	840	if (PL_op)
	841	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	842	"%s in %s", unees, OP_DESC(PL_op));
	843	else
	844	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	845	}
	846
	847	return len;
	848	}
	849
	850	/*
	851	=for apidoc utf8_distance
	852
	853	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	854	and C<b>.
	855
	856	WARNING: use only if you know that the pointers point inside the
	857	same UTF-8 buffer.
	858
	859	=cut
	860	*/
	861
	862	IV
	863	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	864	{
	865	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	866
	867	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	868	}
	869
	870	/*
	871	=for apidoc utf8_hop
	872
	873	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	874	forward or backward.
	875
	876	WARNING: do not use the following unless you know C<off> is within
	877	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	878	on the first byte of character or just after the last byte of a character.
	879
	880	=cut
	881	*/
	882
	883	U8 *
	884	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	885	{
	886	PERL_ARGS_ASSERT_UTF8_HOP;
	887
	888	PERL_UNUSED_CONTEXT;
	889	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	890	* the bitops (especially ~) can create illegal UTF-8.
	891	* In other words: in Perl UTF-8 is not just for Unicode. */
	892
	893	if (off >= 0) {
	894	while (off--)
	895	s += UTF8SKIP(s);
	896	}
	897	else {
	898	while (off++) {
	899	s--;
	900	while (UTF8_IS_CONTINUATION(*s))
	901	s--;
	902	}
	903	}
	904	return (U8 *)s;
	905	}
	906
	907	/*
	908	=for apidoc bytes_cmp_utf8
	909
	910	Compares the sequence of characters (stored as octets) in b, blen with the
	911	sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
	912	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	913	if the first string is greater than the second string.
	914
	915	-1 or +1 is returned if the shorter string was identical to the start of the
	916	longer string. -2 or +2 is returned if the was a difference between characters
	917	within the strings.
	918
	919	=cut
	920	*/
	921
	922	int
	923	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	924	{
	925	const U8 *const bend = b + blen;
	926	const U8 *const uend = u + ulen;
	927
	928	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	929
	930	PERL_UNUSED_CONTEXT;
	931
	932	while (b < bend && u < uend) {
	933	U8 c = *u++;
	934	if (!UTF8_IS_INVARIANT(c)) {
	935	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	936	if (u < uend) {
	937	U8 c1 = *u++;
	938	if (UTF8_IS_CONTINUATION(c1)) {
	939	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
	940	} else {
	941	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	942	"Malformed UTF-8 character "
	943	"(unexpected non-continuation byte 0x%02x"
	944	", immediately after start byte 0x%02x)"
	945	/* Dear diag.t, it's in the pod. */
	946	"%s%s", c1, c,
	947	PL_op ? " in " : "",
	948	PL_op ? OP_DESC(PL_op) : "");
	949	return -2;
	950	}
	951	} else {
	952	if (PL_op)
	953	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	954	"%s in %s", unees, OP_DESC(PL_op));
	955	else
	956	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	957	return -2; /* Really want to return undef :-) */
	958	}
	959	} else {
	960	return -2;
	961	}
	962	}
	963	if (*b != c) {
	964	return *b < c ? -2 : +2;
	965	}
	966	++b;
	967	}
	968
	969	if (b == bend && u == uend)
	970	return 0;
	971
	972	return b < bend ? +1 : -1;
	973	}
	974
	975	/*
	976	=for apidoc utf8_to_bytes
	977
	978	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	979	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	980	updates len to contain the new length.
	981	Returns zero on failure, setting C<len> to -1.
	982
	983	If you need a copy of the string, see C<bytes_from_utf8>.
	984
	985	=cut
	986	*/
	987
	988	U8 *
	989	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	990	{
	991	U8 * const save = s;
	992	U8 * const send = s + *len;
	993	U8 *d;
	994
	995	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	996
	997	/* ensure valid UTF-8 and chars < 256 before updating string */
	998	while (s < send) {
	999	U8 c = *s++;
	1000
	1001	if (!UTF8_IS_INVARIANT(c) &&
	1002	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	1003	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	1004	*len = ((STRLEN) -1);
	1005	return 0;
	1006	}
	1007	}
	1008
	1009	d = s = save;
	1010	while (s < send) {
	1011	STRLEN ulen;
	1012	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	1013	s += ulen;
	1014	}
	1015	*d = '\0';
	1016	*len = d - save;
	1017	return save;
	1018	}
	1019
	1020	/*
	1021	=for apidoc bytes_from_utf8
	1022
	1023	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1024	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	1025	the newly-created string, and updates C<len> to contain the new
	1026	length. Returns the original string if no conversion occurs, C<len>
	1027	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1028	0 if C<s> is converted or consisted entirely of characters that are invariant
	1029	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1030
	1031	=cut
	1032	*/
	1033
	1034	U8 *
	1035	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1036	{
	1037	U8 *d;
	1038	const U8 *start = s;
	1039	const U8 *send;
	1040	I32 count = 0;
	1041
	1042	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1043
	1044	PERL_UNUSED_CONTEXT;
	1045	if (!*is_utf8)
	1046	return (U8 *)start;
	1047
	1048	/* ensure valid UTF-8 and chars < 256 before converting string */
	1049	for (send = s + *len; s < send;) {
	1050	U8 c = *s++;
	1051	if (!UTF8_IS_INVARIANT(c)) {
	1052	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	1053	(c = *s++) && UTF8_IS_CONTINUATION(c))
	1054	count++;
	1055	else
	1056	return (U8 *)start;
	1057	}
	1058	}
	1059
	1060	*is_utf8 = FALSE;
	1061
	1062	Newx(d, (*len) - count + 1, U8);
	1063	s = start; start = d;
	1064	while (s < send) {
	1065	U8 c = *s++;
	1066	if (!UTF8_IS_INVARIANT(c)) {
	1067	/* Then it is two-byte encoded */
	1068	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
	1069	}
	1070	*d++ = c;
	1071	}
	1072	*d = '\0';
	1073	*len = d - start;
	1074	return (U8 *)start;
	1075	}
	1076
	1077	/*
	1078	=for apidoc bytes_to_utf8
	1079
	1080	Converts a string C<s> of length C<len> bytes from the native encoding into
	1081	UTF-8.
	1082	Returns a pointer to the newly-created string, and sets C<len> to
	1083	reflect the new length in bytes.
	1084
	1085	A NUL character will be written after the end of the string.
	1086
	1087	If you want to convert to UTF-8 from encodings other than
	1088	the native (Latin1 or EBCDIC),
	1089	see sv_recode_to_utf8().
	1090
	1091	=cut
	1092	*/
	1093
	1094	U8*
	1095	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1096	{
	1097	const U8 * const send = s + (*len);
	1098	U8 *d;
	1099	U8 *dst;
	1100
	1101	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1102	PERL_UNUSED_CONTEXT;
	1103
	1104	Newx(d, (len) 2 + 1, U8);
	1105	dst = d;
	1106
	1107	while (s < send) {
	1108	const UV uv = NATIVE_TO_ASCII(*s++);
	1109	if (UNI_IS_INVARIANT(uv))
	1110	*d++ = (U8)UTF_TO_NATIVE(uv);
	1111	else {
	1112	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	1113	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	1114	}
	1115	}
	1116	*d = '\0';
	1117	*len = d-dst;
	1118	return dst;
	1119	}
	1120
	1121	/*
	1122	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1123	*
	1124	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1125	* We optimize for native, for obvious reasons. */
	1126
	1127	U8*
	1128	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1129	{
	1130	U8* pend;
	1131	U8* dstart = d;
	1132
	1133	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1134
	1135	if (bytelen & 1)
	1136	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1137
	1138	pend = p + bytelen;
	1139
	1140	while (p < pend) {
	1141	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1142	p += 2;
	1143	if (uv < 0x80) {
	1144	#ifdef EBCDIC
	1145	*d++ = UNI_TO_NATIVE(uv);
	1146	#else
	1147	*d++ = (U8)uv;
	1148	#endif
	1149	continue;
	1150	}
	1151	if (uv < 0x800) {
	1152	*d++ = (U8)(( uv >> 6) \| 0xc0);
	1153	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1154	continue;
	1155	}
	1156	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	1157	if (p >= pend) {
	1158	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1159	} else {
	1160	UV low = (p[0] << 8) + p[1];
	1161	p += 2;
	1162	if (low < 0xdc00 \|\| low > 0xdfff)
	1163	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1164	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	1165	}
	1166	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	1167	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1168	}
	1169	if (uv < 0x10000) {
	1170	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1171	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1172	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1173	continue;
	1174	}
	1175	else {
	1176	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1177	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1178	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1179	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1180	continue;
	1181	}
	1182	}
	1183	*newlen = d - dstart;
	1184	return d;
	1185	}
	1186
	1187	/* Note: this one is slightly destructive of the source. */
	1188
	1189	U8*
	1190	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1191	{
	1192	U8* s = (U8*)p;
	1193	U8* const send = s + bytelen;
	1194
	1195	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1196
	1197	if (bytelen & 1)
	1198	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1199	(UV)bytelen);
	1200
	1201	while (s < send) {
	1202	const U8 tmp = s[0];
	1203	s[0] = s[1];
	1204	s[1] = tmp;
	1205	s += 2;
	1206	}
	1207	return utf16_to_utf8(p, d, bytelen, newlen);
	1208	}
	1209
	1210	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	1211
	1212	bool
	1213	Perl_is_uni_alnum(pTHX_ UV c)
	1214	{
	1215	U8 tmpbuf[UTF8_MAXBYTES+1];
	1216	uvchr_to_utf8(tmpbuf, c);
	1217	return is_utf8_alnum(tmpbuf);
	1218	}
	1219
	1220	bool
	1221	Perl_is_uni_idfirst(pTHX_ UV c)
	1222	{
	1223	U8 tmpbuf[UTF8_MAXBYTES+1];
	1224	uvchr_to_utf8(tmpbuf, c);
	1225	return is_utf8_idfirst(tmpbuf);
	1226	}
	1227
	1228	bool
	1229	Perl_is_uni_alpha(pTHX_ UV c)
	1230	{
	1231	U8 tmpbuf[UTF8_MAXBYTES+1];
	1232	uvchr_to_utf8(tmpbuf, c);
	1233	return is_utf8_alpha(tmpbuf);
	1234	}
	1235
	1236	bool
	1237	Perl_is_uni_ascii(pTHX_ UV c)
	1238	{
	1239	U8 tmpbuf[UTF8_MAXBYTES+1];
	1240	uvchr_to_utf8(tmpbuf, c);
	1241	return is_utf8_ascii(tmpbuf);
	1242	}
	1243
	1244	bool
	1245	Perl_is_uni_space(pTHX_ UV c)
	1246	{
	1247	U8 tmpbuf[UTF8_MAXBYTES+1];
	1248	uvchr_to_utf8(tmpbuf, c);
	1249	return is_utf8_space(tmpbuf);
	1250	}
	1251
	1252	bool
	1253	Perl_is_uni_digit(pTHX_ UV c)
	1254	{
	1255	U8 tmpbuf[UTF8_MAXBYTES+1];
	1256	uvchr_to_utf8(tmpbuf, c);
	1257	return is_utf8_digit(tmpbuf);
	1258	}
	1259
	1260	bool
	1261	Perl_is_uni_upper(pTHX_ UV c)
	1262	{
	1263	U8 tmpbuf[UTF8_MAXBYTES+1];
	1264	uvchr_to_utf8(tmpbuf, c);
	1265	return is_utf8_upper(tmpbuf);
	1266	}
	1267
	1268	bool
	1269	Perl_is_uni_lower(pTHX_ UV c)
	1270	{
	1271	U8 tmpbuf[UTF8_MAXBYTES+1];
	1272	uvchr_to_utf8(tmpbuf, c);
	1273	return is_utf8_lower(tmpbuf);
	1274	}
	1275
	1276	bool
	1277	Perl_is_uni_cntrl(pTHX_ UV c)
	1278	{
	1279	U8 tmpbuf[UTF8_MAXBYTES+1];
	1280	uvchr_to_utf8(tmpbuf, c);
	1281	return is_utf8_cntrl(tmpbuf);
	1282	}
	1283
	1284	bool
	1285	Perl_is_uni_graph(pTHX_ UV c)
	1286	{
	1287	U8 tmpbuf[UTF8_MAXBYTES+1];
	1288	uvchr_to_utf8(tmpbuf, c);
	1289	return is_utf8_graph(tmpbuf);
	1290	}
	1291
	1292	bool
	1293	Perl_is_uni_print(pTHX_ UV c)
	1294	{
	1295	U8 tmpbuf[UTF8_MAXBYTES+1];
	1296	uvchr_to_utf8(tmpbuf, c);
	1297	return is_utf8_print(tmpbuf);
	1298	}
	1299
	1300	bool
	1301	Perl_is_uni_punct(pTHX_ UV c)
	1302	{
	1303	U8 tmpbuf[UTF8_MAXBYTES+1];
	1304	uvchr_to_utf8(tmpbuf, c);
	1305	return is_utf8_punct(tmpbuf);
	1306	}
	1307
	1308	bool
	1309	Perl_is_uni_xdigit(pTHX_ UV c)
	1310	{
	1311	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1312	uvchr_to_utf8(tmpbuf, c);
	1313	return is_utf8_xdigit(tmpbuf);
	1314	}
	1315
	1316	UV
	1317	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1318	{
	1319	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1320
	1321	uvchr_to_utf8(p, c);
	1322	return to_utf8_upper(p, p, lenp);
	1323	}
	1324
	1325	UV
	1326	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1327	{
	1328	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1329
	1330	uvchr_to_utf8(p, c);
	1331	return to_utf8_title(p, p, lenp);
	1332	}
	1333
	1334	UV
	1335	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1336	{
	1337	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1338
	1339	uvchr_to_utf8(p, c);
	1340	return to_utf8_lower(p, p, lenp);
	1341	}
	1342
	1343	UV
	1344	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	1345	{
	1346	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1347
	1348	uvchr_to_utf8(p, c);
	1349	return _to_utf8_fold_flags(p, p, lenp, flags);
	1350	}
	1351
	1352	/* for now these all assume no locale info available for Unicode > 255 */
	1353
	1354	bool
	1355	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1356	{
	1357	return is_uni_alnum(c); /* XXX no locale support yet */
	1358	}
	1359
	1360	bool
	1361	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1362	{
	1363	return is_uni_idfirst(c); /* XXX no locale support yet */
	1364	}
	1365
	1366	bool
	1367	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1368	{
	1369	return is_uni_alpha(c); /* XXX no locale support yet */
	1370	}
	1371
	1372	bool
	1373	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1374	{
	1375	return is_uni_ascii(c); /* XXX no locale support yet */
	1376	}
	1377
	1378	bool
	1379	Perl_is_uni_space_lc(pTHX_ UV c)
	1380	{
	1381	return is_uni_space(c); /* XXX no locale support yet */
	1382	}
	1383
	1384	bool
	1385	Perl_is_uni_digit_lc(pTHX_ UV c)
	1386	{
	1387	return is_uni_digit(c); /* XXX no locale support yet */
	1388	}
	1389
	1390	bool
	1391	Perl_is_uni_upper_lc(pTHX_ UV c)
	1392	{
	1393	return is_uni_upper(c); /* XXX no locale support yet */
	1394	}
	1395
	1396	bool
	1397	Perl_is_uni_lower_lc(pTHX_ UV c)
	1398	{
	1399	return is_uni_lower(c); /* XXX no locale support yet */
	1400	}
	1401
	1402	bool
	1403	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1404	{
	1405	return is_uni_cntrl(c); /* XXX no locale support yet */
	1406	}
	1407
	1408	bool
	1409	Perl_is_uni_graph_lc(pTHX_ UV c)
	1410	{
	1411	return is_uni_graph(c); /* XXX no locale support yet */
	1412	}
	1413
	1414	bool
	1415	Perl_is_uni_print_lc(pTHX_ UV c)
	1416	{
	1417	return is_uni_print(c); /* XXX no locale support yet */
	1418	}
	1419
	1420	bool
	1421	Perl_is_uni_punct_lc(pTHX_ UV c)
	1422	{
	1423	return is_uni_punct(c); /* XXX no locale support yet */
	1424	}
	1425
	1426	bool
	1427	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1428	{
	1429	return is_uni_xdigit(c); /* XXX no locale support yet */
	1430	}
	1431
	1432	U32
	1433	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1434	{
	1435	/* XXX returns only the first character -- do not use XXX */
	1436	/* XXX no locale support yet */
	1437	STRLEN len;
	1438	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1439	return (U32)to_uni_upper(c, tmpbuf, &len);
	1440	}
	1441
	1442	U32
	1443	Perl_to_uni_title_lc(pTHX_ U32 c)
	1444	{
	1445	/* XXX returns only the first character XXX -- do not use XXX */
	1446	/* XXX no locale support yet */
	1447	STRLEN len;
	1448	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1449	return (U32)to_uni_title(c, tmpbuf, &len);
	1450	}
	1451
	1452	U32
	1453	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1454	{
	1455	/* XXX returns only the first character -- do not use XXX */
	1456	/* XXX no locale support yet */
	1457	STRLEN len;
	1458	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1459	return (U32)to_uni_lower(c, tmpbuf, &len);
	1460	}
	1461
	1462	static bool
	1463	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1464	const char *const swashname)
	1465	{
	1466	dVAR;
	1467
	1468	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1469
	1470	if (!is_utf8_char(p))
	1471	return FALSE;
	1472	if (!*swash)
	1473	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1474	return swash_fetch(*swash, p, TRUE) != 0;
	1475	}
	1476
	1477	bool
	1478	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1479	{
	1480	dVAR;
	1481
	1482	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1483
	1484	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1485	* descendant of isalnum(3), in other words, it doesn't
	1486	* contain the '_'. --jhi */
	1487	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1488	}
	1489
	1490	bool
	1491	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1492	{
	1493	dVAR;
	1494
	1495	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1496
	1497	if (*p == '_')
	1498	return TRUE;
	1499	/* is_utf8_idstart would be more logical. */
	1500	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1501	}
	1502
	1503	bool
	1504	Perl_is_utf8_xidfirst(pTHX_ const U8 p) / The naming is historical. */
	1505	{
	1506	dVAR;
	1507
	1508	PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
	1509
	1510	if (*p == '_')
	1511	return TRUE;
	1512	/* is_utf8_idstart would be more logical. */
	1513	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
	1514	}
	1515
	1516	bool
	1517	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1518	{
	1519	dVAR;
	1520
	1521	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1522
	1523	if (*p == '_')
	1524	return TRUE;
	1525	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1526	}
	1527
	1528	bool
	1529	Perl_is_utf8_xidcont(pTHX_ const U8 *p)
	1530	{
	1531	dVAR;
	1532
	1533	PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
	1534
	1535	if (*p == '_')
	1536	return TRUE;
	1537	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
	1538	}
	1539
	1540	bool
	1541	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1542	{
	1543	dVAR;
	1544
	1545	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1546
	1547	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1548	}
	1549
	1550	bool
	1551	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1552	{
	1553	dVAR;
	1554
	1555	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1556
	1557	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1558	}
	1559
	1560	bool
	1561	Perl_is_utf8_space(pTHX_ const U8 *p)
	1562	{
	1563	dVAR;
	1564
	1565	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1566
	1567	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1568	}
	1569
	1570	bool
	1571	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1572	{
	1573	dVAR;
	1574
	1575	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1576
	1577	return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
	1578	}
	1579
	1580	bool
	1581	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1582	{
	1583	dVAR;
	1584
	1585	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1586
	1587	return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
	1588	}
	1589
	1590	bool
	1591	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1592	{
	1593	dVAR;
	1594
	1595	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1596
	1597	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1598	}
	1599
	1600	bool
	1601	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1602	{
	1603	dVAR;
	1604
	1605	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1606
	1607	return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
	1608	}
	1609
	1610	bool
	1611	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1612	{
	1613	dVAR;
	1614
	1615	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1616
	1617	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1618	}
	1619
	1620	bool
	1621	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1622	{
	1623	dVAR;
	1624
	1625	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1626
	1627	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1628	}
	1629
	1630	bool
	1631	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1632	{
	1633	dVAR;
	1634
	1635	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1636
	1637	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1638	}
	1639
	1640	bool
	1641	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1642	{
	1643	dVAR;
	1644
	1645	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1646
	1647	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1648	}
	1649
	1650	bool
	1651	Perl_is_utf8_print(pTHX_ const U8 *p)
	1652	{
	1653	dVAR;
	1654
	1655	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1656
	1657	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1658	}
	1659
	1660	bool
	1661	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1662	{
	1663	dVAR;
	1664
	1665	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1666
	1667	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1668	}
	1669
	1670	bool
	1671	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1672	{
	1673	dVAR;
	1674
	1675	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1676
	1677	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1678	}
	1679
	1680	bool
	1681	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1682	{
	1683	dVAR;
	1684
	1685	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1686
	1687	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1688	}
	1689
	1690	bool
	1691	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1692	{
	1693	dVAR;
	1694
	1695	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1696
	1697	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1698	}
	1699
	1700	bool
	1701	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1702	{
	1703	dVAR;
	1704
	1705	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1706
	1707	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1708	}
	1709
	1710	bool
	1711	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1712	{
	1713	dVAR;
	1714
	1715	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1716
	1717	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1718	}
	1719
	1720	bool
	1721	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1722	{
	1723	dVAR;
	1724
	1725	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1726
	1727	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1728	}
	1729
	1730	bool
	1731	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1732	{
	1733	dVAR;
	1734
	1735	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1736
	1737	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1738	}
	1739
	1740	bool
	1741	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1742	{
	1743	dVAR;
	1744
	1745	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1746
	1747	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1748	}
	1749
	1750	bool
	1751	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1752	{
	1753	dVAR;
	1754
	1755	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1756
	1757	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1758	}
	1759
	1760	bool
	1761	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1762	{
	1763	dVAR;
	1764
	1765	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1766
	1767	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1768	}
	1769
	1770	bool
	1771	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1772	{
	1773	dVAR;
	1774
	1775	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1776
	1777	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1778	}
	1779
	1780	bool
	1781	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1782	{
	1783	dVAR;
	1784
	1785	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1786
	1787	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1788	}
	1789
	1790	/*
	1791	=for apidoc to_utf8_case
	1792
	1793	The "p" contains the pointer to the UTF-8 string encoding
	1794	the character that is being converted.
	1795
	1796	The "ustrp" is a pointer to the character buffer to put the
	1797	conversion result to. The "lenp" is a pointer to the length
	1798	of the result.
	1799
	1800	The "swashp" is a pointer to the swash to use.
	1801
	1802	Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
	1803	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1804	but not always, a multicharacter mapping), is tried first.
	1805
	1806	The "special" is a string like "utf8::ToSpecLower", which means the
	1807	hash %utf8::ToSpecLower. The access to the hash is through
	1808	Perl_to_utf8_case().
	1809
	1810	The "normal" is a string like "ToLower" which means the swash
	1811	%utf8::ToLower.
	1812
	1813	=cut */
	1814
	1815	UV
	1816	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1817	SV *swashp, const char normal, const char *special)
	1818	{
	1819	dVAR;
	1820	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1821	STRLEN len = 0;
	1822	const UV uv0 = utf8_to_uvchr(p, NULL);
	1823	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1824	* are necessary in EBCDIC, they are redundant no-ops
	1825	* in ASCII-ish platforms, and hopefully optimized away. */
	1826	const UV uv1 = NATIVE_TO_UNI(uv0);
	1827
	1828	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1829
	1830	/* Note that swash_fetch() doesn't output warnings for these because it
	1831	* assumes we will */
	1832	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	1833	if (uv1 <= UNICODE_SURROGATE_LAST) {
	1834	if (ckWARN_d(WARN_SURROGATE)) {
	1835	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1836	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	1837	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1838	}
	1839	}
	1840	else if (UNICODE_IS_SUPER(uv1)) {
	1841	if (ckWARN_d(WARN_NON_UNICODE)) {
	1842	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1843	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	1844	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1845	}
	1846	}
	1847
	1848	/* Note that non-characters are perfectly legal, so no warning should
	1849	* be given */
	1850	}
	1851
	1852	uvuni_to_utf8(tmpbuf, uv1);
	1853
	1854	if (!swashp) / load on-demand */
	1855	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1856	/* This is the beginnings of a skeleton of code to read the info section
	1857	* that is in all the swashes in case we ever want to do that, so one can
	1858	* read things whose maps aren't code points, and whose default if missing
	1859	* is not to the code point itself. This was just to see if it actually
	1860	* worked. Details on what the possibilities are are in perluniprops.pod
	1861	HV * const hv = get_hv("utf8::SwashInfo", 0);
	1862	if (hv) {
	1863	SV **svp;
	1864	svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
	1865	const char *s;
	1866
	1867	HV * const this_hash = SvRV(*svp);
	1868	svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
	1869	s = SvPV_const(*svp, len);
	1870	}
	1871	}*/
	1872
	1873	if (special) {
	1874	/* It might be "special" (sometimes, but not always,
	1875	* a multicharacter mapping) */
	1876	HV * const hv = get_hv(special, 0);
	1877	SV **svp;
	1878
	1879	if (hv &&
	1880	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1881	(*svp)) {
	1882	const char *s;
	1883
	1884	s = SvPV_const(*svp, len);
	1885	if (len == 1)
	1886	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1887	else {
	1888	#ifdef EBCDIC
	1889	/* If we have EBCDIC we need to remap the characters
	1890	* since any characters in the low 256 are Unicode
	1891	* code points, not EBCDIC. */
	1892	U8 t = (U8)s, tend = t + len, d;
	1893
	1894	d = tmpbuf;
	1895	if (SvUTF8(*svp)) {
	1896	STRLEN tlen = 0;
	1897
	1898	while (t < tend) {
	1899	const UV c = utf8_to_uvchr(t, &tlen);
	1900	if (tlen > 0) {
	1901	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1902	t += tlen;
	1903	}
	1904	else
	1905	break;
	1906	}
	1907	}
	1908	else {
	1909	while (t < tend) {
	1910	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1911	t++;
	1912	}
	1913	}
	1914	len = d - tmpbuf;
	1915	Copy(tmpbuf, ustrp, len, U8);
	1916	#else
	1917	Copy(s, ustrp, len, U8);
	1918	#endif
	1919	}
	1920	}
	1921	}
	1922
	1923	if (!len && *swashp) {
	1924	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1925
	1926	if (uv2) {
	1927	/* It was "normal" (a single character mapping). */
	1928	const UV uv3 = UNI_TO_NATIVE(uv2);
	1929	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1930	}
	1931	}
	1932
	1933	if (!len) /* Neither: just copy. In other words, there was no mapping
	1934	defined, which means that the code point maps to itself */
	1935	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1936
	1937	if (lenp)
	1938	*lenp = len;
	1939
	1940	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1941	}
	1942
	1943	/*
	1944	=for apidoc to_utf8_upper
	1945
	1946	Convert the UTF-8 encoded character at p to its uppercase version and
	1947	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1948	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1949	the uppercase version may be longer than the original character.
	1950
	1951	The first character of the uppercased version is returned
	1952	(but note, as explained above, that there may be more.)
	1953
	1954	=cut */
	1955
	1956	UV
	1957	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1958	{
	1959	dVAR;
	1960
	1961	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1962
	1963	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1964	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1965	}
	1966
	1967	/*
	1968	=for apidoc to_utf8_title
	1969
	1970	Convert the UTF-8 encoded character at p to its titlecase version and
	1971	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1972	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1973	titlecase version may be longer than the original character.
	1974
	1975	The first character of the titlecased version is returned
	1976	(but note, as explained above, that there may be more.)
	1977
	1978	=cut */
	1979
	1980	UV
	1981	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1982	{
	1983	dVAR;
	1984
	1985	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	1986
	1987	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1988	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1989	}
	1990
	1991	/*
	1992	=for apidoc to_utf8_lower
	1993
	1994	Convert the UTF-8 encoded character at p to its lowercase version and
	1995	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1996	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1997	lowercase version may be longer than the original character.
	1998
	1999	The first character of the lowercased version is returned
	2000	(but note, as explained above, that there may be more.)
	2001
	2002	=cut */
	2003
	2004	UV
	2005	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	2006	{
	2007	dVAR;
	2008
	2009	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	2010
	2011	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2012	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	2013	}
	2014
	2015	/*
	2016	=for apidoc to_utf8_fold
	2017
	2018	Convert the UTF-8 encoded character at p to its foldcase version and
	2019	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2020	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2021	foldcase version may be longer than the original character (up to
	2022	three characters).
	2023
	2024	The first character of the foldcased version is returned
	2025	(but note, as explained above, that there may be more.)
	2026
	2027	=cut */
	2028
	2029	/* Not currently externally documented is 'flags', which currently is non-zero
	2030	* if full case folds are to be used; otherwise simple folds */
	2031
	2032	UV
	2033	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, U8 flags)
	2034	{
	2035	const char *specials = (flags) ? "utf8::ToSpecFold" : NULL;
	2036
	2037	dVAR;
	2038
	2039	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2040
	2041	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2042	&PL_utf8_tofold, "ToFold", specials);
	2043	}
	2044
	2045	/* Note:
	2046	* A "swash" is a swatch hash.
	2047	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	2048	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2049	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2050	*/
	2051	SV*
	2052	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2053	{
	2054	dVAR;
	2055	SV* retval;
	2056	dSP;
	2057	const size_t pkg_len = strlen(pkg);
	2058	const size_t name_len = strlen(name);
	2059	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2060	SV* errsv_save;
	2061	GV *method;
	2062
	2063	PERL_ARGS_ASSERT_SWASH_INIT;
	2064
	2065	PUSHSTACKi(PERLSI_MAGIC);
	2066	ENTER;
	2067	SAVEHINTS();
	2068	save_re_context();
	2069	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2070	if (!method) { /* demand load utf8 */
	2071	ENTER;
	2072	errsv_save = newSVsv(ERRSV);
	2073	/* It is assumed that callers of this routine are not passing in any
	2074	user derived data. */
	2075	/* Need to do this after save_re_context() as it will set PL_tainted to
	2076	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	2077	Even line to create errsv_save can turn on PL_tainted. */
	2078	SAVEBOOL(PL_tainted);
	2079	PL_tainted = 0;
	2080	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2081	NULL);
	2082	if (!SvTRUE(ERRSV))
	2083	sv_setsv(ERRSV, errsv_save);
	2084	SvREFCNT_dec(errsv_save);
	2085	LEAVE;
	2086	}
	2087	SPAGAIN;
	2088	PUSHMARK(SP);
	2089	EXTEND(SP,5);
	2090	mPUSHp(pkg, pkg_len);
	2091	mPUSHp(name, name_len);
	2092	PUSHs(listsv);
	2093	mPUSHi(minbits);
	2094	mPUSHi(none);
	2095	PUTBACK;
	2096	errsv_save = newSVsv(ERRSV);
	2097	/* If we already have a pointer to the method, no need to use call_method()
	2098	to repeat the lookup. */
	2099	if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
	2100	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2101	retval = newSVsv(*PL_stack_sp--);
	2102	else
	2103	retval = &PL_sv_undef;
	2104	if (!SvTRUE(ERRSV))
	2105	sv_setsv(ERRSV, errsv_save);
	2106	SvREFCNT_dec(errsv_save);
	2107	LEAVE;
	2108	POPSTACK;
	2109	if (IN_PERL_COMPILETIME) {
	2110	CopHINTS_set(PL_curcop, PL_hints);
	2111	}
	2112	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2113	if (SvPOK(retval))
	2114	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	2115	SVfARG(retval));
	2116	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	2117	}
	2118	return retval;
	2119	}
	2120
	2121
	2122	/* This API is wrong for special case conversions since we may need to
	2123	* return several Unicode characters for a single Unicode character
	2124	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2125	* the lower-level routine, and it is similarly broken for returning
	2126	* multiple values. --jhi
	2127	* For those, you should use to_utf8_case() instead */
	2128	/* Now SWASHGET is recasted into S_swash_get in this file. */
	2129
	2130	/* Note:
	2131	* Returns the value of property/mapping C<swash> for the first character
	2132	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2133	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	2134	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2135	*/
	2136	UV
	2137	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2138	{
	2139	dVAR;
	2140	HV *const hv = MUTABLE_HV(SvRV(swash));
	2141	U32 klen;
	2142	U32 off;
	2143	STRLEN slen;
	2144	STRLEN needents;
	2145	const U8 *tmps = NULL;
	2146	U32 bit;
	2147	SV *swatch;
	2148	U8 tmputf8[2];
	2149	const UV c = NATIVE_TO_ASCII(*ptr);
	2150
	2151	PERL_ARGS_ASSERT_SWASH_FETCH;
	2152
	2153	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	2154	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	2155	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	2156	ptr = tmputf8;
	2157	}
	2158	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	2159	* then the "swatch" is a vec() for all the chars which start
	2160	* with 0xAA..0xYY
	2161	* So the key in the hash (klen) is length of encoded char -1
	2162	*/
	2163	klen = UTF8SKIP(ptr) - 1;
	2164	off = ptr[klen];
	2165
	2166	if (klen == 0) {
	2167	/* If char is invariant then swatch is for all the invariant chars
	2168	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	2169	*/
	2170	needents = UTF_CONTINUATION_MARK;
	2171	off = NATIVE_TO_UTF(ptr[klen]);
	2172	}
	2173	else {
	2174	/* If char is encoded then swatch is for the prefix */
	2175	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2176	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	2177	if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
	2178	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
	2179
	2180	/* This outputs warnings for binary properties only, assuming that
	2181	* to_utf8_case() will output any. Also, surrogates aren't checked
	2182	* for, as that would warn on things like /\p{Gc=Cs}/ */
	2183	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2184	if (SvUV(*bitssvp) == 1) {
	2185	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2186	"Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
	2187	}
	2188	}
	2189	}
	2190
	2191	/*
	2192	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2193	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2194	* it's nothing to sniff at.) Pity we usually come through at least
	2195	* two function calls to get here...
	2196	*
	2197	* NB: this code assumes that swatches are never modified, once generated!
	2198	*/
	2199
	2200	if (hv == PL_last_swash_hv &&
	2201	klen == PL_last_swash_klen &&
	2202	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2203	{
	2204	tmps = PL_last_swash_tmps;
	2205	slen = PL_last_swash_slen;
	2206	}
	2207	else {
	2208	/* Try our second-level swatch cache, kept in a hash. */
	2209	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2210
	2211	/* If not cached, generate it via swash_get */
	2212	if (!svp \|\| !SvPOK(*svp)
	2213	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	2214	/* We use utf8n_to_uvuni() as we want an index into
	2215	Unicode tables, not a native character number.
	2216	*/
	2217	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	2218	ckWARN(WARN_UTF8) ?
	2219	0 : UTF8_ALLOW_ANY);
	2220	swatch = swash_get(swash,
	2221	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	2222	(klen) ? (code_point & ~(needents - 1)) : 0,
	2223	needents);
	2224
	2225	if (IN_PERL_COMPILETIME)
	2226	CopHINTS_set(PL_curcop, PL_hints);
	2227
	2228	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2229
	2230	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2231	\|\| (slen << 3) < needents)
	2232	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	2233	}
	2234
	2235	PL_last_swash_hv = hv;
	2236	assert(klen <= sizeof(PL_last_swash_key));
	2237	PL_last_swash_klen = (U8)klen;
	2238	/* FIXME change interpvar.h? */
	2239	PL_last_swash_tmps = (U8 *) tmps;
	2240	PL_last_swash_slen = slen;
	2241	if (klen)
	2242	Copy(ptr, PL_last_swash_key, klen, U8);
	2243	}
	2244
	2245	switch ((int)((slen << 3) / needents)) {
	2246	case 1:
	2247	bit = 1 << (off & 7);
	2248	off >>= 3;
	2249	return (tmps[off] & bit) != 0;
	2250	case 8:
	2251	return tmps[off];
	2252	case 16:
	2253	off <<= 1;
	2254	return (tmps[off] << 8) + tmps[off + 1] ;
	2255	case 32:
	2256	off <<= 2;
	2257	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2258	}
	2259	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2260	NORETURN_FUNCTION_END;
	2261	}
	2262
	2263	/* Read a single line of the main body of the swash input text. These are of
	2264	* the form:
	2265	* 0053 0056 0073
	2266	* where each number is hex. The first two numbers form the minimum and
	2267	* maximum of a range, and the third is the value associated with the range.
	2268	* Not all swashes should have a third number
	2269	*
	2270	* On input: l points to the beginning of the line to be examined; it points
	2271	* to somewhere in the string of the whole input text, and is
	2272	* terminated by a \n or the null string terminator.
	2273	* lend points to the null terminator of that string
	2274	* wants_value is non-zero if the swash expects a third number
	2275	* typestr is the name of the swash's mapping, like 'ToLower'
	2276	* On output: min, max, and *val are set to the values read from the line.
	2277	* returns a pointer just beyond the line examined. If there was no
	2278	* valid min number on the line, returns lend+1
	2279	*/
	2280
	2281	STATIC U8*
	2282	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2283	const bool wants_value, const U8* const typestr)
	2284	{
	2285	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2286	STRLEN numlen; /* Length of the number */
	2287	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2288
	2289	/* nl points to the next \n in the scan */
	2290	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2291
	2292	/* Get the first number on the line: the range minimum */
	2293	numlen = lend - l;
	2294	min = grok_hex((char )l, &numlen, &flags, NULL);
	2295	if (numlen) /* If found a hex number, position past it */
	2296	l += numlen;
	2297	else if (nl) { /* Else, go handle next line, if any */
	2298	return nl + 1; /* 1 is length of "\n" */
	2299	}
	2300	else { /* Else, no next line */
	2301	return lend + 1; /* to LIST's end at which \n is not found */
	2302	}
	2303
	2304	/* The max range value follows, separated by a BLANK */
	2305	if (isBLANK(*l)) {
	2306	++l;
	2307	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2308	numlen = lend - l;
	2309	max = grok_hex((char )l, &numlen, &flags, NULL);
	2310	if (numlen)
	2311	l += numlen;
	2312	else /* If no value here, it is a single element range */
	2313	max = min;
	2314
	2315	/* Non-binary tables have a third entry: what the first element of the
	2316	* range maps to */
	2317	if (wants_value) {
	2318	if (isBLANK(*l)) {
	2319	++l;
	2320	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	2321	PERL_SCAN_DISALLOW_PREFIX;
	2322	numlen = lend - l;
	2323	val = grok_hex((char )l, &numlen, &flags, NULL);
	2324	if (numlen)
	2325	l += numlen;
	2326	else
	2327	*val = 0;
	2328	}
	2329	else {
	2330	*val = 0;
	2331	if (typeto) {
	2332	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2333	typestr, l);
	2334	}
	2335	}
	2336	}
	2337	else
	2338	val = 0; / bits == 1, then any val should be ignored */
	2339	}
	2340	else { /* Nothing following range min, should be single element with no
	2341	mapping expected */
	2342	max = min;
	2343	if (wants_value) {
	2344	*val = 0;
	2345	if (typeto) {
	2346	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2347	}
	2348	}
	2349	else
	2350	val = 0; / bits == 1, then val should be ignored */
	2351	}
	2352
	2353	/* Position to next line if any, or EOF */
	2354	if (nl)
	2355	l = nl + 1;
	2356	else
	2357	l = lend;
	2358
	2359	return l;
	2360	}
	2361
	2362	/* Note:
	2363	* Returns a swatch (a bit vector string) for a code point sequence
	2364	* that starts from the value C<start> and comprises the number C<span>.
	2365	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2366	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2367	*/
	2368	STATIC SV*
	2369	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2370	{
	2371	SV *swatch;
	2372	U8 l, lend, x, xend, *s;
	2373	STRLEN lcur, xcur, scur;
	2374	HV *const hv = MUTABLE_HV(SvRV(swash));
	2375
	2376	/* The string containing the main body of the table */
	2377	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2378
	2379	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2380	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2381	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2382	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2383	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2384	const STRLEN bits = SvUV(*bitssvp);
	2385	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2386	const UV none = SvUV(*nonesvp);
	2387	const UV end = start + span;
	2388
	2389	PERL_ARGS_ASSERT_SWASH_GET;
	2390
	2391	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2392	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2393	(UV)bits);
	2394	}
	2395
	2396	/* create and initialize $swatch */
	2397	scur = octets ? (span * octets) : (span + 7) / 8;
	2398	swatch = newSV(scur);
	2399	SvPOK_on(swatch);
	2400	s = (U8*)SvPVX(swatch);
	2401	if (octets && none) {
	2402	const U8* const e = s + scur;
	2403	while (s < e) {
	2404	if (bits == 8)
	2405	*s++ = (U8)(none & 0xff);
	2406	else if (bits == 16) {
	2407	*s++ = (U8)((none >> 8) & 0xff);
	2408	*s++ = (U8)( none & 0xff);
	2409	}
	2410	else if (bits == 32) {
	2411	*s++ = (U8)((none >> 24) & 0xff);
	2412	*s++ = (U8)((none >> 16) & 0xff);
	2413	*s++ = (U8)((none >> 8) & 0xff);
	2414	*s++ = (U8)( none & 0xff);
	2415	}
	2416	}
	2417	*s = '\0';
	2418	}
	2419	else {
	2420	(void)memzero((U8*)s, scur + 1);
	2421	}
	2422	SvCUR_set(swatch, scur);
	2423	s = (U8*)SvPVX(swatch);
	2424
	2425	/* read $swash->{LIST} */
	2426	l = (U8)SvPV(listsvp, lcur);
	2427	lend = l + lcur;
	2428	while (l < lend) {
	2429	UV min, max, val;
	2430	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2431	cBOOL(octets), typestr);
	2432	if (l > lend) {
	2433	break;
	2434	}
	2435
	2436	/* If looking for something beyond this range, go try the next one */
	2437	if (max < start)
	2438	continue;
	2439
	2440	if (octets) {
	2441	UV key;
	2442	if (min < start) {
	2443	if (!none \|\| val < none) {
	2444	val += start - min;
	2445	}
	2446	min = start;
	2447	}
	2448	for (key = min; key <= max; key++) {
	2449	STRLEN offset;
	2450	if (key >= end)
	2451	goto go_out_list;
	2452	/* offset must be non-negative (start <= min <= key < end) */
	2453	offset = octets * (key - start);
	2454	if (bits == 8)
	2455	s[offset] = (U8)(val & 0xff);
	2456	else if (bits == 16) {
	2457	s[offset ] = (U8)((val >> 8) & 0xff);
	2458	s[offset + 1] = (U8)( val & 0xff);
	2459	}
	2460	else if (bits == 32) {
	2461	s[offset ] = (U8)((val >> 24) & 0xff);
	2462	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2463	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2464	s[offset + 3] = (U8)( val & 0xff);
	2465	}
	2466
	2467	if (!none \|\| val < none)
	2468	++val;
	2469	}
	2470	}
	2471	else { /* bits == 1, then val should be ignored */
	2472	UV key;
	2473	if (min < start)
	2474	min = start;
	2475	for (key = min; key <= max; key++) {
	2476	const STRLEN offset = (STRLEN)(key - start);
	2477	if (key >= end)
	2478	goto go_out_list;
	2479	s[offset >> 3] \|= 1 << (offset & 7);
	2480	}
	2481	}
	2482	} /* while */
	2483	go_out_list:
	2484
	2485	/* read $swash->{EXTRAS} */
	2486	x = (U8)SvPV(extssvp, xcur);
	2487	xend = x + xcur;
	2488	while (x < xend) {
	2489	STRLEN namelen;
	2490	U8 *namestr;
	2491	SV** othersvp;
	2492	HV* otherhv;
	2493	STRLEN otherbits;
	2494	SV *otherbitssvp, other;
	2495	U8 s, o, *nl;
	2496	STRLEN slen, olen;
	2497
	2498	const U8 opc = *x++;
	2499	if (opc == '\n')
	2500	continue;
	2501
	2502	nl = (U8*)memchr(x, '\n', xend - x);
	2503
	2504	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2505	if (nl) {
	2506	x = nl + 1; /* 1 is length of "\n" */
	2507	continue;
	2508	}
	2509	else {
	2510	x = xend; /* to EXTRAS' end at which \n is not found */
	2511	break;
	2512	}
	2513	}
	2514
	2515	namestr = x;
	2516	if (nl) {
	2517	namelen = nl - namestr;
	2518	x = nl + 1;
	2519	}
	2520	else {
	2521	namelen = xend - namestr;
	2522	x = xend;
	2523	}
	2524
	2525	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2526	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2527	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2528	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2529	if (bits < otherbits)
	2530	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2531
	2532	/* The "other" swatch must be destroyed after. */
	2533	other = swash_get(*othersvp, start, span);
	2534	o = (U8*)SvPV(other, olen);
	2535
	2536	if (!olen)
	2537	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2538
	2539	s = (U8*)SvPV(swatch, slen);
	2540	if (bits == 1 && otherbits == 1) {
	2541	if (slen != olen)
	2542	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2543
	2544	switch (opc) {
	2545	case '+':
	2546	while (slen--)
	2547	s++ \|= o++;
	2548	break;
	2549	case '!':
	2550	while (slen--)
	2551	s++ \|= ~o++;
	2552	break;
	2553	case '-':
	2554	while (slen--)
	2555	s++ &= ~o++;
	2556	break;
	2557	case '&':
	2558	while (slen--)
	2559	s++ &= o++;
	2560	break;
	2561	default:
	2562	break;
	2563	}
	2564	}
	2565	else {
	2566	STRLEN otheroctets = otherbits >> 3;
	2567	STRLEN offset = 0;
	2568	U8* const send = s + slen;
	2569
	2570	while (s < send) {
	2571	UV otherval = 0;
	2572
	2573	if (otherbits == 1) {
	2574	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2575	++offset;
	2576	}
	2577	else {
	2578	STRLEN vlen = otheroctets;
	2579	otherval = *o++;
	2580	while (--vlen) {
	2581	otherval <<= 8;
	2582	otherval \|= *o++;
	2583	}
	2584	}
	2585
	2586	if (opc == '+' && otherval)
	2587	NOOP; /* replace with otherval */
	2588	else if (opc == '!' && !otherval)
	2589	otherval = 1;
	2590	else if (opc == '-' && otherval)
	2591	otherval = 0;
	2592	else if (opc == '&' && !otherval)
	2593	otherval = 0;
	2594	else {
	2595	s += octets; /* no replacement */
	2596	continue;
	2597	}
	2598
	2599	if (bits == 8)
	2600	*s++ = (U8)( otherval & 0xff);
	2601	else if (bits == 16) {
	2602	*s++ = (U8)((otherval >> 8) & 0xff);
	2603	*s++ = (U8)( otherval & 0xff);
	2604	}
	2605	else if (bits == 32) {
	2606	*s++ = (U8)((otherval >> 24) & 0xff);
	2607	*s++ = (U8)((otherval >> 16) & 0xff);
	2608	*s++ = (U8)((otherval >> 8) & 0xff);
	2609	*s++ = (U8)( otherval & 0xff);
	2610	}
	2611	}
	2612	}
	2613	sv_free(other); /* through with it! */
	2614	} /* while */
	2615	return swatch;
	2616	}
	2617
	2618	HV*
	2619	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	2620	{
	2621
	2622	/* Subject to change or removal. For use only in one place in regexec.c
	2623	*
	2624	* Returns a hash which is the inversion and closure of a swash mapping.
	2625	* For example, consider the input lines:
	2626	* 004B 006B
	2627	* 004C 006C
	2628	* 212A 006B
	2629	*
	2630	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	2631	* 006C. The value for each key is an array. For 006C, the array would
	2632	* have a two elements, the utf8 for itself, and for 004C. For 006B, there
	2633	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	2634	*
	2635	* Essentially, for any code point, it gives all the code points that map to
	2636	* it, or the list of 'froms' for that point.
	2637	*
	2638	* Currently it only looks at the main body of the swash, and ignores any
	2639	* additions or deletions from other swashes */
	2640
	2641	U8 l, lend;
	2642	STRLEN lcur;
	2643	HV *const hv = MUTABLE_HV(SvRV(swash));
	2644
	2645	/* The string containing the main body of the table */
	2646	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2647
	2648	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2649	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2650	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2651	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	2652	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2653	const STRLEN bits = SvUV(*bitssvp);
	2654	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2655	const UV none = SvUV(*nonesvp);
	2656
	2657	HV* ret = newHV();
	2658
	2659	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	2660
	2661	/* Must have at least 8 bits to get the mappings */
	2662	if (bits != 8 && bits != 16 && bits != 32) {
	2663	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	2664	(UV)bits);
	2665	}
	2666
	2667	/* read $swash->{LIST} */
	2668	l = (U8)SvPV(listsvp, lcur);
	2669	lend = l + lcur;
	2670
	2671	/* Go through each input line */
	2672	while (l < lend) {
	2673	UV min, max, val;
	2674	UV inverse;
	2675	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2676	cBOOL(octets), typestr);
	2677	if (l > lend) {
	2678	break;
	2679	}
	2680
	2681	/* Each element in the range is to be inverted */
	2682	for (inverse = min; inverse <= max; inverse++) {
	2683	AV* list;
	2684	SV* element;
	2685	SV** listp;
	2686	IV i;
	2687	bool found_key = FALSE;
	2688
	2689	/* The key is the inverse mapping */
	2690	char key[UTF8_MAXBYTES+1];
	2691	char* key_end = (char ) uvuni_to_utf8((U8) key, val);
	2692	STRLEN key_len = key_end - key;
	2693
	2694	/* Get the list for the map */
	2695	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	2696	list = (AV) listp;
	2697	}
	2698	else { /* No entry yet for it: create one */
	2699	list = newAV();
	2700	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	2701	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2702	}
	2703	}
	2704
	2705	for (i = 0; i < av_len(list); i++) {
	2706	SV** entryp = av_fetch(list, i, FALSE);
	2707	SV* entry;
	2708	if (entryp == NULL) {
	2709	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2710	}
	2711	entry = *entryp;
	2712	if (SvUV(entry) == val) {
	2713	found_key = TRUE;
	2714	break;
	2715	}
	2716	}
	2717
	2718	/* Make sure there is a mapping to itself on the list */
	2719	if (! found_key) {
	2720	element = newSVuv(val);
	2721	av_push(list, element);
	2722	}
	2723
	2724
	2725	/* Simply add the value to the list */
	2726	element = newSVuv(inverse);
	2727	av_push(list, element);
	2728
	2729	/* swash_get() increments the value of val for each element in the
	2730	* range. That makes more compact tables possible. You can
	2731	* express the capitalization, for example, of all consecutive
	2732	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	2733	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	2734	* and it's not documented, and perhaps not even currently used,
	2735	* but I copied the semantics from swash_get(), just in case */
	2736	if (!none \|\| val < none) {
	2737	++val;
	2738	}
	2739	}
	2740	}
	2741
	2742	return ret;
	2743	}
	2744
	2745	HV*
	2746	Perl__swash_to_invlist(pTHX_ SV* const swash)
	2747	{
	2748
	2749	/* Subject to change or removal. For use only in one place in regcomp.c */
	2750
	2751	U8 l, lend;
	2752	char *loc;
	2753	STRLEN lcur;
	2754	HV *const hv = MUTABLE_HV(SvRV(swash));
	2755	UV elements = 0; /* Number of elements in the inversion list */
	2756	U8 empty[] = "";
	2757
	2758	/* The string containing the main body of the table */
	2759	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2760	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2761	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2762
	2763	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2764	const STRLEN bits = SvUV(*bitssvp);
	2765	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2766
	2767	HV* invlist;
	2768
	2769	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	2770
	2771	/* read $swash->{LIST} */
	2772	if (SvPOK(*listsvp)) {
	2773	l = (U8)SvPV(listsvp, lcur);
	2774	}
	2775	else {
	2776	/* LIST legitimately doesn't contain a string during compilation phases
	2777	* of Perl itself, before the Unicode tables are generated. In this
	2778	* case, just fake things up by creating an empty list */
	2779	l = empty;
	2780	lcur = 0;
	2781	}
	2782	loc = (char *) l;
	2783	lend = l + lcur;
	2784
	2785	/* Scan the input to count the number of lines to preallocate array size
	2786	* based on worst possible case, which is each line in the input creates 2
	2787	* elements in the inversion list: 1) the beginning of a range in the list;
	2788	* 2) the beginning of a range not in the list. */
	2789	while ((loc = (strchr(loc, '\n'))) != NULL) {
	2790	elements += 2;
	2791	loc++;
	2792	}
	2793
	2794	/* If the ending is somehow corrupt and isn't a new line, add another
	2795	* element for the final range that isn't in the inversion list */
	2796	if (! (lend == '\n' \|\| (lend == '\0' && *(lend - 1) == '\n'))) {
	2797	elements++;
	2798	}
	2799
	2800	invlist = _new_invlist(elements);
	2801
	2802	/* Now go through the input again, adding each range to the list */
	2803	while (l < lend) {
	2804	UV start, end;
	2805	UV val; /* Not used by this function */
	2806
	2807	l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
	2808	cBOOL(octets), typestr);
	2809
	2810	if (l > lend) {
	2811	break;
	2812	}
	2813
	2814	_append_range_to_invlist(invlist, start, end);
	2815	}
	2816
	2817	return invlist;
	2818	}
	2819
	2820	/*
	2821	=for apidoc uvchr_to_utf8
	2822
	2823	Adds the UTF-8 representation of the Native code point C<uv> to the end
	2824	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2825	bytes available. The return value is the pointer to the byte after the
	2826	end of the new character. In other words,
	2827
	2828	d = uvchr_to_utf8(d, uv);
	2829
	2830	is the recommended wide native character-aware way of saying
	2831
	2832	*(d++) = uv;
	2833
	2834	=cut
	2835	*/
	2836
	2837	/* On ASCII machines this is normally a macro but we want a
	2838	real function in case XS code wants it
	2839	*/
	2840	U8 *
	2841	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2842	{
	2843	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	2844
	2845	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2846	}
	2847
	2848	U8 *
	2849	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2850	{
	2851	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	2852
	2853	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2854	}
	2855
	2856	/*
	2857	=for apidoc utf8n_to_uvchr
	2858
	2859	Returns the native character value of the first character in the string
	2860	C<s>
	2861	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2862	length, in bytes, of that character.
	2863
	2864	length and flags are the same as utf8n_to_uvuni().
	2865
	2866	=cut
	2867	*/
	2868	/* On ASCII machines this is normally a macro but we want
	2869	a real function in case XS code wants it
	2870	*/
	2871	UV
	2872	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2873	U32 flags)
	2874	{
	2875	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2876
	2877	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	2878
	2879	return UNI_TO_NATIVE(uv);
	2880	}
	2881
	2882	bool
	2883	Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
	2884	{
	2885	/* May change: warns if surrogates, non-character code points, or
	2886	* non-Unicode code points are in s which has length len. Returns TRUE if
	2887	* none found; FALSE otherwise. The only other validity check is to make
	2888	* sure that this won't exceed the string's length */
	2889
	2890	const U8* const e = s + len;
	2891	bool ok = TRUE;
	2892
	2893	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	2894
	2895	while (s < e) {
	2896	if (UTF8SKIP(s) > len) {
	2897	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	2898	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	2899	return FALSE;
	2900	}
	2901	if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
	2902	STRLEN char_len;
	2903	if (UTF8_IS_SUPER(s)) {
	2904	if (ckWARN_d(WARN_NON_UNICODE)) {
	2905	UV uv = utf8_to_uvchr(s, &char_len);
	2906	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2907	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	2908	ok = FALSE;
	2909	}
	2910	}
	2911	else if (UTF8_IS_SURROGATE(s)) {
	2912	if (ckWARN_d(WARN_SURROGATE)) {
	2913	UV uv = utf8_to_uvchr(s, &char_len);
	2914	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	2915	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	2916	ok = FALSE;
	2917	}
	2918	}
	2919	else if
	2920	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	2921	&& (ckWARN_d(WARN_NONCHAR)))
	2922	{
	2923	UV uv = utf8_to_uvchr(s, &char_len);
	2924	Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
	2925	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	2926	ok = FALSE;
	2927	}
	2928	}
	2929	s += UTF8SKIP(s);
	2930	}
	2931
	2932	return ok;
	2933	}
	2934
	2935	/*
	2936	=for apidoc pv_uni_display
	2937
	2938	Build to the scalar dsv a displayable version of the string spv,
	2939	length len, the displayable version being at most pvlim bytes long
	2940	(if longer, the rest is truncated and "..." will be appended).
	2941
	2942	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2943	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2944	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2945	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2946	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2947	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2948
	2949	The pointer to the PV of the dsv is returned.
	2950
	2951	=cut */
	2952	char *
	2953	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2954	{
	2955	int truncated = 0;
	2956	const char s, e;
	2957
	2958	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	2959
	2960	sv_setpvs(dsv, "");
	2961	SvUTF8_off(dsv);
	2962	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2963	UV u;
	2964	/* This serves double duty as a flag and a character to print after
	2965	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2966	*/
	2967	char ok = 0;
	2968
	2969	if (pvlim && SvCUR(dsv) >= pvlim) {
	2970	truncated++;
	2971	break;
	2972	}
	2973	u = utf8_to_uvchr((U8*)s, 0);
	2974	if (u < 256) {
	2975	const unsigned char c = (unsigned char)u & 0xFF;
	2976	if (flags & UNI_DISPLAY_BACKSLASH) {
	2977	switch (c) {
	2978	case '\n':
	2979	ok = 'n'; break;
	2980	case '\r':
	2981	ok = 'r'; break;
	2982	case '\t':
	2983	ok = 't'; break;
	2984	case '\f':
	2985	ok = 'f'; break;
	2986	case '\a':
	2987	ok = 'a'; break;
	2988	case '\\':
	2989	ok = '\\'; break;
	2990	default: break;
	2991	}
	2992	if (ok) {
	2993	const char string = ok;
	2994	sv_catpvs(dsv, "\\");
	2995	sv_catpvn(dsv, &string, 1);
	2996	}
	2997	}
	2998	/* isPRINT() is the locale-blind version. */
	2999	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	3000	const char string = c;
	3001	sv_catpvn(dsv, &string, 1);
	3002	ok = 1;
	3003	}
	3004	}
	3005	if (!ok)
	3006	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	3007	}
	3008	if (truncated)
	3009	sv_catpvs(dsv, "...");
	3010
	3011	return SvPVX(dsv);
	3012	}
	3013
	3014	/*
	3015	=for apidoc sv_uni_display
	3016
	3017	Build to the scalar dsv a displayable version of the scalar sv,
	3018	the displayable version being at most pvlim bytes long
	3019	(if longer, the rest is truncated and "..." will be appended).
	3020
	3021	The flags argument is as in pv_uni_display().
	3022
	3023	The pointer to the PV of the dsv is returned.
	3024
	3025	=cut
	3026	*/
	3027	char *
	3028	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	3029	{
	3030	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	3031
	3032	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	3033	SvCUR(ssv), pvlim, flags);
	3034	}
	3035
	3036	/*
	3037	=for apidoc foldEQ_utf8
	3038
	3039	Returns true if the leading portions of the strings s1 and s2 (either or both
	3040	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3041	How far into the strings to compare is determined by other input parameters.
	3042
	3043	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	3044	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	3045	with respect to s2.
	3046
	3047	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	3048	equality. In other words, s1+l1 will be used as a goal to reach. The
	3049	scan will not be considered to be a match unless the goal is reached, and
	3050	scanning won't continue past that goal. Correspondingly for l2 with respect to
	3051	s2.
	3052
	3053	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	3054	considered an end pointer beyond which scanning of s1 will not continue under
	3055	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	3056	is less than s1+l1, the match will never be successful because it can never
	3057	get as far as its goal (and in fact is asserted against). Correspondingly for
	3058	pe2 with respect to s2.
	3059
	3060	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	3061	non-zero), and if both do, both have to be
	3062	reached for a successful match. Also, if the fold of a character is multiple
	3063	characters, all of them must be matched (see tr21 reference below for
	3064	'folding').
	3065
	3066	Upon a successful match, if pe1 is non-NULL,
	3067	it will be set to point to the beginning of the I<next> character of s1 beyond
	3068	what was matched. Correspondingly for pe2 and s2.
	3069
	3070	For case-insensitiveness, the "casefolding" of Unicode is used
	3071	instead of upper/lowercasing both the characters, see
	3072	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	3073
	3074	=cut */
	3075
	3076	/* A flags parameter has been added which may change, and hence isn't
	3077	* externally documented. Currently it is:
	3078	* 0 for as-documented above
	3079	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	3080	ASCII one, to not match
	3081	* FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
	3082	* points below 256; unicode rules for above 255; and
	3083	* folds that cross those boundaries are disallowed,
	3084	* like the NOMIX_ASCII option
	3085	*/
	3086	I32
	3087	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2, U32 flags)
	3088	{
	3089	dVAR;
	3090	register const U8 p1 = (const U8)s1; /* Point to current char */
	3091	register const U8 p2 = (const U8)s2;
	3092	register const U8 g1 = NULL; / goal for s1 */
	3093	register const U8 *g2 = NULL;
	3094	register const U8 e1 = NULL; / Don't scan s1 past this */
	3095	register U8 f1 = NULL; / Point to current folded */
	3096	register const U8 *e2 = NULL;
	3097	register U8 *f2 = NULL;
	3098	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3099	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3100	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3101	U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
	3102	these always fit in 2 bytes */
	3103
	3104	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	3105
	3106	if (pe1) {
	3107	e1 = (U8*)pe1;
	3108	}
	3109
	3110	if (l1) {
	3111	g1 = (const U8*)s1 + l1;
	3112	}
	3113
	3114	if (pe2) {
	3115	e2 = (U8*)pe2;
	3116	}
	3117
	3118	if (l2) {
	3119	g2 = (const U8*)s2 + l2;
	3120	}
	3121
	3122	/* Must have at least one goal */
	3123	assert(g1 \|\| g2);
	3124
	3125	if (g1) {
	3126
	3127	/* Will never match if goal is out-of-bounds */
	3128	assert(! e1 \|\| e1 >= g1);
	3129
	3130	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3131	* only go as far as the goal */
	3132	e1 = g1;
	3133	}
	3134	else {
	3135	assert(e1); /* Must have an end for looking at s1 */
	3136	}
	3137
	3138	/* Same for goal for s2 */
	3139	if (g2) {
	3140	assert(! e2 \|\| e2 >= g2);
	3141	e2 = g2;
	3142	}
	3143	else {
	3144	assert(e2);
	3145	}
	3146
	3147	/* Look through both strings, a character at a time */
	3148	while (p1 < e1 && p2 < e2) {
	3149
	3150	/* If at the beginning of a new character in s1, get its fold to use
	3151	* and the length of the fold. (exception: locale rules just get the
	3152	* character to a single byte) */
	3153	if (n1 == 0) {
	3154
	3155	/* If in locale matching, we use two sets of rules, depending on if
	3156	* the code point is above or below 255. Here, we test for and
	3157	* handle locale rules */
	3158	if ((flags & FOLDEQ_UTF8_LOCALE)
	3159	&& (! u1 \|\| UTF8_IS_INVARIANT(p1) \|\| UTF8_IS_DOWNGRADEABLE_START(p1)))
	3160	{
	3161	/* There is no mixing of code points above and below 255. */
	3162	if (u2 && (! UTF8_IS_INVARIANT(*p2)
	3163	&& ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
	3164	{
	3165	return 0;
	3166	}
	3167
	3168	/* We handle locale rules by converting, if necessary, the code
	3169	* point to a single byte. */
	3170	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
	3171	foldbuf1 = p1;
	3172	}
	3173	else {
	3174	foldbuf1 = TWO_BYTE_UTF8_TO_UNI(p1, *(p1 + 1));
	3175	}
	3176	n1 = 1;
	3177	}
	3178	else if (isASCII(p1)) { / Note, that here won't be both ASCII
	3179	and using locale rules */
	3180
	3181	/* If trying to mix non- with ASCII, and not supposed to, fail */
	3182	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	3183	return 0;
	3184	}
	3185	n1 = 1;
	3186	foldbuf1 = toLOWER(p1); /* Folds in the ASCII range are
	3187	just lowercased */
	3188	}
	3189	else if (u1) {
	3190	to_utf8_fold(p1, foldbuf1, &n1);
	3191	}
	3192	else { /* Not utf8, convert to it first and then get fold */
	3193	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	3194	to_utf8_fold(natbuf, foldbuf1, &n1);
	3195	}
	3196	f1 = foldbuf1;
	3197	}
	3198
	3199	if (n2 == 0) { /* Same for s2 */
	3200	if ((flags & FOLDEQ_UTF8_LOCALE)
	3201	&& (! u2 \|\| UTF8_IS_INVARIANT(p2) \|\| UTF8_IS_DOWNGRADEABLE_START(p2)))
	3202	{
	3203	/* Here, the next char in s2 is < 256. We've already worked on
	3204	* s1, and if it isn't also < 256, can't match */
	3205	if (u1 && (! UTF8_IS_INVARIANT(*p1)
	3206	&& ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
	3207	{
	3208	return 0;
	3209	}
	3210	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
	3211	foldbuf2 = p2;
	3212	}
	3213	else {
	3214	foldbuf2 = TWO_BYTE_UTF8_TO_UNI(p2, *(p2 + 1));
	3215	}
	3216
	3217	/* Use another function to handle locale rules. We've made
	3218	* sure that both characters to compare are single bytes */
	3219	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
	3220	return 0;
	3221	}
	3222	n1 = n2 = 0;
	3223	}
	3224	else if (isASCII(*p2)) {
	3225	if (flags && ! isASCII(*p1)) {
	3226	return 0;
	3227	}
	3228	n2 = 1;
	3229	foldbuf2 = toLOWER(p2);
	3230	}
	3231	else if (u2) {
	3232	to_utf8_fold(p2, foldbuf2, &n2);
	3233	}
	3234	else {
	3235	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	3236	to_utf8_fold(natbuf, foldbuf2, &n2);
	3237	}
	3238	f2 = foldbuf2;
	3239	}
	3240
	3241	/* Here f1 and f2 point to the beginning of the strings to compare.
	3242	* These strings are the folds of the input characters, stored in utf8.
	3243	*/
	3244
	3245	/* While there is more to look for in both folds, see if they
	3246	* continue to match */
	3247	while (n1 && n2) {
	3248	U8 fold_length = UTF8SKIP(f1);
	3249	if (fold_length != UTF8SKIP(f2)
	3250	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	3251	function call for single
	3252	character */
	3253	\|\| memNE((char)f1, (char)f2, fold_length))
	3254	{
	3255	return 0; /* mismatch */
	3256	}
	3257
	3258	/* Here, they matched, advance past them */
	3259	n1 -= fold_length;
	3260	f1 += fold_length;
	3261	n2 -= fold_length;
	3262	f2 += fold_length;
	3263	}
	3264
	3265	/* When reach the end of any fold, advance the input past it */
	3266	if (n1 == 0) {
	3267	p1 += u1 ? UTF8SKIP(p1) : 1;
	3268	}
	3269	if (n2 == 0) {
	3270	p2 += u2 ? UTF8SKIP(p2) : 1;
	3271	}
	3272	} /* End of loop through both strings */
	3273
	3274	/* A match is defined by each scan that specified an explicit length
	3275	* reaching its final goal, and the other not having matched a partial
	3276	* character (which can happen when the fold of a character is more than one
	3277	* character). */
	3278	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	3279	return 0;
	3280	}
	3281
	3282	/* Successful match. Set output pointers */
	3283	if (pe1) {
	3284	pe1 = (char)p1;
	3285	}
	3286	if (pe2) {
	3287	pe2 = (char)p2;
	3288	}
	3289	return 1;
	3290	}
	3291
	3292	/*
	3293	* Local variables:
	3294	* c-indentation-style: bsd
	3295	* c-basic-offset: 4
	3296	* indent-tabs-mode: t
	3297	* End:
	3298	*
	3299	* ex: set ts=8 sts=4 sw=4 noet:
	3300	*/