perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	66
	67	=cut
	68	*/
	69
	70	bool
	71	Perl_is_ascii_string(const U8 *s, STRLEN len)
	72	{
	73	const U8* const send = s + (len ? len : strlen((const char *)s));
	74	const U8* x = s;
	75
	76	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	77
	78	for (; x < send; ++x) {
	79	if (!UTF8_IS_INVARIANT(*x))
	80	break;
	81	}
	82
	83	return x == send;
	84	}
	85
	86	/*
	87	=for apidoc uvuni_to_utf8_flags
	88
	89	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	90	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	91	bytes available. The return value is the pointer to the byte after the
	92	end of the new character. In other words,
	93
	94	d = uvuni_to_utf8_flags(d, uv, flags);
	95
	96	or, in most cases,
	97
	98	d = uvuni_to_utf8(d, uv);
	99
	100	(which is equivalent to)
	101
	102	d = uvuni_to_utf8_flags(d, uv, 0);
	103
	104	is the recommended Unicode-aware way of saying
	105
	106	*(d++) = uv;
	107
	108	=cut
	109	*/
	110
	111	U8 *
	112	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	113	{
	114	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	115
	116	if (ckWARN(WARN_UTF8)) {
	117	if (UNICODE_IS_SURROGATE(uv) &&
	118	!(flags & UNICODE_ALLOW_SURROGATE))
	119	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	120	else if (
	121	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	122	!(flags & UNICODE_ALLOW_FDD0))
	123	\|\|
	124	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	125	!(flags & UNICODE_ALLOW_FFFF))) &&
	126	/* UNICODE_ALLOW_SUPER includes
	127	* FFFEs and FFFFs beyond 0x10FFFF. */
	128	((uv <= PERL_UNICODE_MAX) \|\|
	129	!(flags & UNICODE_ALLOW_SUPER))
	130	)
	131	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	132	"Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
	133	}
	134	if (UNI_IS_INVARIANT(uv)) {
	135	*d++ = (U8)UTF_TO_NATIVE(uv);
	136	return d;
	137	}
	138	#if defined(EBCDIC)
	139	else {
	140	STRLEN len = UNISKIP(uv);
	141	U8 *p = d+len-1;
	142	while (p > d) {
	143	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	144	uv >>= UTF_ACCUMULATION_SHIFT;
	145	}
	146	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	147	return d+len;
	148	}
	149	#else /* Non loop style */
	150	if (uv < 0x800) {
	151	*d++ = (U8)(( uv >> 6) \| 0xc0);
	152	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	153	return d;
	154	}
	155	if (uv < 0x10000) {
	156	*d++ = (U8)(( uv >> 12) \| 0xe0);
	157	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	158	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	159	return d;
	160	}
	161	if (uv < 0x200000) {
	162	*d++ = (U8)(( uv >> 18) \| 0xf0);
	163	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	164	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	165	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	166	return d;
	167	}
	168	if (uv < 0x4000000) {
	169	*d++ = (U8)(( uv >> 24) \| 0xf8);
	170	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	171	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	172	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	173	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	174	return d;
	175	}
	176	if (uv < 0x80000000) {
	177	*d++ = (U8)(( uv >> 30) \| 0xfc);
	178	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	179	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	180	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	181	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	182	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	183	return d;
	184	}
	185	#ifdef HAS_QUAD
	186	if (uv < UTF8_QUAD_MAX)
	187	#endif
	188	{
	189	d++ = 0xfe; / Can't match U+FEFF! */
	190	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	191	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	192	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	193	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	194	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	195	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	196	return d;
	197	}
	198	#ifdef HAS_QUAD
	199	{
	200	d++ = 0xff; / Can't match U+FFFE! */
	201	d++ = 0x80; / 6 Reserved bits */
	202	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	203	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	204	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	205	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	206	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	207	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	208	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	#endif
	216	#endif /* Loop style */
	217	}
	218
	219	/*
	220
	221	Tests if some arbitrary number of bytes begins in a valid UTF-8
	222	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	223	UTF-8 character. The actual number of bytes in the UTF-8 character
	224	will be returned if it is valid, otherwise 0.
	225
	226	This is the "slow" version as opposed to the "fast" version which is
	227	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	228	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	229	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	230	you should use the _slow(). In practice this means that the _slow()
	231	will be used very rarely, since the maximum Unicode code point (as of
	232	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	233	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	234	five bytes or more.
	235
	236	=cut */
	237	STATIC STRLEN
	238	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	239	{
	240	U8 u = *s;
	241	STRLEN slen;
	242	UV uv, ouv;
	243
	244	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	245
	246	if (UTF8_IS_INVARIANT(u))
	247	return 1;
	248
	249	if (!UTF8_IS_START(u))
	250	return 0;
	251
	252	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	253	return 0;
	254
	255	slen = len - 1;
	256	s++;
	257	#ifdef EBCDIC
	258	u = NATIVE_TO_UTF(u);
	259	#endif
	260	u &= UTF_START_MASK(len);
	261	uv = u;
	262	ouv = uv;
	263	while (slen--) {
	264	if (!UTF8_IS_CONTINUATION(*s))
	265	return 0;
	266	uv = UTF8_ACCUMULATE(uv, *s);
	267	if (uv < ouv)
	268	return 0;
	269	ouv = uv;
	270	s++;
	271	}
	272
	273	if ((STRLEN)UNISKIP(uv) < len)
	274	return 0;
	275
	276	return len;
	277	}
	278
	279	/*
	280	=for apidoc is_utf8_char
	281
	282	Tests if some arbitrary number of bytes begins in a valid UTF-8
	283	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	284	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	285	character will be returned if it is valid, otherwise 0.
	286
	287	=cut */
	288	STRLEN
	289	Perl_is_utf8_char(const U8 *s)
	290	{
	291	const STRLEN len = UTF8SKIP(s);
	292
	293	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	294	#ifdef IS_UTF8_CHAR
	295	if (IS_UTF8_CHAR_FAST(len))
	296	return IS_UTF8_CHAR(s, len) ? len : 0;
	297	#endif /* #ifdef IS_UTF8_CHAR */
	298	return is_utf8_char_slow(s, len);
	299	}
	300
	301
	302	/*
	303	=for apidoc is_utf8_string
	304
	305	Returns true if first C<len> bytes of the given string form a valid
	306	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	307	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	308	because a valid ASCII string is a valid UTF-8 string.
	309
	310	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	311
	312	=cut
	313	*/
	314
	315	bool
	316	Perl_is_utf8_string(const U8 *s, STRLEN len)
	317	{
	318	const U8* const send = s + (len ? len : strlen((const char *)s));
	319	const U8* x = s;
	320
	321	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	322
	323	while (x < send) {
	324	STRLEN c;
	325	/* Inline the easy bits of is_utf8_char() here for speed... */
	326	if (UTF8_IS_INVARIANT(*x))
	327	c = 1;
	328	else if (!UTF8_IS_START(*x))
	329	goto out;
	330	else {
	331	/* ... and call is_utf8_char() only if really needed. */
	332	#ifdef IS_UTF8_CHAR
	333	c = UTF8SKIP(x);
	334	if (IS_UTF8_CHAR_FAST(c)) {
	335	if (!IS_UTF8_CHAR(x, c))
	336	c = 0;
	337	}
	338	else
	339	c = is_utf8_char_slow(x, c);
	340	#else
	341	c = is_utf8_char(x);
	342	#endif /* #ifdef IS_UTF8_CHAR */
	343	if (!c)
	344	goto out;
	345	}
	346	x += c;
	347	}
	348
	349	out:
	350	if (x != send)
	351	return FALSE;
	352
	353	return TRUE;
	354	}
	355
	356	/*
	357	Implemented as a macro in utf8.h
	358
	359	=for apidoc is_utf8_string_loc
	360
	361	Like is_utf8_string() but stores the location of the failure (in the
	362	case of "utf8ness failure") or the location s+len (in the case of
	363	"utf8ness success") in the C<ep>.
	364
	365	See also is_utf8_string_loclen() and is_utf8_string().
	366
	367	=for apidoc is_utf8_string_loclen
	368
	369	Like is_utf8_string() but stores the location of the failure (in the
	370	case of "utf8ness failure") or the location s+len (in the case of
	371	"utf8ness success") in the C<ep>, and the number of UTF-8
	372	encoded characters in the C<el>.
	373
	374	See also is_utf8_string_loc() and is_utf8_string().
	375
	376	=cut
	377	*/
	378
	379	bool
	380	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	381	{
	382	const U8* const send = s + (len ? len : strlen((const char *)s));
	383	const U8* x = s;
	384	STRLEN c;
	385	STRLEN outlen = 0;
	386
	387	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	388
	389	while (x < send) {
	390	/* Inline the easy bits of is_utf8_char() here for speed... */
	391	if (UTF8_IS_INVARIANT(*x))
	392	c = 1;
	393	else if (!UTF8_IS_START(*x))
	394	goto out;
	395	else {
	396	/* ... and call is_utf8_char() only if really needed. */
	397	#ifdef IS_UTF8_CHAR
	398	c = UTF8SKIP(x);
	399	if (IS_UTF8_CHAR_FAST(c)) {
	400	if (!IS_UTF8_CHAR(x, c))
	401	c = 0;
	402	} else
	403	c = is_utf8_char_slow(x, c);
	404	#else
	405	c = is_utf8_char(x);
	406	#endif /* #ifdef IS_UTF8_CHAR */
	407	if (!c)
	408	goto out;
	409	}
	410	x += c;
	411	outlen++;
	412	}
	413
	414	out:
	415	if (el)
	416	*el = outlen;
	417
	418	if (ep)
	419	*ep = x;
	420	return (x == send);
	421	}
	422
	423	/*
	424
	425	=for apidoc utf8n_to_uvuni
	426
	427	Bottom level UTF-8 decode routine.
	428	Returns the Unicode code point value of the first character in the string C<s>
	429	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	430	C<retlen> will be set to the length, in bytes, of that character.
	431
	432	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	433	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	434	it is assumed that the caller will raise a warning, and this function
	435	will silently just set C<retlen> to C<-1> and return zero. If the
	436	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	437	malformations will be given, C<retlen> will be set to the expected
	438	length of the UTF-8 character in bytes, and zero will be returned.
	439
	440	The C<flags> can also contain various flags to allow deviations from
	441	the strict UTF-8 encoding (see F<utf8.h>).
	442
	443	Most code should use utf8_to_uvchr() rather than call this directly.
	444
	445	=cut
	446	*/
	447
	448	UV
	449	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	450	{
	451	dVAR;
	452	const U8 * const s0 = s;
	453	UV uv = *s, ouv = 0;
	454	STRLEN len = 1;
	455	const bool dowarn = ckWARN_d(WARN_UTF8);
	456	const UV startbyte = *s;
	457	STRLEN expectlen = 0;
	458	U32 warning = 0;
	459	SV* sv;
	460
	461	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	462
	463	/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */
	464
	465	#define UTF8_WARN_EMPTY 1
	466	#define UTF8_WARN_CONTINUATION 2
	467	#define UTF8_WARN_NON_CONTINUATION 3
	468	#define UTF8_WARN_FE_FF 4
	469	#define UTF8_WARN_SHORT 5
	470	#define UTF8_WARN_OVERFLOW 6
	471	#define UTF8_WARN_SURROGATE 7
	472	#define UTF8_WARN_LONG 8
	473	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	474
	475	if (curlen == 0 &&
	476	!(flags & UTF8_ALLOW_EMPTY)) {
	477	warning = UTF8_WARN_EMPTY;
	478	goto malformed;
	479	}
	480
	481	if (UTF8_IS_INVARIANT(uv)) {
	482	if (retlen)
	483	*retlen = 1;
	484	return (UV) (NATIVE_TO_UTF(*s));
	485	}
	486
	487	if (UTF8_IS_CONTINUATION(uv) &&
	488	!(flags & UTF8_ALLOW_CONTINUATION)) {
	489	warning = UTF8_WARN_CONTINUATION;
	490	goto malformed;
	491	}
	492
	493	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	494	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	495	warning = UTF8_WARN_NON_CONTINUATION;
	496	goto malformed;
	497	}
	498
	499	#ifdef EBCDIC
	500	uv = NATIVE_TO_UTF(uv);
	501	#else
	502	if ((uv == 0xfe \|\| uv == 0xff) &&
	503	!(flags & UTF8_ALLOW_FE_FF)) {
	504	warning = UTF8_WARN_FE_FF;
	505	goto malformed;
	506	}
	507	#endif
	508
	509	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	510	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	511	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	512	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	513	#ifdef EBCDIC
	514	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	515	else { len = 7; uv &= 0x01; }
	516	#else
	517	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	518	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	519	else { len = 13; uv = 0; } /* whoa! */
	520	#endif
	521
	522	if (retlen)
	523	*retlen = len;
	524
	525	expectlen = len;
	526
	527	if ((curlen < expectlen) &&
	528	!(flags & UTF8_ALLOW_SHORT)) {
	529	warning = UTF8_WARN_SHORT;
	530	goto malformed;
	531	}
	532
	533	len--;
	534	s++;
	535	ouv = uv;
	536
	537	while (len--) {
	538	if (!UTF8_IS_CONTINUATION(*s) &&
	539	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	540	s--;
	541	warning = UTF8_WARN_NON_CONTINUATION;
	542	goto malformed;
	543	}
	544	else
	545	uv = UTF8_ACCUMULATE(uv, *s);
	546	if (!(uv > ouv)) {
	547	/* These cannot be allowed. */
	548	if (uv == ouv) {
	549	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	550	warning = UTF8_WARN_LONG;
	551	goto malformed;
	552	}
	553	}
	554	else { /* uv < ouv */
	555	/* This cannot be allowed. */
	556	warning = UTF8_WARN_OVERFLOW;
	557	goto malformed;
	558	}
	559	}
	560	s++;
	561	ouv = uv;
	562	}
	563
	564	if (UNICODE_IS_SURROGATE(uv) &&
	565	!(flags & UTF8_ALLOW_SURROGATE)) {
	566	warning = UTF8_WARN_SURROGATE;
	567	goto malformed;
	568	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	569	!(flags & UTF8_ALLOW_LONG)) {
	570	warning = UTF8_WARN_LONG;
	571	goto malformed;
	572	} else if (UNICODE_IS_ILLEGAL(uv) &&
	573	!(flags & UTF8_ALLOW_FFFF)) {
	574	warning = UTF8_WARN_FFFF;
	575	goto malformed;
	576	}
	577
	578	return uv;
	579
	580	malformed:
	581
	582	if (flags & UTF8_CHECK_ONLY) {
	583	if (retlen)
	584	*retlen = ((STRLEN) -1);
	585	return 0;
	586	}
	587
	588	if (dowarn) {
	589	if (warning == UTF8_WARN_FFFF) {
	590	sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
	591	Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
	592	}
	593	else {
	594	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	595
	596	switch (warning) {
	597	case 0: /* Intentionally empty. */ break;
	598	case UTF8_WARN_EMPTY:
	599	sv_catpvs(sv, "(empty string)");
	600	break;
	601	case UTF8_WARN_CONTINUATION:
	602	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	603	break;
	604	case UTF8_WARN_NON_CONTINUATION:
	605	if (s == s0)
	606	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	607	(UV)s[1], startbyte);
	608	else {
	609	const int len = (int)(s-s0);
	610	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	611	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	612	}
	613
	614	break;
	615	case UTF8_WARN_FE_FF:
	616	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	617	break;
	618	case UTF8_WARN_SHORT:
	619	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	620	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	621	expectlen = curlen; /* distance for caller to skip */
	622	break;
	623	case UTF8_WARN_OVERFLOW:
	624	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	625	ouv, *s, startbyte);
	626	break;
	627	case UTF8_WARN_SURROGATE:
	628	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	629	break;
	630	case UTF8_WARN_LONG:
	631	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	632	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	633	break;
	634	default:
	635	sv_catpvs(sv, "(unknown reason)");
	636	break;
	637	}
	638	}
	639
	640	if (warning) {
	641	const char * const s = SvPVX_const(sv);
	642
	643	if (PL_op)
	644	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	645	"%s in %s", s, OP_DESC(PL_op));
	646	else
	647	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	648	}
	649	}
	650
	651	if (retlen)
	652	*retlen = expectlen ? expectlen : len;
	653
	654	return 0;
	655	}
	656
	657	/*
	658	=for apidoc utf8_to_uvchr
	659
	660	Returns the native character value of the first character in the string C<s>
	661	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	662	length, in bytes, of that character.
	663
	664	If C<s> does not point to a well-formed UTF-8 character, zero is
	665	returned and retlen is set, if possible, to -1.
	666
	667	=cut
	668	*/
	669
	670	UV
	671	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	672	{
	673	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	674
	675	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	676	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	677	}
	678
	679	/*
	680	=for apidoc utf8_to_uvuni
	681
	682	Returns the Unicode code point of the first character in the string C<s>
	683	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	684	length, in bytes, of that character.
	685
	686	This function should only be used when the returned UV is considered
	687	an index into the Unicode semantic tables (e.g. swashes).
	688
	689	If C<s> does not point to a well-formed UTF-8 character, zero is
	690	returned and retlen is set, if possible, to -1.
	691
	692	=cut
	693	*/
	694
	695	UV
	696	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	697	{
	698	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	699
	700	/* Call the low level routine asking for checks */
	701	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	702	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	703	}
	704
	705	/*
	706	=for apidoc utf8_length
	707
	708	Return the length of the UTF-8 char encoded string C<s> in characters.
	709	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	710	up past C<e>, croaks.
	711
	712	=cut
	713	*/
	714
	715	STRLEN
	716	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	717	{
	718	dVAR;
	719	STRLEN len = 0;
	720
	721	PERL_ARGS_ASSERT_UTF8_LENGTH;
	722
	723	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	724	* the bitops (especially ~) can create illegal UTF-8.
	725	* In other words: in Perl UTF-8 is not just for Unicode. */
	726
	727	if (e < s)
	728	goto warn_and_return;
	729	while (s < e) {
	730	if (!UTF8_IS_INVARIANT(*s))
	731	s += UTF8SKIP(s);
	732	else
	733	s++;
	734	len++;
	735	}
	736
	737	if (e != s) {
	738	len--;
	739	warn_and_return:
	740	if (PL_op)
	741	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	742	"%s in %s", unees, OP_DESC(PL_op));
	743	else
	744	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
	745	}
	746
	747	return len;
	748	}
	749
	750	/*
	751	=for apidoc utf8_distance
	752
	753	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	754	and C<b>.
	755
	756	WARNING: use only if you know that the pointers point inside the
	757	same UTF-8 buffer.
	758
	759	=cut
	760	*/
	761
	762	IV
	763	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	764	{
	765	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	766
	767	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	768	}
	769
	770	/*
	771	=for apidoc utf8_hop
	772
	773	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	774	forward or backward.
	775
	776	WARNING: do not use the following unless you know C<off> is within
	777	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	778	on the first byte of character or just after the last byte of a character.
	779
	780	=cut
	781	*/
	782
	783	U8 *
	784	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	785	{
	786	PERL_ARGS_ASSERT_UTF8_HOP;
	787
	788	PERL_UNUSED_CONTEXT;
	789	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	790	* the bitops (especially ~) can create illegal UTF-8.
	791	* In other words: in Perl UTF-8 is not just for Unicode. */
	792
	793	if (off >= 0) {
	794	while (off--)
	795	s += UTF8SKIP(s);
	796	}
	797	else {
	798	while (off++) {
	799	s--;
	800	while (UTF8_IS_CONTINUATION(*s))
	801	s--;
	802	}
	803	}
	804	return (U8 *)s;
	805	}
	806
	807	/*
	808	=for apidoc utf8_to_bytes
	809
	810	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	811	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	812	updates len to contain the new length.
	813	Returns zero on failure, setting C<len> to -1.
	814
	815	If you need a copy of the string, see C<bytes_from_utf8>.
	816
	817	=cut
	818	*/
	819
	820	U8 *
	821	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	822	{
	823	U8 * const save = s;
	824	U8 * const send = s + *len;
	825	U8 *d;
	826
	827	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	828
	829	/* ensure valid UTF-8 and chars < 256 before updating string */
	830	while (s < send) {
	831	U8 c = *s++;
	832
	833	if (!UTF8_IS_INVARIANT(c) &&
	834	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	835	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	836	*len = ((STRLEN) -1);
	837	return 0;
	838	}
	839	}
	840
	841	d = s = save;
	842	while (s < send) {
	843	STRLEN ulen;
	844	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	845	s += ulen;
	846	}
	847	*d = '\0';
	848	*len = d - save;
	849	return save;
	850	}
	851
	852	/*
	853	=for apidoc bytes_from_utf8
	854
	855	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	856	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	857	the newly-created string, and updates C<len> to contain the new
	858	length. Returns the original string if no conversion occurs, C<len>
	859	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	860	0 if C<s> is converted or consisted entirely of characters that are invariant
	861	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	862
	863	=cut
	864	*/
	865
	866	U8 *
	867	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	868	{
	869	U8 *d;
	870	const U8 *start = s;
	871	const U8 *send;
	872	I32 count = 0;
	873
	874	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	875
	876	PERL_UNUSED_CONTEXT;
	877	if (!*is_utf8)
	878	return (U8 *)start;
	879
	880	/* ensure valid UTF-8 and chars < 256 before converting string */
	881	for (send = s + *len; s < send;) {
	882	U8 c = *s++;
	883	if (!UTF8_IS_INVARIANT(c)) {
	884	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	885	(c = *s++) && UTF8_IS_CONTINUATION(c))
	886	count++;
	887	else
	888	return (U8 *)start;
	889	}
	890	}
	891
	892	*is_utf8 = FALSE;
	893
	894	Newx(d, (*len) - count + 1, U8);
	895	s = start; start = d;
	896	while (s < send) {
	897	U8 c = *s++;
	898	if (!UTF8_IS_INVARIANT(c)) {
	899	/* Then it is two-byte encoded */
	900	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	901	c = ASCII_TO_NATIVE(c);
	902	}
	903	*d++ = c;
	904	}
	905	*d = '\0';
	906	*len = d - start;
	907	return (U8 *)start;
	908	}
	909
	910	/*
	911	=for apidoc bytes_to_utf8
	912
	913	Converts a string C<s> of length C<len> from the native encoding into UTF-8.
	914	Returns a pointer to the newly-created string, and sets C<len> to
	915	reflect the new length.
	916
	917	A NUL character will be written after the end of the string.
	918
	919	If you want to convert to UTF-8 from encodings other than
	920	the native (Latin1 or EBCDIC),
	921	see sv_recode_to_utf8().
	922
	923	=cut
	924	*/
	925
	926	U8*
	927	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	928	{
	929	const U8 * const send = s + (*len);
	930	U8 *d;
	931	U8 *dst;
	932
	933	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	934	PERL_UNUSED_CONTEXT;
	935
	936	Newx(d, (len) 2 + 1, U8);
	937	dst = d;
	938
	939	while (s < send) {
	940	const UV uv = NATIVE_TO_ASCII(*s++);
	941	if (UNI_IS_INVARIANT(uv))
	942	*d++ = (U8)UTF_TO_NATIVE(uv);
	943	else {
	944	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	945	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	946	}
	947	}
	948	*d = '\0';
	949	*len = d-dst;
	950	return dst;
	951	}
	952
	953	/*
	954	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	955	*
	956	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	957	* We optimize for native, for obvious reasons. */
	958
	959	U8*
	960	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	961	{
	962	U8* pend;
	963	U8* dstart = d;
	964
	965	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	966
	967	if (bytelen & 1)
	968	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	969
	970	pend = p + bytelen;
	971
	972	while (p < pend) {
	973	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	974	p += 2;
	975	if (uv < 0x80) {
	976	#ifdef EBCDIC
	977	*d++ = UNI_TO_NATIVE(uv);
	978	#else
	979	*d++ = (U8)uv;
	980	#endif
	981	continue;
	982	}
	983	if (uv < 0x800) {
	984	*d++ = (U8)(( uv >> 6) \| 0xc0);
	985	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	986	continue;
	987	}
	988	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	989	if (p >= pend) {
	990	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	991	} else {
	992	UV low = (p[0] << 8) + p[1];
	993	p += 2;
	994	if (low < 0xdc00 \|\| low > 0xdfff)
	995	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	996	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	997	}
	998	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	999	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1000	}
	1001	if (uv < 0x10000) {
	1002	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1003	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1004	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1005	continue;
	1006	}
	1007	else {
	1008	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1009	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1010	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1011	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1012	continue;
	1013	}
	1014	}
	1015	*newlen = d - dstart;
	1016	return d;
	1017	}
	1018
	1019	/* Note: this one is slightly destructive of the source. */
	1020
	1021	U8*
	1022	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1023	{
	1024	U8* s = (U8*)p;
	1025	U8* const send = s + bytelen;
	1026
	1027	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1028
	1029	if (bytelen & 1)
	1030	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1031	(UV)bytelen);
	1032
	1033	while (s < send) {
	1034	const U8 tmp = s[0];
	1035	s[0] = s[1];
	1036	s[1] = tmp;
	1037	s += 2;
	1038	}
	1039	return utf16_to_utf8(p, d, bytelen, newlen);
	1040	}
	1041
	1042	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	1043
	1044	bool
	1045	Perl_is_uni_alnum(pTHX_ UV c)
	1046	{
	1047	U8 tmpbuf[UTF8_MAXBYTES+1];
	1048	uvchr_to_utf8(tmpbuf, c);
	1049	return is_utf8_alnum(tmpbuf);
	1050	}
	1051
	1052	bool
	1053	Perl_is_uni_idfirst(pTHX_ UV c)
	1054	{
	1055	U8 tmpbuf[UTF8_MAXBYTES+1];
	1056	uvchr_to_utf8(tmpbuf, c);
	1057	return is_utf8_idfirst(tmpbuf);
	1058	}
	1059
	1060	bool
	1061	Perl_is_uni_alpha(pTHX_ UV c)
	1062	{
	1063	U8 tmpbuf[UTF8_MAXBYTES+1];
	1064	uvchr_to_utf8(tmpbuf, c);
	1065	return is_utf8_alpha(tmpbuf);
	1066	}
	1067
	1068	bool
	1069	Perl_is_uni_ascii(pTHX_ UV c)
	1070	{
	1071	U8 tmpbuf[UTF8_MAXBYTES+1];
	1072	uvchr_to_utf8(tmpbuf, c);
	1073	return is_utf8_ascii(tmpbuf);
	1074	}
	1075
	1076	bool
	1077	Perl_is_uni_space(pTHX_ UV c)
	1078	{
	1079	U8 tmpbuf[UTF8_MAXBYTES+1];
	1080	uvchr_to_utf8(tmpbuf, c);
	1081	return is_utf8_space(tmpbuf);
	1082	}
	1083
	1084	bool
	1085	Perl_is_uni_digit(pTHX_ UV c)
	1086	{
	1087	U8 tmpbuf[UTF8_MAXBYTES+1];
	1088	uvchr_to_utf8(tmpbuf, c);
	1089	return is_utf8_digit(tmpbuf);
	1090	}
	1091
	1092	bool
	1093	Perl_is_uni_upper(pTHX_ UV c)
	1094	{
	1095	U8 tmpbuf[UTF8_MAXBYTES+1];
	1096	uvchr_to_utf8(tmpbuf, c);
	1097	return is_utf8_upper(tmpbuf);
	1098	}
	1099
	1100	bool
	1101	Perl_is_uni_lower(pTHX_ UV c)
	1102	{
	1103	U8 tmpbuf[UTF8_MAXBYTES+1];
	1104	uvchr_to_utf8(tmpbuf, c);
	1105	return is_utf8_lower(tmpbuf);
	1106	}
	1107
	1108	bool
	1109	Perl_is_uni_cntrl(pTHX_ UV c)
	1110	{
	1111	U8 tmpbuf[UTF8_MAXBYTES+1];
	1112	uvchr_to_utf8(tmpbuf, c);
	1113	return is_utf8_cntrl(tmpbuf);
	1114	}
	1115
	1116	bool
	1117	Perl_is_uni_graph(pTHX_ UV c)
	1118	{
	1119	U8 tmpbuf[UTF8_MAXBYTES+1];
	1120	uvchr_to_utf8(tmpbuf, c);
	1121	return is_utf8_graph(tmpbuf);
	1122	}
	1123
	1124	bool
	1125	Perl_is_uni_print(pTHX_ UV c)
	1126	{
	1127	U8 tmpbuf[UTF8_MAXBYTES+1];
	1128	uvchr_to_utf8(tmpbuf, c);
	1129	return is_utf8_print(tmpbuf);
	1130	}
	1131
	1132	bool
	1133	Perl_is_uni_punct(pTHX_ UV c)
	1134	{
	1135	U8 tmpbuf[UTF8_MAXBYTES+1];
	1136	uvchr_to_utf8(tmpbuf, c);
	1137	return is_utf8_punct(tmpbuf);
	1138	}
	1139
	1140	bool
	1141	Perl_is_uni_xdigit(pTHX_ UV c)
	1142	{
	1143	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1144	uvchr_to_utf8(tmpbuf, c);
	1145	return is_utf8_xdigit(tmpbuf);
	1146	}
	1147
	1148	UV
	1149	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1150	{
	1151	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1152
	1153	uvchr_to_utf8(p, c);
	1154	return to_utf8_upper(p, p, lenp);
	1155	}
	1156
	1157	UV
	1158	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1159	{
	1160	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1161
	1162	uvchr_to_utf8(p, c);
	1163	return to_utf8_title(p, p, lenp);
	1164	}
	1165
	1166	UV
	1167	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1168	{
	1169	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1170
	1171	uvchr_to_utf8(p, c);
	1172	return to_utf8_lower(p, p, lenp);
	1173	}
	1174
	1175	UV
	1176	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1177	{
	1178	PERL_ARGS_ASSERT_TO_UNI_FOLD;
	1179
	1180	uvchr_to_utf8(p, c);
	1181	return to_utf8_fold(p, p, lenp);
	1182	}
	1183
	1184	/* for now these all assume no locale info available for Unicode > 255 */
	1185
	1186	bool
	1187	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1188	{
	1189	return is_uni_alnum(c); /* XXX no locale support yet */
	1190	}
	1191
	1192	bool
	1193	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1194	{
	1195	return is_uni_idfirst(c); /* XXX no locale support yet */
	1196	}
	1197
	1198	bool
	1199	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1200	{
	1201	return is_uni_alpha(c); /* XXX no locale support yet */
	1202	}
	1203
	1204	bool
	1205	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1206	{
	1207	return is_uni_ascii(c); /* XXX no locale support yet */
	1208	}
	1209
	1210	bool
	1211	Perl_is_uni_space_lc(pTHX_ UV c)
	1212	{
	1213	return is_uni_space(c); /* XXX no locale support yet */
	1214	}
	1215
	1216	bool
	1217	Perl_is_uni_digit_lc(pTHX_ UV c)
	1218	{
	1219	return is_uni_digit(c); /* XXX no locale support yet */
	1220	}
	1221
	1222	bool
	1223	Perl_is_uni_upper_lc(pTHX_ UV c)
	1224	{
	1225	return is_uni_upper(c); /* XXX no locale support yet */
	1226	}
	1227
	1228	bool
	1229	Perl_is_uni_lower_lc(pTHX_ UV c)
	1230	{
	1231	return is_uni_lower(c); /* XXX no locale support yet */
	1232	}
	1233
	1234	bool
	1235	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1236	{
	1237	return is_uni_cntrl(c); /* XXX no locale support yet */
	1238	}
	1239
	1240	bool
	1241	Perl_is_uni_graph_lc(pTHX_ UV c)
	1242	{
	1243	return is_uni_graph(c); /* XXX no locale support yet */
	1244	}
	1245
	1246	bool
	1247	Perl_is_uni_print_lc(pTHX_ UV c)
	1248	{
	1249	return is_uni_print(c); /* XXX no locale support yet */
	1250	}
	1251
	1252	bool
	1253	Perl_is_uni_punct_lc(pTHX_ UV c)
	1254	{
	1255	return is_uni_punct(c); /* XXX no locale support yet */
	1256	}
	1257
	1258	bool
	1259	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1260	{
	1261	return is_uni_xdigit(c); /* XXX no locale support yet */
	1262	}
	1263
	1264	U32
	1265	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1266	{
	1267	/* XXX returns only the first character -- do not use XXX */
	1268	/* XXX no locale support yet */
	1269	STRLEN len;
	1270	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1271	return (U32)to_uni_upper(c, tmpbuf, &len);
	1272	}
	1273
	1274	U32
	1275	Perl_to_uni_title_lc(pTHX_ U32 c)
	1276	{
	1277	/* XXX returns only the first character XXX -- do not use XXX */
	1278	/* XXX no locale support yet */
	1279	STRLEN len;
	1280	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1281	return (U32)to_uni_title(c, tmpbuf, &len);
	1282	}
	1283
	1284	U32
	1285	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1286	{
	1287	/* XXX returns only the first character -- do not use XXX */
	1288	/* XXX no locale support yet */
	1289	STRLEN len;
	1290	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1291	return (U32)to_uni_lower(c, tmpbuf, &len);
	1292	}
	1293
	1294	static bool
	1295	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1296	const char *const swashname)
	1297	{
	1298	dVAR;
	1299
	1300	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1301
	1302	if (!is_utf8_char(p))
	1303	return FALSE;
	1304	if (!*swash)
	1305	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1306	return swash_fetch(*swash, p, TRUE) != 0;
	1307	}
	1308
	1309	bool
	1310	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1311	{
	1312	dVAR;
	1313
	1314	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1315
	1316	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1317	* descendant of isalnum(3), in other words, it doesn't
	1318	* contain the '_'. --jhi */
	1319	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1320	}
	1321
	1322	bool
	1323	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1324	{
	1325	dVAR;
	1326
	1327	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1328
	1329	if (*p == '_')
	1330	return TRUE;
	1331	/* is_utf8_idstart would be more logical. */
	1332	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1333	}
	1334
	1335	bool
	1336	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1337	{
	1338	dVAR;
	1339
	1340	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1341
	1342	if (*p == '_')
	1343	return TRUE;
	1344	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1345	}
	1346
	1347	bool
	1348	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1349	{
	1350	dVAR;
	1351
	1352	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1353
	1354	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1355	}
	1356
	1357	bool
	1358	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1359	{
	1360	dVAR;
	1361
	1362	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1363
	1364	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1365	}
	1366
	1367	bool
	1368	Perl_is_utf8_space(pTHX_ const U8 *p)
	1369	{
	1370	dVAR;
	1371
	1372	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1373
	1374	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1375	}
	1376
	1377	bool
	1378	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1379	{
	1380	dVAR;
	1381
	1382	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1383
	1384	return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
	1385	}
	1386
	1387	bool
	1388	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1389	{
	1390	dVAR;
	1391
	1392	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1393
	1394	return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
	1395	}
	1396
	1397	bool
	1398	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1399	{
	1400	dVAR;
	1401
	1402	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1403
	1404	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1405	}
	1406
	1407	bool
	1408	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1409	{
	1410	dVAR;
	1411
	1412	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1413
	1414	return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
	1415	}
	1416
	1417	bool
	1418	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1419	{
	1420	dVAR;
	1421
	1422	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1423
	1424	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1425	}
	1426
	1427	bool
	1428	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1429	{
	1430	dVAR;
	1431
	1432	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1433
	1434	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1435	}
	1436
	1437	bool
	1438	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1439	{
	1440	dVAR;
	1441
	1442	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1443
	1444	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1445	}
	1446
	1447	bool
	1448	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1449	{
	1450	dVAR;
	1451
	1452	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1453
	1454	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1455	}
	1456
	1457	bool
	1458	Perl_is_utf8_print(pTHX_ const U8 *p)
	1459	{
	1460	dVAR;
	1461
	1462	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1463
	1464	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1465	}
	1466
	1467	bool
	1468	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1469	{
	1470	dVAR;
	1471
	1472	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1473
	1474	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1475	}
	1476
	1477	bool
	1478	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1479	{
	1480	dVAR;
	1481
	1482	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1483
	1484	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1485	}
	1486
	1487	bool
	1488	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1489	{
	1490	dVAR;
	1491
	1492	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1493
	1494	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1495	}
	1496
	1497	bool
	1498	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1499	{
	1500	dVAR;
	1501
	1502	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1503
	1504	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1505	}
	1506
	1507	bool
	1508	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1509	{
	1510	dVAR;
	1511
	1512	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1513
	1514	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1515	}
	1516
	1517	bool
	1518	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1519	{
	1520	dVAR;
	1521
	1522	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1523
	1524	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1525	}
	1526
	1527	bool
	1528	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1529	{
	1530	dVAR;
	1531
	1532	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1533
	1534	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1535	}
	1536
	1537	bool
	1538	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1539	{
	1540	dVAR;
	1541
	1542	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1543
	1544	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1545	}
	1546
	1547	bool
	1548	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1549	{
	1550	dVAR;
	1551
	1552	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1553
	1554	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1555	}
	1556
	1557	bool
	1558	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1559	{
	1560	dVAR;
	1561
	1562	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1563
	1564	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1565	}
	1566
	1567	bool
	1568	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1569	{
	1570	dVAR;
	1571
	1572	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1573
	1574	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1575	}
	1576
	1577	bool
	1578	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1579	{
	1580	dVAR;
	1581
	1582	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1583
	1584	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1585	}
	1586
	1587	bool
	1588	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1589	{
	1590	dVAR;
	1591
	1592	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1593
	1594	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1595	}
	1596
	1597	/*
	1598	=for apidoc to_utf8_case
	1599
	1600	The "p" contains the pointer to the UTF-8 string encoding
	1601	the character that is being converted.
	1602
	1603	The "ustrp" is a pointer to the character buffer to put the
	1604	conversion result to. The "lenp" is a pointer to the length
	1605	of the result.
	1606
	1607	The "swashp" is a pointer to the swash to use.
	1608
	1609	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1610	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1611	but not always, a multicharacter mapping), is tried first.
	1612
	1613	The "special" is a string like "utf8::ToSpecLower", which means the
	1614	hash %utf8::ToSpecLower. The access to the hash is through
	1615	Perl_to_utf8_case().
	1616
	1617	The "normal" is a string like "ToLower" which means the swash
	1618	%utf8::ToLower.
	1619
	1620	=cut */
	1621
	1622	UV
	1623	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1624	SV *swashp, const char normal, const char *special)
	1625	{
	1626	dVAR;
	1627	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1628	STRLEN len = 0;
	1629	const UV uv0 = utf8_to_uvchr(p, NULL);
	1630	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1631	* are necessary in EBCDIC, they are redundant no-ops
	1632	* in ASCII-ish platforms, and hopefully optimized away. */
	1633	const UV uv1 = NATIVE_TO_UNI(uv0);
	1634
	1635	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1636
	1637	uvuni_to_utf8(tmpbuf, uv1);
	1638
	1639	if (!swashp) / load on-demand */
	1640	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1641	/* This is the beginnings of a skeleton of code to read the info section
	1642	* that is in all the swashes in case we ever want to do that, so one can
	1643	* read things whose maps aren't code points, and whose default if missing
	1644	* is not to the code point itself. This was just to see if it actually
	1645	* worked. Details on what the possibilities are are in perluniprops.pod
	1646	HV * const hv = get_hv("utf8::SwashInfo", 0);
	1647	if (hv) {
	1648	SV **svp;
	1649	svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
	1650	const char *s;
	1651
	1652	HV * const this_hash = SvRV(*svp);
	1653	svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
	1654	s = SvPV_const(*svp, len);
	1655	}
	1656	}*/
	1657
	1658	if (special) {
	1659	/* It might be "special" (sometimes, but not always,
	1660	* a multicharacter mapping) */
	1661	HV * const hv = get_hv(special, 0);
	1662	SV **svp;
	1663
	1664	if (hv &&
	1665	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1666	(*svp)) {
	1667	const char *s;
	1668
	1669	s = SvPV_const(*svp, len);
	1670	if (len == 1)
	1671	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1672	else {
	1673	#ifdef EBCDIC
	1674	/* If we have EBCDIC we need to remap the characters
	1675	* since any characters in the low 256 are Unicode
	1676	* code points, not EBCDIC. */
	1677	U8 t = (U8)s, tend = t + len, d;
	1678
	1679	d = tmpbuf;
	1680	if (SvUTF8(*svp)) {
	1681	STRLEN tlen = 0;
	1682
	1683	while (t < tend) {
	1684	const UV c = utf8_to_uvchr(t, &tlen);
	1685	if (tlen > 0) {
	1686	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1687	t += tlen;
	1688	}
	1689	else
	1690	break;
	1691	}
	1692	}
	1693	else {
	1694	while (t < tend) {
	1695	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1696	t++;
	1697	}
	1698	}
	1699	len = d - tmpbuf;
	1700	Copy(tmpbuf, ustrp, len, U8);
	1701	#else
	1702	Copy(s, ustrp, len, U8);
	1703	#endif
	1704	}
	1705	}
	1706	}
	1707
	1708	if (!len && *swashp) {
	1709	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1710
	1711	if (uv2) {
	1712	/* It was "normal" (a single character mapping). */
	1713	const UV uv3 = UNI_TO_NATIVE(uv2);
	1714	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1715	}
	1716	}
	1717
	1718	if (!len) /* Neither: just copy. In other words, there was no mapping
	1719	defined, which means that the code point maps to itself */
	1720	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1721
	1722	if (lenp)
	1723	*lenp = len;
	1724
	1725	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1726	}
	1727
	1728	/*
	1729	=for apidoc to_utf8_upper
	1730
	1731	Convert the UTF-8 encoded character at p to its uppercase version and
	1732	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1733	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1734	the uppercase version may be longer than the original character.
	1735
	1736	The first character of the uppercased version is returned
	1737	(but note, as explained above, that there may be more.)
	1738
	1739	=cut */
	1740
	1741	UV
	1742	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1743	{
	1744	dVAR;
	1745
	1746	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1747
	1748	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1749	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1750	}
	1751
	1752	/*
	1753	=for apidoc to_utf8_title
	1754
	1755	Convert the UTF-8 encoded character at p to its titlecase version and
	1756	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1757	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1758	titlecase version may be longer than the original character.
	1759
	1760	The first character of the titlecased version is returned
	1761	(but note, as explained above, that there may be more.)
	1762
	1763	=cut */
	1764
	1765	UV
	1766	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1767	{
	1768	dVAR;
	1769
	1770	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	1771
	1772	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1773	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1774	}
	1775
	1776	/*
	1777	=for apidoc to_utf8_lower
	1778
	1779	Convert the UTF-8 encoded character at p to its lowercase version and
	1780	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1781	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1782	lowercase version may be longer than the original character.
	1783
	1784	The first character of the lowercased version is returned
	1785	(but note, as explained above, that there may be more.)
	1786
	1787	=cut */
	1788
	1789	UV
	1790	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1791	{
	1792	dVAR;
	1793
	1794	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	1795
	1796	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1797	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1798	}
	1799
	1800	/*
	1801	=for apidoc to_utf8_fold
	1802
	1803	Convert the UTF-8 encoded character at p to its foldcase version and
	1804	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1805	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1806	foldcase version may be longer than the original character (up to
	1807	three characters).
	1808
	1809	The first character of the foldcased version is returned
	1810	(but note, as explained above, that there may be more.)
	1811
	1812	=cut */
	1813
	1814	UV
	1815	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1816	{
	1817	dVAR;
	1818
	1819	PERL_ARGS_ASSERT_TO_UTF8_FOLD;
	1820
	1821	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1822	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1823	}
	1824
	1825	/* Note:
	1826	* A "swash" is a swatch hash.
	1827	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	1828	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	1829	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	1830	*/
	1831	SV*
	1832	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	1833	{
	1834	dVAR;
	1835	SV* retval;
	1836	dSP;
	1837	const size_t pkg_len = strlen(pkg);
	1838	const size_t name_len = strlen(name);
	1839	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	1840	SV* errsv_save;
	1841
	1842	PERL_ARGS_ASSERT_SWASH_INIT;
	1843
	1844	PUSHSTACKi(PERLSI_MAGIC);
	1845	ENTER;
	1846	SAVEHINTS();
	1847	save_re_context();
	1848	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1849	ENTER;
	1850	errsv_save = newSVsv(ERRSV);
	1851	/* It is assumed that callers of this routine are not passing in any
	1852	user derived data. */
	1853	/* Need to do this after save_re_context() as it will set PL_tainted to
	1854	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	1855	Even line to create errsv_save can turn on PL_tainted. */
	1856	SAVEBOOL(PL_tainted);
	1857	PL_tainted = 0;
	1858	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1859	NULL);
	1860	if (!SvTRUE(ERRSV))
	1861	sv_setsv(ERRSV, errsv_save);
	1862	SvREFCNT_dec(errsv_save);
	1863	LEAVE;
	1864	}
	1865	SPAGAIN;
	1866	PUSHMARK(SP);
	1867	EXTEND(SP,5);
	1868	mPUSHp(pkg, pkg_len);
	1869	mPUSHp(name, name_len);
	1870	PUSHs(listsv);
	1871	mPUSHi(minbits);
	1872	mPUSHi(none);
	1873	PUTBACK;
	1874	errsv_save = newSVsv(ERRSV);
	1875	if (call_method("SWASHNEW", G_SCALAR))
	1876	retval = newSVsv(*PL_stack_sp--);
	1877	else
	1878	retval = &PL_sv_undef;
	1879	if (!SvTRUE(ERRSV))
	1880	sv_setsv(ERRSV, errsv_save);
	1881	SvREFCNT_dec(errsv_save);
	1882	LEAVE;
	1883	POPSTACK;
	1884	if (IN_PERL_COMPILETIME) {
	1885	CopHINTS_set(PL_curcop, PL_hints);
	1886	}
	1887	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1888	if (SvPOK(retval))
	1889	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1890	SVfARG(retval));
	1891	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1892	}
	1893	return retval;
	1894	}
	1895
	1896
	1897	/* This API is wrong for special case conversions since we may need to
	1898	* return several Unicode characters for a single Unicode character
	1899	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1900	* the lower-level routine, and it is similarly broken for returning
	1901	* multiple values. --jhi */
	1902	/* Now SWASHGET is recasted into S_swash_get in this file. */
	1903
	1904	/* Note:
	1905	* Returns the value of property/mapping C<swash> for the first character
	1906	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	1907	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	1908	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	1909	*/
	1910	UV
	1911	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	1912	{
	1913	dVAR;
	1914	HV *const hv = MUTABLE_HV(SvRV(swash));
	1915	U32 klen;
	1916	U32 off;
	1917	STRLEN slen;
	1918	STRLEN needents;
	1919	const U8 *tmps = NULL;
	1920	U32 bit;
	1921	SV *swatch;
	1922	U8 tmputf8[2];
	1923	const UV c = NATIVE_TO_ASCII(*ptr);
	1924
	1925	PERL_ARGS_ASSERT_SWASH_FETCH;
	1926
	1927	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1928	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1929	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1930	ptr = tmputf8;
	1931	}
	1932	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1933	* then the "swatch" is a vec() for all the chars which start
	1934	* with 0xAA..0xYY
	1935	* So the key in the hash (klen) is length of encoded char -1
	1936	*/
	1937	klen = UTF8SKIP(ptr) - 1;
	1938	off = ptr[klen];
	1939
	1940	if (klen == 0) {
	1941	/* If char is invariant then swatch is for all the invariant chars
	1942	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1943	*/
	1944	needents = UTF_CONTINUATION_MARK;
	1945	off = NATIVE_TO_UTF(ptr[klen]);
	1946	}
	1947	else {
	1948	/* If char is encoded then swatch is for the prefix */
	1949	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1950	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1951	}
	1952
	1953	/*
	1954	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1955	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1956	* it's nothing to sniff at.) Pity we usually come through at least
	1957	* two function calls to get here...
	1958	*
	1959	* NB: this code assumes that swatches are never modified, once generated!
	1960	*/
	1961
	1962	if (hv == PL_last_swash_hv &&
	1963	klen == PL_last_swash_klen &&
	1964	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1965	{
	1966	tmps = PL_last_swash_tmps;
	1967	slen = PL_last_swash_slen;
	1968	}
	1969	else {
	1970	/* Try our second-level swatch cache, kept in a hash. */
	1971	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1972
	1973	/* If not cached, generate it via swash_get */
	1974	if (!svp \|\| !SvPOK(*svp)
	1975	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	1976	/* We use utf8n_to_uvuni() as we want an index into
	1977	Unicode tables, not a native character number.
	1978	*/
	1979	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1980	ckWARN(WARN_UTF8) ?
	1981	0 : UTF8_ALLOW_ANY);
	1982	swatch = swash_get(swash,
	1983	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1984	(klen) ? (code_point & ~(needents - 1)) : 0,
	1985	needents);
	1986
	1987	if (IN_PERL_COMPILETIME)
	1988	CopHINTS_set(PL_curcop, PL_hints);
	1989
	1990	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	1991
	1992	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	1993	\|\| (slen << 3) < needents)
	1994	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	1995	}
	1996
	1997	PL_last_swash_hv = hv;
	1998	assert(klen <= sizeof(PL_last_swash_key));
	1999	PL_last_swash_klen = (U8)klen;
	2000	/* FIXME change interpvar.h? */
	2001	PL_last_swash_tmps = (U8 *) tmps;
	2002	PL_last_swash_slen = slen;
	2003	if (klen)
	2004	Copy(ptr, PL_last_swash_key, klen, U8);
	2005	}
	2006
	2007	switch ((int)((slen << 3) / needents)) {
	2008	case 1:
	2009	bit = 1 << (off & 7);
	2010	off >>= 3;
	2011	return (tmps[off] & bit) != 0;
	2012	case 8:
	2013	return tmps[off];
	2014	case 16:
	2015	off <<= 1;
	2016	return (tmps[off] << 8) + tmps[off + 1] ;
	2017	case 32:
	2018	off <<= 2;
	2019	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2020	}
	2021	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2022	NORETURN_FUNCTION_END;
	2023	}
	2024
	2025	/* Note:
	2026	* Returns a swatch (a bit vector string) for a code point sequence
	2027	* that starts from the value C<start> and comprises the number C<span>.
	2028	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2029	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2030	*/
	2031	STATIC SV*
	2032	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2033	{
	2034	SV *swatch;
	2035	U8 l, lend, x, xend, *s;
	2036	STRLEN lcur, xcur, scur;
	2037	HV *const hv = MUTABLE_HV(SvRV(swash));
	2038	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2039	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2040	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2041	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2042	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2043	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2044	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2045	const STRLEN bits = SvUV(*bitssvp);
	2046	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2047	const UV none = SvUV(*nonesvp);
	2048	const UV end = start + span;
	2049
	2050	PERL_ARGS_ASSERT_SWASH_GET;
	2051
	2052	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2053	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2054	(UV)bits);
	2055	}
	2056
	2057	/* create and initialize $swatch */
	2058	scur = octets ? (span * octets) : (span + 7) / 8;
	2059	swatch = newSV(scur);
	2060	SvPOK_on(swatch);
	2061	s = (U8*)SvPVX(swatch);
	2062	if (octets && none) {
	2063	const U8* const e = s + scur;
	2064	while (s < e) {
	2065	if (bits == 8)
	2066	*s++ = (U8)(none & 0xff);
	2067	else if (bits == 16) {
	2068	*s++ = (U8)((none >> 8) & 0xff);
	2069	*s++ = (U8)( none & 0xff);
	2070	}
	2071	else if (bits == 32) {
	2072	*s++ = (U8)((none >> 24) & 0xff);
	2073	*s++ = (U8)((none >> 16) & 0xff);
	2074	*s++ = (U8)((none >> 8) & 0xff);
	2075	*s++ = (U8)( none & 0xff);
	2076	}
	2077	}
	2078	*s = '\0';
	2079	}
	2080	else {
	2081	(void)memzero((U8*)s, scur + 1);
	2082	}
	2083	SvCUR_set(swatch, scur);
	2084	s = (U8*)SvPVX(swatch);
	2085
	2086	/* read $swash->{LIST} */
	2087	l = (U8)SvPV(listsvp, lcur);
	2088	lend = l + lcur;
	2089	while (l < lend) {
	2090	UV min, max, val;
	2091	STRLEN numlen;
	2092	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2093
	2094	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2095
	2096	numlen = lend - l;
	2097	min = grok_hex((char *)l, &numlen, &flags, NULL);
	2098	if (numlen)
	2099	l += numlen;
	2100	else if (nl) {
	2101	l = nl + 1; /* 1 is length of "\n" */
	2102	continue;
	2103	}
	2104	else {
	2105	l = lend; /* to LIST's end at which \n is not found */
	2106	break;
	2107	}
	2108
	2109	if (isBLANK(*l)) {
	2110	++l;
	2111	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2112	numlen = lend - l;
	2113	max = grok_hex((char *)l, &numlen, &flags, NULL);
	2114	if (numlen)
	2115	l += numlen;
	2116	else
	2117	max = min;
	2118
	2119	if (octets) {
	2120	if (isBLANK(*l)) {
	2121	++l;
	2122	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	2123	PERL_SCAN_DISALLOW_PREFIX;
	2124	numlen = lend - l;
	2125	val = grok_hex((char *)l, &numlen, &flags, NULL);
	2126	if (numlen)
	2127	l += numlen;
	2128	else
	2129	val = 0;
	2130	}
	2131	else {
	2132	val = 0;
	2133	if (typeto) {
	2134	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2135	typestr, l);
	2136	}
	2137	}
	2138	}
	2139	else
	2140	val = 0; /* bits == 1, then val should be ignored */
	2141	}
	2142	else {
	2143	max = min;
	2144	if (octets) {
	2145	val = 0;
	2146	if (typeto) {
	2147	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2148	}
	2149	}
	2150	else
	2151	val = 0; /* bits == 1, then val should be ignored */
	2152	}
	2153
	2154	if (nl)
	2155	l = nl + 1;
	2156	else
	2157	l = lend;
	2158
	2159	if (max < start)
	2160	continue;
	2161
	2162	if (octets) {
	2163	UV key;
	2164	if (min < start) {
	2165	if (!none \|\| val < none) {
	2166	val += start - min;
	2167	}
	2168	min = start;
	2169	}
	2170	for (key = min; key <= max; key++) {
	2171	STRLEN offset;
	2172	if (key >= end)
	2173	goto go_out_list;
	2174	/* offset must be non-negative (start <= min <= key < end) */
	2175	offset = octets * (key - start);
	2176	if (bits == 8)
	2177	s[offset] = (U8)(val & 0xff);
	2178	else if (bits == 16) {
	2179	s[offset ] = (U8)((val >> 8) & 0xff);
	2180	s[offset + 1] = (U8)( val & 0xff);
	2181	}
	2182	else if (bits == 32) {
	2183	s[offset ] = (U8)((val >> 24) & 0xff);
	2184	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2185	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2186	s[offset + 3] = (U8)( val & 0xff);
	2187	}
	2188
	2189	if (!none \|\| val < none)
	2190	++val;
	2191	}
	2192	}
	2193	else { /* bits == 1, then val should be ignored */
	2194	UV key;
	2195	if (min < start)
	2196	min = start;
	2197	for (key = min; key <= max; key++) {
	2198	const STRLEN offset = (STRLEN)(key - start);
	2199	if (key >= end)
	2200	goto go_out_list;
	2201	s[offset >> 3] \|= 1 << (offset & 7);
	2202	}
	2203	}
	2204	} /* while */
	2205	go_out_list:
	2206
	2207	/* read $swash->{EXTRAS} */
	2208	x = (U8)SvPV(extssvp, xcur);
	2209	xend = x + xcur;
	2210	while (x < xend) {
	2211	STRLEN namelen;
	2212	U8 *namestr;
	2213	SV** othersvp;
	2214	HV* otherhv;
	2215	STRLEN otherbits;
	2216	SV *otherbitssvp, other;
	2217	U8 s, o, *nl;
	2218	STRLEN slen, olen;
	2219
	2220	const U8 opc = *x++;
	2221	if (opc == '\n')
	2222	continue;
	2223
	2224	nl = (U8*)memchr(x, '\n', xend - x);
	2225
	2226	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2227	if (nl) {
	2228	x = nl + 1; /* 1 is length of "\n" */
	2229	continue;
	2230	}
	2231	else {
	2232	x = xend; /* to EXTRAS' end at which \n is not found */
	2233	break;
	2234	}
	2235	}
	2236
	2237	namestr = x;
	2238	if (nl) {
	2239	namelen = nl - namestr;
	2240	x = nl + 1;
	2241	}
	2242	else {
	2243	namelen = xend - namestr;
	2244	x = xend;
	2245	}
	2246
	2247	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2248	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2249	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2250	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2251	if (bits < otherbits)
	2252	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2253
	2254	/* The "other" swatch must be destroyed after. */
	2255	other = swash_get(*othersvp, start, span);
	2256	o = (U8*)SvPV(other, olen);
	2257
	2258	if (!olen)
	2259	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2260
	2261	s = (U8*)SvPV(swatch, slen);
	2262	if (bits == 1 && otherbits == 1) {
	2263	if (slen != olen)
	2264	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2265
	2266	switch (opc) {
	2267	case '+':
	2268	while (slen--)
	2269	s++ \|= o++;
	2270	break;
	2271	case '!':
	2272	while (slen--)
	2273	s++ \|= ~o++;
	2274	break;
	2275	case '-':
	2276	while (slen--)
	2277	s++ &= ~o++;
	2278	break;
	2279	case '&':
	2280	while (slen--)
	2281	s++ &= o++;
	2282	break;
	2283	default:
	2284	break;
	2285	}
	2286	}
	2287	else {
	2288	STRLEN otheroctets = otherbits >> 3;
	2289	STRLEN offset = 0;
	2290	U8* const send = s + slen;
	2291
	2292	while (s < send) {
	2293	UV otherval = 0;
	2294
	2295	if (otherbits == 1) {
	2296	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2297	++offset;
	2298	}
	2299	else {
	2300	STRLEN vlen = otheroctets;
	2301	otherval = *o++;
	2302	while (--vlen) {
	2303	otherval <<= 8;
	2304	otherval \|= *o++;
	2305	}
	2306	}
	2307
	2308	if (opc == '+' && otherval)
	2309	NOOP; /* replace with otherval */
	2310	else if (opc == '!' && !otherval)
	2311	otherval = 1;
	2312	else if (opc == '-' && otherval)
	2313	otherval = 0;
	2314	else if (opc == '&' && !otherval)
	2315	otherval = 0;
	2316	else {
	2317	s += octets; /* no replacement */
	2318	continue;
	2319	}
	2320
	2321	if (bits == 8)
	2322	*s++ = (U8)( otherval & 0xff);
	2323	else if (bits == 16) {
	2324	*s++ = (U8)((otherval >> 8) & 0xff);
	2325	*s++ = (U8)( otherval & 0xff);
	2326	}
	2327	else if (bits == 32) {
	2328	*s++ = (U8)((otherval >> 24) & 0xff);
	2329	*s++ = (U8)((otherval >> 16) & 0xff);
	2330	*s++ = (U8)((otherval >> 8) & 0xff);
	2331	*s++ = (U8)( otherval & 0xff);
	2332	}
	2333	}
	2334	}
	2335	sv_free(other); /* through with it! */
	2336	} /* while */
	2337	return swatch;
	2338	}
	2339
	2340	/*
	2341	=for apidoc uvchr_to_utf8
	2342
	2343	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	2344	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2345	bytes available. The return value is the pointer to the byte after the
	2346	end of the new character. In other words,
	2347
	2348	d = uvchr_to_utf8(d, uv);
	2349
	2350	is the recommended wide native character-aware way of saying
	2351
	2352	*(d++) = uv;
	2353
	2354	=cut
	2355	*/
	2356
	2357	/* On ASCII machines this is normally a macro but we want a
	2358	real function in case XS code wants it
	2359	*/
	2360	U8 *
	2361	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2362	{
	2363	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	2364
	2365	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2366	}
	2367
	2368	U8 *
	2369	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2370	{
	2371	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	2372
	2373	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2374	}
	2375
	2376	/*
	2377	=for apidoc utf8n_to_uvchr
	2378	flags
	2379
	2380	Returns the native character value of the first character in the string
	2381	C<s>
	2382	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2383	length, in bytes, of that character.
	2384
	2385	Allows length and flags to be passed to low level routine.
	2386
	2387	=cut
	2388	*/
	2389	/* On ASCII machines this is normally a macro but we want
	2390	a real function in case XS code wants it
	2391	*/
	2392	UV
	2393	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2394	U32 flags)
	2395	{
	2396	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2397
	2398	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	2399
	2400	return UNI_TO_NATIVE(uv);
	2401	}
	2402
	2403	/*
	2404	=for apidoc pv_uni_display
	2405
	2406	Build to the scalar dsv a displayable version of the string spv,
	2407	length len, the displayable version being at most pvlim bytes long
	2408	(if longer, the rest is truncated and "..." will be appended).
	2409
	2410	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2411	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2412	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2413	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2414	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2415	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2416
	2417	The pointer to the PV of the dsv is returned.
	2418
	2419	=cut */
	2420	char *
	2421	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2422	{
	2423	int truncated = 0;
	2424	const char s, e;
	2425
	2426	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	2427
	2428	sv_setpvs(dsv, "");
	2429	SvUTF8_off(dsv);
	2430	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2431	UV u;
	2432	/* This serves double duty as a flag and a character to print after
	2433	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2434	*/
	2435	char ok = 0;
	2436
	2437	if (pvlim && SvCUR(dsv) >= pvlim) {
	2438	truncated++;
	2439	break;
	2440	}
	2441	u = utf8_to_uvchr((U8*)s, 0);
	2442	if (u < 256) {
	2443	const unsigned char c = (unsigned char)u & 0xFF;
	2444	if (flags & UNI_DISPLAY_BACKSLASH) {
	2445	switch (c) {
	2446	case '\n':
	2447	ok = 'n'; break;
	2448	case '\r':
	2449	ok = 'r'; break;
	2450	case '\t':
	2451	ok = 't'; break;
	2452	case '\f':
	2453	ok = 'f'; break;
	2454	case '\a':
	2455	ok = 'a'; break;
	2456	case '\\':
	2457	ok = '\\'; break;
	2458	default: break;
	2459	}
	2460	if (ok) {
	2461	const char string = ok;
	2462	sv_catpvs(dsv, "\\");
	2463	sv_catpvn(dsv, &string, 1);
	2464	}
	2465	}
	2466	/* isPRINT() is the locale-blind version. */
	2467	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	2468	const char string = c;
	2469	sv_catpvn(dsv, &string, 1);
	2470	ok = 1;
	2471	}
	2472	}
	2473	if (!ok)
	2474	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	2475	}
	2476	if (truncated)
	2477	sv_catpvs(dsv, "...");
	2478
	2479	return SvPVX(dsv);
	2480	}
	2481
	2482	/*
	2483	=for apidoc sv_uni_display
	2484
	2485	Build to the scalar dsv a displayable version of the scalar sv,
	2486	the displayable version being at most pvlim bytes long
	2487	(if longer, the rest is truncated and "..." will be appended).
	2488
	2489	The flags argument is as in pv_uni_display().
	2490
	2491	The pointer to the PV of the dsv is returned.
	2492
	2493	=cut
	2494	*/
	2495	char *
	2496	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	2497	{
	2498	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	2499
	2500	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	2501	SvCUR(ssv), pvlim, flags);
	2502	}
	2503
	2504	/*
	2505	=for apidoc foldEQ_utf8
	2506
	2507	Returns true if the leading portions of the strings s1 and s2 (either or both
	2508	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	2509	How far into the strings to compare is determined by other input parameters.
	2510
	2511	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	2512	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	2513	with respect to s2.
	2514
	2515	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	2516	equality. In other words, s1+l1 will be used as a goal to reach. The
	2517	scan will not be considered to be a match unless the goal is reached, and
	2518	scanning won't continue past that goal. Correspondingly for l2 with respect to
	2519	s2.
	2520
	2521	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	2522	considered an end pointer beyond which scanning of s1 will not continue under
	2523	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	2524	is less than s1+l1, the match will never be successful because it can never
	2525	get as far as its goal (and in fact is asserted against). Correspondingly for
	2526	pe2 with respect to s2.
	2527
	2528	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	2529	non-zero), and if both do, both have to be
	2530	reached for a successful match. Also, if the fold of a character is multiple
	2531	characters, all of them must be matched (see tr21 reference below for
	2532	'folding').
	2533
	2534	Upon a successful match, if pe1 is non-NULL,
	2535	it will be set to point to the beginning of the I<next> character of s1 beyond
	2536	what was matched. Correspondingly for pe2 and s2.
	2537
	2538	For case-insensitiveness, the "casefolding" of Unicode is used
	2539	instead of upper/lowercasing both the characters, see
	2540	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	2541
	2542	=cut */
	2543	I32
	2544	Perl_foldEQ_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	2545	{
	2546	dVAR;
	2547	register const U8 p1 = (const U8)s1; /* Point to current char */
	2548	register const U8 p2 = (const U8)s2;
	2549	register const U8 g1 = NULL; / goal for s1 */
	2550	register const U8 *g2 = NULL;
	2551	register const U8 e1 = NULL; / Don't scan s1 past this */
	2552	register U8 f1 = NULL; / Point to current folded */
	2553	register const U8 *e2 = NULL;
	2554	register U8 *f2 = NULL;
	2555	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	2556	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	2557	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	2558	U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
	2559	these always fit in 2 bytes */
	2560
	2561	PERL_ARGS_ASSERT_FOLDEQ_UTF8;
	2562
	2563	if (pe1) {
	2564	e1 = (U8*)pe1;
	2565	}
	2566
	2567	if (l1) {
	2568	g1 = (const U8*)s1 + l1;
	2569	}
	2570
	2571	if (pe2) {
	2572	e2 = (U8*)pe2;
	2573	}
	2574
	2575	if (l2) {
	2576	g2 = (const U8*)s2 + l2;
	2577	}
	2578
	2579	/* Must have at least one goal */
	2580	assert(g1 \|\| g2);
	2581
	2582	if (g1) {
	2583
	2584	/* Will never match if goal is out-of-bounds */
	2585	assert(! e1 \|\| e1 >= g1);
	2586
	2587	/* Here, there isn't an end pointer, or it is beyond the goal. We
	2588	* only go as far as the goal */
	2589	e1 = g1;
	2590	}
	2591	else {
	2592	assert(e1); /* Must have an end for looking at s1 */
	2593	}
	2594
	2595	/* Same for goal for s2 */
	2596	if (g2) {
	2597	assert(! e2 \|\| e2 >= g2);
	2598	e2 = g2;
	2599	}
	2600	else {
	2601	assert(e2);
	2602	}
	2603
	2604	/* Look through both strings, a character at a time */
	2605	while (p1 < e1 && p2 < e2) {
	2606
	2607	/* If at the beginning of a new character in s1, get its fold to use
	2608	* and the length of the fold */
	2609	if (n1 == 0) {
	2610	if (u1) {
	2611	to_utf8_fold(p1, foldbuf1, &n1);
	2612	}
	2613	else { /* Not utf8, convert to it first and then get fold */
	2614	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	2615	to_utf8_fold(natbuf, foldbuf1, &n1);
	2616	}
	2617	f1 = foldbuf1;
	2618	}
	2619
	2620	if (n2 == 0) { /* Same for s2 */
	2621	if (u2) {
	2622	to_utf8_fold(p2, foldbuf2, &n2);
	2623	}
	2624	else {
	2625	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	2626	to_utf8_fold(natbuf, foldbuf2, &n2);
	2627	}
	2628	f2 = foldbuf2;
	2629	}
	2630
	2631	/* While there is more to look for in both folds, see if they
	2632	* continue to match */
	2633	while (n1 && n2) {
	2634	U8 fold_length = UTF8SKIP(f1);
	2635	if (fold_length != UTF8SKIP(f2)
	2636	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	2637	function call for single
	2638	character */
	2639	\|\| memNE((char)f1, (char)f2, fold_length))
	2640	{
	2641	return 0; /* mismatch */
	2642	}
	2643
	2644	/* Here, they matched, advance past them */
	2645	n1 -= fold_length;
	2646	f1 += fold_length;
	2647	n2 -= fold_length;
	2648	f2 += fold_length;
	2649	}
	2650
	2651	/* When reach the end of any fold, advance the input past it */
	2652	if (n1 == 0) {
	2653	p1 += u1 ? UTF8SKIP(p1) : 1;
	2654	}
	2655	if (n2 == 0) {
	2656	p2 += u2 ? UTF8SKIP(p2) : 1;
	2657	}
	2658	} /* End of loop through both strings */
	2659
	2660	/* A match is defined by each scan that specified an explicit length
	2661	* reaching its final goal, and the other not having matched a partial
	2662	* character (which can happen when the fold of a character is more than one
	2663	* character). */
	2664	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	2665	return 0;
	2666	}
	2667
	2668	/* Successful match. Set output pointers */
	2669	if (pe1) {
	2670	pe1 = (char)p1;
	2671	}
	2672	if (pe2) {
	2673	pe2 = (char)p2;
	2674	}
	2675	return 1;
	2676	}
	2677
	2678	/*
	2679	* Local variables:
	2680	* c-indentation-style: bsd
	2681	* c-basic-offset: 4
	2682	* indent-tabs-mode: t
	2683	* End:
	2684	*
	2685	* ex: set ts=8 sts=4 sw=4 noet:
	2686	*/