perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (c) 1998-2002, Larry Wall
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	*/
	9
	10	/*
	11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	12	* heard of that we don't want to see any closer; and that's the one place
	13	* we're trying to get to! And that's just where we can't get, nohow.'
	14	*
	15	* 'Well do I understand your speech,' he answered in the same language;
	16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	17	* as is the custom in the West, if you wish to be answered?'
	18	*
	19	* ...the travellers perceived that the floor was paved with stones of many
	20	* hues; branching runes and strange devices intertwined beneath their feet.
	21	*/
	22
	23	#include "EXTERN.h"
	24	#define PERL_IN_UTF8_C
	25	#include "perl.h"
	26
	27	/*
	28	=head1 Unicode Support
	29
	30	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	31
	32	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
	33	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	34	bytes available. The return value is the pointer to the byte after the
	35	end of the new character. In other words,
	36
	37	d = uvuni_to_utf8_flags(d, uv, flags);
	38
	39	or, in most cases,
	40
	41	d = uvuni_to_utf8(d, uv);
	42
	43	(which is equivalent to)
	44
	45	d = uvuni_to_utf8_flags(d, uv, 0);
	46
	47	is the recommended Unicode-aware way of saying
	48
	49	*(d++) = uv;
	50
	51	=cut
	52	*/
	53
	54	U8 *
	55	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	56	{
	57	if (ckWARN(WARN_UTF8)) {
	58	if (UNICODE_IS_SURROGATE(uv) &&
	59	!(flags & UNICODE_ALLOW_SURROGATE))
	60	Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
	61	else if (
	62	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	63	!(flags & UNICODE_ALLOW_FDD0))
	64	\|\|
	65	((uv & 0xFFFF) == 0xFFFE &&
	66	!(flags & UNICODE_ALLOW_FFFE))
	67	\|\|
	68	((uv & 0xFFFF) == 0xFFFF &&
	69	!(flags & UNICODE_ALLOW_FFFF))) &&
	70	/* UNICODE_ALLOW_SUPER includes
	71	* FFFEs and FFFFs beyond 0x10FFFF. */
	72	((uv <= PERL_UNICODE_MAX) \|\|
	73	!(flags & UNICODE_ALLOW_SUPER))
	74	)
	75	Perl_warner(aTHX_ WARN_UTF8,
	76	"Unicode character 0x%04"UVxf" is illegal", uv);
	77	}
	78	if (UNI_IS_INVARIANT(uv)) {
	79	*d++ = UTF_TO_NATIVE(uv);
	80	return d;
	81	}
	82	#if defined(EBCDIC)
	83	else {
	84	STRLEN len = UNISKIP(uv);
	85	U8 *p = d+len-1;
	86	while (p > d) {
	87	*p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	88	uv >>= UTF_ACCUMULATION_SHIFT;
	89	}
	90	*p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	91	return d+len;
	92	}
	93	#else /* Non loop style */
	94	if (uv < 0x800) {
	95	*d++ = (( uv >> 6) \| 0xc0);
	96	*d++ = (( uv & 0x3f) \| 0x80);
	97	return d;
	98	}
	99	if (uv < 0x10000) {
	100	*d++ = (( uv >> 12) \| 0xe0);
	101	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	102	*d++ = (( uv & 0x3f) \| 0x80);
	103	return d;
	104	}
	105	if (uv < 0x200000) {
	106	*d++ = (( uv >> 18) \| 0xf0);
	107	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	108	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	109	*d++ = (( uv & 0x3f) \| 0x80);
	110	return d;
	111	}
	112	if (uv < 0x4000000) {
	113	*d++ = (( uv >> 24) \| 0xf8);
	114	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	115	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	116	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	117	*d++ = (( uv & 0x3f) \| 0x80);
	118	return d;
	119	}
	120	if (uv < 0x80000000) {
	121	*d++ = (( uv >> 30) \| 0xfc);
	122	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	123	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	124	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	125	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	126	*d++ = (( uv & 0x3f) \| 0x80);
	127	return d;
	128	}
	129	#ifdef HAS_QUAD
	130	if (uv < UTF8_QUAD_MAX)
	131	#endif
	132	{
	133	d++ = 0xfe; / Can't match U+FEFF! */
	134	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	135	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	136	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	137	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	138	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	139	*d++ = (( uv & 0x3f) \| 0x80);
	140	return d;
	141	}
	142	#ifdef HAS_QUAD
	143	{
	144	d++ = 0xff; / Can't match U+FFFE! */
	145	d++ = 0x80; / 6 Reserved bits */
	146	d++ = (((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	147	*d++ = (((uv >> 54) & 0x3f) \| 0x80);
	148	*d++ = (((uv >> 48) & 0x3f) \| 0x80);
	149	*d++ = (((uv >> 42) & 0x3f) \| 0x80);
	150	*d++ = (((uv >> 36) & 0x3f) \| 0x80);
	151	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	152	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	153	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	154	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	155	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	156	*d++ = (( uv & 0x3f) \| 0x80);
	157	return d;
	158	}
	159	#endif
	160	#endif /* Loop style */
	161	}
	162
	163	U8 *
	164	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	165	{
	166	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	167	}
	168
	169
	170	/*
	171	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	172
	173	Tests if some arbitrary number of bytes begins in a valid UTF-8
	174	character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
	175	The actual number of bytes in the UTF-8 character will be returned if
	176	it is valid, otherwise 0.
	177
	178	=cut
	179	*/
	180	STRLEN
	181	Perl_is_utf8_char(pTHX_ U8 *s)
	182	{
	183	U8 u = *s;
	184	STRLEN slen, len;
	185	UV uv, ouv;
	186
	187	if (UTF8_IS_INVARIANT(u))
	188	return 1;
	189
	190	if (!UTF8_IS_START(u))
	191	return 0;
	192
	193	len = UTF8SKIP(s);
	194
	195	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	196	return 0;
	197
	198	slen = len - 1;
	199	s++;
	200	u &= UTF_START_MASK(len);
	201	uv = u;
	202	ouv = uv;
	203	while (slen--) {
	204	if (!UTF8_IS_CONTINUATION(*s))
	205	return 0;
	206	uv = UTF8_ACCUMULATE(uv, *s);
	207	if (uv < ouv)
	208	return 0;
	209	ouv = uv;
	210	s++;
	211	}
	212
	213	if (UNISKIP(uv) < len)
	214	return 0;
	215
	216	return len;
	217	}
	218
	219	/*
	220	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	221
	222	Returns true if first C<len> bytes of the given string form a valid UTF8
	223	string, false otherwise. Note that 'a valid UTF8 string' does not mean
	224	'a string that contains UTF8' because a valid ASCII string is a valid
	225	UTF8 string.
	226
	227	=cut
	228	*/
	229
	230	bool
	231	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	232	{
	233	U8* x = s;
	234	U8* send;
	235	STRLEN c;
	236
	237	if (!len)
	238	len = strlen((char *)s);
	239	send = s + len;
	240
	241	while (x < send) {
	242	c = is_utf8_char(x);
	243	if (!c)
	244	return FALSE;
	245	x += c;
	246	}
	247	if (x != send)
	248	return FALSE;
	249
	250	return TRUE;
	251	}
	252
	253	/*
	254	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	255
	256	Bottom level UTF-8 decode routine.
	257	Returns the unicode code point value of the first character in the string C<s>
	258	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
	259	C<retlen> will be set to the length, in bytes, of that character.
	260
	261	If C<s> does not point to a well-formed UTF8 character, the behaviour
	262	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	263	it is assumed that the caller will raise a warning, and this function
	264	will silently just set C<retlen> to C<-1> and return zero. If the
	265	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	266	malformations will be given, C<retlen> will be set to the expected
	267	length of the UTF-8 character in bytes, and zero will be returned.
	268
	269	The C<flags> can also contain various flags to allow deviations from
	270	the strict UTF-8 encoding (see F<utf8.h>).
	271
	272	Most code should use utf8_to_uvchr() rather than call this directly.
	273
	274	=cut
	275	*/
	276
	277	UV
	278	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	279	{
	280	U8 *s0 = s;
	281	UV uv = *s, ouv = 0;
	282	STRLEN len = 1;
	283	bool dowarn = ckWARN_d(WARN_UTF8);
	284	UV startbyte = *s;
	285	STRLEN expectlen = 0;
	286	U32 warning = 0;
	287
	288	/* This list is a superset of the UTF8_ALLOW_XXX. */
	289
	290	#define UTF8_WARN_EMPTY 1
	291	#define UTF8_WARN_CONTINUATION 2
	292	#define UTF8_WARN_NON_CONTINUATION 3
	293	#define UTF8_WARN_FE_FF 4
	294	#define UTF8_WARN_SHORT 5
	295	#define UTF8_WARN_OVERFLOW 6
	296	#define UTF8_WARN_SURROGATE 7
	297	#define UTF8_WARN_BOM 8
	298	#define UTF8_WARN_LONG 9
	299	#define UTF8_WARN_FFFF 10
	300
	301	if (curlen == 0 &&
	302	!(flags & UTF8_ALLOW_EMPTY)) {
	303	warning = UTF8_WARN_EMPTY;
	304	goto malformed;
	305	}
	306
	307	if (UTF8_IS_INVARIANT(uv)) {
	308	if (retlen)
	309	*retlen = 1;
	310	return (UV) (NATIVE_TO_UTF(*s));
	311	}
	312
	313	if (UTF8_IS_CONTINUATION(uv) &&
	314	!(flags & UTF8_ALLOW_CONTINUATION)) {
	315	warning = UTF8_WARN_CONTINUATION;
	316	goto malformed;
	317	}
	318
	319	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	320	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	321	warning = UTF8_WARN_NON_CONTINUATION;
	322	goto malformed;
	323	}
	324
	325	#ifdef EBCDIC
	326	uv = NATIVE_TO_UTF(uv);
	327	#else
	328	if ((uv == 0xfe \|\| uv == 0xff) &&
	329	!(flags & UTF8_ALLOW_FE_FF)) {
	330	warning = UTF8_WARN_FE_FF;
	331	goto malformed;
	332	}
	333	#endif
	334
	335	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	336	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	337	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	338	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	339	#ifdef EBCDIC
	340	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	341	else { len = 7; uv &= 0x01; }
	342	#else
	343	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	344	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	345	else { len = 13; uv = 0; } /* whoa! */
	346	#endif
	347
	348	if (retlen)
	349	*retlen = len;
	350
	351	expectlen = len;
	352
	353	if ((curlen < expectlen) &&
	354	!(flags & UTF8_ALLOW_SHORT)) {
	355	warning = UTF8_WARN_SHORT;
	356	goto malformed;
	357	}
	358
	359	len--;
	360	s++;
	361	ouv = uv;
	362
	363	while (len--) {
	364	if (!UTF8_IS_CONTINUATION(*s) &&
	365	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	366	s--;
	367	warning = UTF8_WARN_NON_CONTINUATION;
	368	goto malformed;
	369	}
	370	else
	371	uv = UTF8_ACCUMULATE(uv, *s);
	372	if (!(uv > ouv)) {
	373	/* These cannot be allowed. */
	374	if (uv == ouv) {
	375	if (!(flags & UTF8_ALLOW_LONG)) {
	376	warning = UTF8_WARN_LONG;
	377	goto malformed;
	378	}
	379	}
	380	else { /* uv < ouv */
	381	/* This cannot be allowed. */
	382	warning = UTF8_WARN_OVERFLOW;
	383	goto malformed;
	384	}
	385	}
	386	s++;
	387	ouv = uv;
	388	}
	389
	390	if (UNICODE_IS_SURROGATE(uv) &&
	391	!(flags & UTF8_ALLOW_SURROGATE)) {
	392	warning = UTF8_WARN_SURROGATE;
	393	goto malformed;
	394	} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
	395	!(flags & UTF8_ALLOW_BOM)) {
	396	warning = UTF8_WARN_BOM;
	397	goto malformed;
	398	} else if ((expectlen > UNISKIP(uv)) &&
	399	!(flags & UTF8_ALLOW_LONG)) {
	400	warning = UTF8_WARN_LONG;
	401	goto malformed;
	402	} else if (UNICODE_IS_ILLEGAL(uv) &&
	403	!(flags & UTF8_ALLOW_FFFF)) {
	404	warning = UTF8_WARN_FFFF;
	405	goto malformed;
	406	}
	407
	408	return uv;
	409
	410	malformed:
	411
	412	if (flags & UTF8_CHECK_ONLY) {
	413	if (retlen)
	414	*retlen = -1;
	415	return 0;
	416	}
	417
	418	if (dowarn) {
	419	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	420
	421	switch (warning) {
	422	case 0: /* Intentionally empty. */ break;
	423	case UTF8_WARN_EMPTY:
	424	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
	425	break;
	426	case UTF8_WARN_CONTINUATION:
	427	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	428	break;
	429	case UTF8_WARN_NON_CONTINUATION:
	430	if (s == s0)
	431	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	432	(UV)s[1], startbyte);
	433	else
	434	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	435	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
	436
	437	break;
	438	case UTF8_WARN_FE_FF:
	439	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	440	break;
	441	case UTF8_WARN_SHORT:
	442	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	443	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
	444	expectlen = curlen; /* distance for caller to skip */
	445	break;
	446	case UTF8_WARN_OVERFLOW:
	447	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	448	ouv, *s, startbyte);
	449	break;
	450	case UTF8_WARN_SURROGATE:
	451	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	452	break;
	453	case UTF8_WARN_BOM:
	454	Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
	455	break;
	456	case UTF8_WARN_LONG:
	457	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	458	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	459	break;
	460	case UTF8_WARN_FFFF:
	461	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	462	break;
	463	default:
	464	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
	465	break;
	466	}
	467
	468	if (warning) {
	469	char *s = SvPVX(sv);
	470
	471	if (PL_op)
	472	Perl_warner(aTHX_ WARN_UTF8,
	473	"%s in %s", s, OP_DESC(PL_op));
	474	else
	475	Perl_warner(aTHX_ WARN_UTF8, "%s", s);
	476	}
	477	}
	478
	479	if (retlen)
	480	*retlen = expectlen ? expectlen : len;
	481
	482	return 0;
	483	}
	484
	485	/*
	486	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	487
	488	Returns the native character value of the first character in the string C<s>
	489	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	490	length, in bytes, of that character.
	491
	492	If C<s> does not point to a well-formed UTF8 character, zero is
	493	returned and retlen is set, if possible, to -1.
	494
	495	=cut
	496	*/
	497
	498	UV
	499	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	500	{
	501	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	502	}
	503
	504	/*
	505	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	506
	507	Returns the Unicode code point of the first character in the string C<s>
	508	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	509	length, in bytes, of that character.
	510
	511	This function should only be used when returned UV is considered
	512	an index into the Unicode semantic tables (e.g. swashes).
	513
	514	If C<s> does not point to a well-formed UTF8 character, zero is
	515	returned and retlen is set, if possible, to -1.
	516
	517	=cut
	518	*/
	519
	520	UV
	521	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	522	{
	523	/* Call the low level routine asking for checks */
	524	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	525	}
	526
	527	/*
	528	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	529
	530	Return the length of the UTF-8 char encoded string C<s> in characters.
	531	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	532	up past C<e>, croaks.
	533
	534	=cut
	535	*/
	536
	537	STRLEN
	538	Perl_utf8_length(pTHX_ U8 s, U8 e)
	539	{
	540	STRLEN len = 0;
	541
	542	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	543	* the bitops (especially ~) can create illegal UTF-8.
	544	* In other words: in Perl UTF-8 is not just for Unicode. */
	545
	546	if (e < s)
	547	Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
	548	while (s < e) {
	549	U8 t = UTF8SKIP(s);
	550
	551	if (e - s < t)
	552	Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
	553	s += t;
	554	len++;
	555	}
	556
	557	return len;
	558	}
	559
	560	/*
	561	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	562
	563	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
	564	and C<b>.
	565
	566	WARNING: use only if you know that the pointers point inside the
	567	same UTF-8 buffer.
	568
	569	=cut
	570	*/
	571
	572	IV
	573	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	574	{
	575	IV off = 0;
	576
	577	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	578	* the bitops (especially ~) can create illegal UTF-8.
	579	* In other words: in Perl UTF-8 is not just for Unicode. */
	580
	581	if (a < b) {
	582	while (a < b) {
	583	U8 c = UTF8SKIP(a);
	584
	585	if (b - a < c)
	586	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	587	a += c;
	588	off--;
	589	}
	590	}
	591	else {
	592	while (b < a) {
	593	U8 c = UTF8SKIP(b);
	594
	595	if (a - b < c)
	596	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	597	b += c;
	598	off++;
	599	}
	600	}
	601
	602	return off;
	603	}
	604
	605	/*
	606	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	607
	608	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	609	forward or backward.
	610
	611	WARNING: do not use the following unless you know C<off> is within
	612	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	613	on the first byte of character or just after the last byte of a character.
	614
	615	=cut
	616	*/
	617
	618	U8 *
	619	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	620	{
	621	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	622	* the bitops (especially ~) can create illegal UTF-8.
	623	* In other words: in Perl UTF-8 is not just for Unicode. */
	624
	625	if (off >= 0) {
	626	while (off--)
	627	s += UTF8SKIP(s);
	628	}
	629	else {
	630	while (off++) {
	631	s--;
	632	while (UTF8_IS_CONTINUATION(*s))
	633	s--;
	634	}
	635	}
	636	return s;
	637	}
	638
	639	/*
	640	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	641
	642	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	643	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	644	updates len to contain the new length.
	645	Returns zero on failure, setting C<len> to -1.
	646
	647	=cut
	648	*/
	649
	650	U8 *
	651	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	652	{
	653	U8 *send;
	654	U8 *d;
	655	U8 *save = s;
	656
	657	/* ensure valid UTF8 and chars < 256 before updating string */
	658	for (send = s + *len; s < send; ) {
	659	U8 c = *s++;
	660
	661	if (!UTF8_IS_INVARIANT(c) &&
	662	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	663	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	664	*len = -1;
	665	return 0;
	666	}
	667	}
	668
	669	d = s = save;
	670	while (s < send) {
	671	STRLEN ulen;
	672	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	673	s += ulen;
	674	}
	675	*d = '\0';
	676	*len = d - save;
	677	return save;
	678	}
	679
	680	/*
	681	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
	682
	683	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	684	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	685	the newly-created string, and updates C<len> to contain the new
	686	length. Returns the original string if no conversion occurs, C<len>
	687	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	688	0 if C<s> is converted or contains all 7bit characters.
	689
	690	=cut
	691	*/
	692
	693	U8 *
	694	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	695	{
	696	U8 *d;
	697	U8 *start = s;
	698	U8 *send;
	699	I32 count = 0;
	700
	701	if (!*is_utf8)
	702	return start;
	703
	704	/* ensure valid UTF8 and chars < 256 before converting string */
	705	for (send = s + *len; s < send;) {
	706	U8 c = *s++;
	707	if (!UTF8_IS_INVARIANT(c)) {
	708	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	709	(c = *s++) && UTF8_IS_CONTINUATION(c))
	710	count++;
	711	else
	712	return start;
	713	}
	714	}
	715
	716	*is_utf8 = 0;
	717
	718	Newz(801, d, (*len) - count + 1, U8);
	719	s = start; start = d;
	720	while (s < send) {
	721	U8 c = *s++;
	722	if (!UTF8_IS_INVARIANT(c)) {
	723	/* Then it is two-byte encoded */
	724	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	725	c = ASCII_TO_NATIVE(c);
	726	}
	727	*d++ = c;
	728	}
	729	*d = '\0';
	730	*len = d - start;
	731	return start;
	732	}
	733
	734	/*
	735	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	736
	737	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
	738	Returns a pointer to the newly-created string, and sets C<len> to
	739	reflect the new length.
	740
	741	=cut
	742	*/
	743
	744	U8*
	745	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	746	{
	747	U8 *send;
	748	U8 *d;
	749	U8 *dst;
	750	send = s + (*len);
	751
	752	Newz(801, d, (len) 2 + 1, U8);
	753	dst = d;
	754
	755	while (s < send) {
	756	UV uv = NATIVE_TO_ASCII(*s++);
	757	if (UNI_IS_INVARIANT(uv))
	758	*d++ = UTF_TO_NATIVE(uv);
	759	else {
	760	*d++ = UTF8_EIGHT_BIT_HI(uv);
	761	*d++ = UTF8_EIGHT_BIT_LO(uv);
	762	}
	763	}
	764	*d = '\0';
	765	*len = d-dst;
	766	return dst;
	767	}
	768
	769	/*
	770	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	771	*
	772	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	773	* We optimize for native, for obvious reasons. */
	774
	775	U8*
	776	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	777	{
	778	U8* pend;
	779	U8* dstart = d;
	780
	781	if (bytelen & 1)
	782	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
	783
	784	pend = p + bytelen;
	785
	786	while (p < pend) {
	787	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	788	p += 2;
	789	if (uv < 0x80) {
	790	*d++ = uv;
	791	continue;
	792	}
	793	if (uv < 0x800) {
	794	*d++ = (( uv >> 6) \| 0xc0);
	795	*d++ = (( uv & 0x3f) \| 0x80);
	796	continue;
	797	}
	798	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	799	UV low = *p++;
	800	if (low < 0xdc00 \|\| low >= 0xdfff)
	801	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	802	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	803	}
	804	if (uv < 0x10000) {
	805	*d++ = (( uv >> 12) \| 0xe0);
	806	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	807	*d++ = (( uv & 0x3f) \| 0x80);
	808	continue;
	809	}
	810	else {
	811	*d++ = (( uv >> 18) \| 0xf0);
	812	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	813	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	814	*d++ = (( uv & 0x3f) \| 0x80);
	815	continue;
	816	}
	817	}
	818	*newlen = d - dstart;
	819	return d;
	820	}
	821
	822	/* Note: this one is slightly destructive of the source. */
	823
	824	U8*
	825	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	826	{
	827	U8* s = (U8*)p;
	828	U8* send = s + bytelen;
	829	while (s < send) {
	830	U8 tmp = s[0];
	831	s[0] = s[1];
	832	s[1] = tmp;
	833	s += 2;
	834	}
	835	return utf16_to_utf8(p, d, bytelen, newlen);
	836	}
	837
	838	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	839
	840	bool
	841	Perl_is_uni_alnum(pTHX_ UV c)
	842	{
	843	U8 tmpbuf[UTF8_MAXLEN+1];
	844	uvchr_to_utf8(tmpbuf, c);
	845	return is_utf8_alnum(tmpbuf);
	846	}
	847
	848	bool
	849	Perl_is_uni_alnumc(pTHX_ UV c)
	850	{
	851	U8 tmpbuf[UTF8_MAXLEN+1];
	852	uvchr_to_utf8(tmpbuf, c);
	853	return is_utf8_alnumc(tmpbuf);
	854	}
	855
	856	bool
	857	Perl_is_uni_idfirst(pTHX_ UV c)
	858	{
	859	U8 tmpbuf[UTF8_MAXLEN+1];
	860	uvchr_to_utf8(tmpbuf, c);
	861	return is_utf8_idfirst(tmpbuf);
	862	}
	863
	864	bool
	865	Perl_is_uni_alpha(pTHX_ UV c)
	866	{
	867	U8 tmpbuf[UTF8_MAXLEN+1];
	868	uvchr_to_utf8(tmpbuf, c);
	869	return is_utf8_alpha(tmpbuf);
	870	}
	871
	872	bool
	873	Perl_is_uni_ascii(pTHX_ UV c)
	874	{
	875	U8 tmpbuf[UTF8_MAXLEN+1];
	876	uvchr_to_utf8(tmpbuf, c);
	877	return is_utf8_ascii(tmpbuf);
	878	}
	879
	880	bool
	881	Perl_is_uni_space(pTHX_ UV c)
	882	{
	883	U8 tmpbuf[UTF8_MAXLEN+1];
	884	uvchr_to_utf8(tmpbuf, c);
	885	return is_utf8_space(tmpbuf);
	886	}
	887
	888	bool
	889	Perl_is_uni_digit(pTHX_ UV c)
	890	{
	891	U8 tmpbuf[UTF8_MAXLEN+1];
	892	uvchr_to_utf8(tmpbuf, c);
	893	return is_utf8_digit(tmpbuf);
	894	}
	895
	896	bool
	897	Perl_is_uni_upper(pTHX_ UV c)
	898	{
	899	U8 tmpbuf[UTF8_MAXLEN+1];
	900	uvchr_to_utf8(tmpbuf, c);
	901	return is_utf8_upper(tmpbuf);
	902	}
	903
	904	bool
	905	Perl_is_uni_lower(pTHX_ UV c)
	906	{
	907	U8 tmpbuf[UTF8_MAXLEN+1];
	908	uvchr_to_utf8(tmpbuf, c);
	909	return is_utf8_lower(tmpbuf);
	910	}
	911
	912	bool
	913	Perl_is_uni_cntrl(pTHX_ UV c)
	914	{
	915	U8 tmpbuf[UTF8_MAXLEN+1];
	916	uvchr_to_utf8(tmpbuf, c);
	917	return is_utf8_cntrl(tmpbuf);
	918	}
	919
	920	bool
	921	Perl_is_uni_graph(pTHX_ UV c)
	922	{
	923	U8 tmpbuf[UTF8_MAXLEN+1];
	924	uvchr_to_utf8(tmpbuf, c);
	925	return is_utf8_graph(tmpbuf);
	926	}
	927
	928	bool
	929	Perl_is_uni_print(pTHX_ UV c)
	930	{
	931	U8 tmpbuf[UTF8_MAXLEN+1];
	932	uvchr_to_utf8(tmpbuf, c);
	933	return is_utf8_print(tmpbuf);
	934	}
	935
	936	bool
	937	Perl_is_uni_punct(pTHX_ UV c)
	938	{
	939	U8 tmpbuf[UTF8_MAXLEN+1];
	940	uvchr_to_utf8(tmpbuf, c);
	941	return is_utf8_punct(tmpbuf);
	942	}
	943
	944	bool
	945	Perl_is_uni_xdigit(pTHX_ UV c)
	946	{
	947	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	948	uvchr_to_utf8(tmpbuf, c);
	949	return is_utf8_xdigit(tmpbuf);
	950	}
	951
	952	UV
	953	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	954	{
	955	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	956	uvchr_to_utf8(tmpbuf, c);
	957	return to_utf8_upper(tmpbuf, p, lenp);
	958	}
	959
	960	UV
	961	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	962	{
	963	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	964	uvchr_to_utf8(tmpbuf, c);
	965	return to_utf8_title(tmpbuf, p, lenp);
	966	}
	967
	968	UV
	969	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	970	{
	971	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	972	uvchr_to_utf8(tmpbuf, c);
	973	return to_utf8_lower(tmpbuf, p, lenp);
	974	}
	975
	976	UV
	977	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	978	{
	979	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
	980	uvchr_to_utf8(tmpbuf, c);
	981	return to_utf8_fold(tmpbuf, p, lenp);
	982	}
	983
	984	/* for now these all assume no locale info available for Unicode > 255 */
	985
	986	bool
	987	Perl_is_uni_alnum_lc(pTHX_ UV c)
	988	{
	989	return is_uni_alnum(c); /* XXX no locale support yet */
	990	}
	991
	992	bool
	993	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	994	{
	995	return is_uni_alnumc(c); /* XXX no locale support yet */
	996	}
	997
	998	bool
	999	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1000	{
	1001	return is_uni_idfirst(c); /* XXX no locale support yet */
	1002	}
	1003
	1004	bool
	1005	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1006	{
	1007	return is_uni_alpha(c); /* XXX no locale support yet */
	1008	}
	1009
	1010	bool
	1011	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1012	{
	1013	return is_uni_ascii(c); /* XXX no locale support yet */
	1014	}
	1015
	1016	bool
	1017	Perl_is_uni_space_lc(pTHX_ UV c)
	1018	{
	1019	return is_uni_space(c); /* XXX no locale support yet */
	1020	}
	1021
	1022	bool
	1023	Perl_is_uni_digit_lc(pTHX_ UV c)
	1024	{
	1025	return is_uni_digit(c); /* XXX no locale support yet */
	1026	}
	1027
	1028	bool
	1029	Perl_is_uni_upper_lc(pTHX_ UV c)
	1030	{
	1031	return is_uni_upper(c); /* XXX no locale support yet */
	1032	}
	1033
	1034	bool
	1035	Perl_is_uni_lower_lc(pTHX_ UV c)
	1036	{
	1037	return is_uni_lower(c); /* XXX no locale support yet */
	1038	}
	1039
	1040	bool
	1041	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1042	{
	1043	return is_uni_cntrl(c); /* XXX no locale support yet */
	1044	}
	1045
	1046	bool
	1047	Perl_is_uni_graph_lc(pTHX_ UV c)
	1048	{
	1049	return is_uni_graph(c); /* XXX no locale support yet */
	1050	}
	1051
	1052	bool
	1053	Perl_is_uni_print_lc(pTHX_ UV c)
	1054	{
	1055	return is_uni_print(c); /* XXX no locale support yet */
	1056	}
	1057
	1058	bool
	1059	Perl_is_uni_punct_lc(pTHX_ UV c)
	1060	{
	1061	return is_uni_punct(c); /* XXX no locale support yet */
	1062	}
	1063
	1064	bool
	1065	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1066	{
	1067	return is_uni_xdigit(c); /* XXX no locale support yet */
	1068	}
	1069
	1070	U32
	1071	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1072	{
	1073	/* XXX returns only the first character -- do not use XXX */
	1074	/* XXX no locale support yet */
	1075	STRLEN len;
	1076	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1077	return (U32)to_uni_upper(c, tmpbuf, &len);
	1078	}
	1079
	1080	U32
	1081	Perl_to_uni_title_lc(pTHX_ U32 c)
	1082	{
	1083	/* XXX returns only the first character XXX -- do not use XXX */
	1084	/* XXX no locale support yet */
	1085	STRLEN len;
	1086	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1087	return (U32)to_uni_title(c, tmpbuf, &len);
	1088	}
	1089
	1090	U32
	1091	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1092	{
	1093	/* XXX returns only the first character -- do not use XXX */
	1094	/* XXX no locale support yet */
	1095	STRLEN len;
	1096	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1097	return (U32)to_uni_lower(c, tmpbuf, &len);
	1098	}
	1099
	1100	bool
	1101	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1102	{
	1103	if (!is_utf8_char(p))
	1104	return FALSE;
	1105	if (!PL_utf8_alnum)
	1106	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1107	* descendant of isalnum(3), in other words, it doesn't
	1108	* contain the '_'. --jhi */
	1109	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1110	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1111	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1112	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1113	if (!PL_utf8_alnum)
	1114	PL_utf8_alnum = swash_init("utf8", "",
	1115	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1116	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1117	#endif
	1118	}
	1119
	1120	bool
	1121	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1122	{
	1123	if (!is_utf8_char(p))
	1124	return FALSE;
	1125	if (!PL_utf8_alnum)
	1126	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1127	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1128	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1129	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1130	if (!PL_utf8_alnum)
	1131	PL_utf8_alnum = swash_init("utf8", "",
	1132	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1133	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1134	#endif
	1135	}
	1136
	1137	bool
	1138	Perl_is_utf8_idfirst(pTHX_ U8 *p)
	1139	{
	1140	return *p == '_' \|\| is_utf8_alpha(p);
	1141	}
	1142
	1143	bool
	1144	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1145	{
	1146	if (!is_utf8_char(p))
	1147	return FALSE;
	1148	if (!PL_utf8_alpha)
	1149	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1150	return swash_fetch(PL_utf8_alpha, p, TRUE);
	1151	}
	1152
	1153	bool
	1154	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1155	{
	1156	if (!is_utf8_char(p))
	1157	return FALSE;
	1158	if (!PL_utf8_ascii)
	1159	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1160	return swash_fetch(PL_utf8_ascii, p, TRUE);
	1161	}
	1162
	1163	bool
	1164	Perl_is_utf8_space(pTHX_ U8 *p)
	1165	{
	1166	if (!is_utf8_char(p))
	1167	return FALSE;
	1168	if (!PL_utf8_space)
	1169	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1170	return swash_fetch(PL_utf8_space, p, TRUE);
	1171	}
	1172
	1173	bool
	1174	Perl_is_utf8_digit(pTHX_ U8 *p)
	1175	{
	1176	if (!is_utf8_char(p))
	1177	return FALSE;
	1178	if (!PL_utf8_digit)
	1179	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1180	return swash_fetch(PL_utf8_digit, p, TRUE);
	1181	}
	1182
	1183	bool
	1184	Perl_is_utf8_upper(pTHX_ U8 *p)
	1185	{
	1186	if (!is_utf8_char(p))
	1187	return FALSE;
	1188	if (!PL_utf8_upper)
	1189	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
	1190	return swash_fetch(PL_utf8_upper, p, TRUE);
	1191	}
	1192
	1193	bool
	1194	Perl_is_utf8_lower(pTHX_ U8 *p)
	1195	{
	1196	if (!is_utf8_char(p))
	1197	return FALSE;
	1198	if (!PL_utf8_lower)
	1199	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
	1200	return swash_fetch(PL_utf8_lower, p, TRUE);
	1201	}
	1202
	1203	bool
	1204	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1205	{
	1206	if (!is_utf8_char(p))
	1207	return FALSE;
	1208	if (!PL_utf8_cntrl)
	1209	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1210	return swash_fetch(PL_utf8_cntrl, p, TRUE);
	1211	}
	1212
	1213	bool
	1214	Perl_is_utf8_graph(pTHX_ U8 *p)
	1215	{
	1216	if (!is_utf8_char(p))
	1217	return FALSE;
	1218	if (!PL_utf8_graph)
	1219	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1220	return swash_fetch(PL_utf8_graph, p, TRUE);
	1221	}
	1222
	1223	bool
	1224	Perl_is_utf8_print(pTHX_ U8 *p)
	1225	{
	1226	if (!is_utf8_char(p))
	1227	return FALSE;
	1228	if (!PL_utf8_print)
	1229	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1230	return swash_fetch(PL_utf8_print, p, TRUE);
	1231	}
	1232
	1233	bool
	1234	Perl_is_utf8_punct(pTHX_ U8 *p)
	1235	{
	1236	if (!is_utf8_char(p))
	1237	return FALSE;
	1238	if (!PL_utf8_punct)
	1239	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1240	return swash_fetch(PL_utf8_punct, p, TRUE);
	1241	}
	1242
	1243	bool
	1244	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1245	{
	1246	if (!is_utf8_char(p))
	1247	return FALSE;
	1248	if (!PL_utf8_xdigit)
	1249	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1250	return swash_fetch(PL_utf8_xdigit, p, TRUE);
	1251	}
	1252
	1253	bool
	1254	Perl_is_utf8_mark(pTHX_ U8 *p)
	1255	{
	1256	if (!is_utf8_char(p))
	1257	return FALSE;
	1258	if (!PL_utf8_mark)
	1259	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1260	return swash_fetch(PL_utf8_mark, p, TRUE);
	1261	}
	1262
	1263	/*
	1264	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1265
	1266	The "p" contains the pointer to the UTF-8 string encoding
	1267	the character that is being converted.
	1268
	1269	The "ustrp" is a pointer to the character buffer to put the
	1270	conversion result to. The "lenp" is a pointer to the length
	1271	of the result.
	1272
	1273	The "swashp" is a pointer to the swash to use.
	1274
	1275	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1276	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
	1277	but not always, a multicharacter mapping), is tried first.
	1278
	1279	The "special" is a string like "utf8::ToSpecLower", which means the
	1280	hash %utf8::ToSpecLower. The access to the hash is through
	1281	Perl_to_utf8_case().
	1282
	1283	The "normal" is a string like "ToLower" which means the swash
	1284	%utf8::ToLower.
	1285
	1286	=cut */
	1287
	1288	UV
	1289	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
	1290	{
	1291	UV uv0, uv1;
	1292	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
	1293	STRLEN len = 0;
	1294
	1295	uv0 = utf8_to_uvchr(p, 0);
	1296	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1297	* are necessary in EBCDIC, they are redundant no-ops
	1298	* in ASCII-ish platforms, and hopefully optimized away. */
	1299	uv1 = NATIVE_TO_UNI(uv0);
	1300	uvuni_to_utf8(tmpbuf, uv1);
	1301
	1302	if (!swashp) / load on-demand */
	1303	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1304
	1305	if (special) {
	1306	/* It might be "special" (sometimes, but not always,
	1307	* a multicharacter mapping) */
	1308	HV *hv;
	1309	SV *keysv;
	1310	HE *he;
	1311	SV *val;
	1312
	1313	if ((hv = get_hv(special, FALSE)) &&
	1314	(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
	1315	(he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
	1316	(val = HeVAL(he))) {
	1317	char *s;
	1318
	1319	s = SvPV(val, len);
	1320	if (len == 1)
	1321	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1322	else {
	1323	#ifdef EBCDIC
	1324	/* If we have EBCDIC we need to remap the characters
	1325	* since any characters in the low 256 are Unicode
	1326	* code points, not EBCDIC. */
	1327	U8 t = (U8)s, tend = t + len, d;
	1328
	1329	d = tmpbuf;
	1330	if (SvUTF8(val)) {
	1331	STRLEN tlen = 0;
	1332
	1333	while (t < tend) {
	1334	UV c = utf8_to_uvchr(t, &tlen);
	1335	if (tlen > 0) {
	1336	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1337	t += tlen;
	1338	}
	1339	else
	1340	break;
	1341	}
	1342	}
	1343	else {
	1344	while (t < tend) {
	1345	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1346	t++;
	1347	}
	1348	}
	1349	len = d - tmpbuf;
	1350	Copy(tmpbuf, ustrp, len, U8);
	1351	#else
	1352	Copy(s, ustrp, len, U8);
	1353	#endif
	1354	}
	1355	}
	1356	}
	1357
	1358	if (!len && *swashp) {
	1359	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1360
	1361	if (uv2) {
	1362	/* It was "normal" (a single character mapping). */
	1363	UV uv3 = UNI_TO_NATIVE(uv2);
	1364
	1365	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1366	}
	1367	}
	1368
	1369	if (!len) /* Neither: just copy. */
	1370	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1371
	1372	if (lenp)
	1373	*lenp = len;
	1374
	1375	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1376	}
	1377
	1378	/*
	1379	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
	1380
	1381	Convert the UTF-8 encoded character at p to its uppercase version and
	1382	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1383	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1384	uppercase version may be longer than the original character (up to two
	1385	characters).
	1386
	1387	The first character of the uppercased version is returned
	1388	(but note, as explained above, that there may be more.)
	1389
	1390	=cut */
	1391
	1392	UV
	1393	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1394	{
	1395	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1396	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1397	}
	1398
	1399	/*
	1400	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
	1401
	1402	Convert the UTF-8 encoded character at p to its titlecase version and
	1403	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1404	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1405	titlecase version may be longer than the original character (up to two
	1406	characters).
	1407
	1408	The first character of the titlecased version is returned
	1409	(but note, as explained above, that there may be more.)
	1410
	1411	=cut */
	1412
	1413	UV
	1414	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1415	{
	1416	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1417	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1418	}
	1419
	1420	/*
	1421	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
	1422
	1423	Convert the UTF-8 encoded character at p to its lowercase version and
	1424	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1425	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1426	lowercase version may be longer than the original character (up to two
	1427	characters).
	1428
	1429	The first character of the lowercased version is returned
	1430	(but note, as explained above, that there may be more.)
	1431
	1432	=cut */
	1433
	1434	UV
	1435	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1436	{
	1437	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1438	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1439	}
	1440
	1441	/*
	1442	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
	1443
	1444	Convert the UTF-8 encoded character at p to its foldcase version and
	1445	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1446	that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
	1447	foldcase version may be longer than the original character (up to
	1448	three characters).
	1449
	1450	The first character of the foldcased version is returned
	1451	(but note, as explained above, that there may be more.)
	1452
	1453	=cut */
	1454
	1455	UV
	1456	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1457	{
	1458	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1459	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1460	}
	1461
	1462	/* a "swash" is a swatch hash */
	1463
	1464	SV*
	1465	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1466	{
	1467	SV* retval;
	1468	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
	1469	dSP;
	1470	HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
	1471	SV* errsv_save;
	1472
	1473	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1474	ENTER;
	1475	errsv_save = newSVsv(ERRSV);
	1476	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
	1477	if (!SvTRUE(ERRSV))
	1478	sv_setsv(ERRSV, errsv_save);
	1479	SvREFCNT_dec(errsv_save);
	1480	LEAVE;
	1481	}
	1482	SPAGAIN;
	1483	PUSHSTACKi(PERLSI_MAGIC);
	1484	PUSHMARK(SP);
	1485	EXTEND(SP,5);
	1486	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
	1487	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
	1488	PUSHs(listsv);
	1489	PUSHs(sv_2mortal(newSViv(minbits)));
	1490	PUSHs(sv_2mortal(newSViv(none)));
	1491	PUTBACK;
	1492	ENTER;
	1493	SAVEI32(PL_hints);
	1494	PL_hints = 0;
	1495	save_re_context();
	1496	if (PL_curcop == &PL_compiling)
	1497	/* XXX ought to be handled by lex_start */
	1498	sv_setpv(tokenbufsv, PL_tokenbuf);
	1499	errsv_save = newSVsv(ERRSV);
	1500	if (call_method("SWASHNEW", G_SCALAR))
	1501	retval = newSVsv(*PL_stack_sp--);
	1502	else
	1503	retval = &PL_sv_undef;
	1504	if (!SvTRUE(ERRSV))
	1505	sv_setsv(ERRSV, errsv_save);
	1506	SvREFCNT_dec(errsv_save);
	1507	LEAVE;
	1508	POPSTACK;
	1509	if (PL_curcop == &PL_compiling) {
	1510	STRLEN len;
	1511	char* pv = SvPV(tokenbufsv, len);
	1512
	1513	Copy(pv, PL_tokenbuf, len+1, char);
	1514	PL_curcop->op_private = PL_hints;
	1515	}
	1516	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV)
	1517	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1518	return retval;
	1519	}
	1520
	1521
	1522	/* This API is wrong for special case conversions since we may need to
	1523	* return several Unicode characters for a single Unicode character
	1524	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1525	* the lower-level routine, and it is similarly broken for returning
	1526	* multiple values. --jhi */
	1527	UV
	1528	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1529	{
	1530	HV* hv = (HV*)SvRV(sv);
	1531	U32 klen;
	1532	U32 off;
	1533	STRLEN slen;
	1534	STRLEN needents;
	1535	U8 *tmps = NULL;
	1536	U32 bit;
	1537	SV *retval;
	1538	U8 tmputf8[2];
	1539	UV c = NATIVE_TO_ASCII(*ptr);
	1540
	1541	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1542	tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
	1543	tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
	1544	ptr = tmputf8;
	1545	}
	1546	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1547	* then the "swatch" is a vec() for al the chars which start
	1548	* with 0xAA..0xYY
	1549	* So the key in the hash (klen) is length of encoded char -1
	1550	*/
	1551	klen = UTF8SKIP(ptr) - 1;
	1552	off = ptr[klen];
	1553
	1554	if (klen == 0)
	1555	{
	1556	/* If char in invariant then swatch is for all the invariant chars
	1557	* In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
	1558	*/
	1559	needents = UTF_CONTINUATION_MARK;
	1560	off = NATIVE_TO_UTF(ptr[klen]);
	1561	}
	1562	else
	1563	{
	1564	/* If char is encoded then swatch is for the prefix */
	1565	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1566	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1567	}
	1568
	1569	/*
	1570	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1571	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1572	* it's nothing to sniff at.) Pity we usually come through at least
	1573	* two function calls to get here...
	1574	*
	1575	* NB: this code assumes that swatches are never modified, once generated!
	1576	*/
	1577
	1578	if (hv == PL_last_swash_hv &&
	1579	klen == PL_last_swash_klen &&
	1580	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1581	{
	1582	tmps = PL_last_swash_tmps;
	1583	slen = PL_last_swash_slen;
	1584	}
	1585	else {
	1586	/* Try our second-level swatch cache, kept in a hash. */
	1587	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1588
	1589	/* If not cached, generate it via utf8::SWASHGET */
	1590	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1591	dSP;
	1592	/* We use utf8n_to_uvuni() as we want an index into
	1593	Unicode tables, not a native character number.
	1594	*/
	1595	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
	1596	SV *errsv_save;
	1597	ENTER;
	1598	SAVETMPS;
	1599	save_re_context();
	1600	PUSHSTACKi(PERLSI_MAGIC);
	1601	PUSHMARK(SP);
	1602	EXTEND(SP,3);
	1603	PUSHs((SV*)sv);
	1604	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1605	PUSHs(sv_2mortal(newSViv((klen) ?
	1606	(code_point & ~(needents - 1)) : 0)));
	1607	PUSHs(sv_2mortal(newSViv(needents)));
	1608	PUTBACK;
	1609	errsv_save = newSVsv(ERRSV);
	1610	if (call_method("SWASHGET", G_SCALAR))
	1611	retval = newSVsv(*PL_stack_sp--);
	1612	else
	1613	retval = &PL_sv_undef;
	1614	if (!SvTRUE(ERRSV))
	1615	sv_setsv(ERRSV, errsv_save);
	1616	SvREFCNT_dec(errsv_save);
	1617	POPSTACK;
	1618	FREETMPS;
	1619	LEAVE;
	1620	if (PL_curcop == &PL_compiling)
	1621	PL_curcop->op_private = PL_hints;
	1622
	1623	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1624
	1625	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1626	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1627	}
	1628
	1629	PL_last_swash_hv = hv;
	1630	PL_last_swash_klen = klen;
	1631	PL_last_swash_tmps = tmps;
	1632	PL_last_swash_slen = slen;
	1633	if (klen)
	1634	Copy(ptr, PL_last_swash_key, klen, U8);
	1635	}
	1636
	1637	switch ((int)((slen << 3) / needents)) {
	1638	case 1:
	1639	bit = 1 << (off & 7);
	1640	off >>= 3;
	1641	return (tmps[off] & bit) != 0;
	1642	case 8:
	1643	return tmps[off];
	1644	case 16:
	1645	off <<= 1;
	1646	return (tmps[off] << 8) + tmps[off + 1] ;
	1647	case 32:
	1648	off <<= 2;
	1649	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1650	}
	1651	Perl_croak(aTHX_ "panic: swash_fetch");
	1652	return 0;
	1653	}
	1654
	1655
	1656	/*
	1657	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1658
	1659	Adds the UTF8 representation of the Native codepoint C<uv> to the end
	1660	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	1661	bytes available. The return value is the pointer to the byte after the
	1662	end of the new character. In other words,
	1663
	1664	d = uvchr_to_utf8(d, uv);
	1665
	1666	is the recommended wide native character-aware way of saying
	1667
	1668	*(d++) = uv;
	1669
	1670	=cut
	1671	*/
	1672
	1673	/* On ASCII machines this is normally a macro but we want a
	1674	real function in case XS code wants it
	1675	*/
	1676	#undef Perl_uvchr_to_utf8
	1677	U8 *
	1678	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1679	{
	1680	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1681	}
	1682
	1683	U8 *
	1684	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1685	{
	1686	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1687	}
	1688
	1689	/*
	1690	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1691
	1692	Returns the native character value of the first character in the string C<s>
	1693	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	1694	length, in bytes, of that character.
	1695
	1696	Allows length and flags to be passed to low level routine.
	1697
	1698	=cut
	1699	*/
	1700	/* On ASCII machines this is normally a macro but we want
	1701	a real function in case XS code wants it
	1702	*/
	1703	#undef Perl_utf8n_to_uvchr
	1704	UV
	1705	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1706	{
	1707	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1708	return UNI_TO_NATIVE(uv);
	1709	}
	1710
	1711	/*
	1712	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1713
	1714	Build to the scalar dsv a displayable version of the string spv,
	1715	length len, the displayable version being at most pvlim bytes long
	1716	(if longer, the rest is truncated and "..." will be appended).
	1717
	1718	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1719	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1720	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1721	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1722	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1723	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1724
	1725	The pointer to the PV of the dsv is returned.
	1726
	1727	=cut */
	1728	char *
	1729	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1730	{
	1731	int truncated = 0;
	1732	char s, e;
	1733
	1734	sv_setpvn(dsv, "", 0);
	1735	for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1736	UV u;
	1737	bool ok = FALSE;
	1738
	1739	if (pvlim && SvCUR(dsv) >= pvlim) {
	1740	truncated++;
	1741	break;
	1742	}
	1743	u = utf8_to_uvchr((U8*)s, 0);
	1744	if (u < 256) {
	1745	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1746	switch (u & 0xFF) {
	1747	case '\n':
	1748	Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
	1749	case '\r':
	1750	Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
	1751	case '\t':
	1752	Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
	1753	case '\f':
	1754	Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
	1755	case '\a':
	1756	Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
	1757	case '\\':
	1758	Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
	1759	default: break;
	1760	}
	1761	}
	1762	/* isPRINT() is the locale-blind version. */
	1763	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
	1764	Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
	1765	ok = TRUE;
	1766	}
	1767	}
	1768	if (!ok)
	1769	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1770	}
	1771	if (truncated)
	1772	sv_catpvn(dsv, "...", 3);
	1773
	1774	return SvPVX(dsv);
	1775	}
	1776
	1777	/*
	1778	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1779
	1780	Build to the scalar dsv a displayable version of the scalar sv,
	1781	the displayable version being at most pvlim bytes long
	1782	(if longer, the rest is truncated and "..." will be appended).
	1783
	1784	The flags argument is as in pv_uni_display().
	1785
	1786	The pointer to the PV of the dsv is returned.
	1787
	1788	=cut */
	1789	char *
	1790	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1791	{
	1792	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
	1793	pvlim, flags);
	1794	}
	1795
	1796	/*
	1797	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1798
	1799	Return true if the strings s1 and s2 differ case-insensitively, false
	1800	if not (if they are equal case-insensitively). If u1 is true, the
	1801	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1802	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1803	are false, the respective string is assumed to be in native 8-bit
	1804	encoding.
	1805
	1806	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1807	in there (they will point at the beginning of the I<next> character).
	1808	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1809	pointers beyond which scanning will not continue under any
	1810	circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1811	s2+l2 will be used as goal end pointers that will also stop the scan,
	1812	and which qualify towards defining a successful match: all the scans
	1813	that define an explicit length must reach their goal pointers for
	1814	a match to succeed).
	1815
	1816	For case-insensitiveness, the "casefolding" of Unicode is used
	1817	instead of upper/lowercasing both the characters, see
	1818	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	1819
	1820	=cut */
	1821	I32
	1822	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	1823	{
	1824	register U8 p1 = (U8)s1;
	1825	register U8 p2 = (U8)s2;
	1826	register U8 e1 = 0, f1 = 0, *q1 = 0;
	1827	register U8 e2 = 0, f2 = 0, *q2 = 0;
	1828	STRLEN n1 = 0, n2 = 0;
	1829	U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
	1830	U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
	1831	U8 natbuf[1+1];
	1832	STRLEN foldlen1, foldlen2;
	1833	bool match;
	1834
	1835	if (pe1)
	1836	e1 = (U8*)pe1;
	1837	if (e1 == 0 \|\| (l1 && l1 < e1 - (U8*)s1))
	1838	f1 = (U8*)s1 + l1;
	1839	if (pe2)
	1840	e2 = (U8*)pe2;
	1841	if (e2 == 0 \|\| (l2 && l2 < e2 - (U8*)s2))
	1842	f2 = (U8*)s2 + l2;
	1843
	1844	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	1845	return 1; /* mismatch; possible infinite loop or false positive */
	1846
	1847	if (!u1 \|\| !u2)
	1848	natbuf[1] = 0; /* Need to terminate the buffer. */
	1849
	1850	while ((e1 == 0 \|\| p1 < e1) &&
	1851	(f1 == 0 \|\| p1 < f1) &&
	1852	(e2 == 0 \|\| p2 < e2) &&
	1853	(f2 == 0 \|\| p2 < f2)) {
	1854	if (n1 == 0) {
	1855	if (u1)
	1856	to_utf8_fold(p1, foldbuf1, &foldlen1);
	1857	else {
	1858	natbuf[0] = *p1;
	1859	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	1860	}
	1861	q1 = foldbuf1;
	1862	n1 = foldlen1;
	1863	}
	1864	if (n2 == 0) {
	1865	if (u2)
	1866	to_utf8_fold(p2, foldbuf2, &foldlen2);
	1867	else {
	1868	natbuf[0] = *p2;
	1869	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	1870	}
	1871	q2 = foldbuf2;
	1872	n2 = foldlen2;
	1873	}
	1874	while (n1 && n2) {
	1875	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	1876	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	1877	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	1878	return 1; /* mismatch */
	1879	n1 -= UTF8SKIP(q1);
	1880	q1 += UTF8SKIP(q1);
	1881	n2 -= UTF8SKIP(q2);
	1882	q2 += UTF8SKIP(q2);
	1883	}
	1884	if (n1 == 0)
	1885	p1 += u1 ? UTF8SKIP(p1) : 1;
	1886	if (n2 == 0)
	1887	p2 += u2 ? UTF8SKIP(p2) : 1;
	1888
	1889	}
	1890
	1891	/* A match is defined by all the scans that specified
	1892	* an explicit length reaching their final goals. */
	1893	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	1894
	1895	if (match) {
	1896	if (pe1)
	1897	pe1 = (char)p1;
	1898	if (pe2)
	1899	pe2 = (char)p2;
	1900	}
	1901
	1902	return match ? 0 : 1; /* 0 match, 1 mismatch */
	1903	}
	1904