perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (c) 1998-2002, Larry Wall
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	*/
	9
	10	/*
	11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	12	* heard of that we don't want to see any closer; and that's the one place
	13	* we're trying to get to! And that's just where we can't get, nohow.'
	14	*
	15	* 'Well do I understand your speech,' he answered in the same language;
	16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	17	* as is the custom in the West, if you wish to be answered?'
	18	*
	19	* ...the travellers perceived that the floor was paved with stones of many
	20	* hues; branching runes and strange devices intertwined beneath their feet.
	21	*/
	22
	23	#include "EXTERN.h"
	24	#define PERL_IN_UTF8_C
	25	#include "perl.h"
	26
	27	/*
	28	=head1 Unicode Support
	29
	30	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	31
	32	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
	33	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	34	bytes available. The return value is the pointer to the byte after the
	35	end of the new character. In other words,
	36
	37	d = uvuni_to_utf8_flags(d, uv, flags);
	38
	39	or, in most cases,
	40
	41	d = uvuni_to_utf8(d, uv);
	42
	43	(which is equivalent to)
	44
	45	d = uvuni_to_utf8_flags(d, uv, 0);
	46
	47	is the recommended Unicode-aware way of saying
	48
	49	*(d++) = uv;
	50
	51	=cut
	52	*/
	53
	54	U8 *
	55	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	56	{
	57	if (ckWARN(WARN_UTF8)) {
	58	if (UNICODE_IS_SURROGATE(uv) &&
	59	!(flags & UNICODE_ALLOW_SURROGATE))
	60	Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
	61	else if (
	62	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	63	!(flags & UNICODE_ALLOW_FDD0))
	64	\|\|
	65	((uv & 0xFFFF) == 0xFFFE &&
	66	!(flags & UNICODE_ALLOW_FFFE))
	67	\|\|
	68	((uv & 0xFFFF) == 0xFFFF &&
	69	!(flags & UNICODE_ALLOW_FFFF))) &&
	70	/* UNICODE_ALLOW_SUPER includes
	71	* FFFEs and FFFFs beyond 0x10FFFF. */
	72	((uv <= PERL_UNICODE_MAX) \|\|
	73	!(flags & UNICODE_ALLOW_SUPER))
	74	)
	75	Perl_warner(aTHX_ WARN_UTF8,
	76	"Unicode character 0x%04"UVxf" is illegal", uv);
	77	}
	78	if (UNI_IS_INVARIANT(uv)) {
	79	*d++ = UTF_TO_NATIVE(uv);
	80	return d;
	81	}
	82	#if defined(EBCDIC)
	83	else {
	84	STRLEN len = UNISKIP(uv);
	85	U8 *p = d+len-1;
	86	while (p > d) {
	87	*p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	88	uv >>= UTF_ACCUMULATION_SHIFT;
	89	}
	90	*p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	91	return d+len;
	92	}
	93	#else /* Non loop style */
	94	if (uv < 0x800) {
	95	*d++ = (( uv >> 6) \| 0xc0);
	96	*d++ = (( uv & 0x3f) \| 0x80);
	97	return d;
	98	}
	99	if (uv < 0x10000) {
	100	*d++ = (( uv >> 12) \| 0xe0);
	101	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	102	*d++ = (( uv & 0x3f) \| 0x80);
	103	return d;
	104	}
	105	if (uv < 0x200000) {
	106	*d++ = (( uv >> 18) \| 0xf0);
	107	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	108	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	109	*d++ = (( uv & 0x3f) \| 0x80);
	110	return d;
	111	}
	112	if (uv < 0x4000000) {
	113	*d++ = (( uv >> 24) \| 0xf8);
	114	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	115	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	116	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	117	*d++ = (( uv & 0x3f) \| 0x80);
	118	return d;
	119	}
	120	if (uv < 0x80000000) {
	121	*d++ = (( uv >> 30) \| 0xfc);
	122	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	123	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	124	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	125	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	126	*d++ = (( uv & 0x3f) \| 0x80);
	127	return d;
	128	}
	129	#ifdef HAS_QUAD
	130	if (uv < UTF8_QUAD_MAX)
	131	#endif
	132	{
	133	d++ = 0xfe; / Can't match U+FEFF! */
	134	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	135	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	136	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	137	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	138	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	139	*d++ = (( uv & 0x3f) \| 0x80);
	140	return d;
	141	}
	142	#ifdef HAS_QUAD
	143	{
	144	d++ = 0xff; / Can't match U+FFFE! */
	145	d++ = 0x80; / 6 Reserved bits */
	146	d++ = (((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	147	*d++ = (((uv >> 54) & 0x3f) \| 0x80);
	148	*d++ = (((uv >> 48) & 0x3f) \| 0x80);
	149	*d++ = (((uv >> 42) & 0x3f) \| 0x80);
	150	*d++ = (((uv >> 36) & 0x3f) \| 0x80);
	151	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	152	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	153	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	154	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	155	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	156	*d++ = (( uv & 0x3f) \| 0x80);
	157	return d;
	158	}
	159	#endif
	160	#endif /* Loop style */
	161	}
	162
	163	U8 *
	164	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	165	{
	166	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	167	}
	168
	169
	170	/*
	171	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	172
	173	Tests if some arbitrary number of bytes begins in a valid UTF-8
	174	character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
	175	The actual number of bytes in the UTF-8 character will be returned if
	176	it is valid, otherwise 0.
	177
	178	=cut
	179	*/
	180	STRLEN
	181	Perl_is_utf8_char(pTHX_ U8 *s)
	182	{
	183	U8 u = *s;
	184	STRLEN slen, len;
	185	UV uv, ouv;
	186
	187	if (UTF8_IS_INVARIANT(u))
	188	return 1;
	189
	190	if (!UTF8_IS_START(u))
	191	return 0;
	192
	193	len = UTF8SKIP(s);
	194
	195	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	196	return 0;
	197
	198	slen = len - 1;
	199	s++;
	200	u &= UTF_START_MASK(len);
	201	uv = u;
	202	ouv = uv;
	203	while (slen--) {
	204	if (!UTF8_IS_CONTINUATION(*s))
	205	return 0;
	206	uv = UTF8_ACCUMULATE(uv, *s);
	207	if (uv < ouv)
	208	return 0;
	209	ouv = uv;
	210	s++;
	211	}
	212
	213	if (UNISKIP(uv) < len)
	214	return 0;
	215
	216	return len;
	217	}
	218
	219	/*
	220	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	221
	222	Returns true if first C<len> bytes of the given string form a valid UTF8
	223	string, false otherwise. Note that 'a valid UTF8 string' does not mean
	224	'a string that contains UTF8' because a valid ASCII string is a valid
	225	UTF8 string.
	226
	227	=cut
	228	*/
	229
	230	bool
	231	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	232	{
	233	U8* x = s;
	234	U8* send;
	235	STRLEN c;
	236
	237	if (!len)
	238	len = strlen((char *)s);
	239	send = s + len;
	240
	241	while (x < send) {
	242	c = is_utf8_char(x);
	243	if (!c)
	244	return FALSE;
	245	x += c;
	246	}
	247	if (x != send)
	248	return FALSE;
	249
	250	return TRUE;
	251	}
	252
	253	/*
	254	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	255
	256	Bottom level UTF-8 decode routine.
	257	Returns the unicode code point value of the first character in the string C<s>
	258	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
	259	C<retlen> will be set to the length, in bytes, of that character.
	260
	261	If C<s> does not point to a well-formed UTF8 character, the behaviour
	262	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	263	it is assumed that the caller will raise a warning, and this function
	264	will silently just set C<retlen> to C<-1> and return zero. If the
	265	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	266	malformations will be given, C<retlen> will be set to the expected
	267	length of the UTF-8 character in bytes, and zero will be returned.
	268
	269	The C<flags> can also contain various flags to allow deviations from
	270	the strict UTF-8 encoding (see F<utf8.h>).
	271
	272	Most code should use utf8_to_uvchr() rather than call this directly.
	273
	274	=cut
	275	*/
	276
	277	UV
	278	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	279	{
	280	U8 *s0 = s;
	281	UV uv = *s, ouv = 0;
	282	STRLEN len = 1;
	283	bool dowarn = ckWARN_d(WARN_UTF8);
	284	UV startbyte = *s;
	285	STRLEN expectlen = 0;
	286	U32 warning = 0;
	287
	288	/* This list is a superset of the UTF8_ALLOW_XXX. */
	289
	290	#define UTF8_WARN_EMPTY 1
	291	#define UTF8_WARN_CONTINUATION 2
	292	#define UTF8_WARN_NON_CONTINUATION 3
	293	#define UTF8_WARN_FE_FF 4
	294	#define UTF8_WARN_SHORT 5
	295	#define UTF8_WARN_OVERFLOW 6
	296	#define UTF8_WARN_SURROGATE 7
	297	#define UTF8_WARN_BOM 8
	298	#define UTF8_WARN_LONG 9
	299	#define UTF8_WARN_FFFF 10
	300
	301	if (curlen == 0 &&
	302	!(flags & UTF8_ALLOW_EMPTY)) {
	303	warning = UTF8_WARN_EMPTY;
	304	goto malformed;
	305	}
	306
	307	if (UTF8_IS_INVARIANT(uv)) {
	308	if (retlen)
	309	*retlen = 1;
	310	return (UV) (NATIVE_TO_UTF(*s));
	311	}
	312
	313	if (UTF8_IS_CONTINUATION(uv) &&
	314	!(flags & UTF8_ALLOW_CONTINUATION)) {
	315	warning = UTF8_WARN_CONTINUATION;
	316	goto malformed;
	317	}
	318
	319	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	320	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	321	warning = UTF8_WARN_NON_CONTINUATION;
	322	goto malformed;
	323	}
	324
	325	#ifdef EBCDIC
	326	uv = NATIVE_TO_UTF(uv);
	327	#else
	328	if ((uv == 0xfe \|\| uv == 0xff) &&
	329	!(flags & UTF8_ALLOW_FE_FF)) {
	330	warning = UTF8_WARN_FE_FF;
	331	goto malformed;
	332	}
	333	#endif
	334
	335	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	336	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	337	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	338	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	339	#ifdef EBCDIC
	340	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	341	else { len = 7; uv &= 0x01; }
	342	#else
	343	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	344	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	345	else { len = 13; uv = 0; } /* whoa! */
	346	#endif
	347
	348	if (retlen)
	349	*retlen = len;
	350
	351	expectlen = len;
	352
	353	if ((curlen < expectlen) &&
	354	!(flags & UTF8_ALLOW_SHORT)) {
	355	warning = UTF8_WARN_SHORT;
	356	goto malformed;
	357	}
	358
	359	len--;
	360	s++;
	361	ouv = uv;
	362
	363	while (len--) {
	364	if (!UTF8_IS_CONTINUATION(*s) &&
	365	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	366	s--;
	367	warning = UTF8_WARN_NON_CONTINUATION;
	368	goto malformed;
	369	}
	370	else
	371	uv = UTF8_ACCUMULATE(uv, *s);
	372	if (!(uv > ouv)) {
	373	/* These cannot be allowed. */
	374	if (uv == ouv) {
	375	if (!(flags & UTF8_ALLOW_LONG)) {
	376	warning = UTF8_WARN_LONG;
	377	goto malformed;
	378	}
	379	}
	380	else { /* uv < ouv */
	381	/* This cannot be allowed. */
	382	warning = UTF8_WARN_OVERFLOW;
	383	goto malformed;
	384	}
	385	}
	386	s++;
	387	ouv = uv;
	388	}
	389
	390	if (UNICODE_IS_SURROGATE(uv) &&
	391	!(flags & UTF8_ALLOW_SURROGATE)) {
	392	warning = UTF8_WARN_SURROGATE;
	393	goto malformed;
	394	} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
	395	!(flags & UTF8_ALLOW_BOM)) {
	396	warning = UTF8_WARN_BOM;
	397	goto malformed;
	398	} else if ((expectlen > UNISKIP(uv)) &&
	399	!(flags & UTF8_ALLOW_LONG)) {
	400	warning = UTF8_WARN_LONG;
	401	goto malformed;
	402	} else if (UNICODE_IS_ILLEGAL(uv) &&
	403	!(flags & UTF8_ALLOW_FFFF)) {
	404	warning = UTF8_WARN_FFFF;
	405	goto malformed;
	406	}
	407
	408	return uv;
	409
	410	malformed:
	411
	412	if (flags & UTF8_CHECK_ONLY) {
	413	if (retlen)
	414	*retlen = -1;
	415	return 0;
	416	}
	417
	418	if (dowarn) {
	419	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	420
	421	switch (warning) {
	422	case 0: /* Intentionally empty. */ break;
	423	case UTF8_WARN_EMPTY:
	424	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
	425	break;
	426	case UTF8_WARN_CONTINUATION:
	427	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	428	break;
	429	case UTF8_WARN_NON_CONTINUATION:
	430	if (s == s0)
	431	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	432	(UV)s[1], startbyte);
	433	else
	434	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	435	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
	436
	437	break;
	438	case UTF8_WARN_FE_FF:
	439	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	440	break;
	441	case UTF8_WARN_SHORT:
	442	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	443	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
	444	expectlen = curlen; /* distance for caller to skip */
	445	break;
	446	case UTF8_WARN_OVERFLOW:
	447	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	448	ouv, *s, startbyte);
	449	break;
	450	case UTF8_WARN_SURROGATE:
	451	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	452	break;
	453	case UTF8_WARN_BOM:
	454	Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
	455	break;
	456	case UTF8_WARN_LONG:
	457	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	458	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	459	break;
	460	case UTF8_WARN_FFFF:
	461	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	462	break;
	463	default:
	464	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
	465	break;
	466	}
	467
	468	if (warning) {
	469	char *s = SvPVX(sv);
	470
	471	if (PL_op)
	472	Perl_warner(aTHX_ WARN_UTF8,
	473	"%s in %s", s, OP_DESC(PL_op));
	474	else
	475	Perl_warner(aTHX_ WARN_UTF8, "%s", s);
	476	}
	477	}
	478
	479	if (retlen)
	480	*retlen = expectlen ? expectlen : len;
	481
	482	return 0;
	483	}
	484
	485	/*
	486	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	487
	488	Returns the native character value of the first character in the string C<s>
	489	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	490	length, in bytes, of that character.
	491
	492	If C<s> does not point to a well-formed UTF8 character, zero is
	493	returned and retlen is set, if possible, to -1.
	494
	495	=cut
	496	*/
	497
	498	UV
	499	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	500	{
	501	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	502	}
	503
	504	/*
	505	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	506
	507	Returns the Unicode code point of the first character in the string C<s>
	508	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	509	length, in bytes, of that character.
	510
	511	This function should only be used when returned UV is considered
	512	an index into the Unicode semantic tables (e.g. swashes).
	513
	514	If C<s> does not point to a well-formed UTF8 character, zero is
	515	returned and retlen is set, if possible, to -1.
	516
	517	=cut
	518	*/
	519
	520	UV
	521	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	522	{
	523	/* Call the low level routine asking for checks */
	524	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	525	}
	526
	527	/*
	528	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	529
	530	Return the length of the UTF-8 char encoded string C<s> in characters.
	531	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	532	up past C<e>, croaks.
	533
	534	=cut
	535	*/
	536
	537	STRLEN
	538	Perl_utf8_length(pTHX_ U8 s, U8 e)
	539	{
	540	STRLEN len = 0;
	541
	542	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	543	* the bitops (especially ~) can create illegal UTF-8.
	544	* In other words: in Perl UTF-8 is not just for Unicode. */
	545
	546	if (e < s)
	547	Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
	548	while (s < e) {
	549	U8 t = UTF8SKIP(s);
	550
	551	if (e - s < t)
	552	Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
	553	s += t;
	554	len++;
	555	}
	556
	557	return len;
	558	}
	559
	560	/*
	561	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	562
	563	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
	564	and C<b>.
	565
	566	WARNING: use only if you know that the pointers point inside the
	567	same UTF-8 buffer.
	568
	569	=cut
	570	*/
	571
	572	IV
	573	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	574	{
	575	IV off = 0;
	576
	577	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	578	* the bitops (especially ~) can create illegal UTF-8.
	579	* In other words: in Perl UTF-8 is not just for Unicode. */
	580
	581	if (a < b) {
	582	while (a < b) {
	583	U8 c = UTF8SKIP(a);
	584
	585	if (b - a < c)
	586	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	587	a += c;
	588	off--;
	589	}
	590	}
	591	else {
	592	while (b < a) {
	593	U8 c = UTF8SKIP(b);
	594
	595	if (a - b < c)
	596	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	597	b += c;
	598	off++;
	599	}
	600	}
	601
	602	return off;
	603	}
	604
	605	/*
	606	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	607
	608	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	609	forward or backward.
	610
	611	WARNING: do not use the following unless you know C<off> is within
	612	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	613	on the first byte of character or just after the last byte of a character.
	614
	615	=cut
	616	*/
	617
	618	U8 *
	619	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	620	{
	621	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	622	* the bitops (especially ~) can create illegal UTF-8.
	623	* In other words: in Perl UTF-8 is not just for Unicode. */
	624
	625	if (off >= 0) {
	626	while (off--)
	627	s += UTF8SKIP(s);
	628	}
	629	else {
	630	while (off++) {
	631	s--;
	632	while (UTF8_IS_CONTINUATION(*s))
	633	s--;
	634	}
	635	}
	636	return s;
	637	}
	638
	639	/*
	640	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	641
	642	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	643	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	644	updates len to contain the new length.
	645	Returns zero on failure, setting C<len> to -1.
	646
	647	=cut
	648	*/
	649
	650	U8 *
	651	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	652	{
	653	U8 *send;
	654	U8 *d;
	655	U8 *save = s;
	656
	657	/* ensure valid UTF8 and chars < 256 before updating string */
	658	for (send = s + *len; s < send; ) {
	659	U8 c = *s++;
	660
	661	if (!UTF8_IS_INVARIANT(c) &&
	662	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	663	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	664	*len = -1;
	665	return 0;
	666	}
	667	}
	668
	669	d = s = save;
	670	while (s < send) {
	671	STRLEN ulen;
	672	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	673	s += ulen;
	674	}
	675	*d = '\0';
	676	*len = d - save;
	677	return save;
	678	}
	679
	680	/*
	681	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
	682
	683	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	684	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	685	the newly-created string, and updates C<len> to contain the new
	686	length. Returns the original string if no conversion occurs, C<len>
	687	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	688	0 if C<s> is converted or contains all 7bit characters.
	689
	690	=cut
	691	*/
	692
	693	U8 *
	694	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	695	{
	696	U8 *d;
	697	U8 *start = s;
	698	U8 *send;
	699	I32 count = 0;
	700
	701	if (!*is_utf8)
	702	return start;
	703
	704	/* ensure valid UTF8 and chars < 256 before converting string */
	705	for (send = s + *len; s < send;) {
	706	U8 c = *s++;
	707	if (!UTF8_IS_INVARIANT(c)) {
	708	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	709	(c = *s++) && UTF8_IS_CONTINUATION(c))
	710	count++;
	711	else
	712	return start;
	713	}
	714	}
	715
	716	*is_utf8 = 0;
	717
	718	Newz(801, d, (*len) - count + 1, U8);
	719	s = start; start = d;
	720	while (s < send) {
	721	U8 c = *s++;
	722	if (!UTF8_IS_INVARIANT(c)) {
	723	/* Then it is two-byte encoded */
	724	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	725	c = ASCII_TO_NATIVE(c);
	726	}
	727	*d++ = c;
	728	}
	729	*d = '\0';
	730	*len = d - start;
	731	return start;
	732	}
	733
	734	/*
	735	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	736
	737	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
	738	Returns a pointer to the newly-created string, and sets C<len> to
	739	reflect the new length.
	740
	741	=cut
	742	*/
	743
	744	U8*
	745	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	746	{
	747	U8 *send;
	748	U8 *d;
	749	U8 *dst;
	750	send = s + (*len);
	751
	752	Newz(801, d, (len) 2 + 1, U8);
	753	dst = d;
	754
	755	while (s < send) {
	756	UV uv = NATIVE_TO_ASCII(*s++);
	757	if (UNI_IS_INVARIANT(uv))
	758	*d++ = UTF_TO_NATIVE(uv);
	759	else {
	760	*d++ = UTF8_EIGHT_BIT_HI(uv);
	761	*d++ = UTF8_EIGHT_BIT_LO(uv);
	762	}
	763	}
	764	*d = '\0';
	765	*len = d-dst;
	766	return dst;
	767	}
	768
	769	/*
	770	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	771	*
	772	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	773	* We optimize for native, for obvious reasons. */
	774
	775	U8*
	776	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	777	{
	778	U8* pend;
	779	U8* dstart = d;
	780
	781	if (bytelen & 1)
	782	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
	783
	784	pend = p + bytelen;
	785
	786	while (p < pend) {
	787	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	788	p += 2;
	789	if (uv < 0x80) {
	790	*d++ = uv;
	791	continue;
	792	}
	793	if (uv < 0x800) {
	794	*d++ = (( uv >> 6) \| 0xc0);
	795	*d++ = (( uv & 0x3f) \| 0x80);
	796	continue;
	797	}
	798	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	799	UV low = *p++;
	800	if (low < 0xdc00 \|\| low >= 0xdfff)
	801	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	802	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	803	}
	804	if (uv < 0x10000) {
	805	*d++ = (( uv >> 12) \| 0xe0);
	806	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	807	*d++ = (( uv & 0x3f) \| 0x80);
	808	continue;
	809	}
	810	else {
	811	*d++ = (( uv >> 18) \| 0xf0);
	812	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	813	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	814	*d++ = (( uv & 0x3f) \| 0x80);
	815	continue;
	816	}
	817	}
	818	*newlen = d - dstart;
	819	return d;
	820	}
	821
	822	/* Note: this one is slightly destructive of the source. */
	823
	824	U8*
	825	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	826	{
	827	U8* s = (U8*)p;
	828	U8* send = s + bytelen;
	829	while (s < send) {
	830	U8 tmp = s[0];
	831	s[0] = s[1];
	832	s[1] = tmp;
	833	s += 2;
	834	}
	835	return utf16_to_utf8(p, d, bytelen, newlen);
	836	}
	837
	838	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	839
	840	bool
	841	Perl_is_uni_alnum(pTHX_ UV c)
	842	{
	843	U8 tmpbuf[UTF8_MAXLEN+1];
	844	uvchr_to_utf8(tmpbuf, (UV)c);
	845	return is_utf8_alnum(tmpbuf);
	846	}
	847
	848	bool
	849	Perl_is_uni_alnumc(pTHX_ UV c)
	850	{
	851	U8 tmpbuf[UTF8_MAXLEN+1];
	852	uvchr_to_utf8(tmpbuf, (UV)c);
	853	return is_utf8_alnumc(tmpbuf);
	854	}
	855
	856	bool
	857	Perl_is_uni_idfirst(pTHX_ UV c)
	858	{
	859	U8 tmpbuf[UTF8_MAXLEN+1];
	860	uvchr_to_utf8(tmpbuf, (UV)c);
	861	return is_utf8_idfirst(tmpbuf);
	862	}
	863
	864	bool
	865	Perl_is_uni_alpha(pTHX_ UV c)
	866	{
	867	U8 tmpbuf[UTF8_MAXLEN+1];
	868	uvchr_to_utf8(tmpbuf, (UV)c);
	869	return is_utf8_alpha(tmpbuf);
	870	}
	871
	872	bool
	873	Perl_is_uni_ascii(pTHX_ UV c)
	874	{
	875	U8 tmpbuf[UTF8_MAXLEN+1];
	876	uvchr_to_utf8(tmpbuf, (UV)c);
	877	return is_utf8_ascii(tmpbuf);
	878	}
	879
	880	bool
	881	Perl_is_uni_space(pTHX_ UV c)
	882	{
	883	U8 tmpbuf[UTF8_MAXLEN+1];
	884	uvchr_to_utf8(tmpbuf, (UV)c);
	885	return is_utf8_space(tmpbuf);
	886	}
	887
	888	bool
	889	Perl_is_uni_digit(pTHX_ UV c)
	890	{
	891	U8 tmpbuf[UTF8_MAXLEN+1];
	892	uvchr_to_utf8(tmpbuf, (UV)c);
	893	return is_utf8_digit(tmpbuf);
	894	}
	895
	896	bool
	897	Perl_is_uni_upper(pTHX_ UV c)
	898	{
	899	U8 tmpbuf[UTF8_MAXLEN+1];
	900	uvchr_to_utf8(tmpbuf, (UV)c);
	901	return is_utf8_upper(tmpbuf);
	902	}
	903
	904	bool
	905	Perl_is_uni_lower(pTHX_ UV c)
	906	{
	907	U8 tmpbuf[UTF8_MAXLEN+1];
	908	uvchr_to_utf8(tmpbuf, (UV)c);
	909	return is_utf8_lower(tmpbuf);
	910	}
	911
	912	bool
	913	Perl_is_uni_cntrl(pTHX_ UV c)
	914	{
	915	U8 tmpbuf[UTF8_MAXLEN+1];
	916	uvchr_to_utf8(tmpbuf, (UV)c);
	917	return is_utf8_cntrl(tmpbuf);
	918	}
	919
	920	bool
	921	Perl_is_uni_graph(pTHX_ UV c)
	922	{
	923	U8 tmpbuf[UTF8_MAXLEN+1];
	924	uvchr_to_utf8(tmpbuf, (UV)c);
	925	return is_utf8_graph(tmpbuf);
	926	}
	927
	928	bool
	929	Perl_is_uni_print(pTHX_ UV c)
	930	{
	931	U8 tmpbuf[UTF8_MAXLEN+1];
	932	uvchr_to_utf8(tmpbuf, (UV)c);
	933	return is_utf8_print(tmpbuf);
	934	}
	935
	936	bool
	937	Perl_is_uni_punct(pTHX_ UV c)
	938	{
	939	U8 tmpbuf[UTF8_MAXLEN+1];
	940	uvchr_to_utf8(tmpbuf, (UV)c);
	941	return is_utf8_punct(tmpbuf);
	942	}
	943
	944	bool
	945	Perl_is_uni_xdigit(pTHX_ UV c)
	946	{
	947	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	948	uvchr_to_utf8(tmpbuf, (UV)c);
	949	return is_utf8_xdigit(tmpbuf);
	950	}
	951
	952	UV
	953	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	954	{
	955	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	956	uvchr_to_utf8(tmpbuf, (UV)c);
	957	return to_utf8_upper(tmpbuf, p, lenp);
	958	}
	959
	960	UV
	961	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	962	{
	963	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	964	uvchr_to_utf8(tmpbuf, (UV)c);
	965	return to_utf8_title(tmpbuf, p, lenp);
	966	}
	967
	968	UV
	969	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	970	{
	971	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	972	uvchr_to_utf8(tmpbuf, (UV)c);
	973	return to_utf8_lower(tmpbuf, p, lenp);
	974	}
	975
	976	UV
	977	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	978	{
	979	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
	980	uvchr_to_utf8(tmpbuf, (UV)c);
	981	return to_utf8_fold(tmpbuf, p, lenp);
	982	}
	983
	984	/* for now these all assume no locale info available for Unicode > 255 */
	985
	986	bool
	987	Perl_is_uni_alnum_lc(pTHX_ UV c)
	988	{
	989	return is_uni_alnum(c); /* XXX no locale support yet */
	990	}
	991
	992	bool
	993	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	994	{
	995	return is_uni_alnumc(c); /* XXX no locale support yet */
	996	}
	997
	998	bool
	999	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1000	{
	1001	return is_uni_idfirst(c); /* XXX no locale support yet */
	1002	}
	1003
	1004	bool
	1005	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1006	{
	1007	return is_uni_alpha(c); /* XXX no locale support yet */
	1008	}
	1009
	1010	bool
	1011	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1012	{
	1013	return is_uni_ascii(c); /* XXX no locale support yet */
	1014	}
	1015
	1016	bool
	1017	Perl_is_uni_space_lc(pTHX_ UV c)
	1018	{
	1019	return is_uni_space(c); /* XXX no locale support yet */
	1020	}
	1021
	1022	bool
	1023	Perl_is_uni_digit_lc(pTHX_ UV c)
	1024	{
	1025	return is_uni_digit(c); /* XXX no locale support yet */
	1026	}
	1027
	1028	bool
	1029	Perl_is_uni_upper_lc(pTHX_ UV c)
	1030	{
	1031	return is_uni_upper(c); /* XXX no locale support yet */
	1032	}
	1033
	1034	bool
	1035	Perl_is_uni_lower_lc(pTHX_ UV c)
	1036	{
	1037	return is_uni_lower(c); /* XXX no locale support yet */
	1038	}
	1039
	1040	bool
	1041	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1042	{
	1043	return is_uni_cntrl(c); /* XXX no locale support yet */
	1044	}
	1045
	1046	bool
	1047	Perl_is_uni_graph_lc(pTHX_ UV c)
	1048	{
	1049	return is_uni_graph(c); /* XXX no locale support yet */
	1050	}
	1051
	1052	bool
	1053	Perl_is_uni_print_lc(pTHX_ UV c)
	1054	{
	1055	return is_uni_print(c); /* XXX no locale support yet */
	1056	}
	1057
	1058	bool
	1059	Perl_is_uni_punct_lc(pTHX_ UV c)
	1060	{
	1061	return is_uni_punct(c); /* XXX no locale support yet */
	1062	}
	1063
	1064	bool
	1065	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1066	{
	1067	return is_uni_xdigit(c); /* XXX no locale support yet */
	1068	}
	1069
	1070	U32
	1071	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1072	{
	1073	/* XXX returns only the first character -- do not use XXX */
	1074	/* XXX no locale support yet */
	1075	STRLEN len;
	1076	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1077	return (U32)to_uni_upper(c, tmpbuf, &len);
	1078	}
	1079
	1080	U32
	1081	Perl_to_uni_title_lc(pTHX_ U32 c)
	1082	{
	1083	/* XXX returns only the first character XXX -- do not use XXX */
	1084	/* XXX no locale support yet */
	1085	STRLEN len;
	1086	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1087	return (U32)to_uni_title(c, tmpbuf, &len);
	1088	}
	1089
	1090	U32
	1091	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1092	{
	1093	/* XXX returns only the first character -- do not use XXX */
	1094	/* XXX no locale support yet */
	1095	STRLEN len;
	1096	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1097	return (U32)to_uni_lower(c, tmpbuf, &len);
	1098	}
	1099
	1100	bool
	1101	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1102	{
	1103	if (!is_utf8_char(p))
	1104	return FALSE;
	1105	if (!PL_utf8_alnum)
	1106	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1107	* descendant of isalnum(3), in other words, it doesn't
	1108	* contain the '_'. --jhi */
	1109	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1110	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1111	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1112	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1113	if (!PL_utf8_alnum)
	1114	PL_utf8_alnum = swash_init("utf8", "",
	1115	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1116	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1117	#endif
	1118	}
	1119
	1120	bool
	1121	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1122	{
	1123	if (!is_utf8_char(p))
	1124	return FALSE;
	1125	if (!PL_utf8_alnum)
	1126	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1127	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1128	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1129	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1130	if (!PL_utf8_alnum)
	1131	PL_utf8_alnum = swash_init("utf8", "",
	1132	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1133	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1134	#endif
	1135	}
	1136
	1137	bool
	1138	Perl_is_utf8_idfirst(pTHX_ U8 *p)
	1139	{
	1140	return *p == '_' \|\| is_utf8_alpha(p);
	1141	}
	1142
	1143	bool
	1144	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1145	{
	1146	if (!is_utf8_char(p))
	1147	return FALSE;
	1148	if (!PL_utf8_alpha)
	1149	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1150	return swash_fetch(PL_utf8_alpha, p, TRUE);
	1151	}
	1152
	1153	bool
	1154	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1155	{
	1156	if (!is_utf8_char(p))
	1157	return FALSE;
	1158	if (!PL_utf8_ascii)
	1159	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1160	return swash_fetch(PL_utf8_ascii, p, TRUE);
	1161	}
	1162
	1163	bool
	1164	Perl_is_utf8_space(pTHX_ U8 *p)
	1165	{
	1166	if (!is_utf8_char(p))
	1167	return FALSE;
	1168	if (!PL_utf8_space)
	1169	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1170	return swash_fetch(PL_utf8_space, p, TRUE);
	1171	}
	1172
	1173	bool
	1174	Perl_is_utf8_digit(pTHX_ U8 *p)
	1175	{
	1176	if (!is_utf8_char(p))
	1177	return FALSE;
	1178	if (!PL_utf8_digit)
	1179	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1180	return swash_fetch(PL_utf8_digit, p, TRUE);
	1181	}
	1182
	1183	bool
	1184	Perl_is_utf8_upper(pTHX_ U8 *p)
	1185	{
	1186	if (!is_utf8_char(p))
	1187	return FALSE;
	1188	if (!PL_utf8_upper)
	1189	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
	1190	return swash_fetch(PL_utf8_upper, p, TRUE);
	1191	}
	1192
	1193	bool
	1194	Perl_is_utf8_lower(pTHX_ U8 *p)
	1195	{
	1196	if (!is_utf8_char(p))
	1197	return FALSE;
	1198	if (!PL_utf8_lower)
	1199	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
	1200	return swash_fetch(PL_utf8_lower, p, TRUE);
	1201	}
	1202
	1203	bool
	1204	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1205	{
	1206	if (!is_utf8_char(p))
	1207	return FALSE;
	1208	if (!PL_utf8_cntrl)
	1209	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1210	return swash_fetch(PL_utf8_cntrl, p, TRUE);
	1211	}
	1212
	1213	bool
	1214	Perl_is_utf8_graph(pTHX_ U8 *p)
	1215	{
	1216	if (!is_utf8_char(p))
	1217	return FALSE;
	1218	if (!PL_utf8_graph)
	1219	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1220	return swash_fetch(PL_utf8_graph, p, TRUE);
	1221	}
	1222
	1223	bool
	1224	Perl_is_utf8_print(pTHX_ U8 *p)
	1225	{
	1226	if (!is_utf8_char(p))
	1227	return FALSE;
	1228	if (!PL_utf8_print)
	1229	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1230	return swash_fetch(PL_utf8_print, p, TRUE);
	1231	}
	1232
	1233	bool
	1234	Perl_is_utf8_punct(pTHX_ U8 *p)
	1235	{
	1236	if (!is_utf8_char(p))
	1237	return FALSE;
	1238	if (!PL_utf8_punct)
	1239	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1240	return swash_fetch(PL_utf8_punct, p, TRUE);
	1241	}
	1242
	1243	bool
	1244	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1245	{
	1246	if (!is_utf8_char(p))
	1247	return FALSE;
	1248	if (!PL_utf8_xdigit)
	1249	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1250	return swash_fetch(PL_utf8_xdigit, p, TRUE);
	1251	}
	1252
	1253	bool
	1254	Perl_is_utf8_mark(pTHX_ U8 *p)
	1255	{
	1256	if (!is_utf8_char(p))
	1257	return FALSE;
	1258	if (!PL_utf8_mark)
	1259	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1260	return swash_fetch(PL_utf8_mark, p, TRUE);
	1261	}
	1262
	1263	/*
	1264	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1265
	1266	The "p" contains the pointer to the UTF-8 string encoding
	1267	the character that is being converted.
	1268
	1269	The "ustrp" is a pointer to the character buffer to put the
	1270	conversion result to. The "lenp" is a pointer to the length
	1271	of the result.
	1272
	1273	The "swash" is a pointer to the swash to use.
	1274
	1275	The "normal" is a string like "ToLower" which means the swash
	1276	$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
	1277	and loaded by SWASHGET, using lib/utf8_heavy.pl.
	1278
	1279	The "special" is a string like "utf8::ToSpecLower", which means
	1280	the hash %utf8::ToSpecLower, which is stored in the same file,
	1281	lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
	1282	to the hash is by Perl_to_utf8_case().
	1283
	1284	=cut
	1285	*/
	1286
	1287	UV
	1288	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
	1289	{
	1290	UV uv;
	1291
	1292	if (!*swashp)
	1293	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1294	uv = swash_fetch(*swashp, p, TRUE);
	1295	if (!uv) {
	1296	HV *hv;
	1297	SV *keysv;
	1298	HE *he;
	1299
	1300	uv = utf8_to_uvchr(p, 0);
	1301
	1302	if ((hv = get_hv(special, FALSE)) &&
	1303	(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
	1304	(he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
	1305	SV *val = HeVAL(he);
	1306	STRLEN len;
	1307	char *s = SvPV(val, len);
	1308
	1309	if (len > 1) {
	1310	Copy(s, ustrp, len, U8);
	1311	#ifdef EBCDIC
	1312	{
	1313	/* If we have EBCDIC we need to remap the
	1314	* characters coming in from the "special"
	1315	* (usually, but not always multicharacter)
	1316	* mapping, since any characters in the low 256
	1317	* are in Unicode code points, not EBCDIC.
	1318	* --jhi */
	1319
	1320	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
	1321	U8 *d = tmpbuf;
	1322	U8 t, tend;
	1323
	1324	if (SvUTF8(val)) {
	1325	STRLEN tlen = 0;
	1326
	1327	for (t = ustrp, tend = t + len;
	1328	t < tend; t += tlen) {
	1329	UV c = utf8_to_uvchr(t, &tlen);
	1330
	1331	if (tlen > 0)
	1332	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1333	else
	1334	break;
	1335	}
	1336	} else {
	1337	for (t = ustrp, tend = t + len;
	1338	t < tend; t++)
	1339	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1340	}
	1341	len = d - tmpbuf;
	1342	Copy(tmpbuf, ustrp, len, U8);
	1343	}
	1344	#endif
	1345	}
	1346	else {
	1347	UV c = UNI_TO_NATIVE((U8)s);
	1348	U8 *d = uvchr_to_utf8(ustrp, c);
	1349
	1350	len = d - ustrp;
	1351	}
	1352	if (lenp)
	1353	*lenp = len;
	1354	return utf8_to_uvchr(ustrp, 0);
	1355	}
	1356	uv = NATIVE_TO_UNI(uv);
	1357	}
	1358	if (lenp)
	1359	*lenp = UNISKIP(uv);
	1360	uvuni_to_utf8(ustrp, uv);
	1361	return uv;
	1362	}
	1363
	1364	/*
	1365	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
	1366
	1367	Convert the UTF-8 encoded character at p to its uppercase version and
	1368	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1369	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1370	uppercase version may be longer than the original character (up to two
	1371	characters).
	1372
	1373	The first character of the uppercased version is returned
	1374	(but note, as explained above, that there may be more.)
	1375
	1376	=cut */
	1377
	1378	UV
	1379	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1380	{
	1381	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1382	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1383	}
	1384
	1385	/*
	1386	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
	1387
	1388	Convert the UTF-8 encoded character at p to its titlecase version and
	1389	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1390	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1391	titlecase version may be longer than the original character (up to two
	1392	characters).
	1393
	1394	The first character of the titlecased version is returned
	1395	(but note, as explained above, that there may be more.)
	1396
	1397	=cut */
	1398
	1399	UV
	1400	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1401	{
	1402	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1403	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1404	}
	1405
	1406	/*
	1407	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
	1408
	1409	Convert the UTF-8 encoded character at p to its lowercase version and
	1410	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1411	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1412	lowercase version may be longer than the original character (up to two
	1413	characters).
	1414
	1415	The first character of the lowercased version is returned
	1416	(but note, as explained above, that there may be more.)
	1417
	1418	=cut */
	1419
	1420	UV
	1421	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1422	{
	1423	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1424	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1425	}
	1426
	1427	/*
	1428	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
	1429
	1430	Convert the UTF-8 encoded character at p to its foldcase version and
	1431	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1432	that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
	1433	foldcase version may be longer than the original character (up to
	1434	three characters).
	1435
	1436	The first character of the foldcased version is returned
	1437	(but note, as explained above, that there may be more.)
	1438
	1439	=cut */
	1440
	1441	UV
	1442	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1443	{
	1444	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1445	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1446	}
	1447
	1448	/* a "swash" is a swatch hash */
	1449
	1450	SV*
	1451	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1452	{
	1453	SV* retval;
	1454	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
	1455	dSP;
	1456	HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
	1457	SV* errsv_save;
	1458
	1459	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1460	ENTER;
	1461	errsv_save = newSVsv(ERRSV);
	1462	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
	1463	if (!SvTRUE(ERRSV))
	1464	sv_setsv(ERRSV, errsv_save);
	1465	SvREFCNT_dec(errsv_save);
	1466	LEAVE;
	1467	}
	1468	SPAGAIN;
	1469	PUSHSTACKi(PERLSI_MAGIC);
	1470	PUSHMARK(SP);
	1471	EXTEND(SP,5);
	1472	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
	1473	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
	1474	PUSHs(listsv);
	1475	PUSHs(sv_2mortal(newSViv(minbits)));
	1476	PUSHs(sv_2mortal(newSViv(none)));
	1477	PUTBACK;
	1478	ENTER;
	1479	SAVEI32(PL_hints);
	1480	PL_hints = 0;
	1481	save_re_context();
	1482	if (PL_curcop == &PL_compiling)
	1483	/* XXX ought to be handled by lex_start */
	1484	sv_setpv(tokenbufsv, PL_tokenbuf);
	1485	errsv_save = newSVsv(ERRSV);
	1486	if (call_method("SWASHNEW", G_SCALAR))
	1487	retval = newSVsv(*PL_stack_sp--);
	1488	else
	1489	retval = &PL_sv_undef;
	1490	if (!SvTRUE(ERRSV))
	1491	sv_setsv(ERRSV, errsv_save);
	1492	SvREFCNT_dec(errsv_save);
	1493	LEAVE;
	1494	POPSTACK;
	1495	if (PL_curcop == &PL_compiling) {
	1496	STRLEN len;
	1497	char* pv = SvPV(tokenbufsv, len);
	1498
	1499	Copy(pv, PL_tokenbuf, len+1, char);
	1500	PL_curcop->op_private = PL_hints;
	1501	}
	1502	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV)
	1503	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1504	return retval;
	1505	}
	1506
	1507
	1508	/* This API is wrong for special case conversions since we may need to
	1509	* return several Unicode characters for a single Unicode character
	1510	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1511	* the lower-level routine, and it is similarly broken for returning
	1512	* multiple values. --jhi */
	1513	UV
	1514	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1515	{
	1516	HV* hv = (HV*)SvRV(sv);
	1517	U32 klen;
	1518	U32 off;
	1519	STRLEN slen;
	1520	STRLEN needents;
	1521	U8 *tmps = NULL;
	1522	U32 bit;
	1523	SV *retval;
	1524	U8 tmputf8[2];
	1525	UV c = NATIVE_TO_ASCII(*ptr);
	1526
	1527	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1528	tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
	1529	tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
	1530	ptr = tmputf8;
	1531	}
	1532	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1533	* then the "swatch" is a vec() for al the chars which start
	1534	* with 0xAA..0xYY
	1535	* So the key in the hash (klen) is length of encoded char -1
	1536	*/
	1537	klen = UTF8SKIP(ptr) - 1;
	1538	off = ptr[klen];
	1539
	1540	if (klen == 0)
	1541	{
	1542	/* If char in invariant then swatch is for all the invariant chars
	1543	* In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
	1544	*/
	1545	needents = UTF_CONTINUATION_MARK;
	1546	off = NATIVE_TO_UTF(ptr[klen]);
	1547	}
	1548	else
	1549	{
	1550	/* If char is encoded then swatch is for the prefix */
	1551	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1552	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1553	}
	1554
	1555	/*
	1556	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1557	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1558	* it's nothing to sniff at.) Pity we usually come through at least
	1559	* two function calls to get here...
	1560	*
	1561	* NB: this code assumes that swatches are never modified, once generated!
	1562	*/
	1563
	1564	if (hv == PL_last_swash_hv &&
	1565	klen == PL_last_swash_klen &&
	1566	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1567	{
	1568	tmps = PL_last_swash_tmps;
	1569	slen = PL_last_swash_slen;
	1570	}
	1571	else {
	1572	/* Try our second-level swatch cache, kept in a hash. */
	1573	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1574
	1575	/* If not cached, generate it via utf8::SWASHGET */
	1576	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1577	dSP;
	1578	/* We use utf8n_to_uvuni() as we want an index into
	1579	Unicode tables, not a native character number.
	1580	*/
	1581	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
	1582	SV *errsv_save;
	1583	ENTER;
	1584	SAVETMPS;
	1585	save_re_context();
	1586	PUSHSTACKi(PERLSI_MAGIC);
	1587	PUSHMARK(SP);
	1588	EXTEND(SP,3);
	1589	PUSHs((SV*)sv);
	1590	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1591	PUSHs(sv_2mortal(newSViv((klen) ?
	1592	(code_point & ~(needents - 1)) : 0)));
	1593	PUSHs(sv_2mortal(newSViv(needents)));
	1594	PUTBACK;
	1595	errsv_save = newSVsv(ERRSV);
	1596	if (call_method("SWASHGET", G_SCALAR))
	1597	retval = newSVsv(*PL_stack_sp--);
	1598	else
	1599	retval = &PL_sv_undef;
	1600	if (!SvTRUE(ERRSV))
	1601	sv_setsv(ERRSV, errsv_save);
	1602	SvREFCNT_dec(errsv_save);
	1603	POPSTACK;
	1604	FREETMPS;
	1605	LEAVE;
	1606	if (PL_curcop == &PL_compiling)
	1607	PL_curcop->op_private = PL_hints;
	1608
	1609	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1610
	1611	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1612	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1613	}
	1614
	1615	PL_last_swash_hv = hv;
	1616	PL_last_swash_klen = klen;
	1617	PL_last_swash_tmps = tmps;
	1618	PL_last_swash_slen = slen;
	1619	if (klen)
	1620	Copy(ptr, PL_last_swash_key, klen, U8);
	1621	}
	1622
	1623	switch ((int)((slen << 3) / needents)) {
	1624	case 1:
	1625	bit = 1 << (off & 7);
	1626	off >>= 3;
	1627	return (tmps[off] & bit) != 0;
	1628	case 8:
	1629	return tmps[off];
	1630	case 16:
	1631	off <<= 1;
	1632	return (tmps[off] << 8) + tmps[off + 1] ;
	1633	case 32:
	1634	off <<= 2;
	1635	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1636	}
	1637	Perl_croak(aTHX_ "panic: swash_fetch");
	1638	return 0;
	1639	}
	1640
	1641
	1642	/*
	1643	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1644
	1645	Adds the UTF8 representation of the Native codepoint C<uv> to the end
	1646	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	1647	bytes available. The return value is the pointer to the byte after the
	1648	end of the new character. In other words,
	1649
	1650	d = uvchr_to_utf8(d, uv);
	1651
	1652	is the recommended wide native character-aware way of saying
	1653
	1654	*(d++) = uv;
	1655
	1656	=cut
	1657	*/
	1658
	1659	/* On ASCII machines this is normally a macro but we want a
	1660	real function in case XS code wants it
	1661	*/
	1662	#undef Perl_uvchr_to_utf8
	1663	U8 *
	1664	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1665	{
	1666	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1667	}
	1668
	1669	U8 *
	1670	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1671	{
	1672	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1673	}
	1674
	1675	/*
	1676	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1677
	1678	Returns the native character value of the first character in the string C<s>
	1679	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	1680	length, in bytes, of that character.
	1681
	1682	Allows length and flags to be passed to low level routine.
	1683
	1684	=cut
	1685	*/
	1686	/* On ASCII machines this is normally a macro but we want
	1687	a real function in case XS code wants it
	1688	*/
	1689	#undef Perl_utf8n_to_uvchr
	1690	UV
	1691	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1692	{
	1693	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1694	return UNI_TO_NATIVE(uv);
	1695	}
	1696
	1697	/*
	1698	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1699
	1700	Build to the scalar dsv a displayable version of the string spv,
	1701	length len, the displayable version being at most pvlim bytes long
	1702	(if longer, the rest is truncated and "..." will be appended).
	1703
	1704	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1705	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1706	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1707	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1708	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1709	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1710
	1711	The pointer to the PV of the dsv is returned.
	1712
	1713	=cut */
	1714	char *
	1715	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1716	{
	1717	int truncated = 0;
	1718	char s, e;
	1719
	1720	sv_setpvn(dsv, "", 0);
	1721	for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1722	UV u;
	1723	bool ok = FALSE;
	1724
	1725	if (pvlim && SvCUR(dsv) >= pvlim) {
	1726	truncated++;
	1727	break;
	1728	}
	1729	u = utf8_to_uvchr((U8*)s, 0);
	1730	if (u < 256) {
	1731	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1732	switch (u & 0xFF) {
	1733	case '\n':
	1734	Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
	1735	case '\r':
	1736	Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
	1737	case '\t':
	1738	Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
	1739	case '\f':
	1740	Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
	1741	case '\a':
	1742	Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
	1743	case '\\':
	1744	Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
	1745	default: break;
	1746	}
	1747	}
	1748	/* isPRINT() is the locale-blind version. */
	1749	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
	1750	Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
	1751	ok = TRUE;
	1752	}
	1753	}
	1754	if (!ok)
	1755	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1756	}
	1757	if (truncated)
	1758	sv_catpvn(dsv, "...", 3);
	1759
	1760	return SvPVX(dsv);
	1761	}
	1762
	1763	/*
	1764	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1765
	1766	Build to the scalar dsv a displayable version of the scalar sv,
	1767	the displayable version being at most pvlim bytes long
	1768	(if longer, the rest is truncated and "..." will be appended).
	1769
	1770	The flags argument is as in pv_uni_display().
	1771
	1772	The pointer to the PV of the dsv is returned.
	1773
	1774	=cut */
	1775	char *
	1776	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1777	{
	1778	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
	1779	pvlim, flags);
	1780	}
	1781
	1782	/*
	1783	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1784
	1785	Return true if the strings s1 and s2 differ case-insensitively, false
	1786	if not (if they are equal case-insensitively). If u1 is true, the
	1787	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1788	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1789	are false, the respective string is assumed to be in native 8-bit
	1790	encoding.
	1791
	1792	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1793	in there (they will point at the beginning of the I<next> character).
	1794	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1795	pointers beyond which scanning will not continue under any
	1796	circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1797	s2+l2 will be used as goal end pointers that will also stop the scan,
	1798	and which qualify towards defining a successful match: all the scans
	1799	that define an explicit length must reach their goal pointers for
	1800	a match to succeed).
	1801
	1802	For case-insensitiveness, the "casefolding" of Unicode is used
	1803	instead of upper/lowercasing both the characters, see
	1804	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	1805
	1806	=cut */
	1807	I32
	1808	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	1809	{
	1810	register U8 p1 = (U8)s1;
	1811	register U8 p2 = (U8)s2;
	1812	register U8 e1 = 0, f1 = 0, *q1 = 0;
	1813	register U8 e2 = 0, f2 = 0, *q2 = 0;
	1814	STRLEN n1 = 0, n2 = 0;
	1815	U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
	1816	U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
	1817	U8 natbuf[1+1];
	1818	STRLEN foldlen1, foldlen2;
	1819	bool match;
	1820
	1821	if (pe1)
	1822	e1 = (U8*)pe1;
	1823	if (e1 == 0 \|\| (l1 && l1 < e1 - (U8*)s1))
	1824	f1 = (U8*)s1 + l1;
	1825	if (pe2)
	1826	e2 = (U8*)pe2;
	1827	if (e2 == 0 \|\| (l2 && l2 < e2 - (U8*)s2))
	1828	f2 = (U8*)s2 + l2;
	1829
	1830	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	1831	return 1; /* mismatch; possible infinite loop or false positive */
	1832
	1833	if (!u1 \|\| !u2)
	1834	natbuf[1] = 0; /* Need to terminate the buffer. */
	1835
	1836	while ((e1 == 0 \|\| p1 < e1) &&
	1837	(f1 == 0 \|\| p1 < f1) &&
	1838	(e2 == 0 \|\| p2 < e2) &&
	1839	(f2 == 0 \|\| p2 < f2)) {
	1840	if (n1 == 0) {
	1841	if (u1)
	1842	to_utf8_fold(p1, foldbuf1, &foldlen1);
	1843	else {
	1844	natbuf[0] = NATIVE_TO_UNI(*p1);
	1845	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	1846	}
	1847	q1 = foldbuf1;
	1848	n1 = foldlen1;
	1849	}
	1850	if (n2 == 0) {
	1851	if (u2)
	1852	to_utf8_fold(p2, foldbuf2, &foldlen2);
	1853	else {
	1854	natbuf[0] = NATIVE_TO_UNI(*p2);
	1855	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	1856	}
	1857	q2 = foldbuf2;
	1858	n2 = foldlen2;
	1859	}
	1860	while (n1 && n2) {
	1861	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	1862	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	1863	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	1864	return 1; /* mismatch */
	1865	n1 -= UTF8SKIP(q1);
	1866	q1 += UTF8SKIP(q1);
	1867	n2 -= UTF8SKIP(q2);
	1868	q2 += UTF8SKIP(q2);
	1869	}
	1870	if (n1 == 0)
	1871	p1 += u1 ? UTF8SKIP(p1) : 1;
	1872	if (n2 == 0)
	1873	p2 += u2 ? UTF8SKIP(p2) : 1;
	1874
	1875	}
	1876
	1877	/* A match is defined by all the scans that specified
	1878	* an explicit length reaching their final goals. */
	1879	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	1880
	1881	if (match) {
	1882	if (pe1)
	1883	pe1 = (char)p1;
	1884	if (pe2)
	1885	pe2 = (char)p2;
	1886	}
	1887
	1888	return match ? 0 : 1; /* 0 match, 1 mismatch */
	1889	}
	1890