perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (c) 1998-2003, Larry Wall
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	*/
	9
	10	/*
	11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	12	* heard of that we don't want to see any closer; and that's the one place
	13	* we're trying to get to! And that's just where we can't get, nohow.'
	14	*
	15	* 'Well do I understand your speech,' he answered in the same language;
	16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	17	* as is the custom in the West, if you wish to be answered?'
	18	*
	19	* ...the travellers perceived that the floor was paved with stones of many
	20	* hues; branching runes and strange devices intertwined beneath their feet.
	21	*/
	22
	23	#include "EXTERN.h"
	24	#define PERL_IN_UTF8_C
	25	#include "perl.h"
	26
	27	static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
	28
	29	/*
	30	=head1 Unicode Support
	31
	32	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	33
	34	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
	35	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	36	bytes available. The return value is the pointer to the byte after the
	37	end of the new character. In other words,
	38
	39	d = uvuni_to_utf8_flags(d, uv, flags);
	40
	41	or, in most cases,
	42
	43	d = uvuni_to_utf8(d, uv);
	44
	45	(which is equivalent to)
	46
	47	d = uvuni_to_utf8_flags(d, uv, 0);
	48
	49	is the recommended Unicode-aware way of saying
	50
	51	*(d++) = uv;
	52
	53	=cut
	54	*/
	55
	56	U8 *
	57	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	58	{
	59	if (ckWARN(WARN_UTF8)) {
	60	if (UNICODE_IS_SURROGATE(uv) &&
	61	!(flags & UNICODE_ALLOW_SURROGATE))
	62	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	63	else if (
	64	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	65	!(flags & UNICODE_ALLOW_FDD0))
	66	\|\|
	67	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	68	!(flags & UNICODE_ALLOW_FFFF))) &&
	69	/* UNICODE_ALLOW_SUPER includes
	70	* FFFEs and FFFFs beyond 0x10FFFF. */
	71	((uv <= PERL_UNICODE_MAX) \|\|
	72	!(flags & UNICODE_ALLOW_SUPER))
	73	)
	74	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	75	"Unicode character 0x%04"UVxf" is illegal", uv);
	76	}
	77	if (UNI_IS_INVARIANT(uv)) {
	78	*d++ = (U8)UTF_TO_NATIVE(uv);
	79	return d;
	80	}
	81	#if defined(EBCDIC)
	82	else {
	83	STRLEN len = UNISKIP(uv);
	84	U8 *p = d+len-1;
	85	while (p > d) {
	86	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	87	uv >>= UTF_ACCUMULATION_SHIFT;
	88	}
	89	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	90	return d+len;
	91	}
	92	#else /* Non loop style */
	93	if (uv < 0x800) {
	94	*d++ = (U8)(( uv >> 6) \| 0xc0);
	95	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	96	return d;
	97	}
	98	if (uv < 0x10000) {
	99	*d++ = (U8)(( uv >> 12) \| 0xe0);
	100	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	101	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	102	return d;
	103	}
	104	if (uv < 0x200000) {
	105	*d++ = (U8)(( uv >> 18) \| 0xf0);
	106	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	107	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	108	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	109	return d;
	110	}
	111	if (uv < 0x4000000) {
	112	*d++ = (U8)(( uv >> 24) \| 0xf8);
	113	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	114	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	117	return d;
	118	}
	119	if (uv < 0x80000000) {
	120	*d++ = (U8)(( uv >> 30) \| 0xfc);
	121	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	123	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	124	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	125	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	126	return d;
	127	}
	128	#ifdef HAS_QUAD
	129	if (uv < UTF8_QUAD_MAX)
	130	#endif
	131	{
	132	d++ = 0xfe; / Can't match U+FEFF! */
	133	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	134	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	135	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	136	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	137	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	138	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	139	return d;
	140	}
	141	#ifdef HAS_QUAD
	142	{
	143	d++ = 0xff; / Can't match U+FFFE! */
	144	d++ = 0x80; / 6 Reserved bits */
	145	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	146	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	147	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	148	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	149	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	150	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	151	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	152	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	153	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	154	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	155	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	156	return d;
	157	}
	158	#endif
	159	#endif /* Loop style */
	160	}
	161
	162	U8 *
	163	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	164	{
	165	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	166	}
	167
	168
	169	/*
	170	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	171
	172	Tests if some arbitrary number of bytes begins in a valid UTF-8
	173	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	174	UTF-8 character. The actual number of bytes in the UTF-8 character
	175	will be returned if it is valid, otherwise 0.
	176
	177	=cut */
	178	STRLEN
	179	Perl_is_utf8_char(pTHX_ U8 *s)
	180	{
	181	U8 u = *s;
	182	STRLEN slen, len;
	183	UV uv, ouv;
	184
	185	if (UTF8_IS_INVARIANT(u))
	186	return 1;
	187
	188	if (!UTF8_IS_START(u))
	189	return 0;
	190
	191	len = UTF8SKIP(s);
	192
	193	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	194	return 0;
	195
	196	slen = len - 1;
	197	s++;
	198	u &= UTF_START_MASK(len);
	199	uv = u;
	200	ouv = uv;
	201	while (slen--) {
	202	if (!UTF8_IS_CONTINUATION(*s))
	203	return 0;
	204	uv = UTF8_ACCUMULATE(uv, *s);
	205	if (uv < ouv)
	206	return 0;
	207	ouv = uv;
	208	s++;
	209	}
	210
	211	if ((STRLEN)UNISKIP(uv) < len)
	212	return 0;
	213
	214	return len;
	215	}
	216
	217	/*
	218	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	219
	220	Returns true if first C<len> bytes of the given string form a valid
	221	UTF8 string, false otherwise. Note that 'a valid UTF8 string' does
	222	not mean 'a string that contains code points above 0x7F encoded in
	223	UTF8' because a valid ASCII string is a valid UTF8 string.
	224
	225	=cut
	226	*/
	227
	228	bool
	229	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	230	{
	231	U8* x = s;
	232	U8* send;
	233	STRLEN c;
	234
	235	if (!len)
	236	len = strlen((char *)s);
	237	send = s + len;
	238
	239	while (x < send) {
	240	/* Inline the easy bits of is_utf8_char() here for speed... */
	241	if (UTF8_IS_INVARIANT(*x))
	242	c = 1;
	243	else if (!UTF8_IS_START(*x))
	244	return FALSE;
	245	else {
	246	/* ... and call is_utf8_char() only if really needed. */
	247	c = is_utf8_char(x);
	248	if (!c)
	249	return FALSE;
	250	}
	251	x += c;
	252	}
	253	if (x != send)
	254	return FALSE;
	255
	256	return TRUE;
	257	}
	258
	259	/*
	260	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	261
	262	Bottom level UTF-8 decode routine.
	263	Returns the unicode code point value of the first character in the string C<s>
	264	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
	265	C<retlen> will be set to the length, in bytes, of that character.
	266
	267	If C<s> does not point to a well-formed UTF8 character, the behaviour
	268	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	269	it is assumed that the caller will raise a warning, and this function
	270	will silently just set C<retlen> to C<-1> and return zero. If the
	271	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	272	malformations will be given, C<retlen> will be set to the expected
	273	length of the UTF-8 character in bytes, and zero will be returned.
	274
	275	The C<flags> can also contain various flags to allow deviations from
	276	the strict UTF-8 encoding (see F<utf8.h>).
	277
	278	Most code should use utf8_to_uvchr() rather than call this directly.
	279
	280	=cut
	281	*/
	282
	283	UV
	284	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	285	{
	286	U8 *s0 = s;
	287	UV uv = *s, ouv = 0;
	288	STRLEN len = 1;
	289	bool dowarn = ckWARN_d(WARN_UTF8);
	290	UV startbyte = *s;
	291	STRLEN expectlen = 0;
	292	U32 warning = 0;
	293
	294	/* This list is a superset of the UTF8_ALLOW_XXX. */
	295
	296	#define UTF8_WARN_EMPTY 1
	297	#define UTF8_WARN_CONTINUATION 2
	298	#define UTF8_WARN_NON_CONTINUATION 3
	299	#define UTF8_WARN_FE_FF 4
	300	#define UTF8_WARN_SHORT 5
	301	#define UTF8_WARN_OVERFLOW 6
	302	#define UTF8_WARN_SURROGATE 7
	303	#define UTF8_WARN_LONG 8
	304	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	305
	306	if (curlen == 0 &&
	307	!(flags & UTF8_ALLOW_EMPTY)) {
	308	warning = UTF8_WARN_EMPTY;
	309	goto malformed;
	310	}
	311
	312	if (UTF8_IS_INVARIANT(uv)) {
	313	if (retlen)
	314	*retlen = 1;
	315	return (UV) (NATIVE_TO_UTF(*s));
	316	}
	317
	318	if (UTF8_IS_CONTINUATION(uv) &&
	319	!(flags & UTF8_ALLOW_CONTINUATION)) {
	320	warning = UTF8_WARN_CONTINUATION;
	321	goto malformed;
	322	}
	323
	324	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	325	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	326	warning = UTF8_WARN_NON_CONTINUATION;
	327	goto malformed;
	328	}
	329
	330	#ifdef EBCDIC
	331	uv = NATIVE_TO_UTF(uv);
	332	#else
	333	if ((uv == 0xfe \|\| uv == 0xff) &&
	334	!(flags & UTF8_ALLOW_FE_FF)) {
	335	warning = UTF8_WARN_FE_FF;
	336	goto malformed;
	337	}
	338	#endif
	339
	340	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	341	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	342	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	343	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	344	#ifdef EBCDIC
	345	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	346	else { len = 7; uv &= 0x01; }
	347	#else
	348	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	349	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	350	else { len = 13; uv = 0; } /* whoa! */
	351	#endif
	352
	353	if (retlen)
	354	*retlen = len;
	355
	356	expectlen = len;
	357
	358	if ((curlen < expectlen) &&
	359	!(flags & UTF8_ALLOW_SHORT)) {
	360	warning = UTF8_WARN_SHORT;
	361	goto malformed;
	362	}
	363
	364	len--;
	365	s++;
	366	ouv = uv;
	367
	368	while (len--) {
	369	if (!UTF8_IS_CONTINUATION(*s) &&
	370	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	371	s--;
	372	warning = UTF8_WARN_NON_CONTINUATION;
	373	goto malformed;
	374	}
	375	else
	376	uv = UTF8_ACCUMULATE(uv, *s);
	377	if (!(uv > ouv)) {
	378	/* These cannot be allowed. */
	379	if (uv == ouv) {
	380	if (!(flags & UTF8_ALLOW_LONG)) {
	381	warning = UTF8_WARN_LONG;
	382	goto malformed;
	383	}
	384	}
	385	else { /* uv < ouv */
	386	/* This cannot be allowed. */
	387	warning = UTF8_WARN_OVERFLOW;
	388	goto malformed;
	389	}
	390	}
	391	s++;
	392	ouv = uv;
	393	}
	394
	395	if (UNICODE_IS_SURROGATE(uv) &&
	396	!(flags & UTF8_ALLOW_SURROGATE)) {
	397	warning = UTF8_WARN_SURROGATE;
	398	goto malformed;
	399	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	400	!(flags & UTF8_ALLOW_LONG)) {
	401	warning = UTF8_WARN_LONG;
	402	goto malformed;
	403	} else if (UNICODE_IS_ILLEGAL(uv) &&
	404	!(flags & UTF8_ALLOW_FFFF)) {
	405	warning = UTF8_WARN_FFFF;
	406	goto malformed;
	407	}
	408
	409	return uv;
	410
	411	malformed:
	412
	413	if (flags & UTF8_CHECK_ONLY) {
	414	if (retlen)
	415	*retlen = -1;
	416	return 0;
	417	}
	418
	419	if (dowarn) {
	420	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	421
	422	switch (warning) {
	423	case 0: /* Intentionally empty. */ break;
	424	case UTF8_WARN_EMPTY:
	425	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
	426	break;
	427	case UTF8_WARN_CONTINUATION:
	428	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	429	break;
	430	case UTF8_WARN_NON_CONTINUATION:
	431	if (s == s0)
	432	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	433	(UV)s[1], startbyte);
	434	else
	435	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	436	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
	437
	438	break;
	439	case UTF8_WARN_FE_FF:
	440	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	441	break;
	442	case UTF8_WARN_SHORT:
	443	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	444	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
	445	expectlen = curlen; /* distance for caller to skip */
	446	break;
	447	case UTF8_WARN_OVERFLOW:
	448	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	449	ouv, *s, startbyte);
	450	break;
	451	case UTF8_WARN_SURROGATE:
	452	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	453	break;
	454	case UTF8_WARN_LONG:
	455	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	456	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	457	break;
	458	case UTF8_WARN_FFFF:
	459	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	460	break;
	461	default:
	462	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
	463	break;
	464	}
	465
	466	if (warning) {
	467	char *s = SvPVX(sv);
	468
	469	if (PL_op)
	470	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	471	"%s in %s", s, OP_DESC(PL_op));
	472	else
	473	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	474	}
	475	}
	476
	477	if (retlen)
	478	*retlen = expectlen ? expectlen : len;
	479
	480	return 0;
	481	}
	482
	483	/*
	484	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	485
	486	Returns the native character value of the first character in the string C<s>
	487	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	488	length, in bytes, of that character.
	489
	490	If C<s> does not point to a well-formed UTF8 character, zero is
	491	returned and retlen is set, if possible, to -1.
	492
	493	=cut
	494	*/
	495
	496	UV
	497	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	498	{
	499	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
	500	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	501	}
	502
	503	/*
	504	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	505
	506	Returns the Unicode code point of the first character in the string C<s>
	507	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	508	length, in bytes, of that character.
	509
	510	This function should only be used when returned UV is considered
	511	an index into the Unicode semantic tables (e.g. swashes).
	512
	513	If C<s> does not point to a well-formed UTF8 character, zero is
	514	returned and retlen is set, if possible, to -1.
	515
	516	=cut
	517	*/
	518
	519	UV
	520	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	521	{
	522	/* Call the low level routine asking for checks */
	523	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
	524	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	525	}
	526
	527	/*
	528	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	529
	530	Return the length of the UTF-8 char encoded string C<s> in characters.
	531	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	532	up past C<e>, croaks.
	533
	534	=cut
	535	*/
	536
	537	STRLEN
	538	Perl_utf8_length(pTHX_ U8 s, U8 e)
	539	{
	540	STRLEN len = 0;
	541
	542	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	543	* the bitops (especially ~) can create illegal UTF-8.
	544	* In other words: in Perl UTF-8 is not just for Unicode. */
	545
	546	if (e < s) {
	547	if (ckWARN_d(WARN_UTF8)) {
	548	if (PL_op)
	549	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	550	"%s in %s", unees, OP_DESC(PL_op));
	551	else
	552	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	553	}
	554	return 0;
	555	}
	556	while (s < e) {
	557	U8 t = UTF8SKIP(s);
	558
	559	if (e - s < t) {
	560	if (ckWARN_d(WARN_UTF8)) {
	561	if (PL_op)
	562	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	563	unees, OP_DESC(PL_op));
	564	else
	565	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	566	}
	567	return len;
	568	}
	569	s += t;
	570	len++;
	571	}
	572
	573	return len;
	574	}
	575
	576	/*
	577	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	578
	579	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
	580	and C<b>.
	581
	582	WARNING: use only if you know that the pointers point inside the
	583	same UTF-8 buffer.
	584
	585	=cut
	586	*/
	587
	588	IV
	589	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	590	{
	591	IV off = 0;
	592
	593	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	594	* the bitops (especially ~) can create illegal UTF-8.
	595	* In other words: in Perl UTF-8 is not just for Unicode. */
	596
	597	if (a < b) {
	598	while (a < b) {
	599	U8 c = UTF8SKIP(a);
	600
	601	if (b - a < c) {
	602	if (ckWARN_d(WARN_UTF8)) {
	603	if (PL_op)
	604	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	605	"%s in %s", unees, OP_DESC(PL_op));
	606	else
	607	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	608	}
	609	return off;
	610	}
	611	a += c;
	612	off--;
	613	}
	614	}
	615	else {
	616	while (b < a) {
	617	U8 c = UTF8SKIP(b);
	618
	619	if (a - b < c) {
	620	if (ckWARN_d(WARN_UTF8)) {
	621	if (PL_op)
	622	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	623	"%s in %s", unees, OP_DESC(PL_op));
	624	else
	625	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	626	}
	627	return off;
	628	}
	629	b += c;
	630	off++;
	631	}
	632	}
	633
	634	return off;
	635	}
	636
	637	/*
	638	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	639
	640	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	641	forward or backward.
	642
	643	WARNING: do not use the following unless you know C<off> is within
	644	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	645	on the first byte of character or just after the last byte of a character.
	646
	647	=cut
	648	*/
	649
	650	U8 *
	651	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	652	{
	653	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	654	* the bitops (especially ~) can create illegal UTF-8.
	655	* In other words: in Perl UTF-8 is not just for Unicode. */
	656
	657	if (off >= 0) {
	658	while (off--)
	659	s += UTF8SKIP(s);
	660	}
	661	else {
	662	while (off++) {
	663	s--;
	664	while (UTF8_IS_CONTINUATION(*s))
	665	s--;
	666	}
	667	}
	668	return s;
	669	}
	670
	671	/*
	672	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	673
	674	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	675	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	676	updates len to contain the new length.
	677	Returns zero on failure, setting C<len> to -1.
	678
	679	=cut
	680	*/
	681
	682	U8 *
	683	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	684	{
	685	U8 *send;
	686	U8 *d;
	687	U8 *save = s;
	688
	689	/* ensure valid UTF8 and chars < 256 before updating string */
	690	for (send = s + *len; s < send; ) {
	691	U8 c = *s++;
	692
	693	if (!UTF8_IS_INVARIANT(c) &&
	694	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	695	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	696	*len = -1;
	697	return 0;
	698	}
	699	}
	700
	701	d = s = save;
	702	while (s < send) {
	703	STRLEN ulen;
	704	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	705	s += ulen;
	706	}
	707	*d = '\0';
	708	*len = d - save;
	709	return save;
	710	}
	711
	712	/*
	713	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
	714
	715	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	716	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	717	the newly-created string, and updates C<len> to contain the new
	718	length. Returns the original string if no conversion occurs, C<len>
	719	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	720	0 if C<s> is converted or contains all 7bit characters.
	721
	722	=cut
	723	*/
	724
	725	U8 *
	726	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	727	{
	728	U8 *d;
	729	U8 *start = s;
	730	U8 *send;
	731	I32 count = 0;
	732
	733	if (!*is_utf8)
	734	return start;
	735
	736	/* ensure valid UTF8 and chars < 256 before converting string */
	737	for (send = s + *len; s < send;) {
	738	U8 c = *s++;
	739	if (!UTF8_IS_INVARIANT(c)) {
	740	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	741	(c = *s++) && UTF8_IS_CONTINUATION(c))
	742	count++;
	743	else
	744	return start;
	745	}
	746	}
	747
	748	*is_utf8 = 0;
	749
	750	Newz(801, d, (*len) - count + 1, U8);
	751	s = start; start = d;
	752	while (s < send) {
	753	U8 c = *s++;
	754	if (!UTF8_IS_INVARIANT(c)) {
	755	/* Then it is two-byte encoded */
	756	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	757	c = ASCII_TO_NATIVE(c);
	758	}
	759	*d++ = c;
	760	}
	761	*d = '\0';
	762	*len = d - start;
	763	return start;
	764	}
	765
	766	/*
	767	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	768
	769	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
	770	Returns a pointer to the newly-created string, and sets C<len> to
	771	reflect the new length.
	772
	773	If you want to convert to UTF8 from other encodings than ASCII,
	774	see sv_recode_to_utf8().
	775
	776	=cut
	777	*/
	778
	779	U8*
	780	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	781	{
	782	U8 *send;
	783	U8 *d;
	784	U8 *dst;
	785	send = s + (*len);
	786
	787	Newz(801, d, (len) 2 + 1, U8);
	788	dst = d;
	789
	790	while (s < send) {
	791	UV uv = NATIVE_TO_ASCII(*s++);
	792	if (UNI_IS_INVARIANT(uv))
	793	*d++ = (U8)UTF_TO_NATIVE(uv);
	794	else {
	795	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	796	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	797	}
	798	}
	799	*d = '\0';
	800	*len = d-dst;
	801	return dst;
	802	}
	803
	804	/*
	805	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	806	*
	807	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	808	* We optimize for native, for obvious reasons. */
	809
	810	U8*
	811	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	812	{
	813	U8* pend;
	814	U8* dstart = d;
	815
	816	if (bytelen & 1)
	817	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
	818
	819	pend = p + bytelen;
	820
	821	while (p < pend) {
	822	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	823	p += 2;
	824	if (uv < 0x80) {
	825	*d++ = (U8)uv;
	826	continue;
	827	}
	828	if (uv < 0x800) {
	829	*d++ = (U8)(( uv >> 6) \| 0xc0);
	830	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	831	continue;
	832	}
	833	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	834	UV low = *p++;
	835	if (low < 0xdc00 \|\| low >= 0xdfff)
	836	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	837	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	838	}
	839	if (uv < 0x10000) {
	840	*d++ = (U8)(( uv >> 12) \| 0xe0);
	841	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	842	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	843	continue;
	844	}
	845	else {
	846	*d++ = (U8)(( uv >> 18) \| 0xf0);
	847	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	848	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	849	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	850	continue;
	851	}
	852	}
	853	*newlen = d - dstart;
	854	return d;
	855	}
	856
	857	/* Note: this one is slightly destructive of the source. */
	858
	859	U8*
	860	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	861	{
	862	U8* s = (U8*)p;
	863	U8* send = s + bytelen;
	864	while (s < send) {
	865	U8 tmp = s[0];
	866	s[0] = s[1];
	867	s[1] = tmp;
	868	s += 2;
	869	}
	870	return utf16_to_utf8(p, d, bytelen, newlen);
	871	}
	872
	873	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	874
	875	bool
	876	Perl_is_uni_alnum(pTHX_ UV c)
	877	{
	878	U8 tmpbuf[UTF8_MAXLEN+1];
	879	uvchr_to_utf8(tmpbuf, c);
	880	return is_utf8_alnum(tmpbuf);
	881	}
	882
	883	bool
	884	Perl_is_uni_alnumc(pTHX_ UV c)
	885	{
	886	U8 tmpbuf[UTF8_MAXLEN+1];
	887	uvchr_to_utf8(tmpbuf, c);
	888	return is_utf8_alnumc(tmpbuf);
	889	}
	890
	891	bool
	892	Perl_is_uni_idfirst(pTHX_ UV c)
	893	{
	894	U8 tmpbuf[UTF8_MAXLEN+1];
	895	uvchr_to_utf8(tmpbuf, c);
	896	return is_utf8_idfirst(tmpbuf);
	897	}
	898
	899	bool
	900	Perl_is_uni_alpha(pTHX_ UV c)
	901	{
	902	U8 tmpbuf[UTF8_MAXLEN+1];
	903	uvchr_to_utf8(tmpbuf, c);
	904	return is_utf8_alpha(tmpbuf);
	905	}
	906
	907	bool
	908	Perl_is_uni_ascii(pTHX_ UV c)
	909	{
	910	U8 tmpbuf[UTF8_MAXLEN+1];
	911	uvchr_to_utf8(tmpbuf, c);
	912	return is_utf8_ascii(tmpbuf);
	913	}
	914
	915	bool
	916	Perl_is_uni_space(pTHX_ UV c)
	917	{
	918	U8 tmpbuf[UTF8_MAXLEN+1];
	919	uvchr_to_utf8(tmpbuf, c);
	920	return is_utf8_space(tmpbuf);
	921	}
	922
	923	bool
	924	Perl_is_uni_digit(pTHX_ UV c)
	925	{
	926	U8 tmpbuf[UTF8_MAXLEN+1];
	927	uvchr_to_utf8(tmpbuf, c);
	928	return is_utf8_digit(tmpbuf);
	929	}
	930
	931	bool
	932	Perl_is_uni_upper(pTHX_ UV c)
	933	{
	934	U8 tmpbuf[UTF8_MAXLEN+1];
	935	uvchr_to_utf8(tmpbuf, c);
	936	return is_utf8_upper(tmpbuf);
	937	}
	938
	939	bool
	940	Perl_is_uni_lower(pTHX_ UV c)
	941	{
	942	U8 tmpbuf[UTF8_MAXLEN+1];
	943	uvchr_to_utf8(tmpbuf, c);
	944	return is_utf8_lower(tmpbuf);
	945	}
	946
	947	bool
	948	Perl_is_uni_cntrl(pTHX_ UV c)
	949	{
	950	U8 tmpbuf[UTF8_MAXLEN+1];
	951	uvchr_to_utf8(tmpbuf, c);
	952	return is_utf8_cntrl(tmpbuf);
	953	}
	954
	955	bool
	956	Perl_is_uni_graph(pTHX_ UV c)
	957	{
	958	U8 tmpbuf[UTF8_MAXLEN+1];
	959	uvchr_to_utf8(tmpbuf, c);
	960	return is_utf8_graph(tmpbuf);
	961	}
	962
	963	bool
	964	Perl_is_uni_print(pTHX_ UV c)
	965	{
	966	U8 tmpbuf[UTF8_MAXLEN+1];
	967	uvchr_to_utf8(tmpbuf, c);
	968	return is_utf8_print(tmpbuf);
	969	}
	970
	971	bool
	972	Perl_is_uni_punct(pTHX_ UV c)
	973	{
	974	U8 tmpbuf[UTF8_MAXLEN+1];
	975	uvchr_to_utf8(tmpbuf, c);
	976	return is_utf8_punct(tmpbuf);
	977	}
	978
	979	bool
	980	Perl_is_uni_xdigit(pTHX_ UV c)
	981	{
	982	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	983	uvchr_to_utf8(tmpbuf, c);
	984	return is_utf8_xdigit(tmpbuf);
	985	}
	986
	987	UV
	988	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	989	{
	990	uvchr_to_utf8(p, c);
	991	return to_utf8_upper(p, p, lenp);
	992	}
	993
	994	UV
	995	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	996	{
	997	uvchr_to_utf8(p, c);
	998	return to_utf8_title(p, p, lenp);
	999	}
	1000
	1001	UV
	1002	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1003	{
	1004	uvchr_to_utf8(p, c);
	1005	return to_utf8_lower(p, p, lenp);
	1006	}
	1007
	1008	UV
	1009	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1010	{
	1011	uvchr_to_utf8(p, c);
	1012	return to_utf8_fold(p, p, lenp);
	1013	}
	1014
	1015	/* for now these all assume no locale info available for Unicode > 255 */
	1016
	1017	bool
	1018	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1019	{
	1020	return is_uni_alnum(c); /* XXX no locale support yet */
	1021	}
	1022
	1023	bool
	1024	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1025	{
	1026	return is_uni_alnumc(c); /* XXX no locale support yet */
	1027	}
	1028
	1029	bool
	1030	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1031	{
	1032	return is_uni_idfirst(c); /* XXX no locale support yet */
	1033	}
	1034
	1035	bool
	1036	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1037	{
	1038	return is_uni_alpha(c); /* XXX no locale support yet */
	1039	}
	1040
	1041	bool
	1042	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1043	{
	1044	return is_uni_ascii(c); /* XXX no locale support yet */
	1045	}
	1046
	1047	bool
	1048	Perl_is_uni_space_lc(pTHX_ UV c)
	1049	{
	1050	return is_uni_space(c); /* XXX no locale support yet */
	1051	}
	1052
	1053	bool
	1054	Perl_is_uni_digit_lc(pTHX_ UV c)
	1055	{
	1056	return is_uni_digit(c); /* XXX no locale support yet */
	1057	}
	1058
	1059	bool
	1060	Perl_is_uni_upper_lc(pTHX_ UV c)
	1061	{
	1062	return is_uni_upper(c); /* XXX no locale support yet */
	1063	}
	1064
	1065	bool
	1066	Perl_is_uni_lower_lc(pTHX_ UV c)
	1067	{
	1068	return is_uni_lower(c); /* XXX no locale support yet */
	1069	}
	1070
	1071	bool
	1072	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1073	{
	1074	return is_uni_cntrl(c); /* XXX no locale support yet */
	1075	}
	1076
	1077	bool
	1078	Perl_is_uni_graph_lc(pTHX_ UV c)
	1079	{
	1080	return is_uni_graph(c); /* XXX no locale support yet */
	1081	}
	1082
	1083	bool
	1084	Perl_is_uni_print_lc(pTHX_ UV c)
	1085	{
	1086	return is_uni_print(c); /* XXX no locale support yet */
	1087	}
	1088
	1089	bool
	1090	Perl_is_uni_punct_lc(pTHX_ UV c)
	1091	{
	1092	return is_uni_punct(c); /* XXX no locale support yet */
	1093	}
	1094
	1095	bool
	1096	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1097	{
	1098	return is_uni_xdigit(c); /* XXX no locale support yet */
	1099	}
	1100
	1101	U32
	1102	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1103	{
	1104	/* XXX returns only the first character -- do not use XXX */
	1105	/* XXX no locale support yet */
	1106	STRLEN len;
	1107	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1108	return (U32)to_uni_upper(c, tmpbuf, &len);
	1109	}
	1110
	1111	U32
	1112	Perl_to_uni_title_lc(pTHX_ U32 c)
	1113	{
	1114	/* XXX returns only the first character XXX -- do not use XXX */
	1115	/* XXX no locale support yet */
	1116	STRLEN len;
	1117	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1118	return (U32)to_uni_title(c, tmpbuf, &len);
	1119	}
	1120
	1121	U32
	1122	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1123	{
	1124	/* XXX returns only the first character -- do not use XXX */
	1125	/* XXX no locale support yet */
	1126	STRLEN len;
	1127	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
	1128	return (U32)to_uni_lower(c, tmpbuf, &len);
	1129	}
	1130
	1131	bool
	1132	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1133	{
	1134	if (!is_utf8_char(p))
	1135	return FALSE;
	1136	if (!PL_utf8_alnum)
	1137	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1138	* descendant of isalnum(3), in other words, it doesn't
	1139	* contain the '_'. --jhi */
	1140	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1141	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1142	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1143	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1144	if (!PL_utf8_alnum)
	1145	PL_utf8_alnum = swash_init("utf8", "",
	1146	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1147	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1148	#endif
	1149	}
	1150
	1151	bool
	1152	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1153	{
	1154	if (!is_utf8_char(p))
	1155	return FALSE;
	1156	if (!PL_utf8_alnum)
	1157	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1158	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1159	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1160	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1161	if (!PL_utf8_alnum)
	1162	PL_utf8_alnum = swash_init("utf8", "",
	1163	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1164	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1165	#endif
	1166	}
	1167
	1168	bool
	1169	Perl_is_utf8_idfirst(pTHX_ U8 p) / The naming is historical. */
	1170	{
	1171	if (*p == '_')
	1172	return TRUE;
	1173	if (!is_utf8_char(p))
	1174	return FALSE;
	1175	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
	1176	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
	1177	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
	1178	}
	1179
	1180	bool
	1181	Perl_is_utf8_idcont(pTHX_ U8 *p)
	1182	{
	1183	if (*p == '_')
	1184	return TRUE;
	1185	if (!is_utf8_char(p))
	1186	return FALSE;
	1187	if (!PL_utf8_idcont)
	1188	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
	1189	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
	1190	}
	1191
	1192	bool
	1193	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1194	{
	1195	if (!is_utf8_char(p))
	1196	return FALSE;
	1197	if (!PL_utf8_alpha)
	1198	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1199	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
	1200	}
	1201
	1202	bool
	1203	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1204	{
	1205	if (!is_utf8_char(p))
	1206	return FALSE;
	1207	if (!PL_utf8_ascii)
	1208	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1209	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
	1210	}
	1211
	1212	bool
	1213	Perl_is_utf8_space(pTHX_ U8 *p)
	1214	{
	1215	if (!is_utf8_char(p))
	1216	return FALSE;
	1217	if (!PL_utf8_space)
	1218	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1219	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
	1220	}
	1221
	1222	bool
	1223	Perl_is_utf8_digit(pTHX_ U8 *p)
	1224	{
	1225	if (!is_utf8_char(p))
	1226	return FALSE;
	1227	if (!PL_utf8_digit)
	1228	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1229	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
	1230	}
	1231
	1232	bool
	1233	Perl_is_utf8_upper(pTHX_ U8 *p)
	1234	{
	1235	if (!is_utf8_char(p))
	1236	return FALSE;
	1237	if (!PL_utf8_upper)
	1238	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
	1239	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
	1240	}
	1241
	1242	bool
	1243	Perl_is_utf8_lower(pTHX_ U8 *p)
	1244	{
	1245	if (!is_utf8_char(p))
	1246	return FALSE;
	1247	if (!PL_utf8_lower)
	1248	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
	1249	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
	1250	}
	1251
	1252	bool
	1253	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1254	{
	1255	if (!is_utf8_char(p))
	1256	return FALSE;
	1257	if (!PL_utf8_cntrl)
	1258	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1259	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
	1260	}
	1261
	1262	bool
	1263	Perl_is_utf8_graph(pTHX_ U8 *p)
	1264	{
	1265	if (!is_utf8_char(p))
	1266	return FALSE;
	1267	if (!PL_utf8_graph)
	1268	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1269	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
	1270	}
	1271
	1272	bool
	1273	Perl_is_utf8_print(pTHX_ U8 *p)
	1274	{
	1275	if (!is_utf8_char(p))
	1276	return FALSE;
	1277	if (!PL_utf8_print)
	1278	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1279	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
	1280	}
	1281
	1282	bool
	1283	Perl_is_utf8_punct(pTHX_ U8 *p)
	1284	{
	1285	if (!is_utf8_char(p))
	1286	return FALSE;
	1287	if (!PL_utf8_punct)
	1288	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1289	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
	1290	}
	1291
	1292	bool
	1293	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1294	{
	1295	if (!is_utf8_char(p))
	1296	return FALSE;
	1297	if (!PL_utf8_xdigit)
	1298	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1299	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
	1300	}
	1301
	1302	bool
	1303	Perl_is_utf8_mark(pTHX_ U8 *p)
	1304	{
	1305	if (!is_utf8_char(p))
	1306	return FALSE;
	1307	if (!PL_utf8_mark)
	1308	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1309	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
	1310	}
	1311
	1312	/*
	1313	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1314
	1315	The "p" contains the pointer to the UTF-8 string encoding
	1316	the character that is being converted.
	1317
	1318	The "ustrp" is a pointer to the character buffer to put the
	1319	conversion result to. The "lenp" is a pointer to the length
	1320	of the result.
	1321
	1322	The "swashp" is a pointer to the swash to use.
	1323
	1324	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1325	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
	1326	but not always, a multicharacter mapping), is tried first.
	1327
	1328	The "special" is a string like "utf8::ToSpecLower", which means the
	1329	hash %utf8::ToSpecLower. The access to the hash is through
	1330	Perl_to_utf8_case().
	1331
	1332	The "normal" is a string like "ToLower" which means the swash
	1333	%utf8::ToLower.
	1334
	1335	=cut */
	1336
	1337	UV
	1338	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
	1339	{
	1340	UV uv0, uv1;
	1341	U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
	1342	STRLEN len = 0;
	1343
	1344	uv0 = utf8_to_uvchr(p, 0);
	1345	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1346	* are necessary in EBCDIC, they are redundant no-ops
	1347	* in ASCII-ish platforms, and hopefully optimized away. */
	1348	uv1 = NATIVE_TO_UNI(uv0);
	1349	uvuni_to_utf8(tmpbuf, uv1);
	1350
	1351	if (!swashp) / load on-demand */
	1352	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1353
	1354	if (special) {
	1355	/* It might be "special" (sometimes, but not always,
	1356	* a multicharacter mapping) */
	1357	HV *hv;
	1358	SV *keysv;
	1359	HE *he;
	1360	SV *val;
	1361
	1362	if ((hv = get_hv(special, FALSE)) &&
	1363	(keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
	1364	(he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
	1365	(val = HeVAL(he))) {
	1366	char *s;
	1367
	1368	s = SvPV(val, len);
	1369	if (len == 1)
	1370	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1371	else {
	1372	#ifdef EBCDIC
	1373	/* If we have EBCDIC we need to remap the characters
	1374	* since any characters in the low 256 are Unicode
	1375	* code points, not EBCDIC. */
	1376	U8 t = (U8)s, tend = t + len, d;
	1377
	1378	d = tmpbuf;
	1379	if (SvUTF8(val)) {
	1380	STRLEN tlen = 0;
	1381
	1382	while (t < tend) {
	1383	UV c = utf8_to_uvchr(t, &tlen);
	1384	if (tlen > 0) {
	1385	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1386	t += tlen;
	1387	}
	1388	else
	1389	break;
	1390	}
	1391	}
	1392	else {
	1393	while (t < tend) {
	1394	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1395	t++;
	1396	}
	1397	}
	1398	len = d - tmpbuf;
	1399	Copy(tmpbuf, ustrp, len, U8);
	1400	#else
	1401	Copy(s, ustrp, len, U8);
	1402	#endif
	1403	}
	1404	}
	1405	}
	1406
	1407	if (!len && *swashp) {
	1408	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1409
	1410	if (uv2) {
	1411	/* It was "normal" (a single character mapping). */
	1412	UV uv3 = UNI_TO_NATIVE(uv2);
	1413
	1414	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1415	}
	1416	}
	1417
	1418	if (!len) /* Neither: just copy. */
	1419	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1420
	1421	if (lenp)
	1422	*lenp = len;
	1423
	1424	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1425	}
	1426
	1427	/*
	1428	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
	1429
	1430	Convert the UTF-8 encoded character at p to its uppercase version and
	1431	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1432	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1433	uppercase version may be longer than the original character (up to two
	1434	characters).
	1435
	1436	The first character of the uppercased version is returned
	1437	(but note, as explained above, that there may be more.)
	1438
	1439	=cut */
	1440
	1441	UV
	1442	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1443	{
	1444	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1445	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1446	}
	1447
	1448	/*
	1449	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
	1450
	1451	Convert the UTF-8 encoded character at p to its titlecase version and
	1452	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1453	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1454	titlecase version may be longer than the original character (up to two
	1455	characters).
	1456
	1457	The first character of the titlecased version is returned
	1458	(but note, as explained above, that there may be more.)
	1459
	1460	=cut */
	1461
	1462	UV
	1463	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1464	{
	1465	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1466	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1467	}
	1468
	1469	/*
	1470	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
	1471
	1472	Convert the UTF-8 encoded character at p to its lowercase version and
	1473	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1474	that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
	1475	lowercase version may be longer than the original character (up to two
	1476	characters).
	1477
	1478	The first character of the lowercased version is returned
	1479	(but note, as explained above, that there may be more.)
	1480
	1481	=cut */
	1482
	1483	UV
	1484	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1485	{
	1486	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1487	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1488	}
	1489
	1490	/*
	1491	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
	1492
	1493	Convert the UTF-8 encoded character at p to its foldcase version and
	1494	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1495	that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
	1496	foldcase version may be longer than the original character (up to
	1497	three characters).
	1498
	1499	The first character of the foldcased version is returned
	1500	(but note, as explained above, that there may be more.)
	1501
	1502	=cut */
	1503
	1504	UV
	1505	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1506	{
	1507	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1508	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1509	}
	1510
	1511	/* a "swash" is a swatch hash */
	1512
	1513	SV*
	1514	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1515	{
	1516	SV* retval;
	1517	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
	1518	dSP;
	1519	HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
	1520	SV* errsv_save;
	1521
	1522	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1523	ENTER;
	1524	errsv_save = newSVsv(ERRSV);
	1525	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
	1526	if (!SvTRUE(ERRSV))
	1527	sv_setsv(ERRSV, errsv_save);
	1528	SvREFCNT_dec(errsv_save);
	1529	LEAVE;
	1530	}
	1531	SPAGAIN;
	1532	PUSHSTACKi(PERLSI_MAGIC);
	1533	PUSHMARK(SP);
	1534	EXTEND(SP,5);
	1535	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
	1536	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
	1537	PUSHs(listsv);
	1538	PUSHs(sv_2mortal(newSViv(minbits)));
	1539	PUSHs(sv_2mortal(newSViv(none)));
	1540	PUTBACK;
	1541	ENTER;
	1542	SAVEI32(PL_hints);
	1543	PL_hints = 0;
	1544	save_re_context();
	1545	if (PL_curcop == &PL_compiling) {
	1546	/* XXX ought to be handled by lex_start */
	1547	SAVEI32(PL_in_my);
	1548	PL_in_my = 0;
	1549	sv_setpv(tokenbufsv, PL_tokenbuf);
	1550	}
	1551	errsv_save = newSVsv(ERRSV);
	1552	if (call_method("SWASHNEW", G_SCALAR))
	1553	retval = newSVsv(*PL_stack_sp--);
	1554	else
	1555	retval = &PL_sv_undef;
	1556	if (!SvTRUE(ERRSV))
	1557	sv_setsv(ERRSV, errsv_save);
	1558	SvREFCNT_dec(errsv_save);
	1559	LEAVE;
	1560	POPSTACK;
	1561	if (PL_curcop == &PL_compiling) {
	1562	STRLEN len;
	1563	char* pv = SvPV(tokenbufsv, len);
	1564
	1565	Copy(pv, PL_tokenbuf, len+1, char);
	1566	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1567	}
	1568	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1569	if (SvPOK(retval))
	1570	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1571	retval);
	1572	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1573	}
	1574	return retval;
	1575	}
	1576
	1577
	1578	/* This API is wrong for special case conversions since we may need to
	1579	* return several Unicode characters for a single Unicode character
	1580	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1581	* the lower-level routine, and it is similarly broken for returning
	1582	* multiple values. --jhi */
	1583	UV
	1584	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1585	{
	1586	HV* hv = (HV*)SvRV(sv);
	1587	U32 klen;
	1588	U32 off;
	1589	STRLEN slen;
	1590	STRLEN needents;
	1591	U8 *tmps = NULL;
	1592	U32 bit;
	1593	SV *retval;
	1594	U8 tmputf8[2];
	1595	UV c = NATIVE_TO_ASCII(*ptr);
	1596
	1597	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1598	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1599	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1600	ptr = tmputf8;
	1601	}
	1602	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1603	* then the "swatch" is a vec() for al the chars which start
	1604	* with 0xAA..0xYY
	1605	* So the key in the hash (klen) is length of encoded char -1
	1606	*/
	1607	klen = UTF8SKIP(ptr) - 1;
	1608	off = ptr[klen];
	1609
	1610	if (klen == 0)
	1611	{
	1612	/* If char in invariant then swatch is for all the invariant chars
	1613	* In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
	1614	*/
	1615	needents = UTF_CONTINUATION_MARK;
	1616	off = NATIVE_TO_UTF(ptr[klen]);
	1617	}
	1618	else
	1619	{
	1620	/* If char is encoded then swatch is for the prefix */
	1621	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1622	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1623	}
	1624
	1625	/*
	1626	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1627	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1628	* it's nothing to sniff at.) Pity we usually come through at least
	1629	* two function calls to get here...
	1630	*
	1631	* NB: this code assumes that swatches are never modified, once generated!
	1632	*/
	1633
	1634	if (hv == PL_last_swash_hv &&
	1635	klen == PL_last_swash_klen &&
	1636	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1637	{
	1638	tmps = PL_last_swash_tmps;
	1639	slen = PL_last_swash_slen;
	1640	}
	1641	else {
	1642	/* Try our second-level swatch cache, kept in a hash. */
	1643	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1644
	1645	/* If not cached, generate it via utf8::SWASHGET */
	1646	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1647	dSP;
	1648	/* We use utf8n_to_uvuni() as we want an index into
	1649	Unicode tables, not a native character number.
	1650	*/
	1651	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
	1652	ckWARN(WARN_UTF8) ?
	1653	0 : UTF8_ALLOW_ANY);
	1654	SV *errsv_save;
	1655	ENTER;
	1656	SAVETMPS;
	1657	save_re_context();
	1658	PUSHSTACKi(PERLSI_MAGIC);
	1659	PUSHMARK(SP);
	1660	EXTEND(SP,3);
	1661	PUSHs((SV*)sv);
	1662	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1663	PUSHs(sv_2mortal(newSViv((klen) ?
	1664	(code_point & ~(needents - 1)) : 0)));
	1665	PUSHs(sv_2mortal(newSViv(needents)));
	1666	PUTBACK;
	1667	errsv_save = newSVsv(ERRSV);
	1668	if (call_method("SWASHGET", G_SCALAR))
	1669	retval = newSVsv(*PL_stack_sp--);
	1670	else
	1671	retval = &PL_sv_undef;
	1672	if (!SvTRUE(ERRSV))
	1673	sv_setsv(ERRSV, errsv_save);
	1674	SvREFCNT_dec(errsv_save);
	1675	POPSTACK;
	1676	FREETMPS;
	1677	LEAVE;
	1678	if (PL_curcop == &PL_compiling)
	1679	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1680
	1681	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1682
	1683	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1684	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1685	}
	1686
	1687	PL_last_swash_hv = hv;
	1688	PL_last_swash_klen = klen;
	1689	PL_last_swash_tmps = tmps;
	1690	PL_last_swash_slen = slen;
	1691	if (klen)
	1692	Copy(ptr, PL_last_swash_key, klen, U8);
	1693	}
	1694
	1695	switch ((int)((slen << 3) / needents)) {
	1696	case 1:
	1697	bit = 1 << (off & 7);
	1698	off >>= 3;
	1699	return (tmps[off] & bit) != 0;
	1700	case 8:
	1701	return tmps[off];
	1702	case 16:
	1703	off <<= 1;
	1704	return (tmps[off] << 8) + tmps[off + 1] ;
	1705	case 32:
	1706	off <<= 2;
	1707	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1708	}
	1709	Perl_croak(aTHX_ "panic: swash_fetch");
	1710	return 0;
	1711	}
	1712
	1713
	1714	/*
	1715	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1716
	1717	Adds the UTF8 representation of the Native codepoint C<uv> to the end
	1718	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	1719	bytes available. The return value is the pointer to the byte after the
	1720	end of the new character. In other words,
	1721
	1722	d = uvchr_to_utf8(d, uv);
	1723
	1724	is the recommended wide native character-aware way of saying
	1725
	1726	*(d++) = uv;
	1727
	1728	=cut
	1729	*/
	1730
	1731	/* On ASCII machines this is normally a macro but we want a
	1732	real function in case XS code wants it
	1733	*/
	1734	#undef Perl_uvchr_to_utf8
	1735	U8 *
	1736	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1737	{
	1738	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1739	}
	1740
	1741	U8 *
	1742	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1743	{
	1744	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1745	}
	1746
	1747	/*
	1748	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1749
	1750	Returns the native character value of the first character in the string C<s>
	1751	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	1752	length, in bytes, of that character.
	1753
	1754	Allows length and flags to be passed to low level routine.
	1755
	1756	=cut
	1757	*/
	1758	/* On ASCII machines this is normally a macro but we want
	1759	a real function in case XS code wants it
	1760	*/
	1761	#undef Perl_utf8n_to_uvchr
	1762	UV
	1763	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1764	{
	1765	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1766	return UNI_TO_NATIVE(uv);
	1767	}
	1768
	1769	/*
	1770	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1771
	1772	Build to the scalar dsv a displayable version of the string spv,
	1773	length len, the displayable version being at most pvlim bytes long
	1774	(if longer, the rest is truncated and "..." will be appended).
	1775
	1776	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1777	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1778	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1779	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1780	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1781	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1782
	1783	The pointer to the PV of the dsv is returned.
	1784
	1785	=cut */
	1786	char *
	1787	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1788	{
	1789	int truncated = 0;
	1790	char s, e;
	1791
	1792	sv_setpvn(dsv, "", 0);
	1793	for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1794	UV u;
	1795	bool ok = FALSE;
	1796
	1797	if (pvlim && SvCUR(dsv) >= pvlim) {
	1798	truncated++;
	1799	break;
	1800	}
	1801	u = utf8_to_uvchr((U8*)s, 0);
	1802	if (u < 256) {
	1803	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1804	switch (u & 0xFF) {
	1805	case '\n':
	1806	Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
	1807	case '\r':
	1808	Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
	1809	case '\t':
	1810	Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
	1811	case '\f':
	1812	Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
	1813	case '\a':
	1814	Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
	1815	case '\\':
	1816	Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
	1817	default: break;
	1818	}
	1819	}
	1820	/* isPRINT() is the locale-blind version. */
	1821	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
	1822	Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
	1823	ok = TRUE;
	1824	}
	1825	}
	1826	if (!ok)
	1827	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1828	}
	1829	if (truncated)
	1830	sv_catpvn(dsv, "...", 3);
	1831
	1832	return SvPVX(dsv);
	1833	}
	1834
	1835	/*
	1836	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1837
	1838	Build to the scalar dsv a displayable version of the scalar sv,
	1839	the displayable version being at most pvlim bytes long
	1840	(if longer, the rest is truncated and "..." will be appended).
	1841
	1842	The flags argument is as in pv_uni_display().
	1843
	1844	The pointer to the PV of the dsv is returned.
	1845
	1846	=cut */
	1847	char *
	1848	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1849	{
	1850	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
	1851	pvlim, flags);
	1852	}
	1853
	1854	/*
	1855	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1856
	1857	Return true if the strings s1 and s2 differ case-insensitively, false
	1858	if not (if they are equal case-insensitively). If u1 is true, the
	1859	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1860	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1861	are false, the respective string is assumed to be in native 8-bit
	1862	encoding.
	1863
	1864	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1865	in there (they will point at the beginning of the I<next> character).
	1866	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1867	pointers beyond which scanning will not continue under any
	1868	circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1869	s2+l2 will be used as goal end pointers that will also stop the scan,
	1870	and which qualify towards defining a successful match: all the scans
	1871	that define an explicit length must reach their goal pointers for
	1872	a match to succeed).
	1873
	1874	For case-insensitiveness, the "casefolding" of Unicode is used
	1875	instead of upper/lowercasing both the characters, see
	1876	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	1877
	1878	=cut */
	1879	I32
	1880	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	1881	{
	1882	register U8 p1 = (U8)s1;
	1883	register U8 p2 = (U8)s2;
	1884	register U8 e1 = 0, f1 = 0, *q1 = 0;
	1885	register U8 e2 = 0, f2 = 0, *q2 = 0;
	1886	STRLEN n1 = 0, n2 = 0;
	1887	U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
	1888	U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
	1889	U8 natbuf[1+1];
	1890	STRLEN foldlen1, foldlen2;
	1891	bool match;
	1892
	1893	if (pe1)
	1894	e1 = (U8*)pe1;
	1895	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (U8*)s1)))
	1896	f1 = (U8*)s1 + l1;
	1897	if (pe2)
	1898	e2 = (U8*)pe2;
	1899	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (U8*)s2)))
	1900	f2 = (U8*)s2 + l2;
	1901
	1902	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	1903	return 1; /* mismatch; possible infinite loop or false positive */
	1904
	1905	if (!u1 \|\| !u2)
	1906	natbuf[1] = 0; /* Need to terminate the buffer. */
	1907
	1908	while ((e1 == 0 \|\| p1 < e1) &&
	1909	(f1 == 0 \|\| p1 < f1) &&
	1910	(e2 == 0 \|\| p2 < e2) &&
	1911	(f2 == 0 \|\| p2 < f2)) {
	1912	if (n1 == 0) {
	1913	if (u1)
	1914	to_utf8_fold(p1, foldbuf1, &foldlen1);
	1915	else {
	1916	natbuf[0] = *p1;
	1917	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	1918	}
	1919	q1 = foldbuf1;
	1920	n1 = foldlen1;
	1921	}
	1922	if (n2 == 0) {
	1923	if (u2)
	1924	to_utf8_fold(p2, foldbuf2, &foldlen2);
	1925	else {
	1926	natbuf[0] = *p2;
	1927	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	1928	}
	1929	q2 = foldbuf2;
	1930	n2 = foldlen2;
	1931	}
	1932	while (n1 && n2) {
	1933	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	1934	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	1935	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	1936	return 1; /* mismatch */
	1937	n1 -= UTF8SKIP(q1);
	1938	q1 += UTF8SKIP(q1);
	1939	n2 -= UTF8SKIP(q2);
	1940	q2 += UTF8SKIP(q2);
	1941	}
	1942	if (n1 == 0)
	1943	p1 += u1 ? UTF8SKIP(p1) : 1;
	1944	if (n2 == 0)
	1945	p2 += u2 ? UTF8SKIP(p2) : 1;
	1946
	1947	}
	1948
	1949	/* A match is defined by all the scans that specified
	1950	* an explicit length reaching their final goals. */
	1951	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	1952
	1953	if (match) {
	1954	if (pe1)
	1955	pe1 = (char)p1;
	1956	if (pe2)
	1957	pe2 = (char)p2;
	1958	}
	1959
	1960	return match ? 0 : 1; /* 0 match, 1 mismatch */
	1961	}
	1962