perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
	4	* others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* 'Well do I understand your speech,' he answered in the same language;
	17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	18	* as is the custom in the West, if you wish to be answered?'
	19	*
	20	* ...the travellers perceived that the floor was paved with stones of many
	21	* hues; branching runes and strange devices intertwined beneath their feet.
	22	*/
	23
	24	#include "EXTERN.h"
	25	#define PERL_IN_UTF8_C
	26	#include "perl.h"
	27
	28	static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
	29
	30	/*
	31	=head1 Unicode Support
	32
	33	This file contains various utility functions for manipulating UTF8-encoded
	34	strings. For the uninitiated, this is a method of representing arbitrary
	35	Unicode characters as a variable number of bytes, in such a way that
	36	characters in the ASCII range are unmodified, and a zero byte never appears
	37	within non-zero characters.
	38
	39	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	40
	41	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	42	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	43	bytes available. The return value is the pointer to the byte after the
	44	end of the new character. In other words,
	45
	46	d = uvuni_to_utf8_flags(d, uv, flags);
	47
	48	or, in most cases,
	49
	50	d = uvuni_to_utf8(d, uv);
	51
	52	(which is equivalent to)
	53
	54	d = uvuni_to_utf8_flags(d, uv, 0);
	55
	56	is the recommended Unicode-aware way of saying
	57
	58	*(d++) = uv;
	59
	60	=cut
	61	*/
	62
	63	U8 *
	64	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	65	{
	66	if (ckWARN(WARN_UTF8)) {
	67	if (UNICODE_IS_SURROGATE(uv) &&
	68	!(flags & UNICODE_ALLOW_SURROGATE))
	69	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	70	else if (
	71	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	72	!(flags & UNICODE_ALLOW_FDD0))
	73	\|\|
	74	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	75	!(flags & UNICODE_ALLOW_FFFF))) &&
	76	/* UNICODE_ALLOW_SUPER includes
	77	* FFFEs and FFFFs beyond 0x10FFFF. */
	78	((uv <= PERL_UNICODE_MAX) \|\|
	79	!(flags & UNICODE_ALLOW_SUPER))
	80	)
	81	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	82	"Unicode character 0x%04"UVxf" is illegal", uv);
	83	}
	84	if (UNI_IS_INVARIANT(uv)) {
	85	*d++ = (U8)UTF_TO_NATIVE(uv);
	86	return d;
	87	}
	88	#if defined(EBCDIC)
	89	else {
	90	STRLEN len = UNISKIP(uv);
	91	U8 *p = d+len-1;
	92	while (p > d) {
	93	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	94	uv >>= UTF_ACCUMULATION_SHIFT;
	95	}
	96	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	97	return d+len;
	98	}
	99	#else /* Non loop style */
	100	if (uv < 0x800) {
	101	*d++ = (U8)(( uv >> 6) \| 0xc0);
	102	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	103	return d;
	104	}
	105	if (uv < 0x10000) {
	106	*d++ = (U8)(( uv >> 12) \| 0xe0);
	107	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	108	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	109	return d;
	110	}
	111	if (uv < 0x200000) {
	112	*d++ = (U8)(( uv >> 18) \| 0xf0);
	113	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	114	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	115	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	116	return d;
	117	}
	118	if (uv < 0x4000000) {
	119	*d++ = (U8)(( uv >> 24) \| 0xf8);
	120	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	121	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	123	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	124	return d;
	125	}
	126	if (uv < 0x80000000) {
	127	*d++ = (U8)(( uv >> 30) \| 0xfc);
	128	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	129	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	130	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	131	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	132	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	133	return d;
	134	}
	135	#ifdef HAS_QUAD
	136	if (uv < UTF8_QUAD_MAX)
	137	#endif
	138	{
	139	d++ = 0xfe; / Can't match U+FEFF! */
	140	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	141	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	142	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	143	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	144	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	145	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	146	return d;
	147	}
	148	#ifdef HAS_QUAD
	149	{
	150	d++ = 0xff; / Can't match U+FFFE! */
	151	d++ = 0x80; / 6 Reserved bits */
	152	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	153	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	154	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	155	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	156	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	157	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	158	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	159	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	160	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	161	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	162	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	163	return d;
	164	}
	165	#endif
	166	#endif /* Loop style */
	167	}
	168
	169	U8 *
	170	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	171	{
	172	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	173	}
	174
	175
	176	/*
	177	=for apidoc A\|STRLEN\|is_utf8_char\|const U8 *s
	178
	179	Tests if some arbitrary number of bytes begins in a valid UTF-8
	180	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	181	UTF-8 character. The actual number of bytes in the UTF-8 character
	182	will be returned if it is valid, otherwise 0.
	183
	184	=cut */
	185	STRLEN
	186	Perl_is_utf8_char(pTHX_ const U8 *s)
	187	{
	188	U8 u = *s;
	189	STRLEN slen, len;
	190	UV uv, ouv;
	191
	192	if (UTF8_IS_INVARIANT(u))
	193	return 1;
	194
	195	if (!UTF8_IS_START(u))
	196	return 0;
	197
	198	len = UTF8SKIP(s);
	199
	200	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	201	return 0;
	202
	203	slen = len - 1;
	204	s++;
	205	u &= UTF_START_MASK(len);
	206	uv = u;
	207	ouv = uv;
	208	while (slen--) {
	209	if (!UTF8_IS_CONTINUATION(*s))
	210	return 0;
	211	uv = UTF8_ACCUMULATE(uv, *s);
	212	if (uv < ouv)
	213	return 0;
	214	ouv = uv;
	215	s++;
	216	}
	217
	218	if ((STRLEN)UNISKIP(uv) < len)
	219	return 0;
	220
	221	return len;
	222	}
	223
	224	/*
	225	=for apidoc A\|bool\|is_utf8_string\|const U8 *s\|STRLEN len
	226
	227	Returns true if first C<len> bytes of the given string form a valid
	228	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	229	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	230	because a valid ASCII string is a valid UTF-8 string.
	231
	232	=cut
	233	*/
	234
	235	bool
	236	Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
	237	{
	238	const U8* x = s;
	239	const U8* send;
	240	STRLEN c;
	241
	242	if (!len && s)
	243	len = strlen((const char *)s);
	244	send = s + len;
	245
	246	while (x < send) {
	247	/* Inline the easy bits of is_utf8_char() here for speed... */
	248	if (UTF8_IS_INVARIANT(*x))
	249	c = 1;
	250	else if (!UTF8_IS_START(*x))
	251	return FALSE;
	252	else {
	253	/* ... and call is_utf8_char() only if really needed. */
	254	c = is_utf8_char(x);
	255	if (!c)
	256	return FALSE;
	257	}
	258	x += c;
	259	}
	260	if (x != send)
	261	return FALSE;
	262
	263	return TRUE;
	264	}
	265
	266	/*
	267	=for apidoc A\|bool\|is_utf8_string_loc\|const U8 s\|STRLEN len\|const U8 *p
	268
	269	Like is_ut8_string but store the location of the failure in
	270	the last argument.
	271
	272	=cut
	273	*/
	274
	275	bool
	276	Perl_is_utf8_string_loc(pTHX_ const U8 s, STRLEN len, const U8 *p)
	277	{
	278	const U8* x = s;
	279	const U8* send;
	280	STRLEN c;
	281
	282	if (!len && s)
	283	len = strlen((const char *)s);
	284	send = s + len;
	285
	286	while (x < send) {
	287	/* Inline the easy bits of is_utf8_char() here for speed... */
	288	if (UTF8_IS_INVARIANT(*x))
	289	c = 1;
	290	else if (!UTF8_IS_START(*x)) {
	291	if (p)
	292	*p = x;
	293	return FALSE;
	294	}
	295	else {
	296	/* ... and call is_utf8_char() only if really needed. */
	297	c = is_utf8_char(x);
	298	if (!c) {
	299	if (p)
	300	*p = x;
	301	return FALSE;
	302	}
	303	}
	304	x += c;
	305	}
	306	if (x != send) {
	307	if (p)
	308	*p = x;
	309	return FALSE;
	310	}
	311
	312	return TRUE;
	313	}
	314
	315	/*
	316	=for apidoc A\|UV\|utf8n_to_uvuni\|const U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	317
	318	Bottom level UTF-8 decode routine.
	319	Returns the unicode code point value of the first character in the string C<s>
	320	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	321	C<retlen> will be set to the length, in bytes, of that character.
	322
	323	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	324	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	325	it is assumed that the caller will raise a warning, and this function
	326	will silently just set C<retlen> to C<-1> and return zero. If the
	327	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	328	malformations will be given, C<retlen> will be set to the expected
	329	length of the UTF-8 character in bytes, and zero will be returned.
	330
	331	The C<flags> can also contain various flags to allow deviations from
	332	the strict UTF-8 encoding (see F<utf8.h>).
	333
	334	Most code should use utf8_to_uvchr() rather than call this directly.
	335
	336	=cut
	337	*/
	338
	339	UV
	340	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	341	{
	342	const U8 *s0 = s;
	343	UV uv = *s, ouv = 0;
	344	STRLEN len = 1;
	345	const bool dowarn = ckWARN_d(WARN_UTF8);
	346	const UV startbyte = *s;
	347	STRLEN expectlen = 0;
	348	U32 warning = 0;
	349
	350	/* This list is a superset of the UTF8_ALLOW_XXX. */
	351
	352	#define UTF8_WARN_EMPTY 1
	353	#define UTF8_WARN_CONTINUATION 2
	354	#define UTF8_WARN_NON_CONTINUATION 3
	355	#define UTF8_WARN_FE_FF 4
	356	#define UTF8_WARN_SHORT 5
	357	#define UTF8_WARN_OVERFLOW 6
	358	#define UTF8_WARN_SURROGATE 7
	359	#define UTF8_WARN_LONG 8
	360	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	361
	362	if (curlen == 0 &&
	363	!(flags & UTF8_ALLOW_EMPTY)) {
	364	warning = UTF8_WARN_EMPTY;
	365	goto malformed;
	366	}
	367
	368	if (UTF8_IS_INVARIANT(uv)) {
	369	if (retlen)
	370	*retlen = 1;
	371	return (UV) (NATIVE_TO_UTF(*s));
	372	}
	373
	374	if (UTF8_IS_CONTINUATION(uv) &&
	375	!(flags & UTF8_ALLOW_CONTINUATION)) {
	376	warning = UTF8_WARN_CONTINUATION;
	377	goto malformed;
	378	}
	379
	380	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	381	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	382	warning = UTF8_WARN_NON_CONTINUATION;
	383	goto malformed;
	384	}
	385
	386	#ifdef EBCDIC
	387	uv = NATIVE_TO_UTF(uv);
	388	#else
	389	if ((uv == 0xfe \|\| uv == 0xff) &&
	390	!(flags & UTF8_ALLOW_FE_FF)) {
	391	warning = UTF8_WARN_FE_FF;
	392	goto malformed;
	393	}
	394	#endif
	395
	396	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	397	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	398	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	399	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	400	#ifdef EBCDIC
	401	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	402	else { len = 7; uv &= 0x01; }
	403	#else
	404	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	405	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	406	else { len = 13; uv = 0; } /* whoa! */
	407	#endif
	408
	409	if (retlen)
	410	*retlen = len;
	411
	412	expectlen = len;
	413
	414	if ((curlen < expectlen) &&
	415	!(flags & UTF8_ALLOW_SHORT)) {
	416	warning = UTF8_WARN_SHORT;
	417	goto malformed;
	418	}
	419
	420	len--;
	421	s++;
	422	ouv = uv;
	423
	424	while (len--) {
	425	if (!UTF8_IS_CONTINUATION(*s) &&
	426	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	427	s--;
	428	warning = UTF8_WARN_NON_CONTINUATION;
	429	goto malformed;
	430	}
	431	else
	432	uv = UTF8_ACCUMULATE(uv, *s);
	433	if (!(uv > ouv)) {
	434	/* These cannot be allowed. */
	435	if (uv == ouv) {
	436	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	437	warning = UTF8_WARN_LONG;
	438	goto malformed;
	439	}
	440	}
	441	else { /* uv < ouv */
	442	/* This cannot be allowed. */
	443	warning = UTF8_WARN_OVERFLOW;
	444	goto malformed;
	445	}
	446	}
	447	s++;
	448	ouv = uv;
	449	}
	450
	451	if (UNICODE_IS_SURROGATE(uv) &&
	452	!(flags & UTF8_ALLOW_SURROGATE)) {
	453	warning = UTF8_WARN_SURROGATE;
	454	goto malformed;
	455	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	456	!(flags & UTF8_ALLOW_LONG)) {
	457	warning = UTF8_WARN_LONG;
	458	goto malformed;
	459	} else if (UNICODE_IS_ILLEGAL(uv) &&
	460	!(flags & UTF8_ALLOW_FFFF)) {
	461	warning = UTF8_WARN_FFFF;
	462	goto malformed;
	463	}
	464
	465	return uv;
	466
	467	malformed:
	468
	469	if (flags & UTF8_CHECK_ONLY) {
	470	if (retlen)
	471	*retlen = -1;
	472	return 0;
	473	}
	474
	475	if (dowarn) {
	476	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	477
	478	switch (warning) {
	479	case 0: /* Intentionally empty. */ break;
	480	case UTF8_WARN_EMPTY:
	481	Perl_sv_catpv(aTHX_ sv, "(empty string)");
	482	break;
	483	case UTF8_WARN_CONTINUATION:
	484	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	485	break;
	486	case UTF8_WARN_NON_CONTINUATION:
	487	if (s == s0)
	488	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	489	(UV)s[1], startbyte);
	490	else
	491	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	492	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
	493
	494	break;
	495	case UTF8_WARN_FE_FF:
	496	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	497	break;
	498	case UTF8_WARN_SHORT:
	499	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	500	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
	501	expectlen = curlen; /* distance for caller to skip */
	502	break;
	503	case UTF8_WARN_OVERFLOW:
	504	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	505	ouv, *s, startbyte);
	506	break;
	507	case UTF8_WARN_SURROGATE:
	508	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	509	break;
	510	case UTF8_WARN_LONG:
	511	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	512	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	513	break;
	514	case UTF8_WARN_FFFF:
	515	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	516	break;
	517	default:
	518	Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
	519	break;
	520	}
	521
	522	if (warning) {
	523	char *s = SvPVX(sv);
	524
	525	if (PL_op)
	526	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	527	"%s in %s", s, OP_DESC(PL_op));
	528	else
	529	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	530	}
	531	}
	532
	533	if (retlen)
	534	*retlen = expectlen ? expectlen : len;
	535
	536	return 0;
	537	}
	538
	539	/*
	540	=for apidoc A\|UV\|utf8_to_uvchr\|const U8 s\|STRLEN retlen
	541
	542	Returns the native character value of the first character in the string C<s>
	543	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	544	length, in bytes, of that character.
	545
	546	If C<s> does not point to a well-formed UTF-8 character, zero is
	547	returned and retlen is set, if possible, to -1.
	548
	549	=cut
	550	*/
	551
	552	UV
	553	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	554	{
	555	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
	556	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	557	}
	558
	559	/*
	560	=for apidoc A\|UV\|utf8_to_uvuni\|const U8 s\|STRLEN retlen
	561
	562	Returns the Unicode code point of the first character in the string C<s>
	563	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	564	length, in bytes, of that character.
	565
	566	This function should only be used when returned UV is considered
	567	an index into the Unicode semantic tables (e.g. swashes).
	568
	569	If C<s> does not point to a well-formed UTF-8 character, zero is
	570	returned and retlen is set, if possible, to -1.
	571
	572	=cut
	573	*/
	574
	575	UV
	576	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	577	{
	578	/* Call the low level routine asking for checks */
	579	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	580	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	581	}
	582
	583	/*
	584	=for apidoc A\|STRLEN\|utf8_length\|const U8 s\|const U8 e
	585
	586	Return the length of the UTF-8 char encoded string C<s> in characters.
	587	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	588	up past C<e>, croaks.
	589
	590	=cut
	591	*/
	592
	593	STRLEN
	594	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	595	{
	596	STRLEN len = 0;
	597
	598	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	599	* the bitops (especially ~) can create illegal UTF-8.
	600	* In other words: in Perl UTF-8 is not just for Unicode. */
	601
	602	if (e < s) {
	603	if (ckWARN_d(WARN_UTF8)) {
	604	if (PL_op)
	605	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	606	"%s in %s", unees, OP_DESC(PL_op));
	607	else
	608	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	609	}
	610	return 0;
	611	}
	612	while (s < e) {
	613	U8 t = UTF8SKIP(s);
	614
	615	if (e - s < t) {
	616	if (ckWARN_d(WARN_UTF8)) {
	617	if (PL_op)
	618	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	619	unees, OP_DESC(PL_op));
	620	else
	621	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	622	}
	623	return len;
	624	}
	625	s += t;
	626	len++;
	627	}
	628
	629	return len;
	630	}
	631
	632	/*
	633	=for apidoc A\|IV\|utf8_distance\|const U8 a\|const U8 b
	634
	635	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	636	and C<b>.
	637
	638	WARNING: use only if you know that the pointers point inside the
	639	same UTF-8 buffer.
	640
	641	=cut
	642	*/
	643
	644	IV
	645	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	646	{
	647	IV off = 0;
	648
	649	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	650	* the bitops (especially ~) can create illegal UTF-8.
	651	* In other words: in Perl UTF-8 is not just for Unicode. */
	652
	653	if (a < b) {
	654	while (a < b) {
	655	const U8 c = UTF8SKIP(a);
	656
	657	if (b - a < c) {
	658	if (ckWARN_d(WARN_UTF8)) {
	659	if (PL_op)
	660	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	661	"%s in %s", unees, OP_DESC(PL_op));
	662	else
	663	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	664	}
	665	return off;
	666	}
	667	a += c;
	668	off--;
	669	}
	670	}
	671	else {
	672	while (b < a) {
	673	U8 c = UTF8SKIP(b);
	674
	675	if (a - b < c) {
	676	if (ckWARN_d(WARN_UTF8)) {
	677	if (PL_op)
	678	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	679	"%s in %s", unees, OP_DESC(PL_op));
	680	else
	681	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	682	}
	683	return off;
	684	}
	685	b += c;
	686	off++;
	687	}
	688	}
	689
	690	return off;
	691	}
	692
	693	/*
	694	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	695
	696	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	697	forward or backward.
	698
	699	WARNING: do not use the following unless you know C<off> is within
	700	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	701	on the first byte of character or just after the last byte of a character.
	702
	703	=cut
	704	*/
	705
	706	U8 *
	707	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	708	{
	709	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	710	* the bitops (especially ~) can create illegal UTF-8.
	711	* In other words: in Perl UTF-8 is not just for Unicode. */
	712
	713	if (off >= 0) {
	714	while (off--)
	715	s += UTF8SKIP(s);
	716	}
	717	else {
	718	while (off++) {
	719	s--;
	720	while (UTF8_IS_CONTINUATION(*s))
	721	s--;
	722	}
	723	}
	724	return s;
	725	}
	726
	727	/*
	728	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	729
	730	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	731	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	732	updates len to contain the new length.
	733	Returns zero on failure, setting C<len> to -1.
	734
	735	=cut
	736	*/
	737
	738	U8 *
	739	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	740	{
	741	U8 *send;
	742	U8 *d;
	743	U8 *save = s;
	744
	745	/* ensure valid UTF-8 and chars < 256 before updating string */
	746	for (send = s + *len; s < send; ) {
	747	U8 c = *s++;
	748
	749	if (!UTF8_IS_INVARIANT(c) &&
	750	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	751	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	752	*len = -1;
	753	return 0;
	754	}
	755	}
	756
	757	d = s = save;
	758	while (s < send) {
	759	STRLEN ulen;
	760	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	761	s += ulen;
	762	}
	763	*d = '\0';
	764	*len = d - save;
	765	return save;
	766	}
	767
	768	/*
	769	=for apidoc A\|U8 \|bytes_from_utf8\|const U8 s\|STRLEN len\|bool is_utf8
	770
	771	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	772	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	773	the newly-created string, and updates C<len> to contain the new
	774	length. Returns the original string if no conversion occurs, C<len>
	775	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	776	0 if C<s> is converted or contains all 7bit characters.
	777
	778	=cut
	779	*/
	780
	781	U8 *
	782	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	783	{
	784	U8 *d;
	785	const U8 *start = s;
	786	const U8 *send;
	787	I32 count = 0;
	788
	789	if (!*is_utf8)
	790	return (U8 *)start;
	791
	792	/* ensure valid UTF-8 and chars < 256 before converting string */
	793	for (send = s + *len; s < send;) {
	794	U8 c = *s++;
	795	if (!UTF8_IS_INVARIANT(c)) {
	796	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	797	(c = *s++) && UTF8_IS_CONTINUATION(c))
	798	count++;
	799	else
	800	return (U8 *)start;
	801	}
	802	}
	803
	804	*is_utf8 = 0;
	805
	806	Newz(801, d, (*len) - count + 1, U8);
	807	s = start; start = d;
	808	while (s < send) {
	809	U8 c = *s++;
	810	if (!UTF8_IS_INVARIANT(c)) {
	811	/* Then it is two-byte encoded */
	812	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	813	c = ASCII_TO_NATIVE(c);
	814	}
	815	*d++ = c;
	816	}
	817	*d = '\0';
	818	*len = d - start;
	819	return (U8 *)start;
	820	}
	821
	822	/*
	823	=for apidoc A\|U8 \|bytes_to_utf8\|const U8 s\|STRLEN *len
	824
	825	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
	826	Returns a pointer to the newly-created string, and sets C<len> to
	827	reflect the new length.
	828
	829	If you want to convert to UTF-8 from other encodings than ASCII,
	830	see sv_recode_to_utf8().
	831
	832	=cut
	833	*/
	834
	835	U8*
	836	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	837	{
	838	const U8 * const send = s + (*len);
	839	U8 *d;
	840	U8 *dst;
	841
	842	Newz(801, d, (len) 2 + 1, U8);
	843	dst = d;
	844
	845	while (s < send) {
	846	const UV uv = NATIVE_TO_ASCII(*s++);
	847	if (UNI_IS_INVARIANT(uv))
	848	*d++ = (U8)UTF_TO_NATIVE(uv);
	849	else {
	850	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	851	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	852	}
	853	}
	854	*d = '\0';
	855	*len = d-dst;
	856	return dst;
	857	}
	858
	859	/*
	860	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	861	*
	862	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	863	* We optimize for native, for obvious reasons. */
	864
	865	U8*
	866	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	867	{
	868	U8* pend;
	869	U8* dstart = d;
	870
	871	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
	872	d[0] = 0;
	873	*newlen = 1;
	874	return d;
	875	}
	876
	877	if (bytelen & 1)
	878	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
	879
	880	pend = p + bytelen;
	881
	882	while (p < pend) {
	883	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	884	p += 2;
	885	if (uv < 0x80) {
	886	*d++ = (U8)uv;
	887	continue;
	888	}
	889	if (uv < 0x800) {
	890	*d++ = (U8)(( uv >> 6) \| 0xc0);
	891	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	892	continue;
	893	}
	894	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	895	UV low = (p[0] << 8) + p[1];
	896	p += 2;
	897	if (low < 0xdc00 \|\| low >= 0xdfff)
	898	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	899	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	900	}
	901	if (uv < 0x10000) {
	902	*d++ = (U8)(( uv >> 12) \| 0xe0);
	903	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	904	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	905	continue;
	906	}
	907	else {
	908	*d++ = (U8)(( uv >> 18) \| 0xf0);
	909	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	910	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	911	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	912	continue;
	913	}
	914	}
	915	*newlen = d - dstart;
	916	return d;
	917	}
	918
	919	/* Note: this one is slightly destructive of the source. */
	920
	921	U8*
	922	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	923	{
	924	U8* s = (U8*)p;
	925	U8* send = s + bytelen;
	926	while (s < send) {
	927	U8 tmp = s[0];
	928	s[0] = s[1];
	929	s[1] = tmp;
	930	s += 2;
	931	}
	932	return utf16_to_utf8(p, d, bytelen, newlen);
	933	}
	934
	935	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	936
	937	bool
	938	Perl_is_uni_alnum(pTHX_ UV c)
	939	{
	940	U8 tmpbuf[UTF8_MAXBYTES+1];
	941	uvchr_to_utf8(tmpbuf, c);
	942	return is_utf8_alnum(tmpbuf);
	943	}
	944
	945	bool
	946	Perl_is_uni_alnumc(pTHX_ UV c)
	947	{
	948	U8 tmpbuf[UTF8_MAXBYTES+1];
	949	uvchr_to_utf8(tmpbuf, c);
	950	return is_utf8_alnumc(tmpbuf);
	951	}
	952
	953	bool
	954	Perl_is_uni_idfirst(pTHX_ UV c)
	955	{
	956	U8 tmpbuf[UTF8_MAXBYTES+1];
	957	uvchr_to_utf8(tmpbuf, c);
	958	return is_utf8_idfirst(tmpbuf);
	959	}
	960
	961	bool
	962	Perl_is_uni_alpha(pTHX_ UV c)
	963	{
	964	U8 tmpbuf[UTF8_MAXBYTES+1];
	965	uvchr_to_utf8(tmpbuf, c);
	966	return is_utf8_alpha(tmpbuf);
	967	}
	968
	969	bool
	970	Perl_is_uni_ascii(pTHX_ UV c)
	971	{
	972	U8 tmpbuf[UTF8_MAXBYTES+1];
	973	uvchr_to_utf8(tmpbuf, c);
	974	return is_utf8_ascii(tmpbuf);
	975	}
	976
	977	bool
	978	Perl_is_uni_space(pTHX_ UV c)
	979	{
	980	U8 tmpbuf[UTF8_MAXBYTES+1];
	981	uvchr_to_utf8(tmpbuf, c);
	982	return is_utf8_space(tmpbuf);
	983	}
	984
	985	bool
	986	Perl_is_uni_digit(pTHX_ UV c)
	987	{
	988	U8 tmpbuf[UTF8_MAXBYTES+1];
	989	uvchr_to_utf8(tmpbuf, c);
	990	return is_utf8_digit(tmpbuf);
	991	}
	992
	993	bool
	994	Perl_is_uni_upper(pTHX_ UV c)
	995	{
	996	U8 tmpbuf[UTF8_MAXBYTES+1];
	997	uvchr_to_utf8(tmpbuf, c);
	998	return is_utf8_upper(tmpbuf);
	999	}
	1000
	1001	bool
	1002	Perl_is_uni_lower(pTHX_ UV c)
	1003	{
	1004	U8 tmpbuf[UTF8_MAXBYTES+1];
	1005	uvchr_to_utf8(tmpbuf, c);
	1006	return is_utf8_lower(tmpbuf);
	1007	}
	1008
	1009	bool
	1010	Perl_is_uni_cntrl(pTHX_ UV c)
	1011	{
	1012	U8 tmpbuf[UTF8_MAXBYTES+1];
	1013	uvchr_to_utf8(tmpbuf, c);
	1014	return is_utf8_cntrl(tmpbuf);
	1015	}
	1016
	1017	bool
	1018	Perl_is_uni_graph(pTHX_ UV c)
	1019	{
	1020	U8 tmpbuf[UTF8_MAXBYTES+1];
	1021	uvchr_to_utf8(tmpbuf, c);
	1022	return is_utf8_graph(tmpbuf);
	1023	}
	1024
	1025	bool
	1026	Perl_is_uni_print(pTHX_ UV c)
	1027	{
	1028	U8 tmpbuf[UTF8_MAXBYTES+1];
	1029	uvchr_to_utf8(tmpbuf, c);
	1030	return is_utf8_print(tmpbuf);
	1031	}
	1032
	1033	bool
	1034	Perl_is_uni_punct(pTHX_ UV c)
	1035	{
	1036	U8 tmpbuf[UTF8_MAXBYTES+1];
	1037	uvchr_to_utf8(tmpbuf, c);
	1038	return is_utf8_punct(tmpbuf);
	1039	}
	1040
	1041	bool
	1042	Perl_is_uni_xdigit(pTHX_ UV c)
	1043	{
	1044	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1045	uvchr_to_utf8(tmpbuf, c);
	1046	return is_utf8_xdigit(tmpbuf);
	1047	}
	1048
	1049	UV
	1050	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1051	{
	1052	uvchr_to_utf8(p, c);
	1053	return to_utf8_upper(p, p, lenp);
	1054	}
	1055
	1056	UV
	1057	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1058	{
	1059	uvchr_to_utf8(p, c);
	1060	return to_utf8_title(p, p, lenp);
	1061	}
	1062
	1063	UV
	1064	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1065	{
	1066	uvchr_to_utf8(p, c);
	1067	return to_utf8_lower(p, p, lenp);
	1068	}
	1069
	1070	UV
	1071	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1072	{
	1073	uvchr_to_utf8(p, c);
	1074	return to_utf8_fold(p, p, lenp);
	1075	}
	1076
	1077	/* for now these all assume no locale info available for Unicode > 255 */
	1078
	1079	bool
	1080	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1081	{
	1082	return is_uni_alnum(c); /* XXX no locale support yet */
	1083	}
	1084
	1085	bool
	1086	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1087	{
	1088	return is_uni_alnumc(c); /* XXX no locale support yet */
	1089	}
	1090
	1091	bool
	1092	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1093	{
	1094	return is_uni_idfirst(c); /* XXX no locale support yet */
	1095	}
	1096
	1097	bool
	1098	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1099	{
	1100	return is_uni_alpha(c); /* XXX no locale support yet */
	1101	}
	1102
	1103	bool
	1104	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1105	{
	1106	return is_uni_ascii(c); /* XXX no locale support yet */
	1107	}
	1108
	1109	bool
	1110	Perl_is_uni_space_lc(pTHX_ UV c)
	1111	{
	1112	return is_uni_space(c); /* XXX no locale support yet */
	1113	}
	1114
	1115	bool
	1116	Perl_is_uni_digit_lc(pTHX_ UV c)
	1117	{
	1118	return is_uni_digit(c); /* XXX no locale support yet */
	1119	}
	1120
	1121	bool
	1122	Perl_is_uni_upper_lc(pTHX_ UV c)
	1123	{
	1124	return is_uni_upper(c); /* XXX no locale support yet */
	1125	}
	1126
	1127	bool
	1128	Perl_is_uni_lower_lc(pTHX_ UV c)
	1129	{
	1130	return is_uni_lower(c); /* XXX no locale support yet */
	1131	}
	1132
	1133	bool
	1134	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1135	{
	1136	return is_uni_cntrl(c); /* XXX no locale support yet */
	1137	}
	1138
	1139	bool
	1140	Perl_is_uni_graph_lc(pTHX_ UV c)
	1141	{
	1142	return is_uni_graph(c); /* XXX no locale support yet */
	1143	}
	1144
	1145	bool
	1146	Perl_is_uni_print_lc(pTHX_ UV c)
	1147	{
	1148	return is_uni_print(c); /* XXX no locale support yet */
	1149	}
	1150
	1151	bool
	1152	Perl_is_uni_punct_lc(pTHX_ UV c)
	1153	{
	1154	return is_uni_punct(c); /* XXX no locale support yet */
	1155	}
	1156
	1157	bool
	1158	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1159	{
	1160	return is_uni_xdigit(c); /* XXX no locale support yet */
	1161	}
	1162
	1163	U32
	1164	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1165	{
	1166	/* XXX returns only the first character -- do not use XXX */
	1167	/* XXX no locale support yet */
	1168	STRLEN len;
	1169	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1170	return (U32)to_uni_upper(c, tmpbuf, &len);
	1171	}
	1172
	1173	U32
	1174	Perl_to_uni_title_lc(pTHX_ U32 c)
	1175	{
	1176	/* XXX returns only the first character XXX -- do not use XXX */
	1177	/* XXX no locale support yet */
	1178	STRLEN len;
	1179	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1180	return (U32)to_uni_title(c, tmpbuf, &len);
	1181	}
	1182
	1183	U32
	1184	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1185	{
	1186	/* XXX returns only the first character -- do not use XXX */
	1187	/* XXX no locale support yet */
	1188	STRLEN len;
	1189	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1190	return (U32)to_uni_lower(c, tmpbuf, &len);
	1191	}
	1192
	1193	bool
	1194	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1195	{
	1196	if (!is_utf8_char(p))
	1197	return FALSE;
	1198	if (!PL_utf8_alnum)
	1199	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1200	* descendant of isalnum(3), in other words, it doesn't
	1201	* contain the '_'. --jhi */
	1202	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1203	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1204	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1205	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1206	if (!PL_utf8_alnum)
	1207	PL_utf8_alnum = swash_init("utf8", "",
	1208	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1209	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1210	#endif
	1211	}
	1212
	1213	bool
	1214	Perl_is_utf8_alnumc(pTHX_ const U8 *p)
	1215	{
	1216	if (!is_utf8_char(p))
	1217	return FALSE;
	1218	if (!PL_utf8_alnum)
	1219	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1220	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1221	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1222	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1223	if (!PL_utf8_alnum)
	1224	PL_utf8_alnum = swash_init("utf8", "",
	1225	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1226	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1227	#endif
	1228	}
	1229
	1230	bool
	1231	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1232	{
	1233	if (*p == '_')
	1234	return TRUE;
	1235	if (!is_utf8_char(p))
	1236	return FALSE;
	1237	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
	1238	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
	1239	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
	1240	}
	1241
	1242	bool
	1243	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1244	{
	1245	if (*p == '_')
	1246	return TRUE;
	1247	if (!is_utf8_char(p))
	1248	return FALSE;
	1249	if (!PL_utf8_idcont)
	1250	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
	1251	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
	1252	}
	1253
	1254	bool
	1255	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1256	{
	1257	if (!is_utf8_char(p))
	1258	return FALSE;
	1259	if (!PL_utf8_alpha)
	1260	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1261	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
	1262	}
	1263
	1264	bool
	1265	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1266	{
	1267	if (!is_utf8_char(p))
	1268	return FALSE;
	1269	if (!PL_utf8_ascii)
	1270	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1271	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
	1272	}
	1273
	1274	bool
	1275	Perl_is_utf8_space(pTHX_ const U8 *p)
	1276	{
	1277	if (!is_utf8_char(p))
	1278	return FALSE;
	1279	if (!PL_utf8_space)
	1280	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1281	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
	1282	}
	1283
	1284	bool
	1285	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1286	{
	1287	if (!is_utf8_char(p))
	1288	return FALSE;
	1289	if (!PL_utf8_digit)
	1290	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1291	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
	1292	}
	1293
	1294	bool
	1295	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1296	{
	1297	if (!is_utf8_char(p))
	1298	return FALSE;
	1299	if (!PL_utf8_upper)
	1300	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
	1301	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
	1302	}
	1303
	1304	bool
	1305	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1306	{
	1307	if (!is_utf8_char(p))
	1308	return FALSE;
	1309	if (!PL_utf8_lower)
	1310	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
	1311	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
	1312	}
	1313
	1314	bool
	1315	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1316	{
	1317	if (!is_utf8_char(p))
	1318	return FALSE;
	1319	if (!PL_utf8_cntrl)
	1320	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1321	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
	1322	}
	1323
	1324	bool
	1325	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1326	{
	1327	if (!is_utf8_char(p))
	1328	return FALSE;
	1329	if (!PL_utf8_graph)
	1330	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1331	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
	1332	}
	1333
	1334	bool
	1335	Perl_is_utf8_print(pTHX_ const U8 *p)
	1336	{
	1337	if (!is_utf8_char(p))
	1338	return FALSE;
	1339	if (!PL_utf8_print)
	1340	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1341	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
	1342	}
	1343
	1344	bool
	1345	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1346	{
	1347	if (!is_utf8_char(p))
	1348	return FALSE;
	1349	if (!PL_utf8_punct)
	1350	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1351	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
	1352	}
	1353
	1354	bool
	1355	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1356	{
	1357	if (!is_utf8_char(p))
	1358	return FALSE;
	1359	if (!PL_utf8_xdigit)
	1360	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1361	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
	1362	}
	1363
	1364	bool
	1365	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1366	{
	1367	if (!is_utf8_char(p))
	1368	return FALSE;
	1369	if (!PL_utf8_mark)
	1370	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1371	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
	1372	}
	1373
	1374	/*
	1375	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1376
	1377	The "p" contains the pointer to the UTF-8 string encoding
	1378	the character that is being converted.
	1379
	1380	The "ustrp" is a pointer to the character buffer to put the
	1381	conversion result to. The "lenp" is a pointer to the length
	1382	of the result.
	1383
	1384	The "swashp" is a pointer to the swash to use.
	1385
	1386	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1387	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
	1388	but not always, a multicharacter mapping), is tried first.
	1389
	1390	The "special" is a string like "utf8::ToSpecLower", which means the
	1391	hash %utf8::ToSpecLower. The access to the hash is through
	1392	Perl_to_utf8_case().
	1393
	1394	The "normal" is a string like "ToLower" which means the swash
	1395	%utf8::ToLower.
	1396
	1397	=cut */
	1398
	1399	UV
	1400	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, SV swashp, const char normal, const char *special)
	1401	{
	1402	UV uv1;
	1403	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1404	STRLEN len = 0;
	1405
	1406	const UV uv0 = utf8_to_uvchr(p, 0);
	1407	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1408	* are necessary in EBCDIC, they are redundant no-ops
	1409	* in ASCII-ish platforms, and hopefully optimized away. */
	1410	uv1 = NATIVE_TO_UNI(uv0);
	1411	uvuni_to_utf8(tmpbuf, uv1);
	1412
	1413	if (!swashp) / load on-demand */
	1414	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1415
	1416	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1417	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1418	/* It might be "special" (sometimes, but not always,
	1419	* a multicharacter mapping) */
	1420	HV *hv;
	1421	SV **svp;
	1422
	1423	if ((hv = get_hv(special, FALSE)) &&
	1424	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1425	(*svp)) {
	1426	char *s;
	1427
	1428	s = SvPV(*svp, len);
	1429	if (len == 1)
	1430	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1431	else {
	1432	#ifdef EBCDIC
	1433	/* If we have EBCDIC we need to remap the characters
	1434	* since any characters in the low 256 are Unicode
	1435	* code points, not EBCDIC. */
	1436	U8 t = (U8)s, tend = t + len, d;
	1437
	1438	d = tmpbuf;
	1439	if (SvUTF8(*svp)) {
	1440	STRLEN tlen = 0;
	1441
	1442	while (t < tend) {
	1443	UV c = utf8_to_uvchr(t, &tlen);
	1444	if (tlen > 0) {
	1445	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1446	t += tlen;
	1447	}
	1448	else
	1449	break;
	1450	}
	1451	}
	1452	else {
	1453	while (t < tend) {
	1454	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1455	t++;
	1456	}
	1457	}
	1458	len = d - tmpbuf;
	1459	Copy(tmpbuf, ustrp, len, U8);
	1460	#else
	1461	Copy(s, ustrp, len, U8);
	1462	#endif
	1463	}
	1464	}
	1465	}
	1466
	1467	if (!len && *swashp) {
	1468	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1469
	1470	if (uv2) {
	1471	/* It was "normal" (a single character mapping). */
	1472	UV uv3 = UNI_TO_NATIVE(uv2);
	1473
	1474	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1475	}
	1476	}
	1477
	1478	if (!len) /* Neither: just copy. */
	1479	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1480
	1481	if (lenp)
	1482	*lenp = len;
	1483
	1484	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1485	}
	1486
	1487	/*
	1488	=for apidoc A\|UV\|to_utf8_upper\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1489
	1490	Convert the UTF-8 encoded character at p to its uppercase version and
	1491	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1492	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1493	the uppercase version may be longer than the original character.
	1494
	1495	The first character of the uppercased version is returned
	1496	(but note, as explained above, that there may be more.)
	1497
	1498	=cut */
	1499
	1500	UV
	1501	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1502	{
	1503	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1504	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1505	}
	1506
	1507	/*
	1508	=for apidoc A\|UV\|to_utf8_title\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1509
	1510	Convert the UTF-8 encoded character at p to its titlecase version and
	1511	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1512	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1513	titlecase version may be longer than the original character.
	1514
	1515	The first character of the titlecased version is returned
	1516	(but note, as explained above, that there may be more.)
	1517
	1518	=cut */
	1519
	1520	UV
	1521	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1522	{
	1523	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1524	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1525	}
	1526
	1527	/*
	1528	=for apidoc A\|UV\|to_utf8_lower\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1529
	1530	Convert the UTF-8 encoded character at p to its lowercase version and
	1531	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1532	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1533	lowercase version may be longer than the original character.
	1534
	1535	The first character of the lowercased version is returned
	1536	(but note, as explained above, that there may be more.)
	1537
	1538	=cut */
	1539
	1540	UV
	1541	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1542	{
	1543	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1544	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1545	}
	1546
	1547	/*
	1548	=for apidoc A\|UV\|to_utf8_fold\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1549
	1550	Convert the UTF-8 encoded character at p to its foldcase version and
	1551	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1552	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1553	foldcase version may be longer than the original character (up to
	1554	three characters).
	1555
	1556	The first character of the foldcased version is returned
	1557	(but note, as explained above, that there may be more.)
	1558
	1559	=cut */
	1560
	1561	UV
	1562	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1563	{
	1564	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1565	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1566	}
	1567
	1568	/* a "swash" is a swatch hash */
	1569
	1570	SV*
	1571	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	1572	{
	1573	SV* retval;
	1574	SV* tokenbufsv = sv_newmortal();
	1575	dSP;
	1576	const size_t pkg_len = strlen(pkg);
	1577	const size_t name_len = strlen(name);
	1578	HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
	1579	SV* errsv_save;
	1580
	1581	PUSHSTACKi(PERLSI_MAGIC);
	1582	ENTER;
	1583	SAVEI32(PL_hints);
	1584	PL_hints = 0;
	1585	save_re_context();
	1586	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1587	ENTER;
	1588	errsv_save = newSVsv(ERRSV);
	1589	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1590	Nullsv);
	1591	if (!SvTRUE(ERRSV))
	1592	sv_setsv(ERRSV, errsv_save);
	1593	SvREFCNT_dec(errsv_save);
	1594	LEAVE;
	1595	}
	1596	SPAGAIN;
	1597	PUSHMARK(SP);
	1598	EXTEND(SP,5);
	1599	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
	1600	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
	1601	PUSHs(listsv);
	1602	PUSHs(sv_2mortal(newSViv(minbits)));
	1603	PUSHs(sv_2mortal(newSViv(none)));
	1604	PUTBACK;
	1605	if (IN_PERL_COMPILETIME) {
	1606	/* XXX ought to be handled by lex_start */
	1607	SAVEI32(PL_in_my);
	1608	PL_in_my = 0;
	1609	sv_setpv(tokenbufsv, PL_tokenbuf);
	1610	}
	1611	errsv_save = newSVsv(ERRSV);
	1612	if (call_method("SWASHNEW", G_SCALAR))
	1613	retval = newSVsv(*PL_stack_sp--);
	1614	else
	1615	retval = &PL_sv_undef;
	1616	if (!SvTRUE(ERRSV))
	1617	sv_setsv(ERRSV, errsv_save);
	1618	SvREFCNT_dec(errsv_save);
	1619	LEAVE;
	1620	POPSTACK;
	1621	if (IN_PERL_COMPILETIME) {
	1622	STRLEN len;
	1623	const char* pv = SvPV(tokenbufsv, len);
	1624
	1625	Copy(pv, PL_tokenbuf, len+1, char);
	1626	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1627	}
	1628	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1629	if (SvPOK(retval))
	1630	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1631	retval);
	1632	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1633	}
	1634	return retval;
	1635	}
	1636
	1637
	1638	/* This API is wrong for special case conversions since we may need to
	1639	* return several Unicode characters for a single Unicode character
	1640	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1641	* the lower-level routine, and it is similarly broken for returning
	1642	* multiple values. --jhi */
	1643	UV
	1644	Perl_swash_fetch(pTHX_ SV sv, const U8 ptr, bool do_utf8)
	1645	{
	1646	HV* hv = (HV*)SvRV(sv);
	1647	U32 klen;
	1648	U32 off;
	1649	STRLEN slen;
	1650	STRLEN needents;
	1651	U8 *tmps = NULL;
	1652	U32 bit;
	1653	SV *retval;
	1654	U8 tmputf8[2];
	1655	UV c = NATIVE_TO_ASCII(*ptr);
	1656
	1657	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1658	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1659	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1660	ptr = tmputf8;
	1661	}
	1662	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1663	* then the "swatch" is a vec() for al the chars which start
	1664	* with 0xAA..0xYY
	1665	* So the key in the hash (klen) is length of encoded char -1
	1666	*/
	1667	klen = UTF8SKIP(ptr) - 1;
	1668	off = ptr[klen];
	1669
	1670	if (klen == 0)
	1671	{
	1672	/* If char in invariant then swatch is for all the invariant chars
	1673	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1674	*/
	1675	needents = UTF_CONTINUATION_MARK;
	1676	off = NATIVE_TO_UTF(ptr[klen]);
	1677	}
	1678	else
	1679	{
	1680	/* If char is encoded then swatch is for the prefix */
	1681	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1682	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1683	}
	1684
	1685	/*
	1686	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1687	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1688	* it's nothing to sniff at.) Pity we usually come through at least
	1689	* two function calls to get here...
	1690	*
	1691	* NB: this code assumes that swatches are never modified, once generated!
	1692	*/
	1693
	1694	if (hv == PL_last_swash_hv &&
	1695	klen == PL_last_swash_klen &&
	1696	(!klen \|\| memEQ(ptr, PL_last_swash_key, klen)) )
	1697	{
	1698	tmps = PL_last_swash_tmps;
	1699	slen = PL_last_swash_slen;
	1700	}
	1701	else {
	1702	/* Try our second-level swatch cache, kept in a hash. */
	1703	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1704
	1705	/* If not cached, generate it via utf8::SWASHGET */
	1706	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1707	dSP;
	1708	/* We use utf8n_to_uvuni() as we want an index into
	1709	Unicode tables, not a native character number.
	1710	*/
	1711	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1712	ckWARN(WARN_UTF8) ?
	1713	0 : UTF8_ALLOW_ANY);
	1714	SV *errsv_save;
	1715	ENTER;
	1716	SAVETMPS;
	1717	save_re_context();
	1718	PUSHSTACKi(PERLSI_MAGIC);
	1719	PUSHMARK(SP);
	1720	EXTEND(SP,3);
	1721	PUSHs((SV*)sv);
	1722	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1723	PUSHs(sv_2mortal(newSViv((klen) ?
	1724	(code_point & ~(needents - 1)) : 0)));
	1725	PUSHs(sv_2mortal(newSViv(needents)));
	1726	PUTBACK;
	1727	errsv_save = newSVsv(ERRSV);
	1728	if (call_method("SWASHGET", G_SCALAR))
	1729	retval = newSVsv(*PL_stack_sp--);
	1730	else
	1731	retval = &PL_sv_undef;
	1732	if (!SvTRUE(ERRSV))
	1733	sv_setsv(ERRSV, errsv_save);
	1734	SvREFCNT_dec(errsv_save);
	1735	POPSTACK;
	1736	FREETMPS;
	1737	LEAVE;
	1738	if (IN_PERL_COMPILETIME)
	1739	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1740
	1741	svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
	1742
	1743	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1744	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1745	}
	1746
	1747	PL_last_swash_hv = hv;
	1748	PL_last_swash_klen = klen;
	1749	PL_last_swash_tmps = tmps;
	1750	PL_last_swash_slen = slen;
	1751	if (klen)
	1752	Copy(ptr, PL_last_swash_key, klen, U8);
	1753	}
	1754
	1755	switch ((int)((slen << 3) / needents)) {
	1756	case 1:
	1757	bit = 1 << (off & 7);
	1758	off >>= 3;
	1759	return (tmps[off] & bit) != 0;
	1760	case 8:
	1761	return tmps[off];
	1762	case 16:
	1763	off <<= 1;
	1764	return (tmps[off] << 8) + tmps[off + 1] ;
	1765	case 32:
	1766	off <<= 2;
	1767	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1768	}
	1769	Perl_croak(aTHX_ "panic: swash_fetch");
	1770	return 0;
	1771	}
	1772
	1773
	1774	/*
	1775	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1776
	1777	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	1778	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	1779	bytes available. The return value is the pointer to the byte after the
	1780	end of the new character. In other words,
	1781
	1782	d = uvchr_to_utf8(d, uv);
	1783
	1784	is the recommended wide native character-aware way of saying
	1785
	1786	*(d++) = uv;
	1787
	1788	=cut
	1789	*/
	1790
	1791	/* On ASCII machines this is normally a macro but we want a
	1792	real function in case XS code wants it
	1793	*/
	1794	#undef Perl_uvchr_to_utf8
	1795	U8 *
	1796	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1797	{
	1798	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1799	}
	1800
	1801	U8 *
	1802	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1803	{
	1804	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1805	}
	1806
	1807	/*
	1808	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1809
	1810	Returns the native character value of the first character in the string C<s>
	1811	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	1812	length, in bytes, of that character.
	1813
	1814	Allows length and flags to be passed to low level routine.
	1815
	1816	=cut
	1817	*/
	1818	/* On ASCII machines this is normally a macro but we want
	1819	a real function in case XS code wants it
	1820	*/
	1821	#undef Perl_utf8n_to_uvchr
	1822	UV
	1823	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1824	{
	1825	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1826	return UNI_TO_NATIVE(uv);
	1827	}
	1828
	1829	/*
	1830	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1831
	1832	Build to the scalar dsv a displayable version of the string spv,
	1833	length len, the displayable version being at most pvlim bytes long
	1834	(if longer, the rest is truncated and "..." will be appended).
	1835
	1836	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1837	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1838	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1839	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1840	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1841	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1842
	1843	The pointer to the PV of the dsv is returned.
	1844
	1845	=cut */
	1846	char *
	1847	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1848	{
	1849	int truncated = 0;
	1850	const char s, e;
	1851
	1852	sv_setpvn(dsv, "", 0);
	1853	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1854	UV u;
	1855	/* This serves double duty as a flag and a character to print after
	1856	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	1857	*/
	1858	char ok = 0;
	1859
	1860	if (pvlim && SvCUR(dsv) >= pvlim) {
	1861	truncated++;
	1862	break;
	1863	}
	1864	u = utf8_to_uvchr((U8*)s, 0);
	1865	if (u < 256) {
	1866	unsigned char c = (unsigned char)u & 0xFF;
	1867	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1868	switch (c) {
	1869	case '\n':
	1870	ok = 'n'; break;
	1871	case '\r':
	1872	ok = 'r'; break;
	1873	case '\t':
	1874	ok = 't'; break;
	1875	case '\f':
	1876	ok = 'f'; break;
	1877	case '\a':
	1878	ok = 'a'; break;
	1879	case '\\':
	1880	ok = '\\'; break;
	1881	default: break;
	1882	}
	1883	if (ok) {
	1884	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
	1885	}
	1886	}
	1887	/* isPRINT() is the locale-blind version. */
	1888	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	1889	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
	1890	ok = 1;
	1891	}
	1892	}
	1893	if (!ok)
	1894	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1895	}
	1896	if (truncated)
	1897	sv_catpvn(dsv, "...", 3);
	1898
	1899	return SvPVX(dsv);
	1900	}
	1901
	1902	/*
	1903	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1904
	1905	Build to the scalar dsv a displayable version of the scalar sv,
	1906	the displayable version being at most pvlim bytes long
	1907	(if longer, the rest is truncated and "..." will be appended).
	1908
	1909	The flags argument is as in pv_uni_display().
	1910
	1911	The pointer to the PV of the dsv is returned.
	1912
	1913	=cut */
	1914	char *
	1915	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1916	{
	1917	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
	1918	pvlim, flags);
	1919	}
	1920
	1921	/*
	1922	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1923
	1924	Return true if the strings s1 and s2 differ case-insensitively, false
	1925	if not (if they are equal case-insensitively). If u1 is true, the
	1926	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1927	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1928	are false, the respective string is assumed to be in native 8-bit
	1929	encoding.
	1930
	1931	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1932	in there (they will point at the beginning of the I<next> character).
	1933	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1934	pointers beyond which scanning will not continue under any
	1935	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1936	s2+l2 will be used as goal end pointers that will also stop the scan,
	1937	and which qualify towards defining a successful match: all the scans
	1938	that define an explicit length must reach their goal pointers for
	1939	a match to succeed).
	1940
	1941	For case-insensitiveness, the "casefolding" of Unicode is used
	1942	instead of upper/lowercasing both the characters, see
	1943	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	1944
	1945	=cut */
	1946	I32
	1947	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	1948	{
	1949	register const U8 p1 = (const U8)s1;
	1950	register const U8 p2 = (const U8)s2;
	1951	register const U8 f1 = 0, f2 = 0;
	1952	register U8 e1 = 0, q1 = 0;
	1953	register U8 e2 = 0, q2 = 0;
	1954	STRLEN n1 = 0, n2 = 0;
	1955	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	1956	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	1957	U8 natbuf[1+1];
	1958	STRLEN foldlen1, foldlen2;
	1959	bool match;
	1960
	1961	if (pe1)
	1962	e1 = (U8*)pe1;
	1963	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
	1964	f1 = (const U8*)s1 + l1;
	1965	if (pe2)
	1966	e2 = (U8*)pe2;
	1967	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
	1968	f2 = (const U8*)s2 + l2;
	1969
	1970	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	1971	return 1; /* mismatch; possible infinite loop or false positive */
	1972
	1973	if (!u1 \|\| !u2)
	1974	natbuf[1] = 0; /* Need to terminate the buffer. */
	1975
	1976	while ((e1 == 0 \|\| p1 < e1) &&
	1977	(f1 == 0 \|\| p1 < f1) &&
	1978	(e2 == 0 \|\| p2 < e2) &&
	1979	(f2 == 0 \|\| p2 < f2)) {
	1980	if (n1 == 0) {
	1981	if (u1)
	1982	to_utf8_fold(p1, foldbuf1, &foldlen1);
	1983	else {
	1984	natbuf[0] = *p1;
	1985	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	1986	}
	1987	q1 = foldbuf1;
	1988	n1 = foldlen1;
	1989	}
	1990	if (n2 == 0) {
	1991	if (u2)
	1992	to_utf8_fold(p2, foldbuf2, &foldlen2);
	1993	else {
	1994	natbuf[0] = *p2;
	1995	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	1996	}
	1997	q2 = foldbuf2;
	1998	n2 = foldlen2;
	1999	}
	2000	while (n1 && n2) {
	2001	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2002	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2003	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2004	return 1; /* mismatch */
	2005	n1 -= UTF8SKIP(q1);
	2006	q1 += UTF8SKIP(q1);
	2007	n2 -= UTF8SKIP(q2);
	2008	q2 += UTF8SKIP(q2);
	2009	}
	2010	if (n1 == 0)
	2011	p1 += u1 ? UTF8SKIP(p1) : 1;
	2012	if (n2 == 0)
	2013	p2 += u2 ? UTF8SKIP(p2) : 1;
	2014
	2015	}
	2016
	2017	/* A match is defined by all the scans that specified
	2018	* an explicit length reaching their final goals. */
	2019	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	2020
	2021	if (match) {
	2022	if (pe1)
	2023	pe1 = (char)p1;
	2024	if (pe2)
	2025	pe2 = (char)p2;
	2026	}
	2027
	2028	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2029	}
	2030
	2031	/*
	2032	* Local variables:
	2033	* c-indentation-style: bsd
	2034	* c-basic-offset: 4
	2035	* indent-tabs-mode: t
	2036	* End:
	2037	*
	2038	* vim: shiftwidth=4:
	2039	*/