perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
	4	* others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* 'Well do I understand your speech,' he answered in the same language;
	17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	18	* as is the custom in the West, if you wish to be answered?'
	19	*
	20	* ...the travellers perceived that the floor was paved with stones of many
	21	* hues; branching runes and strange devices intertwined beneath their feet.
	22	*/
	23
	24	#include "EXTERN.h"
	25	#define PERL_IN_UTF8_C
	26	#include "perl.h"
	27
	28	static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
	29
	30	/*
	31	=head1 Unicode Support
	32
	33	This file contains various utility functions for manipulating UTF8-encoded
	34	strings. For the uninitiated, this is a method of representing arbitrary
	35	Unicode characters as a variable number of bytes, in such a way that
	36	characters in the ASCII range are unmodified, and a zero byte never appears
	37	within non-zero characters.
	38
	39	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	40
	41	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	42	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	43	bytes available. The return value is the pointer to the byte after the
	44	end of the new character. In other words,
	45
	46	d = uvuni_to_utf8_flags(d, uv, flags);
	47
	48	or, in most cases,
	49
	50	d = uvuni_to_utf8(d, uv);
	51
	52	(which is equivalent to)
	53
	54	d = uvuni_to_utf8_flags(d, uv, 0);
	55
	56	is the recommended Unicode-aware way of saying
	57
	58	*(d++) = uv;
	59
	60	=cut
	61	*/
	62
	63	U8 *
	64	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	65	{
	66	if (ckWARN(WARN_UTF8)) {
	67	if (UNICODE_IS_SURROGATE(uv) &&
	68	!(flags & UNICODE_ALLOW_SURROGATE))
	69	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	70	else if (
	71	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	72	!(flags & UNICODE_ALLOW_FDD0))
	73	\|\|
	74	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	75	!(flags & UNICODE_ALLOW_FFFF))) &&
	76	/* UNICODE_ALLOW_SUPER includes
	77	* FFFEs and FFFFs beyond 0x10FFFF. */
	78	((uv <= PERL_UNICODE_MAX) \|\|
	79	!(flags & UNICODE_ALLOW_SUPER))
	80	)
	81	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	82	"Unicode character 0x%04"UVxf" is illegal", uv);
	83	}
	84	if (UNI_IS_INVARIANT(uv)) {
	85	*d++ = (U8)UTF_TO_NATIVE(uv);
	86	return d;
	87	}
	88	#if defined(EBCDIC)
	89	else {
	90	STRLEN len = UNISKIP(uv);
	91	U8 *p = d+len-1;
	92	while (p > d) {
	93	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	94	uv >>= UTF_ACCUMULATION_SHIFT;
	95	}
	96	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	97	return d+len;
	98	}
	99	#else /* Non loop style */
	100	if (uv < 0x800) {
	101	*d++ = (U8)(( uv >> 6) \| 0xc0);
	102	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	103	return d;
	104	}
	105	if (uv < 0x10000) {
	106	*d++ = (U8)(( uv >> 12) \| 0xe0);
	107	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	108	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	109	return d;
	110	}
	111	if (uv < 0x200000) {
	112	*d++ = (U8)(( uv >> 18) \| 0xf0);
	113	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	114	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	115	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	116	return d;
	117	}
	118	if (uv < 0x4000000) {
	119	*d++ = (U8)(( uv >> 24) \| 0xf8);
	120	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	121	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	123	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	124	return d;
	125	}
	126	if (uv < 0x80000000) {
	127	*d++ = (U8)(( uv >> 30) \| 0xfc);
	128	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	129	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	130	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	131	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	132	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	133	return d;
	134	}
	135	#ifdef HAS_QUAD
	136	if (uv < UTF8_QUAD_MAX)
	137	#endif
	138	{
	139	d++ = 0xfe; / Can't match U+FEFF! */
	140	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	141	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	142	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	143	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	144	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	145	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	146	return d;
	147	}
	148	#ifdef HAS_QUAD
	149	{
	150	d++ = 0xff; / Can't match U+FFFE! */
	151	d++ = 0x80; / 6 Reserved bits */
	152	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	153	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	154	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	155	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	156	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	157	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	158	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	159	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	160	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	161	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	162	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	163	return d;
	164	}
	165	#endif
	166	#endif /* Loop style */
	167	}
	168
	169	U8 *
	170	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	171	{
	172	return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
	173	}
	174
	175
	176	/*
	177	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	178
	179	Tests if some arbitrary number of bytes begins in a valid UTF-8
	180	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	181	UTF-8 character. The actual number of bytes in the UTF-8 character
	182	will be returned if it is valid, otherwise 0.
	183
	184	=cut */
	185	STRLEN
	186	Perl_is_utf8_char(pTHX_ U8 *s)
	187	{
	188	U8 u = *s;
	189	STRLEN slen, len;
	190	UV uv, ouv;
	191
	192	if (UTF8_IS_INVARIANT(u))
	193	return 1;
	194
	195	if (!UTF8_IS_START(u))
	196	return 0;
	197
	198	len = UTF8SKIP(s);
	199
	200	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	201	return 0;
	202
	203	slen = len - 1;
	204	s++;
	205	u &= UTF_START_MASK(len);
	206	uv = u;
	207	ouv = uv;
	208	while (slen--) {
	209	if (!UTF8_IS_CONTINUATION(*s))
	210	return 0;
	211	uv = UTF8_ACCUMULATE(uv, *s);
	212	if (uv < ouv)
	213	return 0;
	214	ouv = uv;
	215	s++;
	216	}
	217
	218	if ((STRLEN)UNISKIP(uv) < len)
	219	return 0;
	220
	221	return len;
	222	}
	223
	224	/*
	225	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	226
	227	Returns true if first C<len> bytes of the given string form a valid
	228	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	229	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	230	because a valid ASCII string is a valid UTF-8 string.
	231
	232	=cut
	233	*/
	234
	235	bool
	236	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	237	{
	238	U8* x = s;
	239	U8* send;
	240	STRLEN c;
	241
	242	if (!len && s)
	243	len = strlen((char *)s);
	244	send = s + len;
	245
	246	while (x < send) {
	247	/* Inline the easy bits of is_utf8_char() here for speed... */
	248	if (UTF8_IS_INVARIANT(*x))
	249	c = 1;
	250	else if (!UTF8_IS_START(*x))
	251	return FALSE;
	252	else {
	253	/* ... and call is_utf8_char() only if really needed. */
	254	c = is_utf8_char(x);
	255	if (!c)
	256	return FALSE;
	257	}
	258	x += c;
	259	}
	260	if (x != send)
	261	return FALSE;
	262
	263	return TRUE;
	264	}
	265
	266	/*
	267	=for apidoc A\|bool\|is_utf8_string_loc\|U8 s\|STRLEN len\|U8 *p
	268
	269	Like is_ut8_string but store the location of the failure in
	270	the last argument.
	271
	272	=cut
	273	*/
	274
	275	bool
	276	Perl_is_utf8_string_loc(pTHX_ U8 s, STRLEN len, U8 *p)
	277	{
	278	U8* x = s;
	279	U8* send;
	280	STRLEN c;
	281
	282	if (!len && s)
	283	len = strlen((char *)s);
	284	send = s + len;
	285
	286	while (x < send) {
	287	/* Inline the easy bits of is_utf8_char() here for speed... */
	288	if (UTF8_IS_INVARIANT(*x))
	289	c = 1;
	290	else if (!UTF8_IS_START(*x)) {
	291	if (p)
	292	*p = x;
	293	return FALSE;
	294	}
	295	else {
	296	/* ... and call is_utf8_char() only if really needed. */
	297	c = is_utf8_char(x);
	298	if (!c) {
	299	if (p)
	300	*p = x;
	301	return FALSE;
	302	}
	303	}
	304	x += c;
	305	}
	306	if (x != send) {
	307	if (p)
	308	*p = x;
	309	return FALSE;
	310	}
	311
	312	return TRUE;
	313	}
	314
	315	/*
	316	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	317
	318	Bottom level UTF-8 decode routine.
	319	Returns the unicode code point value of the first character in the string C<s>
	320	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	321	C<retlen> will be set to the length, in bytes, of that character.
	322
	323	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	324	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	325	it is assumed that the caller will raise a warning, and this function
	326	will silently just set C<retlen> to C<-1> and return zero. If the
	327	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	328	malformations will be given, C<retlen> will be set to the expected
	329	length of the UTF-8 character in bytes, and zero will be returned.
	330
	331	The C<flags> can also contain various flags to allow deviations from
	332	the strict UTF-8 encoding (see F<utf8.h>).
	333
	334	Most code should use utf8_to_uvchr() rather than call this directly.
	335
	336	=cut
	337	*/
	338
	339	UV
	340	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	341	{
	342	U8 *s0 = s;
	343	UV uv = *s, ouv = 0;
	344	STRLEN len = 1;
	345	bool dowarn = ckWARN_d(WARN_UTF8);
	346	UV startbyte = *s;
	347	STRLEN expectlen = 0;
	348	U32 warning = 0;
	349
	350	/* This list is a superset of the UTF8_ALLOW_XXX. */
	351
	352	#define UTF8_WARN_EMPTY 1
	353	#define UTF8_WARN_CONTINUATION 2
	354	#define UTF8_WARN_NON_CONTINUATION 3
	355	#define UTF8_WARN_FE_FF 4
	356	#define UTF8_WARN_SHORT 5
	357	#define UTF8_WARN_OVERFLOW 6
	358	#define UTF8_WARN_SURROGATE 7
	359	#define UTF8_WARN_LONG 8
	360	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	361
	362	if (curlen == 0 &&
	363	!(flags & UTF8_ALLOW_EMPTY)) {
	364	warning = UTF8_WARN_EMPTY;
	365	goto malformed;
	366	}
	367
	368	if (UTF8_IS_INVARIANT(uv)) {
	369	if (retlen)
	370	*retlen = 1;
	371	return (UV) (NATIVE_TO_UTF(*s));
	372	}
	373
	374	if (UTF8_IS_CONTINUATION(uv) &&
	375	!(flags & UTF8_ALLOW_CONTINUATION)) {
	376	warning = UTF8_WARN_CONTINUATION;
	377	goto malformed;
	378	}
	379
	380	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	381	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	382	warning = UTF8_WARN_NON_CONTINUATION;
	383	goto malformed;
	384	}
	385
	386	#ifdef EBCDIC
	387	uv = NATIVE_TO_UTF(uv);
	388	#else
	389	if ((uv == 0xfe \|\| uv == 0xff) &&
	390	!(flags & UTF8_ALLOW_FE_FF)) {
	391	warning = UTF8_WARN_FE_FF;
	392	goto malformed;
	393	}
	394	#endif
	395
	396	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	397	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	398	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	399	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	400	#ifdef EBCDIC
	401	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	402	else { len = 7; uv &= 0x01; }
	403	#else
	404	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	405	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	406	else { len = 13; uv = 0; } /* whoa! */
	407	#endif
	408
	409	if (retlen)
	410	*retlen = len;
	411
	412	expectlen = len;
	413
	414	if ((curlen < expectlen) &&
	415	!(flags & UTF8_ALLOW_SHORT)) {
	416	warning = UTF8_WARN_SHORT;
	417	goto malformed;
	418	}
	419
	420	len--;
	421	s++;
	422	ouv = uv;
	423
	424	while (len--) {
	425	if (!UTF8_IS_CONTINUATION(*s) &&
	426	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	427	s--;
	428	warning = UTF8_WARN_NON_CONTINUATION;
	429	goto malformed;
	430	}
	431	else
	432	uv = UTF8_ACCUMULATE(uv, *s);
	433	if (!(uv > ouv)) {
	434	/* These cannot be allowed. */
	435	if (uv == ouv) {
	436	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	437	warning = UTF8_WARN_LONG;
	438	goto malformed;
	439	}
	440	}
	441	else { /* uv < ouv */
	442	/* This cannot be allowed. */
	443	warning = UTF8_WARN_OVERFLOW;
	444	goto malformed;
	445	}
	446	}
	447	s++;
	448	ouv = uv;
	449	}
	450
	451	if (UNICODE_IS_SURROGATE(uv) &&
	452	!(flags & UTF8_ALLOW_SURROGATE)) {
	453	warning = UTF8_WARN_SURROGATE;
	454	goto malformed;
	455	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	456	!(flags & UTF8_ALLOW_LONG)) {
	457	warning = UTF8_WARN_LONG;
	458	goto malformed;
	459	} else if (UNICODE_IS_ILLEGAL(uv) &&
	460	!(flags & UTF8_ALLOW_FFFF)) {
	461	warning = UTF8_WARN_FFFF;
	462	goto malformed;
	463	}
	464
	465	return uv;
	466
	467	malformed:
	468
	469	if (flags & UTF8_CHECK_ONLY) {
	470	if (retlen)
	471	*retlen = -1;
	472	return 0;
	473	}
	474
	475	if (dowarn) {
	476	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	477
	478	switch (warning) {
	479	case 0: /* Intentionally empty. */ break;
	480	case UTF8_WARN_EMPTY:
	481	Perl_sv_catpv(aTHX_ sv, "(empty string)");
	482	break;
	483	case UTF8_WARN_CONTINUATION:
	484	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	485	break;
	486	case UTF8_WARN_NON_CONTINUATION:
	487	if (s == s0)
	488	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	489	(UV)s[1], startbyte);
	490	else
	491	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	492	(UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
	493
	494	break;
	495	case UTF8_WARN_FE_FF:
	496	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	497	break;
	498	case UTF8_WARN_SHORT:
	499	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	500	curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
	501	expectlen = curlen; /* distance for caller to skip */
	502	break;
	503	case UTF8_WARN_OVERFLOW:
	504	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	505	ouv, *s, startbyte);
	506	break;
	507	case UTF8_WARN_SURROGATE:
	508	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	509	break;
	510	case UTF8_WARN_LONG:
	511	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	512	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	513	break;
	514	case UTF8_WARN_FFFF:
	515	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	516	break;
	517	default:
	518	Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
	519	break;
	520	}
	521
	522	if (warning) {
	523	char *s = SvPVX(sv);
	524
	525	if (PL_op)
	526	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	527	"%s in %s", s, OP_DESC(PL_op));
	528	else
	529	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	530	}
	531	}
	532
	533	if (retlen)
	534	*retlen = expectlen ? expectlen : len;
	535
	536	return 0;
	537	}
	538
	539	/*
	540	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	541
	542	Returns the native character value of the first character in the string C<s>
	543	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	544	length, in bytes, of that character.
	545
	546	If C<s> does not point to a well-formed UTF-8 character, zero is
	547	returned and retlen is set, if possible, to -1.
	548
	549	=cut
	550	*/
	551
	552	UV
	553	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	554	{
	555	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
	556	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	557	}
	558
	559	/*
	560	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	561
	562	Returns the Unicode code point of the first character in the string C<s>
	563	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	564	length, in bytes, of that character.
	565
	566	This function should only be used when returned UV is considered
	567	an index into the Unicode semantic tables (e.g. swashes).
	568
	569	If C<s> does not point to a well-formed UTF-8 character, zero is
	570	returned and retlen is set, if possible, to -1.
	571
	572	=cut
	573	*/
	574
	575	UV
	576	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	577	{
	578	/* Call the low level routine asking for checks */
	579	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	580	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	581	}
	582
	583	/*
	584	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	585
	586	Return the length of the UTF-8 char encoded string C<s> in characters.
	587	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	588	up past C<e>, croaks.
	589
	590	=cut
	591	*/
	592
	593	STRLEN
	594	Perl_utf8_length(pTHX_ U8 s, U8 e)
	595	{
	596	STRLEN len = 0;
	597
	598	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	599	* the bitops (especially ~) can create illegal UTF-8.
	600	* In other words: in Perl UTF-8 is not just for Unicode. */
	601
	602	if (e < s) {
	603	if (ckWARN_d(WARN_UTF8)) {
	604	if (PL_op)
	605	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	606	"%s in %s", unees, OP_DESC(PL_op));
	607	else
	608	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	609	}
	610	return 0;
	611	}
	612	while (s < e) {
	613	U8 t = UTF8SKIP(s);
	614
	615	if (e - s < t) {
	616	if (ckWARN_d(WARN_UTF8)) {
	617	if (PL_op)
	618	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	619	unees, OP_DESC(PL_op));
	620	else
	621	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	622	}
	623	return len;
	624	}
	625	s += t;
	626	len++;
	627	}
	628
	629	return len;
	630	}
	631
	632	/*
	633	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	634
	635	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	636	and C<b>.
	637
	638	WARNING: use only if you know that the pointers point inside the
	639	same UTF-8 buffer.
	640
	641	=cut
	642	*/
	643
	644	IV
	645	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	646	{
	647	IV off = 0;
	648
	649	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	650	* the bitops (especially ~) can create illegal UTF-8.
	651	* In other words: in Perl UTF-8 is not just for Unicode. */
	652
	653	if (a < b) {
	654	while (a < b) {
	655	U8 c = UTF8SKIP(a);
	656
	657	if (b - a < c) {
	658	if (ckWARN_d(WARN_UTF8)) {
	659	if (PL_op)
	660	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	661	"%s in %s", unees, OP_DESC(PL_op));
	662	else
	663	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	664	}
	665	return off;
	666	}
	667	a += c;
	668	off--;
	669	}
	670	}
	671	else {
	672	while (b < a) {
	673	U8 c = UTF8SKIP(b);
	674
	675	if (a - b < c) {
	676	if (ckWARN_d(WARN_UTF8)) {
	677	if (PL_op)
	678	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	679	"%s in %s", unees, OP_DESC(PL_op));
	680	else
	681	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	682	}
	683	return off;
	684	}
	685	b += c;
	686	off++;
	687	}
	688	}
	689
	690	return off;
	691	}
	692
	693	/*
	694	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	695
	696	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	697	forward or backward.
	698
	699	WARNING: do not use the following unless you know C<off> is within
	700	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	701	on the first byte of character or just after the last byte of a character.
	702
	703	=cut
	704	*/
	705
	706	U8 *
	707	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	708	{
	709	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	710	* the bitops (especially ~) can create illegal UTF-8.
	711	* In other words: in Perl UTF-8 is not just for Unicode. */
	712
	713	if (off >= 0) {
	714	while (off--)
	715	s += UTF8SKIP(s);
	716	}
	717	else {
	718	while (off++) {
	719	s--;
	720	while (UTF8_IS_CONTINUATION(*s))
	721	s--;
	722	}
	723	}
	724	return s;
	725	}
	726
	727	/*
	728	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	729
	730	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	731	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	732	updates len to contain the new length.
	733	Returns zero on failure, setting C<len> to -1.
	734
	735	=cut
	736	*/
	737
	738	U8 *
	739	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	740	{
	741	U8 *send;
	742	U8 *d;
	743	U8 *save = s;
	744
	745	/* ensure valid UTF-8 and chars < 256 before updating string */
	746	for (send = s + *len; s < send; ) {
	747	U8 c = *s++;
	748
	749	if (!UTF8_IS_INVARIANT(c) &&
	750	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	751	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	752	*len = -1;
	753	return 0;
	754	}
	755	}
	756
	757	d = s = save;
	758	while (s < send) {
	759	STRLEN ulen;
	760	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	761	s += ulen;
	762	}
	763	*d = '\0';
	764	*len = d - save;
	765	return save;
	766	}
	767
	768	/*
	769	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
	770
	771	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	772	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	773	the newly-created string, and updates C<len> to contain the new
	774	length. Returns the original string if no conversion occurs, C<len>
	775	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	776	0 if C<s> is converted or contains all 7bit characters.
	777
	778	=cut
	779	*/
	780
	781	U8 *
	782	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	783	{
	784	U8 *d;
	785	U8 *start = s;
	786	U8 *send;
	787	I32 count = 0;
	788
	789	if (!*is_utf8)
	790	return start;
	791
	792	/* ensure valid UTF-8 and chars < 256 before converting string */
	793	for (send = s + *len; s < send;) {
	794	U8 c = *s++;
	795	if (!UTF8_IS_INVARIANT(c)) {
	796	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	797	(c = *s++) && UTF8_IS_CONTINUATION(c))
	798	count++;
	799	else
	800	return start;
	801	}
	802	}
	803
	804	*is_utf8 = 0;
	805
	806	Newz(801, d, (*len) - count + 1, U8);
	807	s = start; start = d;
	808	while (s < send) {
	809	U8 c = *s++;
	810	if (!UTF8_IS_INVARIANT(c)) {
	811	/* Then it is two-byte encoded */
	812	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	813	c = ASCII_TO_NATIVE(c);
	814	}
	815	*d++ = c;
	816	}
	817	*d = '\0';
	818	*len = d - start;
	819	return start;
	820	}
	821
	822	/*
	823	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	824
	825	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
	826	Returns a pointer to the newly-created string, and sets C<len> to
	827	reflect the new length.
	828
	829	If you want to convert to UTF-8 from other encodings than ASCII,
	830	see sv_recode_to_utf8().
	831
	832	=cut
	833	*/
	834
	835	U8*
	836	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	837	{
	838	U8 *send;
	839	U8 *d;
	840	U8 *dst;
	841	send = s + (*len);
	842
	843	Newz(801, d, (len) 2 + 1, U8);
	844	dst = d;
	845
	846	while (s < send) {
	847	UV uv = NATIVE_TO_ASCII(*s++);
	848	if (UNI_IS_INVARIANT(uv))
	849	*d++ = (U8)UTF_TO_NATIVE(uv);
	850	else {
	851	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	852	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	853	}
	854	}
	855	*d = '\0';
	856	*len = d-dst;
	857	return dst;
	858	}
	859
	860	/*
	861	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	862	*
	863	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	864	* We optimize for native, for obvious reasons. */
	865
	866	U8*
	867	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	868	{
	869	U8* pend;
	870	U8* dstart = d;
	871
	872	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
	873	d[0] = 0;
	874	*newlen = 1;
	875	return d;
	876	}
	877
	878	if (bytelen & 1)
	879	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
	880
	881	pend = p + bytelen;
	882
	883	while (p < pend) {
	884	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	885	p += 2;
	886	if (uv < 0x80) {
	887	*d++ = (U8)uv;
	888	continue;
	889	}
	890	if (uv < 0x800) {
	891	*d++ = (U8)(( uv >> 6) \| 0xc0);
	892	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	893	continue;
	894	}
	895	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	896	UV low = (p[0] << 8) + p[1];
	897	p += 2;
	898	if (low < 0xdc00 \|\| low >= 0xdfff)
	899	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	900	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	901	}
	902	if (uv < 0x10000) {
	903	*d++ = (U8)(( uv >> 12) \| 0xe0);
	904	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	905	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	906	continue;
	907	}
	908	else {
	909	*d++ = (U8)(( uv >> 18) \| 0xf0);
	910	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	911	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	912	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	913	continue;
	914	}
	915	}
	916	*newlen = d - dstart;
	917	return d;
	918	}
	919
	920	/* Note: this one is slightly destructive of the source. */
	921
	922	U8*
	923	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	924	{
	925	U8* s = (U8*)p;
	926	U8* send = s + bytelen;
	927	while (s < send) {
	928	U8 tmp = s[0];
	929	s[0] = s[1];
	930	s[1] = tmp;
	931	s += 2;
	932	}
	933	return utf16_to_utf8(p, d, bytelen, newlen);
	934	}
	935
	936	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	937
	938	bool
	939	Perl_is_uni_alnum(pTHX_ UV c)
	940	{
	941	U8 tmpbuf[UTF8_MAXBYTES+1];
	942	uvchr_to_utf8(tmpbuf, c);
	943	return is_utf8_alnum(tmpbuf);
	944	}
	945
	946	bool
	947	Perl_is_uni_alnumc(pTHX_ UV c)
	948	{
	949	U8 tmpbuf[UTF8_MAXBYTES+1];
	950	uvchr_to_utf8(tmpbuf, c);
	951	return is_utf8_alnumc(tmpbuf);
	952	}
	953
	954	bool
	955	Perl_is_uni_idfirst(pTHX_ UV c)
	956	{
	957	U8 tmpbuf[UTF8_MAXBYTES+1];
	958	uvchr_to_utf8(tmpbuf, c);
	959	return is_utf8_idfirst(tmpbuf);
	960	}
	961
	962	bool
	963	Perl_is_uni_alpha(pTHX_ UV c)
	964	{
	965	U8 tmpbuf[UTF8_MAXBYTES+1];
	966	uvchr_to_utf8(tmpbuf, c);
	967	return is_utf8_alpha(tmpbuf);
	968	}
	969
	970	bool
	971	Perl_is_uni_ascii(pTHX_ UV c)
	972	{
	973	U8 tmpbuf[UTF8_MAXBYTES+1];
	974	uvchr_to_utf8(tmpbuf, c);
	975	return is_utf8_ascii(tmpbuf);
	976	}
	977
	978	bool
	979	Perl_is_uni_space(pTHX_ UV c)
	980	{
	981	U8 tmpbuf[UTF8_MAXBYTES+1];
	982	uvchr_to_utf8(tmpbuf, c);
	983	return is_utf8_space(tmpbuf);
	984	}
	985
	986	bool
	987	Perl_is_uni_digit(pTHX_ UV c)
	988	{
	989	U8 tmpbuf[UTF8_MAXBYTES+1];
	990	uvchr_to_utf8(tmpbuf, c);
	991	return is_utf8_digit(tmpbuf);
	992	}
	993
	994	bool
	995	Perl_is_uni_upper(pTHX_ UV c)
	996	{
	997	U8 tmpbuf[UTF8_MAXBYTES+1];
	998	uvchr_to_utf8(tmpbuf, c);
	999	return is_utf8_upper(tmpbuf);
	1000	}
	1001
	1002	bool
	1003	Perl_is_uni_lower(pTHX_ UV c)
	1004	{
	1005	U8 tmpbuf[UTF8_MAXBYTES+1];
	1006	uvchr_to_utf8(tmpbuf, c);
	1007	return is_utf8_lower(tmpbuf);
	1008	}
	1009
	1010	bool
	1011	Perl_is_uni_cntrl(pTHX_ UV c)
	1012	{
	1013	U8 tmpbuf[UTF8_MAXBYTES+1];
	1014	uvchr_to_utf8(tmpbuf, c);
	1015	return is_utf8_cntrl(tmpbuf);
	1016	}
	1017
	1018	bool
	1019	Perl_is_uni_graph(pTHX_ UV c)
	1020	{
	1021	U8 tmpbuf[UTF8_MAXBYTES+1];
	1022	uvchr_to_utf8(tmpbuf, c);
	1023	return is_utf8_graph(tmpbuf);
	1024	}
	1025
	1026	bool
	1027	Perl_is_uni_print(pTHX_ UV c)
	1028	{
	1029	U8 tmpbuf[UTF8_MAXBYTES+1];
	1030	uvchr_to_utf8(tmpbuf, c);
	1031	return is_utf8_print(tmpbuf);
	1032	}
	1033
	1034	bool
	1035	Perl_is_uni_punct(pTHX_ UV c)
	1036	{
	1037	U8 tmpbuf[UTF8_MAXBYTES+1];
	1038	uvchr_to_utf8(tmpbuf, c);
	1039	return is_utf8_punct(tmpbuf);
	1040	}
	1041
	1042	bool
	1043	Perl_is_uni_xdigit(pTHX_ UV c)
	1044	{
	1045	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1046	uvchr_to_utf8(tmpbuf, c);
	1047	return is_utf8_xdigit(tmpbuf);
	1048	}
	1049
	1050	UV
	1051	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1052	{
	1053	uvchr_to_utf8(p, c);
	1054	return to_utf8_upper(p, p, lenp);
	1055	}
	1056
	1057	UV
	1058	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1059	{
	1060	uvchr_to_utf8(p, c);
	1061	return to_utf8_title(p, p, lenp);
	1062	}
	1063
	1064	UV
	1065	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1066	{
	1067	uvchr_to_utf8(p, c);
	1068	return to_utf8_lower(p, p, lenp);
	1069	}
	1070
	1071	UV
	1072	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1073	{
	1074	uvchr_to_utf8(p, c);
	1075	return to_utf8_fold(p, p, lenp);
	1076	}
	1077
	1078	/* for now these all assume no locale info available for Unicode > 255 */
	1079
	1080	bool
	1081	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1082	{
	1083	return is_uni_alnum(c); /* XXX no locale support yet */
	1084	}
	1085
	1086	bool
	1087	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1088	{
	1089	return is_uni_alnumc(c); /* XXX no locale support yet */
	1090	}
	1091
	1092	bool
	1093	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1094	{
	1095	return is_uni_idfirst(c); /* XXX no locale support yet */
	1096	}
	1097
	1098	bool
	1099	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1100	{
	1101	return is_uni_alpha(c); /* XXX no locale support yet */
	1102	}
	1103
	1104	bool
	1105	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1106	{
	1107	return is_uni_ascii(c); /* XXX no locale support yet */
	1108	}
	1109
	1110	bool
	1111	Perl_is_uni_space_lc(pTHX_ UV c)
	1112	{
	1113	return is_uni_space(c); /* XXX no locale support yet */
	1114	}
	1115
	1116	bool
	1117	Perl_is_uni_digit_lc(pTHX_ UV c)
	1118	{
	1119	return is_uni_digit(c); /* XXX no locale support yet */
	1120	}
	1121
	1122	bool
	1123	Perl_is_uni_upper_lc(pTHX_ UV c)
	1124	{
	1125	return is_uni_upper(c); /* XXX no locale support yet */
	1126	}
	1127
	1128	bool
	1129	Perl_is_uni_lower_lc(pTHX_ UV c)
	1130	{
	1131	return is_uni_lower(c); /* XXX no locale support yet */
	1132	}
	1133
	1134	bool
	1135	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1136	{
	1137	return is_uni_cntrl(c); /* XXX no locale support yet */
	1138	}
	1139
	1140	bool
	1141	Perl_is_uni_graph_lc(pTHX_ UV c)
	1142	{
	1143	return is_uni_graph(c); /* XXX no locale support yet */
	1144	}
	1145
	1146	bool
	1147	Perl_is_uni_print_lc(pTHX_ UV c)
	1148	{
	1149	return is_uni_print(c); /* XXX no locale support yet */
	1150	}
	1151
	1152	bool
	1153	Perl_is_uni_punct_lc(pTHX_ UV c)
	1154	{
	1155	return is_uni_punct(c); /* XXX no locale support yet */
	1156	}
	1157
	1158	bool
	1159	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1160	{
	1161	return is_uni_xdigit(c); /* XXX no locale support yet */
	1162	}
	1163
	1164	U32
	1165	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1166	{
	1167	/* XXX returns only the first character -- do not use XXX */
	1168	/* XXX no locale support yet */
	1169	STRLEN len;
	1170	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1171	return (U32)to_uni_upper(c, tmpbuf, &len);
	1172	}
	1173
	1174	U32
	1175	Perl_to_uni_title_lc(pTHX_ U32 c)
	1176	{
	1177	/* XXX returns only the first character XXX -- do not use XXX */
	1178	/* XXX no locale support yet */
	1179	STRLEN len;
	1180	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1181	return (U32)to_uni_title(c, tmpbuf, &len);
	1182	}
	1183
	1184	U32
	1185	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1186	{
	1187	/* XXX returns only the first character -- do not use XXX */
	1188	/* XXX no locale support yet */
	1189	STRLEN len;
	1190	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1191	return (U32)to_uni_lower(c, tmpbuf, &len);
	1192	}
	1193
	1194	bool
	1195	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1196	{
	1197	if (!is_utf8_char(p))
	1198	return FALSE;
	1199	if (!PL_utf8_alnum)
	1200	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1201	* descendant of isalnum(3), in other words, it doesn't
	1202	* contain the '_'. --jhi */
	1203	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1204	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1205	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1206	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1207	if (!PL_utf8_alnum)
	1208	PL_utf8_alnum = swash_init("utf8", "",
	1209	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1210	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1211	#endif
	1212	}
	1213
	1214	bool
	1215	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1216	{
	1217	if (!is_utf8_char(p))
	1218	return FALSE;
	1219	if (!PL_utf8_alnum)
	1220	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1221	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1222	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1223	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1224	if (!PL_utf8_alnum)
	1225	PL_utf8_alnum = swash_init("utf8", "",
	1226	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1227	return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
	1228	#endif
	1229	}
	1230
	1231	bool
	1232	Perl_is_utf8_idfirst(pTHX_ U8 p) / The naming is historical. */
	1233	{
	1234	if (*p == '_')
	1235	return TRUE;
	1236	if (!is_utf8_char(p))
	1237	return FALSE;
	1238	if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
	1239	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
	1240	return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
	1241	}
	1242
	1243	bool
	1244	Perl_is_utf8_idcont(pTHX_ U8 *p)
	1245	{
	1246	if (*p == '_')
	1247	return TRUE;
	1248	if (!is_utf8_char(p))
	1249	return FALSE;
	1250	if (!PL_utf8_idcont)
	1251	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
	1252	return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
	1253	}
	1254
	1255	bool
	1256	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1257	{
	1258	if (!is_utf8_char(p))
	1259	return FALSE;
	1260	if (!PL_utf8_alpha)
	1261	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1262	return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
	1263	}
	1264
	1265	bool
	1266	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1267	{
	1268	if (!is_utf8_char(p))
	1269	return FALSE;
	1270	if (!PL_utf8_ascii)
	1271	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1272	return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
	1273	}
	1274
	1275	bool
	1276	Perl_is_utf8_space(pTHX_ U8 *p)
	1277	{
	1278	if (!is_utf8_char(p))
	1279	return FALSE;
	1280	if (!PL_utf8_space)
	1281	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1282	return swash_fetch(PL_utf8_space, p, TRUE) != 0;
	1283	}
	1284
	1285	bool
	1286	Perl_is_utf8_digit(pTHX_ U8 *p)
	1287	{
	1288	if (!is_utf8_char(p))
	1289	return FALSE;
	1290	if (!PL_utf8_digit)
	1291	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1292	return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
	1293	}
	1294
	1295	bool
	1296	Perl_is_utf8_upper(pTHX_ U8 *p)
	1297	{
	1298	if (!is_utf8_char(p))
	1299	return FALSE;
	1300	if (!PL_utf8_upper)
	1301	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
	1302	return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
	1303	}
	1304
	1305	bool
	1306	Perl_is_utf8_lower(pTHX_ U8 *p)
	1307	{
	1308	if (!is_utf8_char(p))
	1309	return FALSE;
	1310	if (!PL_utf8_lower)
	1311	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
	1312	return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
	1313	}
	1314
	1315	bool
	1316	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1317	{
	1318	if (!is_utf8_char(p))
	1319	return FALSE;
	1320	if (!PL_utf8_cntrl)
	1321	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1322	return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
	1323	}
	1324
	1325	bool
	1326	Perl_is_utf8_graph(pTHX_ U8 *p)
	1327	{
	1328	if (!is_utf8_char(p))
	1329	return FALSE;
	1330	if (!PL_utf8_graph)
	1331	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1332	return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
	1333	}
	1334
	1335	bool
	1336	Perl_is_utf8_print(pTHX_ U8 *p)
	1337	{
	1338	if (!is_utf8_char(p))
	1339	return FALSE;
	1340	if (!PL_utf8_print)
	1341	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1342	return swash_fetch(PL_utf8_print, p, TRUE) != 0;
	1343	}
	1344
	1345	bool
	1346	Perl_is_utf8_punct(pTHX_ U8 *p)
	1347	{
	1348	if (!is_utf8_char(p))
	1349	return FALSE;
	1350	if (!PL_utf8_punct)
	1351	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1352	return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
	1353	}
	1354
	1355	bool
	1356	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1357	{
	1358	if (!is_utf8_char(p))
	1359	return FALSE;
	1360	if (!PL_utf8_xdigit)
	1361	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1362	return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
	1363	}
	1364
	1365	bool
	1366	Perl_is_utf8_mark(pTHX_ U8 *p)
	1367	{
	1368	if (!is_utf8_char(p))
	1369	return FALSE;
	1370	if (!PL_utf8_mark)
	1371	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1372	return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
	1373	}
	1374
	1375	/*
	1376	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1377
	1378	The "p" contains the pointer to the UTF-8 string encoding
	1379	the character that is being converted.
	1380
	1381	The "ustrp" is a pointer to the character buffer to put the
	1382	conversion result to. The "lenp" is a pointer to the length
	1383	of the result.
	1384
	1385	The "swashp" is a pointer to the swash to use.
	1386
	1387	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1388	and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
	1389	but not always, a multicharacter mapping), is tried first.
	1390
	1391	The "special" is a string like "utf8::ToSpecLower", which means the
	1392	hash %utf8::ToSpecLower. The access to the hash is through
	1393	Perl_to_utf8_case().
	1394
	1395	The "normal" is a string like "ToLower" which means the swash
	1396	%utf8::ToLower.
	1397
	1398	=cut */
	1399
	1400	UV
	1401	Perl_to_utf8_case(pTHX_ U8 p, U8 ustrp, STRLEN lenp, SV swashp, char normal, char *special)
	1402	{
	1403	UV uv0, uv1;
	1404	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1405	STRLEN len = 0;
	1406
	1407	uv0 = utf8_to_uvchr(p, 0);
	1408	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1409	* are necessary in EBCDIC, they are redundant no-ops
	1410	* in ASCII-ish platforms, and hopefully optimized away. */
	1411	uv1 = NATIVE_TO_UNI(uv0);
	1412	uvuni_to_utf8(tmpbuf, uv1);
	1413
	1414	if (!swashp) / load on-demand */
	1415	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1416
	1417	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1418	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1419	/* It might be "special" (sometimes, but not always,
	1420	* a multicharacter mapping) */
	1421	HV *hv;
	1422	SV **svp;
	1423
	1424	if ((hv = get_hv(special, FALSE)) &&
	1425	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1426	(*svp)) {
	1427	char *s;
	1428
	1429	s = SvPV(*svp, len);
	1430	if (len == 1)
	1431	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1432	else {
	1433	#ifdef EBCDIC
	1434	/* If we have EBCDIC we need to remap the characters
	1435	* since any characters in the low 256 are Unicode
	1436	* code points, not EBCDIC. */
	1437	U8 t = (U8)s, tend = t + len, d;
	1438
	1439	d = tmpbuf;
	1440	if (SvUTF8(*svp)) {
	1441	STRLEN tlen = 0;
	1442
	1443	while (t < tend) {
	1444	UV c = utf8_to_uvchr(t, &tlen);
	1445	if (tlen > 0) {
	1446	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1447	t += tlen;
	1448	}
	1449	else
	1450	break;
	1451	}
	1452	}
	1453	else {
	1454	while (t < tend) {
	1455	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1456	t++;
	1457	}
	1458	}
	1459	len = d - tmpbuf;
	1460	Copy(tmpbuf, ustrp, len, U8);
	1461	#else
	1462	Copy(s, ustrp, len, U8);
	1463	#endif
	1464	}
	1465	}
	1466	}
	1467
	1468	if (!len && *swashp) {
	1469	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1470
	1471	if (uv2) {
	1472	/* It was "normal" (a single character mapping). */
	1473	UV uv3 = UNI_TO_NATIVE(uv2);
	1474
	1475	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1476	}
	1477	}
	1478
	1479	if (!len) /* Neither: just copy. */
	1480	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1481
	1482	if (lenp)
	1483	*lenp = len;
	1484
	1485	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1486	}
	1487
	1488	/*
	1489	=for apidoc A\|UV\|to_utf8_upper\|U8 p\|U8 ustrp\|STRLEN *lenp
	1490
	1491	Convert the UTF-8 encoded character at p to its uppercase version and
	1492	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1493	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1494	the uppercase version may be longer than the original character.
	1495
	1496	The first character of the uppercased version is returned
	1497	(but note, as explained above, that there may be more.)
	1498
	1499	=cut */
	1500
	1501	UV
	1502	Perl_to_utf8_upper(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1503	{
	1504	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1505	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1506	}
	1507
	1508	/*
	1509	=for apidoc A\|UV\|to_utf8_title\|U8 p\|U8 ustrp\|STRLEN *lenp
	1510
	1511	Convert the UTF-8 encoded character at p to its titlecase version and
	1512	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1513	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1514	titlecase version may be longer than the original character.
	1515
	1516	The first character of the titlecased version is returned
	1517	(but note, as explained above, that there may be more.)
	1518
	1519	=cut */
	1520
	1521	UV
	1522	Perl_to_utf8_title(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1523	{
	1524	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1525	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1526	}
	1527
	1528	/*
	1529	=for apidoc A\|UV\|to_utf8_lower\|U8 p\|U8 ustrp\|STRLEN *lenp
	1530
	1531	Convert the UTF-8 encoded character at p to its lowercase version and
	1532	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1533	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1534	lowercase version may be longer than the original character.
	1535
	1536	The first character of the lowercased version is returned
	1537	(but note, as explained above, that there may be more.)
	1538
	1539	=cut */
	1540
	1541	UV
	1542	Perl_to_utf8_lower(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1543	{
	1544	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1545	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1546	}
	1547
	1548	/*
	1549	=for apidoc A\|UV\|to_utf8_fold\|U8 p\|U8 ustrp\|STRLEN *lenp
	1550
	1551	Convert the UTF-8 encoded character at p to its foldcase version and
	1552	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1553	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1554	foldcase version may be longer than the original character (up to
	1555	three characters).
	1556
	1557	The first character of the foldcased version is returned
	1558	(but note, as explained above, that there may be more.)
	1559
	1560	=cut */
	1561
	1562	UV
	1563	Perl_to_utf8_fold(pTHX_ U8 p, U8 ustrp, STRLEN *lenp)
	1564	{
	1565	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1566	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1567	}
	1568
	1569	/* a "swash" is a swatch hash */
	1570
	1571	SV*
	1572	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1573	{
	1574	SV* retval;
	1575	SV* tokenbufsv = sv_newmortal();
	1576	dSP;
	1577	size_t pkg_len = strlen(pkg);
	1578	size_t name_len = strlen(name);
	1579	HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
	1580	SV* errsv_save;
	1581
	1582	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1583	ENTER;
	1584	errsv_save = newSVsv(ERRSV);
	1585	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1586	Nullsv);
	1587	if (!SvTRUE(ERRSV))
	1588	sv_setsv(ERRSV, errsv_save);
	1589	SvREFCNT_dec(errsv_save);
	1590	LEAVE;
	1591	}
	1592	SPAGAIN;
	1593	PUSHSTACKi(PERLSI_MAGIC);
	1594	PUSHMARK(SP);
	1595	EXTEND(SP,5);
	1596	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
	1597	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
	1598	PUSHs(listsv);
	1599	PUSHs(sv_2mortal(newSViv(minbits)));
	1600	PUSHs(sv_2mortal(newSViv(none)));
	1601	PUTBACK;
	1602	ENTER;
	1603	SAVEI32(PL_hints);
	1604	PL_hints = 0;
	1605	save_re_context();
	1606	if (IN_PERL_COMPILETIME) {
	1607	/* XXX ought to be handled by lex_start */
	1608	SAVEI32(PL_in_my);
	1609	PL_in_my = 0;
	1610	sv_setpv(tokenbufsv, PL_tokenbuf);
	1611	}
	1612	errsv_save = newSVsv(ERRSV);
	1613	if (call_method("SWASHNEW", G_SCALAR))
	1614	retval = newSVsv(*PL_stack_sp--);
	1615	else
	1616	retval = &PL_sv_undef;
	1617	if (!SvTRUE(ERRSV))
	1618	sv_setsv(ERRSV, errsv_save);
	1619	SvREFCNT_dec(errsv_save);
	1620	LEAVE;
	1621	POPSTACK;
	1622	if (IN_PERL_COMPILETIME) {
	1623	STRLEN len;
	1624	char* pv = SvPV(tokenbufsv, len);
	1625
	1626	Copy(pv, PL_tokenbuf, len+1, char);
	1627	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1628	}
	1629	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1630	if (SvPOK(retval))
	1631	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1632	retval);
	1633	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1634	}
	1635	return retval;
	1636	}
	1637
	1638
	1639	/* This API is wrong for special case conversions since we may need to
	1640	* return several Unicode characters for a single Unicode character
	1641	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1642	* the lower-level routine, and it is similarly broken for returning
	1643	* multiple values. --jhi */
	1644	UV
	1645	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1646	{
	1647	HV* hv = (HV*)SvRV(sv);
	1648	U32 klen;
	1649	U32 off;
	1650	STRLEN slen;
	1651	STRLEN needents;
	1652	U8 *tmps = NULL;
	1653	U32 bit;
	1654	SV *retval;
	1655	U8 tmputf8[2];
	1656	UV c = NATIVE_TO_ASCII(*ptr);
	1657
	1658	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1659	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1660	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1661	ptr = tmputf8;
	1662	}
	1663	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1664	* then the "swatch" is a vec() for al the chars which start
	1665	* with 0xAA..0xYY
	1666	* So the key in the hash (klen) is length of encoded char -1
	1667	*/
	1668	klen = UTF8SKIP(ptr) - 1;
	1669	off = ptr[klen];
	1670
	1671	if (klen == 0)
	1672	{
	1673	/* If char in invariant then swatch is for all the invariant chars
	1674	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1675	*/
	1676	needents = UTF_CONTINUATION_MARK;
	1677	off = NATIVE_TO_UTF(ptr[klen]);
	1678	}
	1679	else
	1680	{
	1681	/* If char is encoded then swatch is for the prefix */
	1682	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1683	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1684	}
	1685
	1686	/*
	1687	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1688	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1689	* it's nothing to sniff at.) Pity we usually come through at least
	1690	* two function calls to get here...
	1691	*
	1692	* NB: this code assumes that swatches are never modified, once generated!
	1693	*/
	1694
	1695	if (hv == PL_last_swash_hv &&
	1696	klen == PL_last_swash_klen &&
	1697	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1698	{
	1699	tmps = PL_last_swash_tmps;
	1700	slen = PL_last_swash_slen;
	1701	}
	1702	else {
	1703	/* Try our second-level swatch cache, kept in a hash. */
	1704	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1705
	1706	/* If not cached, generate it via utf8::SWASHGET */
	1707	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1708	dSP;
	1709	/* We use utf8n_to_uvuni() as we want an index into
	1710	Unicode tables, not a native character number.
	1711	*/
	1712	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1713	ckWARN(WARN_UTF8) ?
	1714	0 : UTF8_ALLOW_ANY);
	1715	SV *errsv_save;
	1716	ENTER;
	1717	SAVETMPS;
	1718	save_re_context();
	1719	PUSHSTACKi(PERLSI_MAGIC);
	1720	PUSHMARK(SP);
	1721	EXTEND(SP,3);
	1722	PUSHs((SV*)sv);
	1723	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1724	PUSHs(sv_2mortal(newSViv((klen) ?
	1725	(code_point & ~(needents - 1)) : 0)));
	1726	PUSHs(sv_2mortal(newSViv(needents)));
	1727	PUTBACK;
	1728	errsv_save = newSVsv(ERRSV);
	1729	if (call_method("SWASHGET", G_SCALAR))
	1730	retval = newSVsv(*PL_stack_sp--);
	1731	else
	1732	retval = &PL_sv_undef;
	1733	if (!SvTRUE(ERRSV))
	1734	sv_setsv(ERRSV, errsv_save);
	1735	SvREFCNT_dec(errsv_save);
	1736	POPSTACK;
	1737	FREETMPS;
	1738	LEAVE;
	1739	if (IN_PERL_COMPILETIME)
	1740	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1741
	1742	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1743
	1744	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1745	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1746	}
	1747
	1748	PL_last_swash_hv = hv;
	1749	PL_last_swash_klen = klen;
	1750	PL_last_swash_tmps = tmps;
	1751	PL_last_swash_slen = slen;
	1752	if (klen)
	1753	Copy(ptr, PL_last_swash_key, klen, U8);
	1754	}
	1755
	1756	switch ((int)((slen << 3) / needents)) {
	1757	case 1:
	1758	bit = 1 << (off & 7);
	1759	off >>= 3;
	1760	return (tmps[off] & bit) != 0;
	1761	case 8:
	1762	return tmps[off];
	1763	case 16:
	1764	off <<= 1;
	1765	return (tmps[off] << 8) + tmps[off + 1] ;
	1766	case 32:
	1767	off <<= 2;
	1768	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1769	}
	1770	Perl_croak(aTHX_ "panic: swash_fetch");
	1771	return 0;
	1772	}
	1773
	1774
	1775	/*
	1776	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1777
	1778	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	1779	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	1780	bytes available. The return value is the pointer to the byte after the
	1781	end of the new character. In other words,
	1782
	1783	d = uvchr_to_utf8(d, uv);
	1784
	1785	is the recommended wide native character-aware way of saying
	1786
	1787	*(d++) = uv;
	1788
	1789	=cut
	1790	*/
	1791
	1792	/* On ASCII machines this is normally a macro but we want a
	1793	real function in case XS code wants it
	1794	*/
	1795	#undef Perl_uvchr_to_utf8
	1796	U8 *
	1797	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1798	{
	1799	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	1800	}
	1801
	1802	U8 *
	1803	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	1804	{
	1805	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	1806	}
	1807
	1808	/*
	1809	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1810
	1811	Returns the native character value of the first character in the string C<s>
	1812	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	1813	length, in bytes, of that character.
	1814
	1815	Allows length and flags to be passed to low level routine.
	1816
	1817	=cut
	1818	*/
	1819	/* On ASCII machines this is normally a macro but we want
	1820	a real function in case XS code wants it
	1821	*/
	1822	#undef Perl_utf8n_to_uvchr
	1823	UV
	1824	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1825	{
	1826	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1827	return UNI_TO_NATIVE(uv);
	1828	}
	1829
	1830	/*
	1831	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	1832
	1833	Build to the scalar dsv a displayable version of the string spv,
	1834	length len, the displayable version being at most pvlim bytes long
	1835	(if longer, the rest is truncated and "..." will be appended).
	1836
	1837	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	1838	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	1839	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	1840	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	1841	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	1842	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	1843
	1844	The pointer to the PV of the dsv is returned.
	1845
	1846	=cut */
	1847	char *
	1848	Perl_pv_uni_display(pTHX_ SV dsv, U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	1849	{
	1850	int truncated = 0;
	1851	char s, e;
	1852
	1853	sv_setpvn(dsv, "", 0);
	1854	for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	1855	UV u;
	1856	/* This serves double duty as a flag and a character to print after
	1857	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	1858	*/
	1859	char ok = 0;
	1860
	1861	if (pvlim && SvCUR(dsv) >= pvlim) {
	1862	truncated++;
	1863	break;
	1864	}
	1865	u = utf8_to_uvchr((U8*)s, 0);
	1866	if (u < 256) {
	1867	unsigned char c = u & 0xFF;
	1868	if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
	1869	switch (c) {
	1870	case '\n':
	1871	ok = 'n'; break;
	1872	case '\r':
	1873	ok = 'r'; break;
	1874	case '\t':
	1875	ok = 't'; break;
	1876	case '\f':
	1877	ok = 'f'; break;
	1878	case '\a':
	1879	ok = 'a'; break;
	1880	case '\\':
	1881	ok = '\\'; break;
	1882	default: break;
	1883	}
	1884	if (ok) {
	1885	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
	1886	}
	1887	}
	1888	/* isPRINT() is the locale-blind version. */
	1889	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	1890	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
	1891	ok = 1;
	1892	}
	1893	}
	1894	if (!ok)
	1895	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	1896	}
	1897	if (truncated)
	1898	sv_catpvn(dsv, "...", 3);
	1899
	1900	return SvPVX(dsv);
	1901	}
	1902
	1903	/*
	1904	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	1905
	1906	Build to the scalar dsv a displayable version of the scalar sv,
	1907	the displayable version being at most pvlim bytes long
	1908	(if longer, the rest is truncated and "..." will be appended).
	1909
	1910	The flags argument is as in pv_uni_display().
	1911
	1912	The pointer to the PV of the dsv is returned.
	1913
	1914	=cut */
	1915	char *
	1916	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	1917	{
	1918	return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
	1919	pvlim, flags);
	1920	}
	1921
	1922	/*
	1923	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	1924
	1925	Return true if the strings s1 and s2 differ case-insensitively, false
	1926	if not (if they are equal case-insensitively). If u1 is true, the
	1927	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	1928	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	1929	are false, the respective string is assumed to be in native 8-bit
	1930	encoding.
	1931
	1932	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	1933	in there (they will point at the beginning of the I<next> character).
	1934	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	1935	pointers beyond which scanning will not continue under any
	1936	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	1937	s2+l2 will be used as goal end pointers that will also stop the scan,
	1938	and which qualify towards defining a successful match: all the scans
	1939	that define an explicit length must reach their goal pointers for
	1940	a match to succeed).
	1941
	1942	For case-insensitiveness, the "casefolding" of Unicode is used
	1943	instead of upper/lowercasing both the characters, see
	1944	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	1945
	1946	=cut */
	1947	I32
	1948	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	1949	{
	1950	register U8 p1 = (U8)s1;
	1951	register U8 p2 = (U8)s2;
	1952	register U8 e1 = 0, f1 = 0, *q1 = 0;
	1953	register U8 e2 = 0, f2 = 0, *q2 = 0;
	1954	STRLEN n1 = 0, n2 = 0;
	1955	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	1956	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	1957	U8 natbuf[1+1];
	1958	STRLEN foldlen1, foldlen2;
	1959	bool match;
	1960
	1961	if (pe1)
	1962	e1 = (U8*)pe1;
	1963	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (U8*)s1)))
	1964	f1 = (U8*)s1 + l1;
	1965	if (pe2)
	1966	e2 = (U8*)pe2;
	1967	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (U8*)s2)))
	1968	f2 = (U8*)s2 + l2;
	1969
	1970	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	1971	return 1; /* mismatch; possible infinite loop or false positive */
	1972
	1973	if (!u1 \|\| !u2)
	1974	natbuf[1] = 0; /* Need to terminate the buffer. */
	1975
	1976	while ((e1 == 0 \|\| p1 < e1) &&
	1977	(f1 == 0 \|\| p1 < f1) &&
	1978	(e2 == 0 \|\| p2 < e2) &&
	1979	(f2 == 0 \|\| p2 < f2)) {
	1980	if (n1 == 0) {
	1981	if (u1)
	1982	to_utf8_fold(p1, foldbuf1, &foldlen1);
	1983	else {
	1984	natbuf[0] = *p1;
	1985	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	1986	}
	1987	q1 = foldbuf1;
	1988	n1 = foldlen1;
	1989	}
	1990	if (n2 == 0) {
	1991	if (u2)
	1992	to_utf8_fold(p2, foldbuf2, &foldlen2);
	1993	else {
	1994	natbuf[0] = *p2;
	1995	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	1996	}
	1997	q2 = foldbuf2;
	1998	n2 = foldlen2;
	1999	}
	2000	while (n1 && n2) {
	2001	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2002	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2003	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2004	return 1; /* mismatch */
	2005	n1 -= UTF8SKIP(q1);
	2006	q1 += UTF8SKIP(q1);
	2007	n2 -= UTF8SKIP(q2);
	2008	q2 += UTF8SKIP(q2);
	2009	}
	2010	if (n1 == 0)
	2011	p1 += u1 ? UTF8SKIP(p1) : 1;
	2012	if (n2 == 0)
	2013	p2 += u2 ? UTF8SKIP(p2) : 1;
	2014
	2015	}
	2016
	2017	/* A match is defined by all the scans that specified
	2018	* an explicit length reaching their final goals. */
	2019	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	2020
	2021	if (match) {
	2022	if (pe1)
	2023	pe1 = (char)p1;
	2024	if (pe2)
	2025	pe2 = (char)p2;
	2026	}
	2027
	2028	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2029	}
	2030
	2031	/*
	2032	* Local variables:
	2033	* c-indentation-style: bsd
	2034	* c-basic-offset: 4
	2035	* indent-tabs-mode: t
	2036	* End:
	2037	*
	2038	* vim: shiftwidth=4:
	2039	*/