perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* 'Well do I understand your speech,' he answered in the same language;
	17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	18	* as is the custom in the West, if you wish to be answered?'
	19	*
	20	* ...the travellers perceived that the floor was paved with stones of many
	21	* hues; branching runes and strange devices intertwined beneath their feet.
	22	*/
	23
	24	#include "EXTERN.h"
	25	#define PERL_IN_UTF8_C
	26	#include "perl.h"
	27
	28	#ifndef EBCDIC
	29	/* Separate prototypes needed because in ASCII systems these
	30	* usually macros but they still are compiled as code, too. */
	31	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	32	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	33	#endif
	34
	35	static const char unees[] =
	36	"Malformed UTF-8 character (unexpected end of string)";
	37
	38	/*
	39	=head1 Unicode Support
	40
	41	This file contains various utility functions for manipulating UTF8-encoded
	42	strings. For the uninitiated, this is a method of representing arbitrary
	43	Unicode characters as a variable number of bytes, in such a way that
	44	characters in the ASCII range are unmodified, and a zero byte never appears
	45	within non-zero characters.
	46
	47	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	48
	49	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	50	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	51	bytes available. The return value is the pointer to the byte after the
	52	end of the new character. In other words,
	53
	54	d = uvuni_to_utf8_flags(d, uv, flags);
	55
	56	or, in most cases,
	57
	58	d = uvuni_to_utf8(d, uv);
	59
	60	(which is equivalent to)
	61
	62	d = uvuni_to_utf8_flags(d, uv, 0);
	63
	64	is the recommended Unicode-aware way of saying
	65
	66	*(d++) = uv;
	67
	68	=cut
	69	*/
	70
	71	U8 *
	72	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	73	{
	74	if (ckWARN(WARN_UTF8)) {
	75	if (UNICODE_IS_SURROGATE(uv) &&
	76	!(flags & UNICODE_ALLOW_SURROGATE))
	77	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	78	else if (
	79	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	80	!(flags & UNICODE_ALLOW_FDD0))
	81	\|\|
	82	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	83	!(flags & UNICODE_ALLOW_FFFF))) &&
	84	/* UNICODE_ALLOW_SUPER includes
	85	* FFFEs and FFFFs beyond 0x10FFFF. */
	86	((uv <= PERL_UNICODE_MAX) \|\|
	87	!(flags & UNICODE_ALLOW_SUPER))
	88	)
	89	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	90	"Unicode character 0x%04"UVxf" is illegal", uv);
	91	}
	92	if (UNI_IS_INVARIANT(uv)) {
	93	*d++ = (U8)UTF_TO_NATIVE(uv);
	94	return d;
	95	}
	96	#if defined(EBCDIC)
	97	else {
	98	STRLEN len = UNISKIP(uv);
	99	U8 *p = d+len-1;
	100	while (p > d) {
	101	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	102	uv >>= UTF_ACCUMULATION_SHIFT;
	103	}
	104	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	105	return d+len;
	106	}
	107	#else /* Non loop style */
	108	if (uv < 0x800) {
	109	*d++ = (U8)(( uv >> 6) \| 0xc0);
	110	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	111	return d;
	112	}
	113	if (uv < 0x10000) {
	114	*d++ = (U8)(( uv >> 12) \| 0xe0);
	115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	117	return d;
	118	}
	119	if (uv < 0x200000) {
	120	*d++ = (U8)(( uv >> 18) \| 0xf0);
	121	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	123	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	124	return d;
	125	}
	126	if (uv < 0x4000000) {
	127	*d++ = (U8)(( uv >> 24) \| 0xf8);
	128	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	129	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	130	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	131	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	132	return d;
	133	}
	134	if (uv < 0x80000000) {
	135	*d++ = (U8)(( uv >> 30) \| 0xfc);
	136	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	137	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	138	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	139	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	140	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	141	return d;
	142	}
	143	#ifdef HAS_QUAD
	144	if (uv < UTF8_QUAD_MAX)
	145	#endif
	146	{
	147	d++ = 0xfe; / Can't match U+FEFF! */
	148	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	149	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	150	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	151	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	152	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	153	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	154	return d;
	155	}
	156	#ifdef HAS_QUAD
	157	{
	158	d++ = 0xff; / Can't match U+FFFE! */
	159	d++ = 0x80; / 6 Reserved bits */
	160	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	161	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	162	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	163	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	164	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	165	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	166	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	167	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	168	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	169	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	170	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	171	return d;
	172	}
	173	#endif
	174	#endif /* Loop style */
	175	}
	176
	177	/*
	178
	179	Tests if some arbitrary number of bytes begins in a valid UTF-8
	180	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	181	UTF-8 character. The actual number of bytes in the UTF-8 character
	182	will be returned if it is valid, otherwise 0.
	183
	184	This is the "slow" version as opposed to the "fast" version which is
	185	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	186	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	187	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	188	you should use the _slow(). In practice this means that the _slow()
	189	will be used very rarely, since the maximum Unicode code point (as of
	190	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	191	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	192	five bytes or more.
	193
	194	=cut */
	195	STATIC STRLEN
	196	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	197	{
	198	U8 u = *s;
	199	STRLEN slen;
	200	UV uv, ouv;
	201
	202	if (UTF8_IS_INVARIANT(u))
	203	return 1;
	204
	205	if (!UTF8_IS_START(u))
	206	return 0;
	207
	208	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	209	return 0;
	210
	211	slen = len - 1;
	212	s++;
	213	#ifdef EBCDIC
	214	u = NATIVE_TO_UTF(u);
	215	#endif
	216	u &= UTF_START_MASK(len);
	217	uv = u;
	218	ouv = uv;
	219	while (slen--) {
	220	if (!UTF8_IS_CONTINUATION(*s))
	221	return 0;
	222	uv = UTF8_ACCUMULATE(uv, *s);
	223	if (uv < ouv)
	224	return 0;
	225	ouv = uv;
	226	s++;
	227	}
	228
	229	if ((STRLEN)UNISKIP(uv) < len)
	230	return 0;
	231
	232	return len;
	233	}
	234
	235	/*
	236	=for apidoc A\|STRLEN\|is_utf8_char\|const U8 *s
	237
	238	Tests if some arbitrary number of bytes begins in a valid UTF-8
	239	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	240	UTF-8 character. The actual number of bytes in the UTF-8 character
	241	will be returned if it is valid, otherwise 0.
	242
	243	=cut */
	244	STRLEN
	245	Perl_is_utf8_char(pTHX_ const U8 *s)
	246	{
	247	const STRLEN len = UTF8SKIP(s);
	248	PERL_UNUSED_CONTEXT;
	249	#ifdef IS_UTF8_CHAR
	250	if (IS_UTF8_CHAR_FAST(len))
	251	return IS_UTF8_CHAR(s, len) ? len : 0;
	252	#endif /* #ifdef IS_UTF8_CHAR */
	253	return is_utf8_char_slow(s, len);
	254	}
	255
	256	/*
	257	=for apidoc A\|bool\|is_utf8_string\|const U8 *s\|STRLEN len
	258
	259	Returns true if first C<len> bytes of the given string form a valid
	260	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	261	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	262	because a valid ASCII string is a valid UTF-8 string.
	263
	264	See also is_utf8_string_loclen() and is_utf8_string_loc().
	265
	266	=cut
	267	*/
	268
	269	bool
	270	Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
	271	{
	272	const U8* const send = s + (len ? len : strlen((const char *)s));
	273	const U8* x = s;
	274
	275	PERL_UNUSED_CONTEXT;
	276
	277	while (x < send) {
	278	STRLEN c;
	279	/* Inline the easy bits of is_utf8_char() here for speed... */
	280	if (UTF8_IS_INVARIANT(*x))
	281	c = 1;
	282	else if (!UTF8_IS_START(*x))
	283	goto out;
	284	else {
	285	/* ... and call is_utf8_char() only if really needed. */
	286	#ifdef IS_UTF8_CHAR
	287	c = UTF8SKIP(x);
	288	if (IS_UTF8_CHAR_FAST(c)) {
	289	if (!IS_UTF8_CHAR(x, c))
	290	c = 0;
	291	}
	292	else
	293	c = is_utf8_char_slow(x, c);
	294	#else
	295	c = is_utf8_char(x);
	296	#endif /* #ifdef IS_UTF8_CHAR */
	297	if (!c)
	298	goto out;
	299	}
	300	x += c;
	301	}
	302
	303	out:
	304	if (x != send)
	305	return FALSE;
	306
	307	return TRUE;
	308	}
	309
	310	/*
	311	Implemented as a macro in utf8.h
	312
	313	=for apidoc A\|bool\|is_utf8_string_loc\|const U8 s\|STRLEN len\|const U8 *ep
	314
	315	Like is_utf8_string() but stores the location of the failure (in the
	316	case of "utf8ness failure") or the location s+len (in the case of
	317	"utf8ness success") in the C<ep>.
	318
	319	See also is_utf8_string_loclen() and is_utf8_string().
	320
	321	=for apidoc A\|bool\|is_utf8_string_loclen\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
	322
	323	Like is_utf8_string() but stores the location of the failure (in the
	324	case of "utf8ness failure") or the location s+len (in the case of
	325	"utf8ness success") in the C<ep>, and the number of UTF-8
	326	encoded characters in the C<el>.
	327
	328	See also is_utf8_string_loc() and is_utf8_string().
	329
	330	=cut
	331	*/
	332
	333	bool
	334	Perl_is_utf8_string_loclen(pTHX_ const U8 s, STRLEN len, const U8 ep, STRLEN el)
	335	{
	336	const U8* const send = s + (len ? len : strlen((const char *)s));
	337	const U8* x = s;
	338	STRLEN c;
	339	STRLEN outlen = 0;
	340	PERL_UNUSED_CONTEXT;
	341
	342	while (x < send) {
	343	/* Inline the easy bits of is_utf8_char() here for speed... */
	344	if (UTF8_IS_INVARIANT(*x))
	345	c = 1;
	346	else if (!UTF8_IS_START(*x))
	347	goto out;
	348	else {
	349	/* ... and call is_utf8_char() only if really needed. */
	350	#ifdef IS_UTF8_CHAR
	351	c = UTF8SKIP(x);
	352	if (IS_UTF8_CHAR_FAST(c)) {
	353	if (!IS_UTF8_CHAR(x, c))
	354	c = 0;
	355	} else
	356	c = is_utf8_char_slow(x, c);
	357	#else
	358	c = is_utf8_char(x);
	359	#endif /* #ifdef IS_UTF8_CHAR */
	360	if (!c)
	361	goto out;
	362	}
	363	x += c;
	364	outlen++;
	365	}
	366
	367	out:
	368	if (el)
	369	*el = outlen;
	370
	371	if (ep)
	372	*ep = x;
	373	return (x == send);
	374	}
	375
	376	/*
	377
	378	=for apidoc A\|UV\|utf8n_to_uvuni\|const U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	379
	380	Bottom level UTF-8 decode routine.
	381	Returns the unicode code point value of the first character in the string C<s>
	382	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	383	C<retlen> will be set to the length, in bytes, of that character.
	384
	385	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	386	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	387	it is assumed that the caller will raise a warning, and this function
	388	will silently just set C<retlen> to C<-1> and return zero. If the
	389	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	390	malformations will be given, C<retlen> will be set to the expected
	391	length of the UTF-8 character in bytes, and zero will be returned.
	392
	393	The C<flags> can also contain various flags to allow deviations from
	394	the strict UTF-8 encoding (see F<utf8.h>).
	395
	396	Most code should use utf8_to_uvchr() rather than call this directly.
	397
	398	=cut
	399	*/
	400
	401	UV
	402	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	403	{
	404	dVAR;
	405	const U8 * const s0 = s;
	406	UV uv = *s, ouv = 0;
	407	STRLEN len = 1;
	408	const bool dowarn = ckWARN_d(WARN_UTF8);
	409	const UV startbyte = *s;
	410	STRLEN expectlen = 0;
	411	U32 warning = 0;
	412
	413	/* This list is a superset of the UTF8_ALLOW_XXX. */
	414
	415	#define UTF8_WARN_EMPTY 1
	416	#define UTF8_WARN_CONTINUATION 2
	417	#define UTF8_WARN_NON_CONTINUATION 3
	418	#define UTF8_WARN_FE_FF 4
	419	#define UTF8_WARN_SHORT 5
	420	#define UTF8_WARN_OVERFLOW 6
	421	#define UTF8_WARN_SURROGATE 7
	422	#define UTF8_WARN_LONG 8
	423	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	424
	425	if (curlen == 0 &&
	426	!(flags & UTF8_ALLOW_EMPTY)) {
	427	warning = UTF8_WARN_EMPTY;
	428	goto malformed;
	429	}
	430
	431	if (UTF8_IS_INVARIANT(uv)) {
	432	if (retlen)
	433	*retlen = 1;
	434	return (UV) (NATIVE_TO_UTF(*s));
	435	}
	436
	437	if (UTF8_IS_CONTINUATION(uv) &&
	438	!(flags & UTF8_ALLOW_CONTINUATION)) {
	439	warning = UTF8_WARN_CONTINUATION;
	440	goto malformed;
	441	}
	442
	443	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	444	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	445	warning = UTF8_WARN_NON_CONTINUATION;
	446	goto malformed;
	447	}
	448
	449	#ifdef EBCDIC
	450	uv = NATIVE_TO_UTF(uv);
	451	#else
	452	if ((uv == 0xfe \|\| uv == 0xff) &&
	453	!(flags & UTF8_ALLOW_FE_FF)) {
	454	warning = UTF8_WARN_FE_FF;
	455	goto malformed;
	456	}
	457	#endif
	458
	459	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	460	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	461	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	462	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	463	#ifdef EBCDIC
	464	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	465	else { len = 7; uv &= 0x01; }
	466	#else
	467	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	468	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	469	else { len = 13; uv = 0; } /* whoa! */
	470	#endif
	471
	472	if (retlen)
	473	*retlen = len;
	474
	475	expectlen = len;
	476
	477	if ((curlen < expectlen) &&
	478	!(flags & UTF8_ALLOW_SHORT)) {
	479	warning = UTF8_WARN_SHORT;
	480	goto malformed;
	481	}
	482
	483	len--;
	484	s++;
	485	ouv = uv;
	486
	487	while (len--) {
	488	if (!UTF8_IS_CONTINUATION(*s) &&
	489	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	490	s--;
	491	warning = UTF8_WARN_NON_CONTINUATION;
	492	goto malformed;
	493	}
	494	else
	495	uv = UTF8_ACCUMULATE(uv, *s);
	496	if (!(uv > ouv)) {
	497	/* These cannot be allowed. */
	498	if (uv == ouv) {
	499	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	500	warning = UTF8_WARN_LONG;
	501	goto malformed;
	502	}
	503	}
	504	else { /* uv < ouv */
	505	/* This cannot be allowed. */
	506	warning = UTF8_WARN_OVERFLOW;
	507	goto malformed;
	508	}
	509	}
	510	s++;
	511	ouv = uv;
	512	}
	513
	514	if (UNICODE_IS_SURROGATE(uv) &&
	515	!(flags & UTF8_ALLOW_SURROGATE)) {
	516	warning = UTF8_WARN_SURROGATE;
	517	goto malformed;
	518	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	519	!(flags & UTF8_ALLOW_LONG)) {
	520	warning = UTF8_WARN_LONG;
	521	goto malformed;
	522	} else if (UNICODE_IS_ILLEGAL(uv) &&
	523	!(flags & UTF8_ALLOW_FFFF)) {
	524	warning = UTF8_WARN_FFFF;
	525	goto malformed;
	526	}
	527
	528	return uv;
	529
	530	malformed:
	531
	532	if (flags & UTF8_CHECK_ONLY) {
	533	if (retlen)
	534	*retlen = ((STRLEN) -1);
	535	return 0;
	536	}
	537
	538	if (dowarn) {
	539	SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
	540
	541	switch (warning) {
	542	case 0: /* Intentionally empty. */ break;
	543	case UTF8_WARN_EMPTY:
	544	sv_catpvs(sv, "(empty string)");
	545	break;
	546	case UTF8_WARN_CONTINUATION:
	547	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	548	break;
	549	case UTF8_WARN_NON_CONTINUATION:
	550	if (s == s0)
	551	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	552	(UV)s[1], startbyte);
	553	else {
	554	const int len = (int)(s-s0);
	555	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	556	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	557	}
	558
	559	break;
	560	case UTF8_WARN_FE_FF:
	561	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	562	break;
	563	case UTF8_WARN_SHORT:
	564	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	565	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	566	expectlen = curlen; /* distance for caller to skip */
	567	break;
	568	case UTF8_WARN_OVERFLOW:
	569	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	570	ouv, *s, startbyte);
	571	break;
	572	case UTF8_WARN_SURROGATE:
	573	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	574	break;
	575	case UTF8_WARN_LONG:
	576	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	577	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	578	break;
	579	case UTF8_WARN_FFFF:
	580	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	581	break;
	582	default:
	583	sv_catpvs(sv, "(unknown reason)");
	584	break;
	585	}
	586
	587	if (warning) {
	588	const char * const s = SvPVX_const(sv);
	589
	590	if (PL_op)
	591	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	592	"%s in %s", s, OP_DESC(PL_op));
	593	else
	594	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	595	}
	596	}
	597
	598	if (retlen)
	599	*retlen = expectlen ? expectlen : len;
	600
	601	return 0;
	602	}
	603
	604	/*
	605	=for apidoc A\|UV\|utf8_to_uvchr\|const U8 s\|STRLEN retlen
	606
	607	Returns the native character value of the first character in the string C<s>
	608	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	609	length, in bytes, of that character.
	610
	611	If C<s> does not point to a well-formed UTF-8 character, zero is
	612	returned and retlen is set, if possible, to -1.
	613
	614	=cut
	615	*/
	616
	617	UV
	618	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	619	{
	620	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	621	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	622	}
	623
	624	/*
	625	=for apidoc A\|UV\|utf8_to_uvuni\|const U8 s\|STRLEN retlen
	626
	627	Returns the Unicode code point of the first character in the string C<s>
	628	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	629	length, in bytes, of that character.
	630
	631	This function should only be used when returned UV is considered
	632	an index into the Unicode semantic tables (e.g. swashes).
	633
	634	If C<s> does not point to a well-formed UTF-8 character, zero is
	635	returned and retlen is set, if possible, to -1.
	636
	637	=cut
	638	*/
	639
	640	UV
	641	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	642	{
	643	/* Call the low level routine asking for checks */
	644	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	645	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	646	}
	647
	648	/*
	649	=for apidoc A\|STRLEN\|utf8_length\|const U8 s\|const U8 e
	650
	651	Return the length of the UTF-8 char encoded string C<s> in characters.
	652	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	653	up past C<e>, croaks.
	654
	655	=cut
	656	*/
	657
	658	STRLEN
	659	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	660	{
	661	dVAR;
	662	STRLEN len = 0;
	663	U8 t = 0;
	664
	665	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	666	* the bitops (especially ~) can create illegal UTF-8.
	667	* In other words: in Perl UTF-8 is not just for Unicode. */
	668
	669	if (e < s)
	670	goto warn_and_return;
	671	while (s < e) {
	672	t = UTF8SKIP(s);
	673	if (e - s < t) {
	674	warn_and_return:
	675	if (ckWARN_d(WARN_UTF8)) {
	676	if (PL_op)
	677	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	678	"%s in %s", unees, OP_DESC(PL_op));
	679	else
	680	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	681	}
	682	return len;
	683	}
	684	s += t;
	685	len++;
	686	}
	687
	688	return len;
	689	}
	690
	691	/*
	692	=for apidoc A\|IV\|utf8_distance\|const U8 a\|const U8 b
	693
	694	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	695	and C<b>.
	696
	697	WARNING: use only if you know that the pointers point inside the
	698	same UTF-8 buffer.
	699
	700	=cut
	701	*/
	702
	703	IV
	704	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	705	{
	706	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	707	}
	708
	709	/*
	710	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	711
	712	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	713	forward or backward.
	714
	715	WARNING: do not use the following unless you know C<off> is within
	716	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	717	on the first byte of character or just after the last byte of a character.
	718
	719	=cut
	720	*/
	721
	722	U8 *
	723	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	724	{
	725	PERL_UNUSED_CONTEXT;
	726	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	727	* the bitops (especially ~) can create illegal UTF-8.
	728	* In other words: in Perl UTF-8 is not just for Unicode. */
	729
	730	if (off >= 0) {
	731	while (off--)
	732	s += UTF8SKIP(s);
	733	}
	734	else {
	735	while (off++) {
	736	s--;
	737	while (UTF8_IS_CONTINUATION(*s))
	738	s--;
	739	}
	740	}
	741	return (U8 *)s;
	742	}
	743
	744	/*
	745	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	746
	747	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	748	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	749	updates len to contain the new length.
	750	Returns zero on failure, setting C<len> to -1.
	751
	752	If you need a copy of the string, see C<bytes_from_utf8>.
	753
	754	=cut
	755	*/
	756
	757	U8 *
	758	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	759	{
	760	U8 * const save = s;
	761	U8 * const send = s + *len;
	762	U8 *d;
	763
	764	/* ensure valid UTF-8 and chars < 256 before updating string */
	765	while (s < send) {
	766	U8 c = *s++;
	767
	768	if (!UTF8_IS_INVARIANT(c) &&
	769	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	770	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	771	*len = ((STRLEN) -1);
	772	return 0;
	773	}
	774	}
	775
	776	d = s = save;
	777	while (s < send) {
	778	STRLEN ulen;
	779	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	780	s += ulen;
	781	}
	782	*d = '\0';
	783	*len = d - save;
	784	return save;
	785	}
	786
	787	/*
	788	=for apidoc A\|U8 \|bytes_from_utf8\|const U8 s\|STRLEN len\|bool is_utf8
	789
	790	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	791	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	792	the newly-created string, and updates C<len> to contain the new
	793	length. Returns the original string if no conversion occurs, C<len>
	794	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	795	0 if C<s> is converted or contains all 7bit characters.
	796
	797	=cut
	798	*/
	799
	800	U8 *
	801	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	802	{
	803	U8 *d;
	804	const U8 *start = s;
	805	const U8 *send;
	806	I32 count = 0;
	807
	808	PERL_UNUSED_CONTEXT;
	809	if (!*is_utf8)
	810	return (U8 *)start;
	811
	812	/* ensure valid UTF-8 and chars < 256 before converting string */
	813	for (send = s + *len; s < send;) {
	814	U8 c = *s++;
	815	if (!UTF8_IS_INVARIANT(c)) {
	816	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	817	(c = *s++) && UTF8_IS_CONTINUATION(c))
	818	count++;
	819	else
	820	return (U8 *)start;
	821	}
	822	}
	823
	824	*is_utf8 = FALSE;
	825
	826	Newx(d, (*len) - count + 1, U8);
	827	s = start; start = d;
	828	while (s < send) {
	829	U8 c = *s++;
	830	if (!UTF8_IS_INVARIANT(c)) {
	831	/* Then it is two-byte encoded */
	832	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	833	c = ASCII_TO_NATIVE(c);
	834	}
	835	*d++ = c;
	836	}
	837	*d = '\0';
	838	*len = d - start;
	839	return (U8 *)start;
	840	}
	841
	842	/*
	843	=for apidoc A\|U8 \|bytes_to_utf8\|const U8 s\|STRLEN *len
	844
	845	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
	846	Returns a pointer to the newly-created string, and sets C<len> to
	847	reflect the new length.
	848
	849	If you want to convert to UTF-8 from other encodings than ASCII,
	850	see sv_recode_to_utf8().
	851
	852	=cut
	853	*/
	854
	855	U8*
	856	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	857	{
	858	const U8 * const send = s + (*len);
	859	U8 *d;
	860	U8 *dst;
	861	PERL_UNUSED_CONTEXT;
	862
	863	Newx(d, (len) 2 + 1, U8);
	864	dst = d;
	865
	866	while (s < send) {
	867	const UV uv = NATIVE_TO_ASCII(*s++);
	868	if (UNI_IS_INVARIANT(uv))
	869	*d++ = (U8)UTF_TO_NATIVE(uv);
	870	else {
	871	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	872	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	873	}
	874	}
	875	*d = '\0';
	876	*len = d-dst;
	877	return dst;
	878	}
	879
	880	/*
	881	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	882	*
	883	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	884	* We optimize for native, for obvious reasons. */
	885
	886	U8*
	887	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	888	{
	889	U8* pend;
	890	U8* dstart = d;
	891
	892	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
	893	d[0] = 0;
	894	*newlen = 1;
	895	return d;
	896	}
	897
	898	if (bytelen & 1)
	899	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	900
	901	pend = p + bytelen;
	902
	903	while (p < pend) {
	904	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	905	p += 2;
	906	if (uv < 0x80) {
	907	#ifdef EBCDIC
	908	*d++ = UNI_TO_NATIVE(uv);
	909	#else
	910	*d++ = (U8)uv;
	911	#endif
	912	continue;
	913	}
	914	if (uv < 0x800) {
	915	*d++ = (U8)(( uv >> 6) \| 0xc0);
	916	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	917	continue;
	918	}
	919	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	920	UV low = (p[0] << 8) + p[1];
	921	p += 2;
	922	if (low < 0xdc00 \|\| low >= 0xdfff)
	923	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	924	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	925	}
	926	if (uv < 0x10000) {
	927	*d++ = (U8)(( uv >> 12) \| 0xe0);
	928	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	929	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	930	continue;
	931	}
	932	else {
	933	*d++ = (U8)(( uv >> 18) \| 0xf0);
	934	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	935	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	936	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	937	continue;
	938	}
	939	}
	940	*newlen = d - dstart;
	941	return d;
	942	}
	943
	944	/* Note: this one is slightly destructive of the source. */
	945
	946	U8*
	947	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	948	{
	949	U8* s = (U8*)p;
	950	U8* const send = s + bytelen;
	951	while (s < send) {
	952	const U8 tmp = s[0];
	953	s[0] = s[1];
	954	s[1] = tmp;
	955	s += 2;
	956	}
	957	return utf16_to_utf8(p, d, bytelen, newlen);
	958	}
	959
	960	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	961
	962	bool
	963	Perl_is_uni_alnum(pTHX_ UV c)
	964	{
	965	U8 tmpbuf[UTF8_MAXBYTES+1];
	966	uvchr_to_utf8(tmpbuf, c);
	967	return is_utf8_alnum(tmpbuf);
	968	}
	969
	970	bool
	971	Perl_is_uni_alnumc(pTHX_ UV c)
	972	{
	973	U8 tmpbuf[UTF8_MAXBYTES+1];
	974	uvchr_to_utf8(tmpbuf, c);
	975	return is_utf8_alnumc(tmpbuf);
	976	}
	977
	978	bool
	979	Perl_is_uni_idfirst(pTHX_ UV c)
	980	{
	981	U8 tmpbuf[UTF8_MAXBYTES+1];
	982	uvchr_to_utf8(tmpbuf, c);
	983	return is_utf8_idfirst(tmpbuf);
	984	}
	985
	986	bool
	987	Perl_is_uni_alpha(pTHX_ UV c)
	988	{
	989	U8 tmpbuf[UTF8_MAXBYTES+1];
	990	uvchr_to_utf8(tmpbuf, c);
	991	return is_utf8_alpha(tmpbuf);
	992	}
	993
	994	bool
	995	Perl_is_uni_ascii(pTHX_ UV c)
	996	{
	997	U8 tmpbuf[UTF8_MAXBYTES+1];
	998	uvchr_to_utf8(tmpbuf, c);
	999	return is_utf8_ascii(tmpbuf);
	1000	}
	1001
	1002	bool
	1003	Perl_is_uni_space(pTHX_ UV c)
	1004	{
	1005	U8 tmpbuf[UTF8_MAXBYTES+1];
	1006	uvchr_to_utf8(tmpbuf, c);
	1007	return is_utf8_space(tmpbuf);
	1008	}
	1009
	1010	bool
	1011	Perl_is_uni_digit(pTHX_ UV c)
	1012	{
	1013	U8 tmpbuf[UTF8_MAXBYTES+1];
	1014	uvchr_to_utf8(tmpbuf, c);
	1015	return is_utf8_digit(tmpbuf);
	1016	}
	1017
	1018	bool
	1019	Perl_is_uni_upper(pTHX_ UV c)
	1020	{
	1021	U8 tmpbuf[UTF8_MAXBYTES+1];
	1022	uvchr_to_utf8(tmpbuf, c);
	1023	return is_utf8_upper(tmpbuf);
	1024	}
	1025
	1026	bool
	1027	Perl_is_uni_lower(pTHX_ UV c)
	1028	{
	1029	U8 tmpbuf[UTF8_MAXBYTES+1];
	1030	uvchr_to_utf8(tmpbuf, c);
	1031	return is_utf8_lower(tmpbuf);
	1032	}
	1033
	1034	bool
	1035	Perl_is_uni_cntrl(pTHX_ UV c)
	1036	{
	1037	U8 tmpbuf[UTF8_MAXBYTES+1];
	1038	uvchr_to_utf8(tmpbuf, c);
	1039	return is_utf8_cntrl(tmpbuf);
	1040	}
	1041
	1042	bool
	1043	Perl_is_uni_graph(pTHX_ UV c)
	1044	{
	1045	U8 tmpbuf[UTF8_MAXBYTES+1];
	1046	uvchr_to_utf8(tmpbuf, c);
	1047	return is_utf8_graph(tmpbuf);
	1048	}
	1049
	1050	bool
	1051	Perl_is_uni_print(pTHX_ UV c)
	1052	{
	1053	U8 tmpbuf[UTF8_MAXBYTES+1];
	1054	uvchr_to_utf8(tmpbuf, c);
	1055	return is_utf8_print(tmpbuf);
	1056	}
	1057
	1058	bool
	1059	Perl_is_uni_punct(pTHX_ UV c)
	1060	{
	1061	U8 tmpbuf[UTF8_MAXBYTES+1];
	1062	uvchr_to_utf8(tmpbuf, c);
	1063	return is_utf8_punct(tmpbuf);
	1064	}
	1065
	1066	bool
	1067	Perl_is_uni_xdigit(pTHX_ UV c)
	1068	{
	1069	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1070	uvchr_to_utf8(tmpbuf, c);
	1071	return is_utf8_xdigit(tmpbuf);
	1072	}
	1073
	1074	UV
	1075	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1076	{
	1077	uvchr_to_utf8(p, c);
	1078	return to_utf8_upper(p, p, lenp);
	1079	}
	1080
	1081	UV
	1082	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1083	{
	1084	uvchr_to_utf8(p, c);
	1085	return to_utf8_title(p, p, lenp);
	1086	}
	1087
	1088	UV
	1089	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1090	{
	1091	uvchr_to_utf8(p, c);
	1092	return to_utf8_lower(p, p, lenp);
	1093	}
	1094
	1095	UV
	1096	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1097	{
	1098	uvchr_to_utf8(p, c);
	1099	return to_utf8_fold(p, p, lenp);
	1100	}
	1101
	1102	/* for now these all assume no locale info available for Unicode > 255 */
	1103
	1104	bool
	1105	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1106	{
	1107	return is_uni_alnum(c); /* XXX no locale support yet */
	1108	}
	1109
	1110	bool
	1111	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1112	{
	1113	return is_uni_alnumc(c); /* XXX no locale support yet */
	1114	}
	1115
	1116	bool
	1117	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1118	{
	1119	return is_uni_idfirst(c); /* XXX no locale support yet */
	1120	}
	1121
	1122	bool
	1123	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1124	{
	1125	return is_uni_alpha(c); /* XXX no locale support yet */
	1126	}
	1127
	1128	bool
	1129	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1130	{
	1131	return is_uni_ascii(c); /* XXX no locale support yet */
	1132	}
	1133
	1134	bool
	1135	Perl_is_uni_space_lc(pTHX_ UV c)
	1136	{
	1137	return is_uni_space(c); /* XXX no locale support yet */
	1138	}
	1139
	1140	bool
	1141	Perl_is_uni_digit_lc(pTHX_ UV c)
	1142	{
	1143	return is_uni_digit(c); /* XXX no locale support yet */
	1144	}
	1145
	1146	bool
	1147	Perl_is_uni_upper_lc(pTHX_ UV c)
	1148	{
	1149	return is_uni_upper(c); /* XXX no locale support yet */
	1150	}
	1151
	1152	bool
	1153	Perl_is_uni_lower_lc(pTHX_ UV c)
	1154	{
	1155	return is_uni_lower(c); /* XXX no locale support yet */
	1156	}
	1157
	1158	bool
	1159	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1160	{
	1161	return is_uni_cntrl(c); /* XXX no locale support yet */
	1162	}
	1163
	1164	bool
	1165	Perl_is_uni_graph_lc(pTHX_ UV c)
	1166	{
	1167	return is_uni_graph(c); /* XXX no locale support yet */
	1168	}
	1169
	1170	bool
	1171	Perl_is_uni_print_lc(pTHX_ UV c)
	1172	{
	1173	return is_uni_print(c); /* XXX no locale support yet */
	1174	}
	1175
	1176	bool
	1177	Perl_is_uni_punct_lc(pTHX_ UV c)
	1178	{
	1179	return is_uni_punct(c); /* XXX no locale support yet */
	1180	}
	1181
	1182	bool
	1183	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1184	{
	1185	return is_uni_xdigit(c); /* XXX no locale support yet */
	1186	}
	1187
	1188	U32
	1189	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1190	{
	1191	/* XXX returns only the first character -- do not use XXX */
	1192	/* XXX no locale support yet */
	1193	STRLEN len;
	1194	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1195	return (U32)to_uni_upper(c, tmpbuf, &len);
	1196	}
	1197
	1198	U32
	1199	Perl_to_uni_title_lc(pTHX_ U32 c)
	1200	{
	1201	/* XXX returns only the first character XXX -- do not use XXX */
	1202	/* XXX no locale support yet */
	1203	STRLEN len;
	1204	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1205	return (U32)to_uni_title(c, tmpbuf, &len);
	1206	}
	1207
	1208	U32
	1209	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1210	{
	1211	/* XXX returns only the first character -- do not use XXX */
	1212	/* XXX no locale support yet */
	1213	STRLEN len;
	1214	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1215	return (U32)to_uni_lower(c, tmpbuf, &len);
	1216	}
	1217
	1218	static bool
	1219	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1220	const char *const swashname)
	1221	{
	1222	dVAR;
	1223	if (!is_utf8_char(p))
	1224	return FALSE;
	1225	if (!*swash)
	1226	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1227	return swash_fetch(*swash, p, TRUE) != 0;
	1228	}
	1229
	1230	bool
	1231	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1232	{
	1233	dVAR;
	1234	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1235	* descendant of isalnum(3), in other words, it doesn't
	1236	* contain the '_'. --jhi */
	1237	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1238	}
	1239
	1240	bool
	1241	Perl_is_utf8_alnumc(pTHX_ const U8 *p)
	1242	{
	1243	dVAR;
	1244	return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
	1245	}
	1246
	1247	bool
	1248	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1249	{
	1250	dVAR;
	1251	if (*p == '_')
	1252	return TRUE;
	1253	/* is_utf8_idstart would be more logical. */
	1254	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1255	}
	1256
	1257	bool
	1258	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1259	{
	1260	dVAR;
	1261	if (*p == '_')
	1262	return TRUE;
	1263	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1264	}
	1265
	1266	bool
	1267	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1268	{
	1269	dVAR;
	1270	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1271	}
	1272
	1273	bool
	1274	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1275	{
	1276	dVAR;
	1277	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1278	}
	1279
	1280	bool
	1281	Perl_is_utf8_space(pTHX_ const U8 *p)
	1282	{
	1283	dVAR;
	1284	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1285	}
	1286
	1287	bool
	1288	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1289	{
	1290	dVAR;
	1291	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1292	}
	1293
	1294	bool
	1295	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1296	{
	1297	dVAR;
	1298	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1299	}
	1300
	1301	bool
	1302	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1303	{
	1304	dVAR;
	1305	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1306	}
	1307
	1308	bool
	1309	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1310	{
	1311	dVAR;
	1312	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1313	}
	1314
	1315	bool
	1316	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1317	{
	1318	dVAR;
	1319	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1320	}
	1321
	1322	bool
	1323	Perl_is_utf8_print(pTHX_ const U8 *p)
	1324	{
	1325	dVAR;
	1326	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1327	}
	1328
	1329	bool
	1330	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1331	{
	1332	dVAR;
	1333	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1334	}
	1335
	1336	bool
	1337	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1338	{
	1339	dVAR;
	1340	return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
	1341	}
	1342
	1343	bool
	1344	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1345	{
	1346	dVAR;
	1347	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1348	}
	1349
	1350	/*
	1351	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1352
	1353	The "p" contains the pointer to the UTF-8 string encoding
	1354	the character that is being converted.
	1355
	1356	The "ustrp" is a pointer to the character buffer to put the
	1357	conversion result to. The "lenp" is a pointer to the length
	1358	of the result.
	1359
	1360	The "swashp" is a pointer to the swash to use.
	1361
	1362	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1363	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1364	but not always, a multicharacter mapping), is tried first.
	1365
	1366	The "special" is a string like "utf8::ToSpecLower", which means the
	1367	hash %utf8::ToSpecLower. The access to the hash is through
	1368	Perl_to_utf8_case().
	1369
	1370	The "normal" is a string like "ToLower" which means the swash
	1371	%utf8::ToLower.
	1372
	1373	=cut */
	1374
	1375	UV
	1376	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1377	SV *swashp, const char normal, const char *special)
	1378	{
	1379	dVAR;
	1380	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1381	STRLEN len = 0;
	1382
	1383	const UV uv0 = utf8_to_uvchr(p, NULL);
	1384	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1385	* are necessary in EBCDIC, they are redundant no-ops
	1386	* in ASCII-ish platforms, and hopefully optimized away. */
	1387	const UV uv1 = NATIVE_TO_UNI(uv0);
	1388	uvuni_to_utf8(tmpbuf, uv1);
	1389
	1390	if (!swashp) / load on-demand */
	1391	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1392
	1393	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1394	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1395	/* It might be "special" (sometimes, but not always,
	1396	* a multicharacter mapping) */
	1397	HV * const hv = get_hv(special, FALSE);
	1398	SV **svp;
	1399
	1400	if (hv &&
	1401	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1402	(*svp)) {
	1403	const char *s;
	1404
	1405	s = SvPV_const(*svp, len);
	1406	if (len == 1)
	1407	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1408	else {
	1409	#ifdef EBCDIC
	1410	/* If we have EBCDIC we need to remap the characters
	1411	* since any characters in the low 256 are Unicode
	1412	* code points, not EBCDIC. */
	1413	U8 t = (U8)s, tend = t + len, d;
	1414
	1415	d = tmpbuf;
	1416	if (SvUTF8(*svp)) {
	1417	STRLEN tlen = 0;
	1418
	1419	while (t < tend) {
	1420	const UV c = utf8_to_uvchr(t, &tlen);
	1421	if (tlen > 0) {
	1422	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1423	t += tlen;
	1424	}
	1425	else
	1426	break;
	1427	}
	1428	}
	1429	else {
	1430	while (t < tend) {
	1431	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1432	t++;
	1433	}
	1434	}
	1435	len = d - tmpbuf;
	1436	Copy(tmpbuf, ustrp, len, U8);
	1437	#else
	1438	Copy(s, ustrp, len, U8);
	1439	#endif
	1440	}
	1441	}
	1442	}
	1443
	1444	if (!len && *swashp) {
	1445	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1446
	1447	if (uv2) {
	1448	/* It was "normal" (a single character mapping). */
	1449	const UV uv3 = UNI_TO_NATIVE(uv2);
	1450	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1451	}
	1452	}
	1453
	1454	if (!len) /* Neither: just copy. */
	1455	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1456
	1457	if (lenp)
	1458	*lenp = len;
	1459
	1460	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1461	}
	1462
	1463	/*
	1464	=for apidoc A\|UV\|to_utf8_upper\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1465
	1466	Convert the UTF-8 encoded character at p to its uppercase version and
	1467	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1468	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1469	the uppercase version may be longer than the original character.
	1470
	1471	The first character of the uppercased version is returned
	1472	(but note, as explained above, that there may be more.)
	1473
	1474	=cut */
	1475
	1476	UV
	1477	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1478	{
	1479	dVAR;
	1480	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1481	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1482	}
	1483
	1484	/*
	1485	=for apidoc A\|UV\|to_utf8_title\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1486
	1487	Convert the UTF-8 encoded character at p to its titlecase version and
	1488	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1489	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1490	titlecase version may be longer than the original character.
	1491
	1492	The first character of the titlecased version is returned
	1493	(but note, as explained above, that there may be more.)
	1494
	1495	=cut */
	1496
	1497	UV
	1498	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1499	{
	1500	dVAR;
	1501	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1502	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1503	}
	1504
	1505	/*
	1506	=for apidoc A\|UV\|to_utf8_lower\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1507
	1508	Convert the UTF-8 encoded character at p to its lowercase version and
	1509	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1510	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1511	lowercase version may be longer than the original character.
	1512
	1513	The first character of the lowercased version is returned
	1514	(but note, as explained above, that there may be more.)
	1515
	1516	=cut */
	1517
	1518	UV
	1519	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1520	{
	1521	dVAR;
	1522	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1523	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1524	}
	1525
	1526	/*
	1527	=for apidoc A\|UV\|to_utf8_fold\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1528
	1529	Convert the UTF-8 encoded character at p to its foldcase version and
	1530	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1531	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1532	foldcase version may be longer than the original character (up to
	1533	three characters).
	1534
	1535	The first character of the foldcased version is returned
	1536	(but note, as explained above, that there may be more.)
	1537
	1538	=cut */
	1539
	1540	UV
	1541	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1542	{
	1543	dVAR;
	1544	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1545	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1546	}
	1547
	1548	/* Note:
	1549	* A "swash" is a swatch hash.
	1550	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	1551	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	1552	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	1553	*/
	1554	SV*
	1555	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	1556	{
	1557	dVAR;
	1558	SV* retval;
	1559	SV* const tokenbufsv = sv_newmortal();
	1560	dSP;
	1561	const size_t pkg_len = strlen(pkg);
	1562	const size_t name_len = strlen(name);
	1563	HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
	1564	SV* errsv_save;
	1565
	1566	PUSHSTACKi(PERLSI_MAGIC);
	1567	ENTER;
	1568	SAVEI32(PL_hints);
	1569	PL_hints = 0;
	1570	save_re_context();
	1571	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1572	ENTER;
	1573	errsv_save = newSVsv(ERRSV);
	1574	/* It is assumed that callers of this routine are not passing in any
	1575	user derived data. */
	1576	/* Need to do this after save_re_context() as it will set PL_tainted to
	1577	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	1578	Even line to create errsv_save can turn on PL_tainted. */
	1579	SAVEBOOL(PL_tainted);
	1580	PL_tainted = 0;
	1581	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1582	NULL);
	1583	if (!SvTRUE(ERRSV))
	1584	sv_setsv(ERRSV, errsv_save);
	1585	SvREFCNT_dec(errsv_save);
	1586	LEAVE;
	1587	}
	1588	SPAGAIN;
	1589	PUSHMARK(SP);
	1590	EXTEND(SP,5);
	1591	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
	1592	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
	1593	PUSHs(listsv);
	1594	PUSHs(sv_2mortal(newSViv(minbits)));
	1595	PUSHs(sv_2mortal(newSViv(none)));
	1596	PUTBACK;
	1597	if (IN_PERL_COMPILETIME) {
	1598	/* XXX ought to be handled by lex_start */
	1599	SAVEI32(PL_in_my);
	1600	PL_in_my = 0;
	1601	sv_setpv(tokenbufsv, PL_tokenbuf);
	1602	}
	1603	errsv_save = newSVsv(ERRSV);
	1604	if (call_method("SWASHNEW", G_SCALAR))
	1605	retval = newSVsv(*PL_stack_sp--);
	1606	else
	1607	retval = &PL_sv_undef;
	1608	if (!SvTRUE(ERRSV))
	1609	sv_setsv(ERRSV, errsv_save);
	1610	SvREFCNT_dec(errsv_save);
	1611	LEAVE;
	1612	POPSTACK;
	1613	if (IN_PERL_COMPILETIME) {
	1614	STRLEN len;
	1615	const char* const pv = SvPV_const(tokenbufsv, len);
	1616
	1617	Copy(pv, PL_tokenbuf, len+1, char);
	1618	CopHINTS_set(PL_curcop, PL_hints);
	1619	}
	1620	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1621	if (SvPOK(retval))
	1622	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1623	(void*)retval);
	1624	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1625	}
	1626	return retval;
	1627	}
	1628
	1629
	1630	/* This API is wrong for special case conversions since we may need to
	1631	* return several Unicode characters for a single Unicode character
	1632	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1633	* the lower-level routine, and it is similarly broken for returning
	1634	* multiple values. --jhi */
	1635	/* Now SWASHGET is recasted into S_swash_get in this file. */
	1636
	1637	/* Note:
	1638	* Returns the value of property/mapping C<swash> for the first character
	1639	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	1640	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	1641	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	1642	*/
	1643	UV
	1644	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	1645	{
	1646	dVAR;
	1647	HV* const hv = (HV*)SvRV(swash);
	1648	U32 klen;
	1649	U32 off;
	1650	STRLEN slen;
	1651	STRLEN needents;
	1652	const U8 *tmps = NULL;
	1653	U32 bit;
	1654	SV *swatch;
	1655	U8 tmputf8[2];
	1656	const UV c = NATIVE_TO_ASCII(*ptr);
	1657
	1658	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1659	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1660	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1661	ptr = tmputf8;
	1662	}
	1663	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1664	* then the "swatch" is a vec() for al the chars which start
	1665	* with 0xAA..0xYY
	1666	* So the key in the hash (klen) is length of encoded char -1
	1667	*/
	1668	klen = UTF8SKIP(ptr) - 1;
	1669	off = ptr[klen];
	1670
	1671	if (klen == 0) {
	1672	/* If char in invariant then swatch is for all the invariant chars
	1673	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1674	*/
	1675	needents = UTF_CONTINUATION_MARK;
	1676	off = NATIVE_TO_UTF(ptr[klen]);
	1677	}
	1678	else {
	1679	/* If char is encoded then swatch is for the prefix */
	1680	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1681	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1682	}
	1683
	1684	/*
	1685	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1686	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1687	* it's nothing to sniff at.) Pity we usually come through at least
	1688	* two function calls to get here...
	1689	*
	1690	* NB: this code assumes that swatches are never modified, once generated!
	1691	*/
	1692
	1693	if (hv == PL_last_swash_hv &&
	1694	klen == PL_last_swash_klen &&
	1695	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1696	{
	1697	tmps = PL_last_swash_tmps;
	1698	slen = PL_last_swash_slen;
	1699	}
	1700	else {
	1701	/* Try our second-level swatch cache, kept in a hash. */
	1702	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1703
	1704	/* If not cached, generate it via swash_get */
	1705	if (!svp \|\| !SvPOK(*svp)
	1706	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	1707	/* We use utf8n_to_uvuni() as we want an index into
	1708	Unicode tables, not a native character number.
	1709	*/
	1710	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1711	ckWARN(WARN_UTF8) ?
	1712	0 : UTF8_ALLOW_ANY);
	1713	swatch = swash_get(swash,
	1714	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1715	(klen) ? (code_point & ~(needents - 1)) : 0,
	1716	needents);
	1717
	1718	if (IN_PERL_COMPILETIME)
	1719	CopHINTS_set(PL_curcop, PL_hints);
	1720
	1721	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	1722
	1723	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	1724	\|\| (slen << 3) < needents)
	1725	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	1726	}
	1727
	1728	PL_last_swash_hv = hv;
	1729	PL_last_swash_klen = klen;
	1730	/* FIXME change interpvar.h? */
	1731	PL_last_swash_tmps = (U8 *) tmps;
	1732	PL_last_swash_slen = slen;
	1733	if (klen)
	1734	Copy(ptr, PL_last_swash_key, klen, U8);
	1735	}
	1736
	1737	switch ((int)((slen << 3) / needents)) {
	1738	case 1:
	1739	bit = 1 << (off & 7);
	1740	off >>= 3;
	1741	return (tmps[off] & bit) != 0;
	1742	case 8:
	1743	return tmps[off];
	1744	case 16:
	1745	off <<= 1;
	1746	return (tmps[off] << 8) + tmps[off + 1] ;
	1747	case 32:
	1748	off <<= 2;
	1749	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1750	}
	1751	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	1752	NORETURN_FUNCTION_END;
	1753	}
	1754
	1755	/* Note:
	1756	* Returns a swatch (a bit vector string) for a code point sequence
	1757	* that starts from the value C<start> and comprises the number C<span>.
	1758	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	1759	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	1760	*/
	1761	STATIC SV*
	1762	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	1763	{
	1764	SV *swatch;
	1765	U8 l, lend, x, xend, *s;
	1766	STRLEN lcur, xcur, scur;
	1767
	1768	HV* const hv = (HV*)SvRV(swash);
	1769	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	1770	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	1771	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	1772	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	1773	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	1774	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	1775	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	1776	const STRLEN bits = SvUV(*bitssvp);
	1777	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	1778	const UV none = SvUV(*nonesvp);
	1779	const UV end = start + span;
	1780
	1781	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	1782	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	1783	(UV)bits);
	1784	}
	1785
	1786	/* create and initialize $swatch */
	1787	swatch = newSVpvs("");
	1788	scur = octets ? (span * octets) : (span + 7) / 8;
	1789	SvGROW(swatch, scur + 1);
	1790	s = (U8*)SvPVX(swatch);
	1791	if (octets && none) {
	1792	const U8* const e = s + scur;
	1793	while (s < e) {
	1794	if (bits == 8)
	1795	*s++ = (U8)(none & 0xff);
	1796	else if (bits == 16) {
	1797	*s++ = (U8)((none >> 8) & 0xff);
	1798	*s++ = (U8)( none & 0xff);
	1799	}
	1800	else if (bits == 32) {
	1801	*s++ = (U8)((none >> 24) & 0xff);
	1802	*s++ = (U8)((none >> 16) & 0xff);
	1803	*s++ = (U8)((none >> 8) & 0xff);
	1804	*s++ = (U8)( none & 0xff);
	1805	}
	1806	}
	1807	*s = '\0';
	1808	}
	1809	else {
	1810	(void)memzero((U8*)s, scur + 1);
	1811	}
	1812	SvCUR_set(swatch, scur);
	1813	s = (U8*)SvPVX(swatch);
	1814
	1815	/* read $swash->{LIST} */
	1816	l = (U8)SvPV(listsvp, lcur);
	1817	lend = l + lcur;
	1818	while (l < lend) {
	1819	UV min, max, val;
	1820	STRLEN numlen;
	1821	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	1822
	1823	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	1824
	1825	numlen = lend - l;
	1826	min = grok_hex((char *)l, &numlen, &flags, NULL);
	1827	if (numlen)
	1828	l += numlen;
	1829	else if (nl) {
	1830	l = nl + 1; /* 1 is length of "\n" */
	1831	continue;
	1832	}
	1833	else {
	1834	l = lend; /* to LIST's end at which \n is not found */
	1835	break;
	1836	}
	1837
	1838	if (isBLANK(*l)) {
	1839	++l;
	1840	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	1841	numlen = lend - l;
	1842	max = grok_hex((char *)l, &numlen, &flags, NULL);
	1843	if (numlen)
	1844	l += numlen;
	1845	else
	1846	max = min;
	1847
	1848	if (octets) {
	1849	if (isBLANK(*l)) {
	1850	++l;
	1851	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	1852	PERL_SCAN_DISALLOW_PREFIX;
	1853	numlen = lend - l;
	1854	val = grok_hex((char *)l, &numlen, &flags, NULL);
	1855	if (numlen)
	1856	l += numlen;
	1857	else
	1858	val = 0;
	1859	}
	1860	else {
	1861	val = 0;
	1862	if (typeto) {
	1863	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	1864	typestr, l);
	1865	}
	1866	}
	1867	}
	1868	else
	1869	val = 0; /* bits == 1, then val should be ignored */
	1870	}
	1871	else {
	1872	max = min;
	1873	if (octets) {
	1874	val = 0;
	1875	if (typeto) {
	1876	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	1877	}
	1878	}
	1879	else
	1880	val = 0; /* bits == 1, then val should be ignored */
	1881	}
	1882
	1883	if (nl)
	1884	l = nl + 1;
	1885	else
	1886	l = lend;
	1887
	1888	if (max < start)
	1889	continue;
	1890
	1891	if (octets) {
	1892	UV key;
	1893	if (min < start) {
	1894	if (!none \|\| val < none) {
	1895	val += start - min;
	1896	}
	1897	min = start;
	1898	}
	1899	for (key = min; key <= max; key++) {
	1900	STRLEN offset;
	1901	if (key >= end)
	1902	goto go_out_list;
	1903	/* offset must be non-negative (start <= min <= key < end) */
	1904	offset = octets * (key - start);
	1905	if (bits == 8)
	1906	s[offset] = (U8)(val & 0xff);
	1907	else if (bits == 16) {
	1908	s[offset ] = (U8)((val >> 8) & 0xff);
	1909	s[offset + 1] = (U8)( val & 0xff);
	1910	}
	1911	else if (bits == 32) {
	1912	s[offset ] = (U8)((val >> 24) & 0xff);
	1913	s[offset + 1] = (U8)((val >> 16) & 0xff);
	1914	s[offset + 2] = (U8)((val >> 8) & 0xff);
	1915	s[offset + 3] = (U8)( val & 0xff);
	1916	}
	1917
	1918	if (!none \|\| val < none)
	1919	++val;
	1920	}
	1921	}
	1922	else { /* bits == 1, then val should be ignored */
	1923	UV key;
	1924	if (min < start)
	1925	min = start;
	1926	for (key = min; key <= max; key++) {
	1927	const STRLEN offset = (STRLEN)(key - start);
	1928	if (key >= end)
	1929	goto go_out_list;
	1930	s[offset >> 3] \|= 1 << (offset & 7);
	1931	}
	1932	}
	1933	} /* while */
	1934	go_out_list:
	1935
	1936	/* read $swash->{EXTRAS} */
	1937	x = (U8)SvPV(extssvp, xcur);
	1938	xend = x + xcur;
	1939	while (x < xend) {
	1940	STRLEN namelen;
	1941	U8 *namestr;
	1942	SV** othersvp;
	1943	HV* otherhv;
	1944	STRLEN otherbits;
	1945	SV *otherbitssvp, other;
	1946	U8 s, o, *nl;
	1947	STRLEN slen, olen;
	1948
	1949	const U8 opc = *x++;
	1950	if (opc == '\n')
	1951	continue;
	1952
	1953	nl = (U8*)memchr(x, '\n', xend - x);
	1954
	1955	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	1956	if (nl) {
	1957	x = nl + 1; /* 1 is length of "\n" */
	1958	continue;
	1959	}
	1960	else {
	1961	x = xend; /* to EXTRAS' end at which \n is not found */
	1962	break;
	1963	}
	1964	}
	1965
	1966	namestr = x;
	1967	if (nl) {
	1968	namelen = nl - namestr;
	1969	x = nl + 1;
	1970	}
	1971	else {
	1972	namelen = xend - namestr;
	1973	x = xend;
	1974	}
	1975
	1976	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	1977	otherhv = (HV)SvRV(othersvp);
	1978	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	1979	otherbits = (STRLEN)SvUV(*otherbitssvp);
	1980	if (bits < otherbits)
	1981	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	1982
	1983	/* The "other" swatch must be destroyed after. */
	1984	other = swash_get(*othersvp, start, span);
	1985	o = (U8*)SvPV(other, olen);
	1986
	1987	if (!olen)
	1988	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	1989
	1990	s = (U8*)SvPV(swatch, slen);
	1991	if (bits == 1 && otherbits == 1) {
	1992	if (slen != olen)
	1993	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	1994
	1995	switch (opc) {
	1996	case '+':
	1997	while (slen--)
	1998	s++ \|= o++;
	1999	break;
	2000	case '!':
	2001	while (slen--)
	2002	s++ \|= ~o++;
	2003	break;
	2004	case '-':
	2005	while (slen--)
	2006	s++ &= ~o++;
	2007	break;
	2008	case '&':
	2009	while (slen--)
	2010	s++ &= o++;
	2011	break;
	2012	default:
	2013	break;
	2014	}
	2015	}
	2016	else {
	2017	STRLEN otheroctets = otherbits >> 3;
	2018	STRLEN offset = 0;
	2019	U8* const send = s + slen;
	2020
	2021	while (s < send) {
	2022	UV otherval = 0;
	2023
	2024	if (otherbits == 1) {
	2025	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2026	++offset;
	2027	}
	2028	else {
	2029	STRLEN vlen = otheroctets;
	2030	otherval = *o++;
	2031	while (--vlen) {
	2032	otherval <<= 8;
	2033	otherval \|= *o++;
	2034	}
	2035	}
	2036
	2037	if (opc == '+' && otherval)
	2038	NOOP; /* replace with otherval */
	2039	else if (opc == '!' && !otherval)
	2040	otherval = 1;
	2041	else if (opc == '-' && otherval)
	2042	otherval = 0;
	2043	else if (opc == '&' && !otherval)
	2044	otherval = 0;
	2045	else {
	2046	s += octets; /* no replacement */
	2047	continue;
	2048	}
	2049
	2050	if (bits == 8)
	2051	*s++ = (U8)( otherval & 0xff);
	2052	else if (bits == 16) {
	2053	*s++ = (U8)((otherval >> 8) & 0xff);
	2054	*s++ = (U8)( otherval & 0xff);
	2055	}
	2056	else if (bits == 32) {
	2057	*s++ = (U8)((otherval >> 24) & 0xff);
	2058	*s++ = (U8)((otherval >> 16) & 0xff);
	2059	*s++ = (U8)((otherval >> 8) & 0xff);
	2060	*s++ = (U8)( otherval & 0xff);
	2061	}
	2062	}
	2063	}
	2064	sv_free(other); /* through with it! */
	2065	} /* while */
	2066	return swatch;
	2067	}
	2068
	2069	/*
	2070	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	2071
	2072	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	2073	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2074	bytes available. The return value is the pointer to the byte after the
	2075	end of the new character. In other words,
	2076
	2077	d = uvchr_to_utf8(d, uv);
	2078
	2079	is the recommended wide native character-aware way of saying
	2080
	2081	*(d++) = uv;
	2082
	2083	=cut
	2084	*/
	2085
	2086	/* On ASCII machines this is normally a macro but we want a
	2087	real function in case XS code wants it
	2088	*/
	2089	U8 *
	2090	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2091	{
	2092	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2093	}
	2094
	2095	U8 *
	2096	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2097	{
	2098	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2099	}
	2100
	2101	/*
	2102	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32
	2103	flags
	2104
	2105	Returns the native character value of the first character in the string
	2106	C<s>
	2107	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2108	length, in bytes, of that character.
	2109
	2110	Allows length and flags to be passed to low level routine.
	2111
	2112	=cut
	2113	*/
	2114	/* On ASCII machines this is normally a macro but we want
	2115	a real function in case XS code wants it
	2116	*/
	2117	UV
	2118	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2119	U32 flags)
	2120	{
	2121	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2122	return UNI_TO_NATIVE(uv);
	2123	}
	2124
	2125	/*
	2126	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	2127
	2128	Build to the scalar dsv a displayable version of the string spv,
	2129	length len, the displayable version being at most pvlim bytes long
	2130	(if longer, the rest is truncated and "..." will be appended).
	2131
	2132	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2133	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2134	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2135	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2136	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2137	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2138
	2139	The pointer to the PV of the dsv is returned.
	2140
	2141	=cut */
	2142	char *
	2143	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2144	{
	2145	int truncated = 0;
	2146	const char s, e;
	2147
	2148	sv_setpvn(dsv, "", 0);
	2149	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2150	UV u;
	2151	/* This serves double duty as a flag and a character to print after
	2152	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2153	*/
	2154	char ok = 0;
	2155
	2156	if (pvlim && SvCUR(dsv) >= pvlim) {
	2157	truncated++;
	2158	break;
	2159	}
	2160	u = utf8_to_uvchr((U8*)s, 0);
	2161	if (u < 256) {
	2162	const unsigned char c = (unsigned char)u & 0xFF;
	2163	if (flags & UNI_DISPLAY_BACKSLASH) {
	2164	switch (c) {
	2165	case '\n':
	2166	ok = 'n'; break;
	2167	case '\r':
	2168	ok = 'r'; break;
	2169	case '\t':
	2170	ok = 't'; break;
	2171	case '\f':
	2172	ok = 'f'; break;
	2173	case '\a':
	2174	ok = 'a'; break;
	2175	case '\\':
	2176	ok = '\\'; break;
	2177	default: break;
	2178	}
	2179	if (ok) {
	2180	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
	2181	}
	2182	}
	2183	/* isPRINT() is the locale-blind version. */
	2184	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	2185	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
	2186	ok = 1;
	2187	}
	2188	}
	2189	if (!ok)
	2190	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	2191	}
	2192	if (truncated)
	2193	sv_catpvs(dsv, "...");
	2194
	2195	return SvPVX(dsv);
	2196	}
	2197
	2198	/*
	2199	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	2200
	2201	Build to the scalar dsv a displayable version of the scalar sv,
	2202	the displayable version being at most pvlim bytes long
	2203	(if longer, the rest is truncated and "..." will be appended).
	2204
	2205	The flags argument is as in pv_uni_display().
	2206
	2207	The pointer to the PV of the dsv is returned.
	2208
	2209	=cut
	2210	*/
	2211	char *
	2212	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	2213	{
	2214	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	2215	SvCUR(ssv), pvlim, flags);
	2216	}
	2217
	2218	/*
	2219	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	2220
	2221	Return true if the strings s1 and s2 differ case-insensitively, false
	2222	if not (if they are equal case-insensitively). If u1 is true, the
	2223	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	2224	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	2225	are false, the respective string is assumed to be in native 8-bit
	2226	encoding.
	2227
	2228	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	2229	in there (they will point at the beginning of the I<next> character).
	2230	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	2231	pointers beyond which scanning will not continue under any
	2232	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	2233	s2+l2 will be used as goal end pointers that will also stop the scan,
	2234	and which qualify towards defining a successful match: all the scans
	2235	that define an explicit length must reach their goal pointers for
	2236	a match to succeed).
	2237
	2238	For case-insensitiveness, the "casefolding" of Unicode is used
	2239	instead of upper/lowercasing both the characters, see
	2240	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	2241
	2242	=cut */
	2243	I32
	2244	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	2245	{
	2246	dVAR;
	2247	register const U8 p1 = (const U8)s1;
	2248	register const U8 p2 = (const U8)s2;
	2249	register const U8 *f1 = NULL;
	2250	register const U8 *f2 = NULL;
	2251	register U8 *e1 = NULL;
	2252	register U8 *q1 = NULL;
	2253	register U8 *e2 = NULL;
	2254	register U8 *q2 = NULL;
	2255	STRLEN n1 = 0, n2 = 0;
	2256	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	2257	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	2258	U8 natbuf[1+1];
	2259	STRLEN foldlen1, foldlen2;
	2260	bool match;
	2261
	2262	if (pe1)
	2263	e1 = (U8*)pe1;
	2264	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
	2265	f1 = (const U8*)s1 + l1;
	2266	if (pe2)
	2267	e2 = (U8*)pe2;
	2268	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
	2269	f2 = (const U8*)s2 + l2;
	2270
	2271	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	2272	return 1; /* mismatch; possible infinite loop or false positive */
	2273
	2274	if (!u1 \|\| !u2)
	2275	natbuf[1] = 0; /* Need to terminate the buffer. */
	2276
	2277	while ((e1 == 0 \|\| p1 < e1) &&
	2278	(f1 == 0 \|\| p1 < f1) &&
	2279	(e2 == 0 \|\| p2 < e2) &&
	2280	(f2 == 0 \|\| p2 < f2)) {
	2281	if (n1 == 0) {
	2282	if (u1)
	2283	to_utf8_fold(p1, foldbuf1, &foldlen1);
	2284	else {
	2285	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	2286	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	2287	}
	2288	q1 = foldbuf1;
	2289	n1 = foldlen1;
	2290	}
	2291	if (n2 == 0) {
	2292	if (u2)
	2293	to_utf8_fold(p2, foldbuf2, &foldlen2);
	2294	else {
	2295	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	2296	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	2297	}
	2298	q2 = foldbuf2;
	2299	n2 = foldlen2;
	2300	}
	2301	while (n1 && n2) {
	2302	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2303	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2304	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2305	return 1; /* mismatch */
	2306	n1 -= UTF8SKIP(q1);
	2307	q1 += UTF8SKIP(q1);
	2308	n2 -= UTF8SKIP(q2);
	2309	q2 += UTF8SKIP(q2);
	2310	}
	2311	if (n1 == 0)
	2312	p1 += u1 ? UTF8SKIP(p1) : 1;
	2313	if (n2 == 0)
	2314	p2 += u2 ? UTF8SKIP(p2) : 1;
	2315
	2316	}
	2317
	2318	/* A match is defined by all the scans that specified
	2319	* an explicit length reaching their final goals. */
	2320	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	2321
	2322	if (match) {
	2323	if (pe1)
	2324	pe1 = (char)p1;
	2325	if (pe2)
	2326	pe2 = (char)p2;
	2327	}
	2328
	2329	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2330	}
	2331
	2332	/*
	2333	* Local variables:
	2334	* c-indentation-style: bsd
	2335	* c-basic-offset: 4
	2336	* indent-tabs-mode: t
	2337	* End:
	2338	*
	2339	* ex: set ts=8 sts=4 sw=4 noet:
	2340	*/