perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
	4	* others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* 'Well do I understand your speech,' he answered in the same language;
	17	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	18	* as is the custom in the West, if you wish to be answered?'
	19	*
	20	* ...the travellers perceived that the floor was paved with stones of many
	21	* hues; branching runes and strange devices intertwined beneath their feet.
	22	*/
	23
	24	#include "EXTERN.h"
	25	#define PERL_IN_UTF8_C
	26	#include "perl.h"
	27
	28	static const char unees[] =
	29	"Malformed UTF-8 character (unexpected end of string)";
	30
	31	/*
	32	=head1 Unicode Support
	33
	34	This file contains various utility functions for manipulating UTF8-encoded
	35	strings. For the uninitiated, this is a method of representing arbitrary
	36	Unicode characters as a variable number of bytes, in such a way that
	37	characters in the ASCII range are unmodified, and a zero byte never appears
	38	within non-zero characters.
	39
	40	=for apidoc A\|U8 \|uvuni_to_utf8_flags\|U8 d\|UV uv\|UV flags
	41
	42	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	43	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	44	bytes available. The return value is the pointer to the byte after the
	45	end of the new character. In other words,
	46
	47	d = uvuni_to_utf8_flags(d, uv, flags);
	48
	49	or, in most cases,
	50
	51	d = uvuni_to_utf8(d, uv);
	52
	53	(which is equivalent to)
	54
	55	d = uvuni_to_utf8_flags(d, uv, 0);
	56
	57	is the recommended Unicode-aware way of saying
	58
	59	*(d++) = uv;
	60
	61	=cut
	62	*/
	63
	64	U8 *
	65	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	66	{
	67	if (ckWARN(WARN_UTF8)) {
	68	if (UNICODE_IS_SURROGATE(uv) &&
	69	!(flags & UNICODE_ALLOW_SURROGATE))
	70	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	71	else if (
	72	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	73	!(flags & UNICODE_ALLOW_FDD0))
	74	\|\|
	75	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	76	!(flags & UNICODE_ALLOW_FFFF))) &&
	77	/* UNICODE_ALLOW_SUPER includes
	78	* FFFEs and FFFFs beyond 0x10FFFF. */
	79	((uv <= PERL_UNICODE_MAX) \|\|
	80	!(flags & UNICODE_ALLOW_SUPER))
	81	)
	82	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	83	"Unicode character 0x%04"UVxf" is illegal", uv);
	84	}
	85	if (UNI_IS_INVARIANT(uv)) {
	86	*d++ = (U8)UTF_TO_NATIVE(uv);
	87	return d;
	88	}
	89	#if defined(EBCDIC)
	90	else {
	91	STRLEN len = UNISKIP(uv);
	92	U8 *p = d+len-1;
	93	while (p > d) {
	94	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	95	uv >>= UTF_ACCUMULATION_SHIFT;
	96	}
	97	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	98	return d+len;
	99	}
	100	#else /* Non loop style */
	101	if (uv < 0x800) {
	102	*d++ = (U8)(( uv >> 6) \| 0xc0);
	103	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	104	return d;
	105	}
	106	if (uv < 0x10000) {
	107	*d++ = (U8)(( uv >> 12) \| 0xe0);
	108	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	109	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	110	return d;
	111	}
	112	if (uv < 0x200000) {
	113	*d++ = (U8)(( uv >> 18) \| 0xf0);
	114	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	115	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	116	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	117	return d;
	118	}
	119	if (uv < 0x4000000) {
	120	*d++ = (U8)(( uv >> 24) \| 0xf8);
	121	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	122	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	123	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	124	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	125	return d;
	126	}
	127	if (uv < 0x80000000) {
	128	*d++ = (U8)(( uv >> 30) \| 0xfc);
	129	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	130	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	131	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	132	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	133	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	134	return d;
	135	}
	136	#ifdef HAS_QUAD
	137	if (uv < UTF8_QUAD_MAX)
	138	#endif
	139	{
	140	d++ = 0xfe; / Can't match U+FEFF! */
	141	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	142	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	143	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	144	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	145	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	146	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	147	return d;
	148	}
	149	#ifdef HAS_QUAD
	150	{
	151	d++ = 0xff; / Can't match U+FFFE! */
	152	d++ = 0x80; / 6 Reserved bits */
	153	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	154	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	155	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	156	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	157	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	158	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	159	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	160	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	161	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	162	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	163	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	164	return d;
	165	}
	166	#endif
	167	#endif /* Loop style */
	168	}
	169
	170	/*
	171
	172	Tests if some arbitrary number of bytes begins in a valid UTF-8
	173	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	174	UTF-8 character. The actual number of bytes in the UTF-8 character
	175	will be returned if it is valid, otherwise 0.
	176
	177	This is the "slow" version as opposed to the "fast" version which is
	178	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	179	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	180	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	181	you should use the _slow(). In practice this means that the _slow()
	182	will be used very rarely, since the maximum Unicode code point (as of
	183	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	184	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	185	five bytes or more.
	186
	187	=cut */
	188	STATIC STRLEN
	189	S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
	190	{
	191	U8 u = *s;
	192	STRLEN slen;
	193	UV uv, ouv;
	194
	195	if (UTF8_IS_INVARIANT(u))
	196	return 1;
	197
	198	if (!UTF8_IS_START(u))
	199	return 0;
	200
	201	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	202	return 0;
	203
	204	slen = len - 1;
	205	s++;
	206	#ifdef EBCDIC
	207	u = NATIVE_TO_UTF(u);
	208	#endif
	209	u &= UTF_START_MASK(len);
	210	uv = u;
	211	ouv = uv;
	212	while (slen--) {
	213	if (!UTF8_IS_CONTINUATION(*s))
	214	return 0;
	215	uv = UTF8_ACCUMULATE(uv, *s);
	216	if (uv < ouv)
	217	return 0;
	218	ouv = uv;
	219	s++;
	220	}
	221
	222	if ((STRLEN)UNISKIP(uv) < len)
	223	return 0;
	224
	225	return len;
	226	}
	227
	228	/*
	229	=for apidoc A\|STRLEN\|is_utf8_char\|const U8 *s
	230
	231	Tests if some arbitrary number of bytes begins in a valid UTF-8
	232	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	233	UTF-8 character. The actual number of bytes in the UTF-8 character
	234	will be returned if it is valid, otherwise 0.
	235
	236	=cut */
	237	STRLEN
	238	Perl_is_utf8_char(pTHX_ const U8 *s)
	239	{
	240	const STRLEN len = UTF8SKIP(s);
	241	#ifdef IS_UTF8_CHAR
	242	if (IS_UTF8_CHAR_FAST(len))
	243	return IS_UTF8_CHAR(s, len) ? len : 0;
	244	#endif /* #ifdef IS_UTF8_CHAR */
	245	return is_utf8_char_slow(s, len);
	246	}
	247
	248	/*
	249	=for apidoc A\|bool\|is_utf8_string\|const U8 *s\|STRLEN len
	250
	251	Returns true if first C<len> bytes of the given string form a valid
	252	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	253	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	254	because a valid ASCII string is a valid UTF-8 string.
	255
	256	See also is_utf8_string_loclen() and is_utf8_string_loc().
	257
	258	=cut
	259	*/
	260
	261	bool
	262	Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
	263	{
	264	const U8* x = s;
	265	const U8* send;
	266
	267	if (!len)
	268	len = strlen((const char *)s);
	269	send = s + len;
	270
	271	while (x < send) {
	272	STRLEN c;
	273	/* Inline the easy bits of is_utf8_char() here for speed... */
	274	if (UTF8_IS_INVARIANT(*x))
	275	c = 1;
	276	else if (!UTF8_IS_START(*x))
	277	goto out;
	278	else {
	279	/* ... and call is_utf8_char() only if really needed. */
	280	#ifdef IS_UTF8_CHAR
	281	c = UTF8SKIP(x);
	282	if (IS_UTF8_CHAR_FAST(c)) {
	283	if (!IS_UTF8_CHAR(x, c))
	284	goto out;
	285	} else if (!is_utf8_char_slow(x, c))
	286	goto out;
	287	#else
	288	c = is_utf8_char(x);
	289	#endif /* #ifdef IS_UTF8_CHAR */
	290	if (!c)
	291	goto out;
	292	}
	293	x += c;
	294	}
	295
	296	out:
	297	if (x != send)
	298	return FALSE;
	299
	300	return TRUE;
	301	}
	302
	303	/*
	304	Implemented as a macro in utf8.h
	305
	306	=for apidoc A\|bool\|is_utf8_string_loc\|const U8 s\|STRLEN len\|const U8 *ep
	307
	308	Like is_utf8_string() but stores the location of the failure (in the
	309	case of "utf8ness failure") or the location s+len (in the case of
	310	"utf8ness success") in the C<ep>.
	311
	312	See also is_utf8_string_loclen() and is_utf8_string().
	313
	314	=for apidoc A\|bool\|is_utf8_string_loclen\|const U8 s\|STRLEN len\|const U8 ep\|const STRLEN el
	315
	316	Like is_utf8_string() but stores the location of the failure (in the
	317	case of "utf8ness failure") or the location s+len (in the case of
	318	"utf8ness success") in the C<ep>, and the number of UTF-8
	319	encoded characters in the C<el>.
	320
	321	See also is_utf8_string_loc() and is_utf8_string().
	322
	323	=cut
	324	*/
	325
	326	bool
	327	Perl_is_utf8_string_loclen(pTHX_ const U8 s, STRLEN len, const U8 ep, STRLEN el)
	328	{
	329	const U8* x = s;
	330	const U8* send;
	331	STRLEN c;
	332
	333	if (!len)
	334	len = strlen((const char *)s);
	335	send = s + len;
	336	if (el)
	337	*el = 0;
	338
	339	while (x < send) {
	340	/* Inline the easy bits of is_utf8_char() here for speed... */
	341	if (UTF8_IS_INVARIANT(*x))
	342	c = 1;
	343	else if (!UTF8_IS_START(*x))
	344	goto out;
	345	else {
	346	/* ... and call is_utf8_char() only if really needed. */
	347	#ifdef IS_UTF8_CHAR
	348	c = UTF8SKIP(x);
	349	if (IS_UTF8_CHAR_FAST(c)) {
	350	if (!IS_UTF8_CHAR(x, c))
	351	c = 0;
	352	} else
	353	c = is_utf8_char_slow(x, c);
	354	#else
	355	c = is_utf8_char(x);
	356	#endif /* #ifdef IS_UTF8_CHAR */
	357	if (!c)
	358	goto out;
	359	}
	360	x += c;
	361	if (el)
	362	(*el)++;
	363	}
	364
	365	out:
	366	if (ep)
	367	*ep = x;
	368	if (x != send)
	369	return FALSE;
	370
	371	return TRUE;
	372	}
	373
	374	/*
	375
	376	=for apidoc A\|UV\|utf8n_to_uvuni\|const U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	377
	378	Bottom level UTF-8 decode routine.
	379	Returns the unicode code point value of the first character in the string C<s>
	380	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	381	C<retlen> will be set to the length, in bytes, of that character.
	382
	383	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	384	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	385	it is assumed that the caller will raise a warning, and this function
	386	will silently just set C<retlen> to C<-1> and return zero. If the
	387	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	388	malformations will be given, C<retlen> will be set to the expected
	389	length of the UTF-8 character in bytes, and zero will be returned.
	390
	391	The C<flags> can also contain various flags to allow deviations from
	392	the strict UTF-8 encoding (see F<utf8.h>).
	393
	394	Most code should use utf8_to_uvchr() rather than call this directly.
	395
	396	=cut
	397	*/
	398
	399	UV
	400	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	401	{
	402	const U8 *s0 = s;
	403	UV uv = *s, ouv = 0;
	404	STRLEN len = 1;
	405	const bool dowarn = ckWARN_d(WARN_UTF8);
	406	const UV startbyte = *s;
	407	STRLEN expectlen = 0;
	408	U32 warning = 0;
	409
	410	/* This list is a superset of the UTF8_ALLOW_XXX. */
	411
	412	#define UTF8_WARN_EMPTY 1
	413	#define UTF8_WARN_CONTINUATION 2
	414	#define UTF8_WARN_NON_CONTINUATION 3
	415	#define UTF8_WARN_FE_FF 4
	416	#define UTF8_WARN_SHORT 5
	417	#define UTF8_WARN_OVERFLOW 6
	418	#define UTF8_WARN_SURROGATE 7
	419	#define UTF8_WARN_LONG 8
	420	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	421
	422	if (curlen == 0 &&
	423	!(flags & UTF8_ALLOW_EMPTY)) {
	424	warning = UTF8_WARN_EMPTY;
	425	goto malformed;
	426	}
	427
	428	if (UTF8_IS_INVARIANT(uv)) {
	429	if (retlen)
	430	*retlen = 1;
	431	return (UV) (NATIVE_TO_UTF(*s));
	432	}
	433
	434	if (UTF8_IS_CONTINUATION(uv) &&
	435	!(flags & UTF8_ALLOW_CONTINUATION)) {
	436	warning = UTF8_WARN_CONTINUATION;
	437	goto malformed;
	438	}
	439
	440	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	441	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	442	warning = UTF8_WARN_NON_CONTINUATION;
	443	goto malformed;
	444	}
	445
	446	#ifdef EBCDIC
	447	uv = NATIVE_TO_UTF(uv);
	448	#else
	449	if ((uv == 0xfe \|\| uv == 0xff) &&
	450	!(flags & UTF8_ALLOW_FE_FF)) {
	451	warning = UTF8_WARN_FE_FF;
	452	goto malformed;
	453	}
	454	#endif
	455
	456	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	457	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	458	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	459	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	460	#ifdef EBCDIC
	461	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	462	else { len = 7; uv &= 0x01; }
	463	#else
	464	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	465	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	466	else { len = 13; uv = 0; } /* whoa! */
	467	#endif
	468
	469	if (retlen)
	470	*retlen = len;
	471
	472	expectlen = len;
	473
	474	if ((curlen < expectlen) &&
	475	!(flags & UTF8_ALLOW_SHORT)) {
	476	warning = UTF8_WARN_SHORT;
	477	goto malformed;
	478	}
	479
	480	len--;
	481	s++;
	482	ouv = uv;
	483
	484	while (len--) {
	485	if (!UTF8_IS_CONTINUATION(*s) &&
	486	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	487	s--;
	488	warning = UTF8_WARN_NON_CONTINUATION;
	489	goto malformed;
	490	}
	491	else
	492	uv = UTF8_ACCUMULATE(uv, *s);
	493	if (!(uv > ouv)) {
	494	/* These cannot be allowed. */
	495	if (uv == ouv) {
	496	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	497	warning = UTF8_WARN_LONG;
	498	goto malformed;
	499	}
	500	}
	501	else { /* uv < ouv */
	502	/* This cannot be allowed. */
	503	warning = UTF8_WARN_OVERFLOW;
	504	goto malformed;
	505	}
	506	}
	507	s++;
	508	ouv = uv;
	509	}
	510
	511	if (UNICODE_IS_SURROGATE(uv) &&
	512	!(flags & UTF8_ALLOW_SURROGATE)) {
	513	warning = UTF8_WARN_SURROGATE;
	514	goto malformed;
	515	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	516	!(flags & UTF8_ALLOW_LONG)) {
	517	warning = UTF8_WARN_LONG;
	518	goto malformed;
	519	} else if (UNICODE_IS_ILLEGAL(uv) &&
	520	!(flags & UTF8_ALLOW_FFFF)) {
	521	warning = UTF8_WARN_FFFF;
	522	goto malformed;
	523	}
	524
	525	return uv;
	526
	527	malformed:
	528
	529	if (flags & UTF8_CHECK_ONLY) {
	530	if (retlen)
	531	*retlen = -1;
	532	return 0;
	533	}
	534
	535	if (dowarn) {
	536	SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	537
	538	switch (warning) {
	539	case 0: /* Intentionally empty. */ break;
	540	case UTF8_WARN_EMPTY:
	541	Perl_sv_catpv(aTHX_ sv, "(empty string)");
	542	break;
	543	case UTF8_WARN_CONTINUATION:
	544	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	545	break;
	546	case UTF8_WARN_NON_CONTINUATION:
	547	if (s == s0)
	548	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	549	(UV)s[1], startbyte);
	550	else {
	551	const int len = (int)(s-s0);
	552	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	553	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	554	}
	555
	556	break;
	557	case UTF8_WARN_FE_FF:
	558	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	559	break;
	560	case UTF8_WARN_SHORT:
	561	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	562	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	563	expectlen = curlen; /* distance for caller to skip */
	564	break;
	565	case UTF8_WARN_OVERFLOW:
	566	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	567	ouv, *s, startbyte);
	568	break;
	569	case UTF8_WARN_SURROGATE:
	570	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	571	break;
	572	case UTF8_WARN_LONG:
	573	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	574	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	575	break;
	576	case UTF8_WARN_FFFF:
	577	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	578	break;
	579	default:
	580	Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
	581	break;
	582	}
	583
	584	if (warning) {
	585	const char * const s = SvPVX_const(sv);
	586
	587	if (PL_op)
	588	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	589	"%s in %s", s, OP_DESC(PL_op));
	590	else
	591	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	592	}
	593	}
	594
	595	if (retlen)
	596	*retlen = expectlen ? expectlen : len;
	597
	598	return 0;
	599	}
	600
	601	/*
	602	=for apidoc A\|UV\|utf8_to_uvchr\|const U8 s\|STRLEN retlen
	603
	604	Returns the native character value of the first character in the string C<s>
	605	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	606	length, in bytes, of that character.
	607
	608	If C<s> does not point to a well-formed UTF-8 character, zero is
	609	returned and retlen is set, if possible, to -1.
	610
	611	=cut
	612	*/
	613
	614	UV
	615	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	616	{
	617	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	618	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	619	}
	620
	621	/*
	622	=for apidoc A\|UV\|utf8_to_uvuni\|const U8 s\|STRLEN retlen
	623
	624	Returns the Unicode code point of the first character in the string C<s>
	625	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	626	length, in bytes, of that character.
	627
	628	This function should only be used when returned UV is considered
	629	an index into the Unicode semantic tables (e.g. swashes).
	630
	631	If C<s> does not point to a well-formed UTF-8 character, zero is
	632	returned and retlen is set, if possible, to -1.
	633
	634	=cut
	635	*/
	636
	637	UV
	638	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	639	{
	640	/* Call the low level routine asking for checks */
	641	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	642	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	643	}
	644
	645	/*
	646	=for apidoc A\|STRLEN\|utf8_length\|const U8 s\|const U8 e
	647
	648	Return the length of the UTF-8 char encoded string C<s> in characters.
	649	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	650	up past C<e>, croaks.
	651
	652	=cut
	653	*/
	654
	655	STRLEN
	656	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	657	{
	658	STRLEN len = 0;
	659
	660	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	661	* the bitops (especially ~) can create illegal UTF-8.
	662	* In other words: in Perl UTF-8 is not just for Unicode. */
	663
	664	if (e < s)
	665	goto warn_and_return;
	666	while (s < e) {
	667	const U8 t = UTF8SKIP(s);
	668	if (e - s < t) {
	669	warn_and_return:
	670	if (ckWARN_d(WARN_UTF8)) {
	671	if (PL_op)
	672	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	673	"%s in %s", unees, OP_DESC(PL_op));
	674	else
	675	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	676	}
	677	return len;
	678	}
	679	s += t;
	680	len++;
	681	}
	682
	683	return len;
	684	}
	685
	686	/*
	687	=for apidoc A\|IV\|utf8_distance\|const U8 a\|const U8 b
	688
	689	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	690	and C<b>.
	691
	692	WARNING: use only if you know that the pointers point inside the
	693	same UTF-8 buffer.
	694
	695	=cut
	696	*/
	697
	698	IV
	699	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	700	{
	701	IV off = 0;
	702
	703	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	704	* the bitops (especially ~) can create illegal UTF-8.
	705	* In other words: in Perl UTF-8 is not just for Unicode. */
	706
	707	if (a < b) {
	708	while (a < b) {
	709	const U8 c = UTF8SKIP(a);
	710	if (b - a < c)
	711	goto warn_and_return;
	712	a += c;
	713	off--;
	714	}
	715	}
	716	else {
	717	while (b < a) {
	718	const U8 c = UTF8SKIP(b);
	719
	720	if (a - b < c) {
	721	warn_and_return:
	722	if (ckWARN_d(WARN_UTF8)) {
	723	if (PL_op)
	724	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	725	"%s in %s", unees, OP_DESC(PL_op));
	726	else
	727	Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
	728	}
	729	return off;
	730	}
	731	b += c;
	732	off++;
	733	}
	734	}
	735
	736	return off;
	737	}
	738
	739	/*
	740	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	741
	742	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	743	forward or backward.
	744
	745	WARNING: do not use the following unless you know C<off> is within
	746	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	747	on the first byte of character or just after the last byte of a character.
	748
	749	=cut
	750	*/
	751
	752	U8 *
	753	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	754	{
	755	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	756	* the bitops (especially ~) can create illegal UTF-8.
	757	* In other words: in Perl UTF-8 is not just for Unicode. */
	758
	759	if (off >= 0) {
	760	while (off--)
	761	s += UTF8SKIP(s);
	762	}
	763	else {
	764	while (off++) {
	765	s--;
	766	while (UTF8_IS_CONTINUATION(*s))
	767	s--;
	768	}
	769	}
	770	return (U8 *)s;
	771	}
	772
	773	/*
	774	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	775
	776	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	777	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	778	updates len to contain the new length.
	779	Returns zero on failure, setting C<len> to -1.
	780
	781	=cut
	782	*/
	783
	784	U8 *
	785	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	786	{
	787	U8 *send;
	788	U8 *d;
	789	U8 *save = s;
	790
	791	/* ensure valid UTF-8 and chars < 256 before updating string */
	792	for (send = s + *len; s < send; ) {
	793	U8 c = *s++;
	794
	795	if (!UTF8_IS_INVARIANT(c) &&
	796	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	797	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	798	*len = -1;
	799	return 0;
	800	}
	801	}
	802
	803	d = s = save;
	804	while (s < send) {
	805	STRLEN ulen;
	806	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	807	s += ulen;
	808	}
	809	*d = '\0';
	810	*len = d - save;
	811	return save;
	812	}
	813
	814	/*
	815	=for apidoc A\|U8 \|bytes_from_utf8\|const U8 s\|STRLEN len\|bool is_utf8
	816
	817	Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
	818	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	819	the newly-created string, and updates C<len> to contain the new
	820	length. Returns the original string if no conversion occurs, C<len>
	821	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	822	0 if C<s> is converted or contains all 7bit characters.
	823
	824	=cut
	825	*/
	826
	827	U8 *
	828	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	829	{
	830	U8 *d;
	831	const U8 *start = s;
	832	const U8 *send;
	833	I32 count = 0;
	834
	835	if (!*is_utf8)
	836	return (U8 *)start;
	837
	838	/* ensure valid UTF-8 and chars < 256 before converting string */
	839	for (send = s + *len; s < send;) {
	840	U8 c = *s++;
	841	if (!UTF8_IS_INVARIANT(c)) {
	842	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	843	(c = *s++) && UTF8_IS_CONTINUATION(c))
	844	count++;
	845	else
	846	return (U8 *)start;
	847	}
	848	}
	849
	850	*is_utf8 = 0;
	851
	852	Newxz(d, (*len) - count + 1, U8);
	853	s = start; start = d;
	854	while (s < send) {
	855	U8 c = *s++;
	856	if (!UTF8_IS_INVARIANT(c)) {
	857	/* Then it is two-byte encoded */
	858	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	859	c = ASCII_TO_NATIVE(c);
	860	}
	861	*d++ = c;
	862	}
	863	*d = '\0';
	864	*len = d - start;
	865	return (U8 *)start;
	866	}
	867
	868	/*
	869	=for apidoc A\|U8 \|bytes_to_utf8\|const U8 s\|STRLEN *len
	870
	871	Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
	872	Returns a pointer to the newly-created string, and sets C<len> to
	873	reflect the new length.
	874
	875	If you want to convert to UTF-8 from other encodings than ASCII,
	876	see sv_recode_to_utf8().
	877
	878	=cut
	879	*/
	880
	881	U8*
	882	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	883	{
	884	const U8 * const send = s + (*len);
	885	U8 *d;
	886	U8 *dst;
	887
	888	Newxz(d, (len) 2 + 1, U8);
	889	dst = d;
	890
	891	while (s < send) {
	892	const UV uv = NATIVE_TO_ASCII(*s++);
	893	if (UNI_IS_INVARIANT(uv))
	894	*d++ = (U8)UTF_TO_NATIVE(uv);
	895	else {
	896	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	897	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	898	}
	899	}
	900	*d = '\0';
	901	*len = d-dst;
	902	return dst;
	903	}
	904
	905	/*
	906	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	907	*
	908	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	909	* We optimize for native, for obvious reasons. */
	910
	911	U8*
	912	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	913	{
	914	U8* pend;
	915	U8* dstart = d;
	916
	917	if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
	918	d[0] = 0;
	919	*newlen = 1;
	920	return d;
	921	}
	922
	923	if (bytelen & 1)
	924	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
	925
	926	pend = p + bytelen;
	927
	928	while (p < pend) {
	929	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	930	p += 2;
	931	if (uv < 0x80) {
	932	*d++ = (U8)uv;
	933	continue;
	934	}
	935	if (uv < 0x800) {
	936	*d++ = (U8)(( uv >> 6) \| 0xc0);
	937	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	938	continue;
	939	}
	940	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	941	UV low = (p[0] << 8) + p[1];
	942	p += 2;
	943	if (low < 0xdc00 \|\| low >= 0xdfff)
	944	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	945	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	946	}
	947	if (uv < 0x10000) {
	948	*d++ = (U8)(( uv >> 12) \| 0xe0);
	949	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	950	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	951	continue;
	952	}
	953	else {
	954	*d++ = (U8)(( uv >> 18) \| 0xf0);
	955	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	956	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	957	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	958	continue;
	959	}
	960	}
	961	*newlen = d - dstart;
	962	return d;
	963	}
	964
	965	/* Note: this one is slightly destructive of the source. */
	966
	967	U8*
	968	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	969	{
	970	U8* s = (U8*)p;
	971	U8* send = s + bytelen;
	972	while (s < send) {
	973	U8 tmp = s[0];
	974	s[0] = s[1];
	975	s[1] = tmp;
	976	s += 2;
	977	}
	978	return utf16_to_utf8(p, d, bytelen, newlen);
	979	}
	980
	981	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	982
	983	bool
	984	Perl_is_uni_alnum(pTHX_ UV c)
	985	{
	986	U8 tmpbuf[UTF8_MAXBYTES+1];
	987	uvchr_to_utf8(tmpbuf, c);
	988	return is_utf8_alnum(tmpbuf);
	989	}
	990
	991	bool
	992	Perl_is_uni_alnumc(pTHX_ UV c)
	993	{
	994	U8 tmpbuf[UTF8_MAXBYTES+1];
	995	uvchr_to_utf8(tmpbuf, c);
	996	return is_utf8_alnumc(tmpbuf);
	997	}
	998
	999	bool
	1000	Perl_is_uni_idfirst(pTHX_ UV c)
	1001	{
	1002	U8 tmpbuf[UTF8_MAXBYTES+1];
	1003	uvchr_to_utf8(tmpbuf, c);
	1004	return is_utf8_idfirst(tmpbuf);
	1005	}
	1006
	1007	bool
	1008	Perl_is_uni_alpha(pTHX_ UV c)
	1009	{
	1010	U8 tmpbuf[UTF8_MAXBYTES+1];
	1011	uvchr_to_utf8(tmpbuf, c);
	1012	return is_utf8_alpha(tmpbuf);
	1013	}
	1014
	1015	bool
	1016	Perl_is_uni_ascii(pTHX_ UV c)
	1017	{
	1018	U8 tmpbuf[UTF8_MAXBYTES+1];
	1019	uvchr_to_utf8(tmpbuf, c);
	1020	return is_utf8_ascii(tmpbuf);
	1021	}
	1022
	1023	bool
	1024	Perl_is_uni_space(pTHX_ UV c)
	1025	{
	1026	U8 tmpbuf[UTF8_MAXBYTES+1];
	1027	uvchr_to_utf8(tmpbuf, c);
	1028	return is_utf8_space(tmpbuf);
	1029	}
	1030
	1031	bool
	1032	Perl_is_uni_digit(pTHX_ UV c)
	1033	{
	1034	U8 tmpbuf[UTF8_MAXBYTES+1];
	1035	uvchr_to_utf8(tmpbuf, c);
	1036	return is_utf8_digit(tmpbuf);
	1037	}
	1038
	1039	bool
	1040	Perl_is_uni_upper(pTHX_ UV c)
	1041	{
	1042	U8 tmpbuf[UTF8_MAXBYTES+1];
	1043	uvchr_to_utf8(tmpbuf, c);
	1044	return is_utf8_upper(tmpbuf);
	1045	}
	1046
	1047	bool
	1048	Perl_is_uni_lower(pTHX_ UV c)
	1049	{
	1050	U8 tmpbuf[UTF8_MAXBYTES+1];
	1051	uvchr_to_utf8(tmpbuf, c);
	1052	return is_utf8_lower(tmpbuf);
	1053	}
	1054
	1055	bool
	1056	Perl_is_uni_cntrl(pTHX_ UV c)
	1057	{
	1058	U8 tmpbuf[UTF8_MAXBYTES+1];
	1059	uvchr_to_utf8(tmpbuf, c);
	1060	return is_utf8_cntrl(tmpbuf);
	1061	}
	1062
	1063	bool
	1064	Perl_is_uni_graph(pTHX_ UV c)
	1065	{
	1066	U8 tmpbuf[UTF8_MAXBYTES+1];
	1067	uvchr_to_utf8(tmpbuf, c);
	1068	return is_utf8_graph(tmpbuf);
	1069	}
	1070
	1071	bool
	1072	Perl_is_uni_print(pTHX_ UV c)
	1073	{
	1074	U8 tmpbuf[UTF8_MAXBYTES+1];
	1075	uvchr_to_utf8(tmpbuf, c);
	1076	return is_utf8_print(tmpbuf);
	1077	}
	1078
	1079	bool
	1080	Perl_is_uni_punct(pTHX_ UV c)
	1081	{
	1082	U8 tmpbuf[UTF8_MAXBYTES+1];
	1083	uvchr_to_utf8(tmpbuf, c);
	1084	return is_utf8_punct(tmpbuf);
	1085	}
	1086
	1087	bool
	1088	Perl_is_uni_xdigit(pTHX_ UV c)
	1089	{
	1090	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1091	uvchr_to_utf8(tmpbuf, c);
	1092	return is_utf8_xdigit(tmpbuf);
	1093	}
	1094
	1095	UV
	1096	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1097	{
	1098	uvchr_to_utf8(p, c);
	1099	return to_utf8_upper(p, p, lenp);
	1100	}
	1101
	1102	UV
	1103	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1104	{
	1105	uvchr_to_utf8(p, c);
	1106	return to_utf8_title(p, p, lenp);
	1107	}
	1108
	1109	UV
	1110	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1111	{
	1112	uvchr_to_utf8(p, c);
	1113	return to_utf8_lower(p, p, lenp);
	1114	}
	1115
	1116	UV
	1117	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1118	{
	1119	uvchr_to_utf8(p, c);
	1120	return to_utf8_fold(p, p, lenp);
	1121	}
	1122
	1123	/* for now these all assume no locale info available for Unicode > 255 */
	1124
	1125	bool
	1126	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1127	{
	1128	return is_uni_alnum(c); /* XXX no locale support yet */
	1129	}
	1130
	1131	bool
	1132	Perl_is_uni_alnumc_lc(pTHX_ UV c)
	1133	{
	1134	return is_uni_alnumc(c); /* XXX no locale support yet */
	1135	}
	1136
	1137	bool
	1138	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1139	{
	1140	return is_uni_idfirst(c); /* XXX no locale support yet */
	1141	}
	1142
	1143	bool
	1144	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1145	{
	1146	return is_uni_alpha(c); /* XXX no locale support yet */
	1147	}
	1148
	1149	bool
	1150	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1151	{
	1152	return is_uni_ascii(c); /* XXX no locale support yet */
	1153	}
	1154
	1155	bool
	1156	Perl_is_uni_space_lc(pTHX_ UV c)
	1157	{
	1158	return is_uni_space(c); /* XXX no locale support yet */
	1159	}
	1160
	1161	bool
	1162	Perl_is_uni_digit_lc(pTHX_ UV c)
	1163	{
	1164	return is_uni_digit(c); /* XXX no locale support yet */
	1165	}
	1166
	1167	bool
	1168	Perl_is_uni_upper_lc(pTHX_ UV c)
	1169	{
	1170	return is_uni_upper(c); /* XXX no locale support yet */
	1171	}
	1172
	1173	bool
	1174	Perl_is_uni_lower_lc(pTHX_ UV c)
	1175	{
	1176	return is_uni_lower(c); /* XXX no locale support yet */
	1177	}
	1178
	1179	bool
	1180	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1181	{
	1182	return is_uni_cntrl(c); /* XXX no locale support yet */
	1183	}
	1184
	1185	bool
	1186	Perl_is_uni_graph_lc(pTHX_ UV c)
	1187	{
	1188	return is_uni_graph(c); /* XXX no locale support yet */
	1189	}
	1190
	1191	bool
	1192	Perl_is_uni_print_lc(pTHX_ UV c)
	1193	{
	1194	return is_uni_print(c); /* XXX no locale support yet */
	1195	}
	1196
	1197	bool
	1198	Perl_is_uni_punct_lc(pTHX_ UV c)
	1199	{
	1200	return is_uni_punct(c); /* XXX no locale support yet */
	1201	}
	1202
	1203	bool
	1204	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1205	{
	1206	return is_uni_xdigit(c); /* XXX no locale support yet */
	1207	}
	1208
	1209	U32
	1210	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1211	{
	1212	/* XXX returns only the first character -- do not use XXX */
	1213	/* XXX no locale support yet */
	1214	STRLEN len;
	1215	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1216	return (U32)to_uni_upper(c, tmpbuf, &len);
	1217	}
	1218
	1219	U32
	1220	Perl_to_uni_title_lc(pTHX_ U32 c)
	1221	{
	1222	/* XXX returns only the first character XXX -- do not use XXX */
	1223	/* XXX no locale support yet */
	1224	STRLEN len;
	1225	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1226	return (U32)to_uni_title(c, tmpbuf, &len);
	1227	}
	1228
	1229	U32
	1230	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1231	{
	1232	/* XXX returns only the first character -- do not use XXX */
	1233	/* XXX no locale support yet */
	1234	STRLEN len;
	1235	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1236	return (U32)to_uni_lower(c, tmpbuf, &len);
	1237	}
	1238
	1239	static bool
	1240	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1241	const char *const swashname)
	1242	{
	1243	if (!is_utf8_char(p))
	1244	return FALSE;
	1245	if (!*swash)
	1246	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1247	return swash_fetch(*swash, p, TRUE) != 0;
	1248	}
	1249
	1250	bool
	1251	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1252	{
	1253	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1254	* descendant of isalnum(3), in other words, it doesn't
	1255	* contain the '_'. --jhi */
	1256	return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
	1257	}
	1258
	1259	bool
	1260	Perl_is_utf8_alnumc(pTHX_ const U8 *p)
	1261	{
	1262	return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
	1263	}
	1264
	1265	bool
	1266	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1267	{
	1268	if (*p == '_')
	1269	return TRUE;
	1270	/* is_utf8_idstart would be more logical. */
	1271	return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
	1272	}
	1273
	1274	bool
	1275	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1276	{
	1277	if (*p == '_')
	1278	return TRUE;
	1279	return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
	1280	}
	1281
	1282	bool
	1283	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1284	{
	1285	return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
	1286	}
	1287
	1288	bool
	1289	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1290	{
	1291	return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
	1292	}
	1293
	1294	bool
	1295	Perl_is_utf8_space(pTHX_ const U8 *p)
	1296	{
	1297	return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
	1298	}
	1299
	1300	bool
	1301	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1302	{
	1303	return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
	1304	}
	1305
	1306	bool
	1307	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1308	{
	1309	return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
	1310	}
	1311
	1312	bool
	1313	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1314	{
	1315	return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
	1316	}
	1317
	1318	bool
	1319	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1320	{
	1321	return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
	1322	}
	1323
	1324	bool
	1325	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1326	{
	1327	return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
	1328	}
	1329
	1330	bool
	1331	Perl_is_utf8_print(pTHX_ const U8 *p)
	1332	{
	1333	return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
	1334	}
	1335
	1336	bool
	1337	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1338	{
	1339	return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
	1340	}
	1341
	1342	bool
	1343	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1344	{
	1345	return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
	1346	}
	1347
	1348	bool
	1349	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1350	{
	1351	return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
	1352	}
	1353
	1354	/*
	1355	=for apidoc A\|UV\|to_utf8_case\|U8 p\|U8 ustrp\|STRLEN lenp\|SV swash\|char normal\|char *special
	1356
	1357	The "p" contains the pointer to the UTF-8 string encoding
	1358	the character that is being converted.
	1359
	1360	The "ustrp" is a pointer to the character buffer to put the
	1361	conversion result to. The "lenp" is a pointer to the length
	1362	of the result.
	1363
	1364	The "swashp" is a pointer to the swash to use.
	1365
	1366	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1367	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1368	but not always, a multicharacter mapping), is tried first.
	1369
	1370	The "special" is a string like "utf8::ToSpecLower", which means the
	1371	hash %utf8::ToSpecLower. The access to the hash is through
	1372	Perl_to_utf8_case().
	1373
	1374	The "normal" is a string like "ToLower" which means the swash
	1375	%utf8::ToLower.
	1376
	1377	=cut */
	1378
	1379	UV
	1380	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1381	SV *swashp, const char normal, const char *special)
	1382	{
	1383	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1384	STRLEN len = 0;
	1385
	1386	const UV uv0 = utf8_to_uvchr(p, NULL);
	1387	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1388	* are necessary in EBCDIC, they are redundant no-ops
	1389	* in ASCII-ish platforms, and hopefully optimized away. */
	1390	const UV uv1 = NATIVE_TO_UNI(uv0);
	1391	uvuni_to_utf8(tmpbuf, uv1);
	1392
	1393	if (!swashp) / load on-demand */
	1394	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1395
	1396	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1397	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1398	/* It might be "special" (sometimes, but not always,
	1399	* a multicharacter mapping) */
	1400	HV *hv;
	1401	SV **svp;
	1402
	1403	if ((hv = get_hv(special, FALSE)) &&
	1404	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1405	(*svp)) {
	1406	const char *s;
	1407
	1408	s = SvPV_const(*svp, len);
	1409	if (len == 1)
	1410	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1411	else {
	1412	#ifdef EBCDIC
	1413	/* If we have EBCDIC we need to remap the characters
	1414	* since any characters in the low 256 are Unicode
	1415	* code points, not EBCDIC. */
	1416	U8 t = (U8)s, tend = t + len, d;
	1417
	1418	d = tmpbuf;
	1419	if (SvUTF8(*svp)) {
	1420	STRLEN tlen = 0;
	1421
	1422	while (t < tend) {
	1423	UV c = utf8_to_uvchr(t, &tlen);
	1424	if (tlen > 0) {
	1425	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1426	t += tlen;
	1427	}
	1428	else
	1429	break;
	1430	}
	1431	}
	1432	else {
	1433	while (t < tend) {
	1434	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1435	t++;
	1436	}
	1437	}
	1438	len = d - tmpbuf;
	1439	Copy(tmpbuf, ustrp, len, U8);
	1440	#else
	1441	Copy(s, ustrp, len, U8);
	1442	#endif
	1443	}
	1444	}
	1445	}
	1446
	1447	if (!len && *swashp) {
	1448	UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1449
	1450	if (uv2) {
	1451	/* It was "normal" (a single character mapping). */
	1452	UV uv3 = UNI_TO_NATIVE(uv2);
	1453
	1454	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1455	}
	1456	}
	1457
	1458	if (!len) /* Neither: just copy. */
	1459	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1460
	1461	if (lenp)
	1462	*lenp = len;
	1463
	1464	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1465	}
	1466
	1467	/*
	1468	=for apidoc A\|UV\|to_utf8_upper\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1469
	1470	Convert the UTF-8 encoded character at p to its uppercase version and
	1471	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1472	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1473	the uppercase version may be longer than the original character.
	1474
	1475	The first character of the uppercased version is returned
	1476	(but note, as explained above, that there may be more.)
	1477
	1478	=cut */
	1479
	1480	UV
	1481	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1482	{
	1483	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1484	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1485	}
	1486
	1487	/*
	1488	=for apidoc A\|UV\|to_utf8_title\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1489
	1490	Convert the UTF-8 encoded character at p to its titlecase version and
	1491	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1492	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1493	titlecase version may be longer than the original character.
	1494
	1495	The first character of the titlecased version is returned
	1496	(but note, as explained above, that there may be more.)
	1497
	1498	=cut */
	1499
	1500	UV
	1501	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1502	{
	1503	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1504	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1505	}
	1506
	1507	/*
	1508	=for apidoc A\|UV\|to_utf8_lower\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1509
	1510	Convert the UTF-8 encoded character at p to its lowercase version and
	1511	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1512	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1513	lowercase version may be longer than the original character.
	1514
	1515	The first character of the lowercased version is returned
	1516	(but note, as explained above, that there may be more.)
	1517
	1518	=cut */
	1519
	1520	UV
	1521	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1522	{
	1523	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1524	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1525	}
	1526
	1527	/*
	1528	=for apidoc A\|UV\|to_utf8_fold\|const U8 p\|U8 ustrp\|STRLEN *lenp
	1529
	1530	Convert the UTF-8 encoded character at p to its foldcase version and
	1531	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1532	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1533	foldcase version may be longer than the original character (up to
	1534	three characters).
	1535
	1536	The first character of the foldcased version is returned
	1537	(but note, as explained above, that there may be more.)
	1538
	1539	=cut */
	1540
	1541	UV
	1542	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1543	{
	1544	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1545	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1546	}
	1547
	1548	/* Note:
	1549	* A "swash" is a swatch hash.
	1550	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	1551	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	1552	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	1553	*/
	1554	SV*
	1555	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	1556	{
	1557	dVAR;
	1558	SV* retval;
	1559	SV* const tokenbufsv = sv_newmortal();
	1560	dSP;
	1561	const size_t pkg_len = strlen(pkg);
	1562	const size_t name_len = strlen(name);
	1563	HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
	1564	SV* errsv_save;
	1565
	1566	PUSHSTACKi(PERLSI_MAGIC);
	1567	ENTER;
	1568	SAVEI32(PL_hints);
	1569	PL_hints = 0;
	1570	save_re_context();
	1571	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1572	ENTER;
	1573	errsv_save = newSVsv(ERRSV);
	1574	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1575	Nullsv);
	1576	if (!SvTRUE(ERRSV))
	1577	sv_setsv(ERRSV, errsv_save);
	1578	SvREFCNT_dec(errsv_save);
	1579	LEAVE;
	1580	}
	1581	SPAGAIN;
	1582	PUSHMARK(SP);
	1583	EXTEND(SP,5);
	1584	PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
	1585	PUSHs(sv_2mortal(newSVpvn(name, name_len)));
	1586	PUSHs(listsv);
	1587	PUSHs(sv_2mortal(newSViv(minbits)));
	1588	PUSHs(sv_2mortal(newSViv(none)));
	1589	PUTBACK;
	1590	if (IN_PERL_COMPILETIME) {
	1591	/* XXX ought to be handled by lex_start */
	1592	SAVEI32(PL_in_my);
	1593	PL_in_my = 0;
	1594	sv_setpv(tokenbufsv, PL_tokenbuf);
	1595	}
	1596	errsv_save = newSVsv(ERRSV);
	1597	if (call_method("SWASHNEW", G_SCALAR))
	1598	retval = newSVsv(*PL_stack_sp--);
	1599	else
	1600	retval = &PL_sv_undef;
	1601	if (!SvTRUE(ERRSV))
	1602	sv_setsv(ERRSV, errsv_save);
	1603	SvREFCNT_dec(errsv_save);
	1604	LEAVE;
	1605	POPSTACK;
	1606	if (IN_PERL_COMPILETIME) {
	1607	STRLEN len;
	1608	const char* const pv = SvPV_const(tokenbufsv, len);
	1609
	1610	Copy(pv, PL_tokenbuf, len+1, char);
	1611	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1612	}
	1613	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1614	if (SvPOK(retval))
	1615	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1616	retval);
	1617	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1618	}
	1619	return retval;
	1620	}
	1621
	1622
	1623	/* This API is wrong for special case conversions since we may need to
	1624	* return several Unicode characters for a single Unicode character
	1625	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1626	* the lower-level routine, and it is similarly broken for returning
	1627	* multiple values. --jhi */
	1628	/* Now SWASHGET is recasted into S_swash_get in this file. */
	1629
	1630	/* Note:
	1631	* Returns the value of property/mapping C<swash> for the first character
	1632	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	1633	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	1634	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	1635	*/
	1636	UV
	1637	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	1638	{
	1639	dVAR;
	1640	HV* const hv = (HV*)SvRV(swash);
	1641	U32 klen;
	1642	U32 off;
	1643	STRLEN slen;
	1644	STRLEN needents;
	1645	const U8 *tmps = NULL;
	1646	U32 bit;
	1647	SV *swatch;
	1648	U8 tmputf8[2];
	1649	UV c = NATIVE_TO_ASCII(*ptr);
	1650
	1651	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1652	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1653	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1654	ptr = tmputf8;
	1655	}
	1656	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1657	* then the "swatch" is a vec() for al the chars which start
	1658	* with 0xAA..0xYY
	1659	* So the key in the hash (klen) is length of encoded char -1
	1660	*/
	1661	klen = UTF8SKIP(ptr) - 1;
	1662	off = ptr[klen];
	1663
	1664	if (klen == 0) {
	1665	/* If char in invariant then swatch is for all the invariant chars
	1666	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1667	*/
	1668	needents = UTF_CONTINUATION_MARK;
	1669	off = NATIVE_TO_UTF(ptr[klen]);
	1670	}
	1671	else {
	1672	/* If char is encoded then swatch is for the prefix */
	1673	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1674	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1675	}
	1676
	1677	/*
	1678	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1679	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1680	* it's nothing to sniff at.) Pity we usually come through at least
	1681	* two function calls to get here...
	1682	*
	1683	* NB: this code assumes that swatches are never modified, once generated!
	1684	*/
	1685
	1686	if (hv == PL_last_swash_hv &&
	1687	klen == PL_last_swash_klen &&
	1688	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1689	{
	1690	tmps = PL_last_swash_tmps;
	1691	slen = PL_last_swash_slen;
	1692	}
	1693	else {
	1694	/* Try our second-level swatch cache, kept in a hash. */
	1695	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1696
	1697	/* If not cached, generate it via swash_get */
	1698	if (!svp \|\| !SvPOK(*svp)
	1699	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	1700	/* We use utf8n_to_uvuni() as we want an index into
	1701	Unicode tables, not a native character number.
	1702	*/
	1703	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1704	ckWARN(WARN_UTF8) ?
	1705	0 : UTF8_ALLOW_ANY);
	1706	swatch = swash_get(swash,
	1707	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1708	(klen) ? (code_point & ~(needents - 1)) : 0,
	1709	needents);
	1710
	1711	if (IN_PERL_COMPILETIME)
	1712	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
	1713
	1714	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	1715
	1716	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	1717	\|\| (slen << 3) < needents)
	1718	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	1719	}
	1720
	1721	PL_last_swash_hv = hv;
	1722	PL_last_swash_klen = klen;
	1723	/* FIXME change interpvar.h? */
	1724	PL_last_swash_tmps = (U8 *) tmps;
	1725	PL_last_swash_slen = slen;
	1726	if (klen)
	1727	Copy(ptr, PL_last_swash_key, klen, U8);
	1728	}
	1729
	1730	switch ((int)((slen << 3) / needents)) {
	1731	case 1:
	1732	bit = 1 << (off & 7);
	1733	off >>= 3;
	1734	return (tmps[off] & bit) != 0;
	1735	case 8:
	1736	return tmps[off];
	1737	case 16:
	1738	off <<= 1;
	1739	return (tmps[off] << 8) + tmps[off + 1] ;
	1740	case 32:
	1741	off <<= 2;
	1742	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1743	}
	1744	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	1745	return 0;
	1746	}
	1747
	1748	/* Note:
	1749	* Returns a swatch (a bit vector string) for a code point sequence
	1750	* that starts from the value C<start> and comprises the number C<span>.
	1751	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	1752	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	1753	*/
	1754	STATIC SV*
	1755	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	1756	{
	1757	SV *swatch;
	1758	U8 l, lend, x, xend, *s;
	1759	STRLEN lcur, xcur, scur;
	1760
	1761	HV* const hv = (HV*)SvRV(swash);
	1762	SV** const listsvp = hv_fetch(hv, "LIST", 4, FALSE);
	1763	SV** const typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
	1764	SV** const bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
	1765	SV** const nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
	1766	SV** const extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
	1767	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	1768	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	1769	const STRLEN bits = SvUV(*bitssvp);
	1770	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	1771	const UV none = SvUV(*nonesvp);
	1772	const UV end = start + span;
	1773
	1774	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	1775	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	1776	(UV)bits);
	1777	}
	1778
	1779	/* create and initialize $swatch */
	1780	swatch = newSVpvn("",0);
	1781	scur = octets ? (span * octets) : (span + 7) / 8;
	1782	SvGROW(swatch, scur + 1);
	1783	s = (U8*)SvPVX(swatch);
	1784	if (octets && none) {
	1785	const U8* const e = s + scur;
	1786	while (s < e) {
	1787	if (bits == 8)
	1788	*s++ = (U8)(none & 0xff);
	1789	else if (bits == 16) {
	1790	*s++ = (U8)((none >> 8) & 0xff);
	1791	*s++ = (U8)( none & 0xff);
	1792	}
	1793	else if (bits == 32) {
	1794	*s++ = (U8)((none >> 24) & 0xff);
	1795	*s++ = (U8)((none >> 16) & 0xff);
	1796	*s++ = (U8)((none >> 8) & 0xff);
	1797	*s++ = (U8)( none & 0xff);
	1798	}
	1799	}
	1800	*s = '\0';
	1801	}
	1802	else {
	1803	(void)memzero((U8*)s, scur + 1);
	1804	}
	1805	SvCUR_set(swatch, scur);
	1806	s = (U8*)SvPVX(swatch);
	1807
	1808	/* read $swash->{LIST} */
	1809	l = (U8)SvPV(listsvp, lcur);
	1810	lend = l + lcur;
	1811	while (l < lend) {
	1812	UV min, max, val, key;
	1813	STRLEN numlen;
	1814	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	1815
	1816	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	1817
	1818	numlen = lend - l;
	1819	min = grok_hex((char *)l, &numlen, &flags, NULL);
	1820	if (numlen)
	1821	l += numlen;
	1822	else if (nl) {
	1823	l = nl + 1; /* 1 is length of "\n" */
	1824	continue;
	1825	}
	1826	else {
	1827	l = lend; /* to LIST's end at which \n is not found */
	1828	break;
	1829	}
	1830
	1831	if (isBLANK(*l)) {
	1832	++l;
	1833	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	1834	numlen = lend - l;
	1835	max = grok_hex((char *)l, &numlen, &flags, NULL);
	1836	if (numlen)
	1837	l += numlen;
	1838	else
	1839	max = min;
	1840
	1841	if (octets) {
	1842	if (isBLANK(*l)) {
	1843	++l;
	1844	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	1845	PERL_SCAN_DISALLOW_PREFIX;
	1846	numlen = lend - l;
	1847	val = grok_hex((char *)l, &numlen, &flags, NULL);
	1848	if (numlen)
	1849	l += numlen;
	1850	else
	1851	val = 0;
	1852	}
	1853	else {
	1854	val = 0;
	1855	if (typeto) {
	1856	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	1857	typestr, l);
	1858	}
	1859	}
	1860	}
	1861	else
	1862	val = 0; /* bits == 1, then val should be ignored */
	1863	}
	1864	else {
	1865	max = min;
	1866	if (octets) {
	1867	val = 0;
	1868	if (typeto) {
	1869	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	1870	}
	1871	}
	1872	else
	1873	val = 0; /* bits == 1, then val should be ignored */
	1874	}
	1875
	1876	if (nl)
	1877	l = nl + 1;
	1878	else
	1879	l = lend;
	1880
	1881	if (max < start)
	1882	continue;
	1883
	1884	if (octets) {
	1885	if (min < start) {
	1886	if (!none \|\| val < none) {
	1887	val += start - min;
	1888	}
	1889	min = start;
	1890	}
	1891	for (key = min; key <= max; key++) {
	1892	STRLEN offset;
	1893	if (key >= end)
	1894	goto go_out_list;
	1895	/* offset must be non-negative (start <= min <= key < end) */
	1896	offset = octets * (key - start);
	1897	if (bits == 8)
	1898	s[offset] = (U8)(val & 0xff);
	1899	else if (bits == 16) {
	1900	s[offset ] = (U8)((val >> 8) & 0xff);
	1901	s[offset + 1] = (U8)( val & 0xff);
	1902	}
	1903	else if (bits == 32) {
	1904	s[offset ] = (U8)((val >> 24) & 0xff);
	1905	s[offset + 1] = (U8)((val >> 16) & 0xff);
	1906	s[offset + 2] = (U8)((val >> 8) & 0xff);
	1907	s[offset + 3] = (U8)( val & 0xff);
	1908	}
	1909
	1910	if (!none \|\| val < none)
	1911	++val;
	1912	}
	1913	}
	1914	else { /* bits == 1, then val should be ignored */
	1915	if (min < start)
	1916	min = start;
	1917	for (key = min; key <= max; key++) {
	1918	const STRLEN offset = (STRLEN)(key - start);
	1919	if (key >= end)
	1920	goto go_out_list;
	1921	s[offset >> 3] \|= 1 << (offset & 7);
	1922	}
	1923	}
	1924	} /* while */
	1925	go_out_list:
	1926
	1927	/* read $swash->{EXTRAS} */
	1928	x = (U8)SvPV(extssvp, xcur);
	1929	xend = x + xcur;
	1930	while (x < xend) {
	1931	STRLEN namelen;
	1932	U8 *namestr;
	1933	SV** othersvp;
	1934	HV* otherhv;
	1935	STRLEN otherbits;
	1936	SV *otherbitssvp, other;
	1937	U8 s, o, *nl;
	1938	STRLEN slen, olen;
	1939
	1940	U8 opc = *x++;
	1941	if (opc == '\n')
	1942	continue;
	1943
	1944	nl = (U8*)memchr(x, '\n', xend - x);
	1945
	1946	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	1947	if (nl) {
	1948	x = nl + 1; /* 1 is length of "\n" */
	1949	continue;
	1950	}
	1951	else {
	1952	x = xend; /* to EXTRAS' end at which \n is not found */
	1953	break;
	1954	}
	1955	}
	1956
	1957	namestr = x;
	1958	if (nl) {
	1959	namelen = nl - namestr;
	1960	x = nl + 1;
	1961	}
	1962	else {
	1963	namelen = xend - namestr;
	1964	x = xend;
	1965	}
	1966
	1967	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	1968	otherhv = (HV)SvRV(othersvp);
	1969	otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
	1970	otherbits = (STRLEN)SvUV(*otherbitssvp);
	1971	if (bits < otherbits)
	1972	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	1973
	1974	/* The "other" swatch must be destroyed after. */
	1975	other = swash_get(*othersvp, start, span);
	1976	o = (U8*)SvPV(other, olen);
	1977
	1978	if (!olen)
	1979	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	1980
	1981	s = (U8*)SvPV(swatch, slen);
	1982	if (bits == 1 && otherbits == 1) {
	1983	if (slen != olen)
	1984	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	1985
	1986	switch (opc) {
	1987	case '+':
	1988	while (slen--)
	1989	s++ \|= o++;
	1990	break;
	1991	case '!':
	1992	while (slen--)
	1993	s++ \|= ~o++;
	1994	break;
	1995	case '-':
	1996	while (slen--)
	1997	s++ &= ~o++;
	1998	break;
	1999	case '&':
	2000	while (slen--)
	2001	s++ &= o++;
	2002	break;
	2003	default:
	2004	break;
	2005	}
	2006	}
	2007	else {
	2008	STRLEN otheroctets = otherbits >> 3;
	2009	STRLEN offset = 0;
	2010	U8* send = s + slen;
	2011
	2012	while (s < send) {
	2013	UV otherval = 0;
	2014
	2015	if (otherbits == 1) {
	2016	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2017	++offset;
	2018	}
	2019	else {
	2020	STRLEN vlen = otheroctets;
	2021	otherval = *o++;
	2022	while (--vlen) {
	2023	otherval <<= 8;
	2024	otherval \|= *o++;
	2025	}
	2026	}
	2027
	2028	if (opc == '+' && otherval)
	2029	; /* replace with otherval */
	2030	else if (opc == '!' && !otherval)
	2031	otherval = 1;
	2032	else if (opc == '-' && otherval)
	2033	otherval = 0;
	2034	else if (opc == '&' && !otherval)
	2035	otherval = 0;
	2036	else {
	2037	s += octets; /* no replacement */
	2038	continue;
	2039	}
	2040
	2041	if (bits == 8)
	2042	*s++ = (U8)( otherval & 0xff);
	2043	else if (bits == 16) {
	2044	*s++ = (U8)((otherval >> 8) & 0xff);
	2045	*s++ = (U8)( otherval & 0xff);
	2046	}
	2047	else if (bits == 32) {
	2048	*s++ = (U8)((otherval >> 24) & 0xff);
	2049	*s++ = (U8)((otherval >> 16) & 0xff);
	2050	*s++ = (U8)((otherval >> 8) & 0xff);
	2051	*s++ = (U8)( otherval & 0xff);
	2052	}
	2053	}
	2054	}
	2055	sv_free(other); /* through with it! */
	2056	} /* while */
	2057	return swatch;
	2058	}
	2059
	2060	/*
	2061	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	2062
	2063	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	2064	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2065	bytes available. The return value is the pointer to the byte after the
	2066	end of the new character. In other words,
	2067
	2068	d = uvchr_to_utf8(d, uv);
	2069
	2070	is the recommended wide native character-aware way of saying
	2071
	2072	*(d++) = uv;
	2073
	2074	=cut
	2075	*/
	2076
	2077	/* On ASCII machines this is normally a macro but we want a
	2078	real function in case XS code wants it
	2079	*/
	2080	U8 *
	2081	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2082	{
	2083	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2084	}
	2085
	2086	U8 *
	2087	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2088	{
	2089	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2090	}
	2091
	2092	/*
	2093	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32
	2094	flags
	2095
	2096	Returns the native character value of the first character in the string
	2097	C<s>
	2098	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2099	length, in bytes, of that character.
	2100
	2101	Allows length and flags to be passed to low level routine.
	2102
	2103	=cut
	2104	*/
	2105	/* On ASCII machines this is normally a macro but we want
	2106	a real function in case XS code wants it
	2107	*/
	2108	UV
	2109	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2110	U32 flags)
	2111	{
	2112	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2113	return UNI_TO_NATIVE(uv);
	2114	}
	2115
	2116	/*
	2117	=for apidoc A\|char \|pv_uni_display\|SV dsv\|U8 *spv\|STRLEN len\|STRLEN pvlim\|UV flags
	2118
	2119	Build to the scalar dsv a displayable version of the string spv,
	2120	length len, the displayable version being at most pvlim bytes long
	2121	(if longer, the rest is truncated and "..." will be appended).
	2122
	2123	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2124	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2125	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2126	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2127	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2128	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2129
	2130	The pointer to the PV of the dsv is returned.
	2131
	2132	=cut */
	2133	char *
	2134	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2135	{
	2136	int truncated = 0;
	2137	const char s, e;
	2138
	2139	sv_setpvn(dsv, "", 0);
	2140	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2141	UV u;
	2142	/* This serves double duty as a flag and a character to print after
	2143	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2144	*/
	2145	char ok = 0;
	2146
	2147	if (pvlim && SvCUR(dsv) >= pvlim) {
	2148	truncated++;
	2149	break;
	2150	}
	2151	u = utf8_to_uvchr((U8*)s, 0);
	2152	if (u < 256) {
	2153	const unsigned char c = (unsigned char)u & 0xFF;
	2154	if (flags & UNI_DISPLAY_BACKSLASH) {
	2155	switch (c) {
	2156	case '\n':
	2157	ok = 'n'; break;
	2158	case '\r':
	2159	ok = 'r'; break;
	2160	case '\t':
	2161	ok = 't'; break;
	2162	case '\f':
	2163	ok = 'f'; break;
	2164	case '\a':
	2165	ok = 'a'; break;
	2166	case '\\':
	2167	ok = '\\'; break;
	2168	default: break;
	2169	}
	2170	if (ok) {
	2171	Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
	2172	}
	2173	}
	2174	/* isPRINT() is the locale-blind version. */
	2175	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	2176	Perl_sv_catpvf(aTHX_ dsv, "%c", c);
	2177	ok = 1;
	2178	}
	2179	}
	2180	if (!ok)
	2181	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	2182	}
	2183	if (truncated)
	2184	sv_catpvn(dsv, "...", 3);
	2185
	2186	return SvPVX(dsv);
	2187	}
	2188
	2189	/*
	2190	=for apidoc A\|char \|sv_uni_display\|SV dsv\|SV *ssv\|STRLEN pvlim\|UV flags
	2191
	2192	Build to the scalar dsv a displayable version of the scalar sv,
	2193	the displayable version being at most pvlim bytes long
	2194	(if longer, the rest is truncated and "..." will be appended).
	2195
	2196	The flags argument is as in pv_uni_display().
	2197
	2198	The pointer to the PV of the dsv is returned.
	2199
	2200	=cut */
	2201	char *
	2202	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	2203	{
	2204	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	2205	SvCUR(ssv), pvlim, flags);
	2206	}
	2207
	2208	/*
	2209	=for apidoc A\|I32\|ibcmp_utf8\|const char s1\|char pe1\|register UV l1\|bool u1\|const char s2\|char **pe2\|register UV l2\|bool u2
	2210
	2211	Return true if the strings s1 and s2 differ case-insensitively, false
	2212	if not (if they are equal case-insensitively). If u1 is true, the
	2213	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	2214	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	2215	are false, the respective string is assumed to be in native 8-bit
	2216	encoding.
	2217
	2218	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	2219	in there (they will point at the beginning of the I<next> character).
	2220	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	2221	pointers beyond which scanning will not continue under any
	2222	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	2223	s2+l2 will be used as goal end pointers that will also stop the scan,
	2224	and which qualify towards defining a successful match: all the scans
	2225	that define an explicit length must reach their goal pointers for
	2226	a match to succeed).
	2227
	2228	For case-insensitiveness, the "casefolding" of Unicode is used
	2229	instead of upper/lowercasing both the characters, see
	2230	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	2231
	2232	=cut */
	2233	I32
	2234	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	2235	{
	2236	register const U8 p1 = (const U8)s1;
	2237	register const U8 p2 = (const U8)s2;
	2238	register const U8 f1 = 0, f2 = 0;
	2239	register U8 e1 = 0, q1 = 0;
	2240	register U8 e2 = 0, q2 = 0;
	2241	STRLEN n1 = 0, n2 = 0;
	2242	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	2243	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	2244	U8 natbuf[1+1];
	2245	STRLEN foldlen1, foldlen2;
	2246	bool match;
	2247
	2248	if (pe1)
	2249	e1 = (U8*)pe1;
	2250	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
	2251	f1 = (const U8*)s1 + l1;
	2252	if (pe2)
	2253	e2 = (U8*)pe2;
	2254	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
	2255	f2 = (const U8*)s2 + l2;
	2256
	2257	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	2258	return 1; /* mismatch; possible infinite loop or false positive */
	2259
	2260	if (!u1 \|\| !u2)
	2261	natbuf[1] = 0; /* Need to terminate the buffer. */
	2262
	2263	while ((e1 == 0 \|\| p1 < e1) &&
	2264	(f1 == 0 \|\| p1 < f1) &&
	2265	(e2 == 0 \|\| p2 < e2) &&
	2266	(f2 == 0 \|\| p2 < f2)) {
	2267	if (n1 == 0) {
	2268	if (u1)
	2269	to_utf8_fold(p1, foldbuf1, &foldlen1);
	2270	else {
	2271	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	2272	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	2273	}
	2274	q1 = foldbuf1;
	2275	n1 = foldlen1;
	2276	}
	2277	if (n2 == 0) {
	2278	if (u2)
	2279	to_utf8_fold(p2, foldbuf2, &foldlen2);
	2280	else {
	2281	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	2282	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	2283	}
	2284	q2 = foldbuf2;
	2285	n2 = foldlen2;
	2286	}
	2287	while (n1 && n2) {
	2288	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2289	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2290	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2291	return 1; /* mismatch */
	2292	n1 -= UTF8SKIP(q1);
	2293	q1 += UTF8SKIP(q1);
	2294	n2 -= UTF8SKIP(q2);
	2295	q2 += UTF8SKIP(q2);
	2296	}
	2297	if (n1 == 0)
	2298	p1 += u1 ? UTF8SKIP(p1) : 1;
	2299	if (n2 == 0)
	2300	p2 += u2 ? UTF8SKIP(p2) : 1;
	2301
	2302	}
	2303
	2304	/* A match is defined by all the scans that specified
	2305	* an explicit length reaching their final goals. */
	2306	match = (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2);
	2307
	2308	if (match) {
	2309	if (pe1)
	2310	pe1 = (char)p1;
	2311	if (pe2)
	2312	pe2 = (char)p2;
	2313	}
	2314
	2315	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2316	}
	2317
	2318	/*
	2319	* Local variables:
	2320	* c-indentation-style: bsd
	2321	* c-basic-offset: 4
	2322	* indent-tabs-mode: t
	2323	* End:
	2324	*
	2325	* ex: set ts=8 sts=4 sw=4 noet:
	2326	*/