perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if first C<len> bytes of the given string are ASCII (i.e. none
	61	of them even raise the question of UTF-8-ness).
	62
	63	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	64
	65	=cut
	66	*/
	67
	68	bool
	69	Perl_is_ascii_string(const U8 *s, STRLEN len)
	70	{
	71	const U8* const send = s + (len ? len : strlen((const char *)s));
	72	const U8* x = s;
	73
	74	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	75
	76	for (; x < send; ++x) {
	77	if (!UTF8_IS_INVARIANT(*x))
	78	break;
	79	}
	80
	81	return x == send;
	82	}
	83
	84	/*
	85	=for apidoc uvuni_to_utf8_flags
	86
	87	Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
	88	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	89	bytes available. The return value is the pointer to the byte after the
	90	end of the new character. In other words,
	91
	92	d = uvuni_to_utf8_flags(d, uv, flags);
	93
	94	or, in most cases,
	95
	96	d = uvuni_to_utf8(d, uv);
	97
	98	(which is equivalent to)
	99
	100	d = uvuni_to_utf8_flags(d, uv, 0);
	101
	102	is the recommended Unicode-aware way of saying
	103
	104	*(d++) = uv;
	105
	106	=cut
	107	*/
	108
	109	U8 *
	110	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	111	{
	112	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	113
	114	if (ckWARN(WARN_UTF8)) {
	115	if (UNICODE_IS_SURROGATE(uv) &&
	116	!(flags & UNICODE_ALLOW_SURROGATE))
	117	Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
	118	else if (
	119	((uv >= 0xFDD0 && uv <= 0xFDEF &&
	120	!(flags & UNICODE_ALLOW_FDD0))
	121	\|\|
	122	((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
	123	!(flags & UNICODE_ALLOW_FFFF))) &&
	124	/* UNICODE_ALLOW_SUPER includes
	125	* FFFEs and FFFFs beyond 0x10FFFF. */
	126	((uv <= PERL_UNICODE_MAX) \|\|
	127	!(flags & UNICODE_ALLOW_SUPER))
	128	)
	129	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	130	"Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
	131	}
	132	if (UNI_IS_INVARIANT(uv)) {
	133	*d++ = (U8)UTF_TO_NATIVE(uv);
	134	return d;
	135	}
	136	#if defined(EBCDIC)
	137	else {
	138	STRLEN len = UNISKIP(uv);
	139	U8 *p = d+len-1;
	140	while (p > d) {
	141	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	142	uv >>= UTF_ACCUMULATION_SHIFT;
	143	}
	144	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	145	return d+len;
	146	}
	147	#else /* Non loop style */
	148	if (uv < 0x800) {
	149	*d++ = (U8)(( uv >> 6) \| 0xc0);
	150	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	151	return d;
	152	}
	153	if (uv < 0x10000) {
	154	*d++ = (U8)(( uv >> 12) \| 0xe0);
	155	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	156	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	157	return d;
	158	}
	159	if (uv < 0x200000) {
	160	*d++ = (U8)(( uv >> 18) \| 0xf0);
	161	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	162	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	163	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	164	return d;
	165	}
	166	if (uv < 0x4000000) {
	167	*d++ = (U8)(( uv >> 24) \| 0xf8);
	168	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	169	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	170	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	171	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	172	return d;
	173	}
	174	if (uv < 0x80000000) {
	175	*d++ = (U8)(( uv >> 30) \| 0xfc);
	176	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	177	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	178	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	179	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	180	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	181	return d;
	182	}
	183	#ifdef HAS_QUAD
	184	if (uv < UTF8_QUAD_MAX)
	185	#endif
	186	{
	187	d++ = 0xfe; / Can't match U+FEFF! */
	188	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	189	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	190	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	191	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	192	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	193	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	194	return d;
	195	}
	196	#ifdef HAS_QUAD
	197	{
	198	d++ = 0xff; / Can't match U+FFFE! */
	199	d++ = 0x80; / 6 Reserved bits */
	200	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	201	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	202	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	204	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	205	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	206	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	207	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	208	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	209	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	210	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	211	return d;
	212	}
	213	#endif
	214	#endif /* Loop style */
	215	}
	216
	217	/*
	218
	219	Tests if some arbitrary number of bytes begins in a valid UTF-8
	220	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	221	UTF-8 character. The actual number of bytes in the UTF-8 character
	222	will be returned if it is valid, otherwise 0.
	223
	224	This is the "slow" version as opposed to the "fast" version which is
	225	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	226	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	227	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	228	you should use the _slow(). In practice this means that the _slow()
	229	will be used very rarely, since the maximum Unicode code point (as of
	230	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	231	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	232	five bytes or more.
	233
	234	=cut */
	235	STATIC STRLEN
	236	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	237	{
	238	U8 u = *s;
	239	STRLEN slen;
	240	UV uv, ouv;
	241
	242	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	243
	244	if (UTF8_IS_INVARIANT(u))
	245	return 1;
	246
	247	if (!UTF8_IS_START(u))
	248	return 0;
	249
	250	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	251	return 0;
	252
	253	slen = len - 1;
	254	s++;
	255	#ifdef EBCDIC
	256	u = NATIVE_TO_UTF(u);
	257	#endif
	258	u &= UTF_START_MASK(len);
	259	uv = u;
	260	ouv = uv;
	261	while (slen--) {
	262	if (!UTF8_IS_CONTINUATION(*s))
	263	return 0;
	264	uv = UTF8_ACCUMULATE(uv, *s);
	265	if (uv < ouv)
	266	return 0;
	267	ouv = uv;
	268	s++;
	269	}
	270
	271	if ((STRLEN)UNISKIP(uv) < len)
	272	return 0;
	273
	274	return len;
	275	}
	276
	277	/*
	278	=for apidoc is_utf8_char
	279
	280	Tests if some arbitrary number of bytes begins in a valid UTF-8
	281	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	282	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	283	character will be returned if it is valid, otherwise 0.
	284
	285	=cut */
	286	STRLEN
	287	Perl_is_utf8_char(const U8 *s)
	288	{
	289	const STRLEN len = UTF8SKIP(s);
	290
	291	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	292	#ifdef IS_UTF8_CHAR
	293	if (IS_UTF8_CHAR_FAST(len))
	294	return IS_UTF8_CHAR(s, len) ? len : 0;
	295	#endif /* #ifdef IS_UTF8_CHAR */
	296	return is_utf8_char_slow(s, len);
	297	}
	298
	299
	300	/*
	301	=for apidoc is_utf8_string
	302
	303	Returns true if first C<len> bytes of the given string form a valid
	304	UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
	305	not mean 'a string that contains code points above 0x7F encoded in UTF-8'
	306	because a valid ASCII string is a valid UTF-8 string.
	307
	308	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	309
	310	=cut
	311	*/
	312
	313	bool
	314	Perl_is_utf8_string(const U8 *s, STRLEN len)
	315	{
	316	const U8* const send = s + (len ? len : strlen((const char *)s));
	317	const U8* x = s;
	318
	319	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	320
	321	while (x < send) {
	322	STRLEN c;
	323	/* Inline the easy bits of is_utf8_char() here for speed... */
	324	if (UTF8_IS_INVARIANT(*x))
	325	c = 1;
	326	else if (!UTF8_IS_START(*x))
	327	goto out;
	328	else {
	329	/* ... and call is_utf8_char() only if really needed. */
	330	#ifdef IS_UTF8_CHAR
	331	c = UTF8SKIP(x);
	332	if (IS_UTF8_CHAR_FAST(c)) {
	333	if (!IS_UTF8_CHAR(x, c))
	334	c = 0;
	335	}
	336	else
	337	c = is_utf8_char_slow(x, c);
	338	#else
	339	c = is_utf8_char(x);
	340	#endif /* #ifdef IS_UTF8_CHAR */
	341	if (!c)
	342	goto out;
	343	}
	344	x += c;
	345	}
	346
	347	out:
	348	if (x != send)
	349	return FALSE;
	350
	351	return TRUE;
	352	}
	353
	354	/*
	355	Implemented as a macro in utf8.h
	356
	357	=for apidoc is_utf8_string_loc
	358
	359	Like is_utf8_string() but stores the location of the failure (in the
	360	case of "utf8ness failure") or the location s+len (in the case of
	361	"utf8ness success") in the C<ep>.
	362
	363	See also is_utf8_string_loclen() and is_utf8_string().
	364
	365	=for apidoc is_utf8_string_loclen
	366
	367	Like is_utf8_string() but stores the location of the failure (in the
	368	case of "utf8ness failure") or the location s+len (in the case of
	369	"utf8ness success") in the C<ep>, and the number of UTF-8
	370	encoded characters in the C<el>.
	371
	372	See also is_utf8_string_loc() and is_utf8_string().
	373
	374	=cut
	375	*/
	376
	377	bool
	378	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	379	{
	380	const U8* const send = s + (len ? len : strlen((const char *)s));
	381	const U8* x = s;
	382	STRLEN c;
	383	STRLEN outlen = 0;
	384
	385	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	386
	387	while (x < send) {
	388	/* Inline the easy bits of is_utf8_char() here for speed... */
	389	if (UTF8_IS_INVARIANT(*x))
	390	c = 1;
	391	else if (!UTF8_IS_START(*x))
	392	goto out;
	393	else {
	394	/* ... and call is_utf8_char() only if really needed. */
	395	#ifdef IS_UTF8_CHAR
	396	c = UTF8SKIP(x);
	397	if (IS_UTF8_CHAR_FAST(c)) {
	398	if (!IS_UTF8_CHAR(x, c))
	399	c = 0;
	400	} else
	401	c = is_utf8_char_slow(x, c);
	402	#else
	403	c = is_utf8_char(x);
	404	#endif /* #ifdef IS_UTF8_CHAR */
	405	if (!c)
	406	goto out;
	407	}
	408	x += c;
	409	outlen++;
	410	}
	411
	412	out:
	413	if (el)
	414	*el = outlen;
	415
	416	if (ep)
	417	*ep = x;
	418	return (x == send);
	419	}
	420
	421	/*
	422
	423	=for apidoc utf8n_to_uvuni
	424
	425	Bottom level UTF-8 decode routine.
	426	Returns the Unicode code point value of the first character in the string C<s>
	427	which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
	428	C<retlen> will be set to the length, in bytes, of that character.
	429
	430	If C<s> does not point to a well-formed UTF-8 character, the behaviour
	431	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	432	it is assumed that the caller will raise a warning, and this function
	433	will silently just set C<retlen> to C<-1> and return zero. If the
	434	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	435	malformations will be given, C<retlen> will be set to the expected
	436	length of the UTF-8 character in bytes, and zero will be returned.
	437
	438	The C<flags> can also contain various flags to allow deviations from
	439	the strict UTF-8 encoding (see F<utf8.h>).
	440
	441	Most code should use utf8_to_uvchr() rather than call this directly.
	442
	443	=cut
	444	*/
	445
	446	UV
	447	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	448	{
	449	dVAR;
	450	const U8 * const s0 = s;
	451	UV uv = *s, ouv = 0;
	452	STRLEN len = 1;
	453	const bool dowarn = ckWARN_d(WARN_UTF8);
	454	const UV startbyte = *s;
	455	STRLEN expectlen = 0;
	456	U32 warning = 0;
	457	SV* sv;
	458
	459	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	460
	461	/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */
	462
	463	#define UTF8_WARN_EMPTY 1
	464	#define UTF8_WARN_CONTINUATION 2
	465	#define UTF8_WARN_NON_CONTINUATION 3
	466	#define UTF8_WARN_FE_FF 4
	467	#define UTF8_WARN_SHORT 5
	468	#define UTF8_WARN_OVERFLOW 6
	469	#define UTF8_WARN_SURROGATE 7
	470	#define UTF8_WARN_LONG 8
	471	#define UTF8_WARN_FFFF 9 /* Also FFFE. */
	472
	473	if (curlen == 0 &&
	474	!(flags & UTF8_ALLOW_EMPTY)) {
	475	warning = UTF8_WARN_EMPTY;
	476	goto malformed;
	477	}
	478
	479	if (UTF8_IS_INVARIANT(uv)) {
	480	if (retlen)
	481	*retlen = 1;
	482	return (UV) (NATIVE_TO_UTF(*s));
	483	}
	484
	485	if (UTF8_IS_CONTINUATION(uv) &&
	486	!(flags & UTF8_ALLOW_CONTINUATION)) {
	487	warning = UTF8_WARN_CONTINUATION;
	488	goto malformed;
	489	}
	490
	491	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	492	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	493	warning = UTF8_WARN_NON_CONTINUATION;
	494	goto malformed;
	495	}
	496
	497	#ifdef EBCDIC
	498	uv = NATIVE_TO_UTF(uv);
	499	#else
	500	if ((uv == 0xfe \|\| uv == 0xff) &&
	501	!(flags & UTF8_ALLOW_FE_FF)) {
	502	warning = UTF8_WARN_FE_FF;
	503	goto malformed;
	504	}
	505	#endif
	506
	507	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	508	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	509	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	510	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	511	#ifdef EBCDIC
	512	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	513	else { len = 7; uv &= 0x01; }
	514	#else
	515	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	516	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	517	else { len = 13; uv = 0; } /* whoa! */
	518	#endif
	519
	520	if (retlen)
	521	*retlen = len;
	522
	523	expectlen = len;
	524
	525	if ((curlen < expectlen) &&
	526	!(flags & UTF8_ALLOW_SHORT)) {
	527	warning = UTF8_WARN_SHORT;
	528	goto malformed;
	529	}
	530
	531	len--;
	532	s++;
	533	ouv = uv;
	534
	535	while (len--) {
	536	if (!UTF8_IS_CONTINUATION(*s) &&
	537	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	538	s--;
	539	warning = UTF8_WARN_NON_CONTINUATION;
	540	goto malformed;
	541	}
	542	else
	543	uv = UTF8_ACCUMULATE(uv, *s);
	544	if (!(uv > ouv)) {
	545	/* These cannot be allowed. */
	546	if (uv == ouv) {
	547	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	548	warning = UTF8_WARN_LONG;
	549	goto malformed;
	550	}
	551	}
	552	else { /* uv < ouv */
	553	/* This cannot be allowed. */
	554	warning = UTF8_WARN_OVERFLOW;
	555	goto malformed;
	556	}
	557	}
	558	s++;
	559	ouv = uv;
	560	}
	561
	562	if (UNICODE_IS_SURROGATE(uv) &&
	563	!(flags & UTF8_ALLOW_SURROGATE)) {
	564	warning = UTF8_WARN_SURROGATE;
	565	goto malformed;
	566	} else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
	567	!(flags & UTF8_ALLOW_LONG)) {
	568	warning = UTF8_WARN_LONG;
	569	goto malformed;
	570	} else if (UNICODE_IS_ILLEGAL(uv) &&
	571	!(flags & UTF8_ALLOW_FFFF)) {
	572	warning = UTF8_WARN_FFFF;
	573	goto malformed;
	574	}
	575
	576	return uv;
	577
	578	malformed:
	579
	580	if (flags & UTF8_CHECK_ONLY) {
	581	if (retlen)
	582	*retlen = ((STRLEN) -1);
	583	return 0;
	584	}
	585
	586	if (dowarn) {
	587	if (warning == UTF8_WARN_FFFF) {
	588	sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
	589	Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
	590	}
	591	else {
	592	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	593
	594	switch (warning) {
	595	case 0: /* Intentionally empty. */ break;
	596	case UTF8_WARN_EMPTY:
	597	sv_catpvs(sv, "(empty string)");
	598	break;
	599	case UTF8_WARN_CONTINUATION:
	600	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	601	break;
	602	case UTF8_WARN_NON_CONTINUATION:
	603	if (s == s0)
	604	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	605	(UV)s[1], startbyte);
	606	else {
	607	const int len = (int)(s-s0);
	608	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	609	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	610	}
	611
	612	break;
	613	case UTF8_WARN_FE_FF:
	614	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	615	break;
	616	case UTF8_WARN_SHORT:
	617	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	618	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	619	expectlen = curlen; /* distance for caller to skip */
	620	break;
	621	case UTF8_WARN_OVERFLOW:
	622	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	623	ouv, *s, startbyte);
	624	break;
	625	case UTF8_WARN_SURROGATE:
	626	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	627	break;
	628	case UTF8_WARN_LONG:
	629	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	630	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	631	break;
	632	default:
	633	sv_catpvs(sv, "(unknown reason)");
	634	break;
	635	}
	636	}
	637
	638	if (warning) {
	639	const char * const s = SvPVX_const(sv);
	640
	641	if (PL_op)
	642	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	643	"%s in %s", s, OP_DESC(PL_op));
	644	else
	645	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	646	}
	647	}
	648
	649	if (retlen)
	650	*retlen = expectlen ? expectlen : len;
	651
	652	return 0;
	653	}
	654
	655	/*
	656	=for apidoc utf8_to_uvchr
	657
	658	Returns the native character value of the first character in the string C<s>
	659	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	660	length, in bytes, of that character.
	661
	662	If C<s> does not point to a well-formed UTF-8 character, zero is
	663	returned and retlen is set, if possible, to -1.
	664
	665	=cut
	666	*/
	667
	668	UV
	669	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	670	{
	671	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	672
	673	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	674	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	675	}
	676
	677	/*
	678	=for apidoc utf8_to_uvuni
	679
	680	Returns the Unicode code point of the first character in the string C<s>
	681	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	682	length, in bytes, of that character.
	683
	684	This function should only be used when the returned UV is considered
	685	an index into the Unicode semantic tables (e.g. swashes).
	686
	687	If C<s> does not point to a well-formed UTF-8 character, zero is
	688	returned and retlen is set, if possible, to -1.
	689
	690	=cut
	691	*/
	692
	693	UV
	694	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	695	{
	696	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	697
	698	/* Call the low level routine asking for checks */
	699	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	700	ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	701	}
	702
	703	/*
	704	=for apidoc utf8_length
	705
	706	Return the length of the UTF-8 char encoded string C<s> in characters.
	707	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	708	up past C<e>, croaks.
	709
	710	=cut
	711	*/
	712
	713	STRLEN
	714	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	715	{
	716	dVAR;
	717	STRLEN len = 0;
	718
	719	PERL_ARGS_ASSERT_UTF8_LENGTH;
	720
	721	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	722	* the bitops (especially ~) can create illegal UTF-8.
	723	* In other words: in Perl UTF-8 is not just for Unicode. */
	724
	725	if (e < s)
	726	goto warn_and_return;
	727	while (s < e) {
	728	if (!UTF8_IS_INVARIANT(*s))
	729	s += UTF8SKIP(s);
	730	else
	731	s++;
	732	len++;
	733	}
	734
	735	if (e != s) {
	736	len--;
	737	warn_and_return:
	738	if (PL_op)
	739	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	740	"%s in %s", unees, OP_DESC(PL_op));
	741	else
	742	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
	743	}
	744
	745	return len;
	746	}
	747
	748	/*
	749	=for apidoc utf8_distance
	750
	751	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	752	and C<b>.
	753
	754	WARNING: use only if you know that the pointers point inside the
	755	same UTF-8 buffer.
	756
	757	=cut
	758	*/
	759
	760	IV
	761	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	762	{
	763	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	764
	765	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	766	}
	767
	768	/*
	769	=for apidoc utf8_hop
	770
	771	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	772	forward or backward.
	773
	774	WARNING: do not use the following unless you know C<off> is within
	775	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	776	on the first byte of character or just after the last byte of a character.
	777
	778	=cut
	779	*/
	780
	781	U8 *
	782	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	783	{
	784	PERL_ARGS_ASSERT_UTF8_HOP;
	785
	786	PERL_UNUSED_CONTEXT;
	787	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	788	* the bitops (especially ~) can create illegal UTF-8.
	789	* In other words: in Perl UTF-8 is not just for Unicode. */
	790
	791	if (off >= 0) {
	792	while (off--)
	793	s += UTF8SKIP(s);
	794	}
	795	else {
	796	while (off++) {
	797	s--;
	798	while (UTF8_IS_CONTINUATION(*s))
	799	s--;
	800	}
	801	}
	802	return (U8 *)s;
	803	}
	804
	805	/*
	806	=for apidoc utf8_to_bytes
	807
	808	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	809	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	810	updates len to contain the new length.
	811	Returns zero on failure, setting C<len> to -1.
	812
	813	If you need a copy of the string, see C<bytes_from_utf8>.
	814
	815	=cut
	816	*/
	817
	818	U8 *
	819	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	820	{
	821	U8 * const save = s;
	822	U8 * const send = s + *len;
	823	U8 *d;
	824
	825	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	826
	827	/* ensure valid UTF-8 and chars < 256 before updating string */
	828	while (s < send) {
	829	U8 c = *s++;
	830
	831	if (!UTF8_IS_INVARIANT(c) &&
	832	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	833	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	834	*len = ((STRLEN) -1);
	835	return 0;
	836	}
	837	}
	838
	839	d = s = save;
	840	while (s < send) {
	841	STRLEN ulen;
	842	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	843	s += ulen;
	844	}
	845	*d = '\0';
	846	*len = d - save;
	847	return save;
	848	}
	849
	850	/*
	851	=for apidoc bytes_from_utf8
	852
	853	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	854	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	855	the newly-created string, and updates C<len> to contain the new
	856	length. Returns the original string if no conversion occurs, C<len>
	857	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	858	0 if C<s> is converted or consisted entirely of characters that are invariant
	859	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	860
	861	=cut
	862	*/
	863
	864	U8 *
	865	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	866	{
	867	U8 *d;
	868	const U8 *start = s;
	869	const U8 *send;
	870	I32 count = 0;
	871
	872	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	873
	874	PERL_UNUSED_CONTEXT;
	875	if (!*is_utf8)
	876	return (U8 *)start;
	877
	878	/* ensure valid UTF-8 and chars < 256 before converting string */
	879	for (send = s + *len; s < send;) {
	880	U8 c = *s++;
	881	if (!UTF8_IS_INVARIANT(c)) {
	882	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	883	(c = *s++) && UTF8_IS_CONTINUATION(c))
	884	count++;
	885	else
	886	return (U8 *)start;
	887	}
	888	}
	889
	890	*is_utf8 = FALSE;
	891
	892	Newx(d, (*len) - count + 1, U8);
	893	s = start; start = d;
	894	while (s < send) {
	895	U8 c = *s++;
	896	if (!UTF8_IS_INVARIANT(c)) {
	897	/* Then it is two-byte encoded */
	898	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	899	c = ASCII_TO_NATIVE(c);
	900	}
	901	*d++ = c;
	902	}
	903	*d = '\0';
	904	*len = d - start;
	905	return (U8 *)start;
	906	}
	907
	908	/*
	909	=for apidoc bytes_to_utf8
	910
	911	Converts a string C<s> of length C<len> from the native encoding into UTF-8.
	912	Returns a pointer to the newly-created string, and sets C<len> to
	913	reflect the new length.
	914
	915	A NUL character will be written after the end of the string.
	916
	917	If you want to convert to UTF-8 from encodings other than
	918	the native (Latin1 or EBCDIC),
	919	see sv_recode_to_utf8().
	920
	921	=cut
	922	*/
	923
	924	U8*
	925	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	926	{
	927	const U8 * const send = s + (*len);
	928	U8 *d;
	929	U8 *dst;
	930
	931	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	932	PERL_UNUSED_CONTEXT;
	933
	934	Newx(d, (len) 2 + 1, U8);
	935	dst = d;
	936
	937	while (s < send) {
	938	const UV uv = NATIVE_TO_ASCII(*s++);
	939	if (UNI_IS_INVARIANT(uv))
	940	*d++ = (U8)UTF_TO_NATIVE(uv);
	941	else {
	942	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	943	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	944	}
	945	}
	946	*d = '\0';
	947	*len = d-dst;
	948	return dst;
	949	}
	950
	951	/*
	952	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	953	*
	954	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	955	* We optimize for native, for obvious reasons. */
	956
	957	U8*
	958	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	959	{
	960	U8* pend;
	961	U8* dstart = d;
	962
	963	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	964
	965	if (bytelen & 1)
	966	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	967
	968	pend = p + bytelen;
	969
	970	while (p < pend) {
	971	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	972	p += 2;
	973	if (uv < 0x80) {
	974	#ifdef EBCDIC
	975	*d++ = UNI_TO_NATIVE(uv);
	976	#else
	977	*d++ = (U8)uv;
	978	#endif
	979	continue;
	980	}
	981	if (uv < 0x800) {
	982	*d++ = (U8)(( uv >> 6) \| 0xc0);
	983	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	984	continue;
	985	}
	986	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	987	if (p >= pend) {
	988	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	989	} else {
	990	UV low = (p[0] << 8) + p[1];
	991	p += 2;
	992	if (low < 0xdc00 \|\| low > 0xdfff)
	993	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	994	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	995	}
	996	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	997	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	998	}
	999	if (uv < 0x10000) {
	1000	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1001	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1002	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1003	continue;
	1004	}
	1005	else {
	1006	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1007	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1008	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1009	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1010	continue;
	1011	}
	1012	}
	1013	*newlen = d - dstart;
	1014	return d;
	1015	}
	1016
	1017	/* Note: this one is slightly destructive of the source. */
	1018
	1019	U8*
	1020	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1021	{
	1022	U8* s = (U8*)p;
	1023	U8* const send = s + bytelen;
	1024
	1025	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1026
	1027	if (bytelen & 1)
	1028	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1029	(UV)bytelen);
	1030
	1031	while (s < send) {
	1032	const U8 tmp = s[0];
	1033	s[0] = s[1];
	1034	s[1] = tmp;
	1035	s += 2;
	1036	}
	1037	return utf16_to_utf8(p, d, bytelen, newlen);
	1038	}
	1039
	1040	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	1041
	1042	bool
	1043	Perl_is_uni_alnum(pTHX_ UV c)
	1044	{
	1045	U8 tmpbuf[UTF8_MAXBYTES+1];
	1046	uvchr_to_utf8(tmpbuf, c);
	1047	return is_utf8_alnum(tmpbuf);
	1048	}
	1049
	1050	bool
	1051	Perl_is_uni_idfirst(pTHX_ UV c)
	1052	{
	1053	U8 tmpbuf[UTF8_MAXBYTES+1];
	1054	uvchr_to_utf8(tmpbuf, c);
	1055	return is_utf8_idfirst(tmpbuf);
	1056	}
	1057
	1058	bool
	1059	Perl_is_uni_alpha(pTHX_ UV c)
	1060	{
	1061	U8 tmpbuf[UTF8_MAXBYTES+1];
	1062	uvchr_to_utf8(tmpbuf, c);
	1063	return is_utf8_alpha(tmpbuf);
	1064	}
	1065
	1066	bool
	1067	Perl_is_uni_ascii(pTHX_ UV c)
	1068	{
	1069	U8 tmpbuf[UTF8_MAXBYTES+1];
	1070	uvchr_to_utf8(tmpbuf, c);
	1071	return is_utf8_ascii(tmpbuf);
	1072	}
	1073
	1074	bool
	1075	Perl_is_uni_space(pTHX_ UV c)
	1076	{
	1077	U8 tmpbuf[UTF8_MAXBYTES+1];
	1078	uvchr_to_utf8(tmpbuf, c);
	1079	return is_utf8_space(tmpbuf);
	1080	}
	1081
	1082	bool
	1083	Perl_is_uni_digit(pTHX_ UV c)
	1084	{
	1085	U8 tmpbuf[UTF8_MAXBYTES+1];
	1086	uvchr_to_utf8(tmpbuf, c);
	1087	return is_utf8_digit(tmpbuf);
	1088	}
	1089
	1090	bool
	1091	Perl_is_uni_upper(pTHX_ UV c)
	1092	{
	1093	U8 tmpbuf[UTF8_MAXBYTES+1];
	1094	uvchr_to_utf8(tmpbuf, c);
	1095	return is_utf8_upper(tmpbuf);
	1096	}
	1097
	1098	bool
	1099	Perl_is_uni_lower(pTHX_ UV c)
	1100	{
	1101	U8 tmpbuf[UTF8_MAXBYTES+1];
	1102	uvchr_to_utf8(tmpbuf, c);
	1103	return is_utf8_lower(tmpbuf);
	1104	}
	1105
	1106	bool
	1107	Perl_is_uni_cntrl(pTHX_ UV c)
	1108	{
	1109	U8 tmpbuf[UTF8_MAXBYTES+1];
	1110	uvchr_to_utf8(tmpbuf, c);
	1111	return is_utf8_cntrl(tmpbuf);
	1112	}
	1113
	1114	bool
	1115	Perl_is_uni_graph(pTHX_ UV c)
	1116	{
	1117	U8 tmpbuf[UTF8_MAXBYTES+1];
	1118	uvchr_to_utf8(tmpbuf, c);
	1119	return is_utf8_graph(tmpbuf);
	1120	}
	1121
	1122	bool
	1123	Perl_is_uni_print(pTHX_ UV c)
	1124	{
	1125	U8 tmpbuf[UTF8_MAXBYTES+1];
	1126	uvchr_to_utf8(tmpbuf, c);
	1127	return is_utf8_print(tmpbuf);
	1128	}
	1129
	1130	bool
	1131	Perl_is_uni_punct(pTHX_ UV c)
	1132	{
	1133	U8 tmpbuf[UTF8_MAXBYTES+1];
	1134	uvchr_to_utf8(tmpbuf, c);
	1135	return is_utf8_punct(tmpbuf);
	1136	}
	1137
	1138	bool
	1139	Perl_is_uni_xdigit(pTHX_ UV c)
	1140	{
	1141	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1142	uvchr_to_utf8(tmpbuf, c);
	1143	return is_utf8_xdigit(tmpbuf);
	1144	}
	1145
	1146	UV
	1147	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1148	{
	1149	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1150
	1151	uvchr_to_utf8(p, c);
	1152	return to_utf8_upper(p, p, lenp);
	1153	}
	1154
	1155	UV
	1156	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1157	{
	1158	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1159
	1160	uvchr_to_utf8(p, c);
	1161	return to_utf8_title(p, p, lenp);
	1162	}
	1163
	1164	UV
	1165	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1166	{
	1167	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1168
	1169	uvchr_to_utf8(p, c);
	1170	return to_utf8_lower(p, p, lenp);
	1171	}
	1172
	1173	UV
	1174	Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
	1175	{
	1176	PERL_ARGS_ASSERT_TO_UNI_FOLD;
	1177
	1178	uvchr_to_utf8(p, c);
	1179	return to_utf8_fold(p, p, lenp);
	1180	}
	1181
	1182	/* for now these all assume no locale info available for Unicode > 255 */
	1183
	1184	bool
	1185	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1186	{
	1187	return is_uni_alnum(c); /* XXX no locale support yet */
	1188	}
	1189
	1190	bool
	1191	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1192	{
	1193	return is_uni_idfirst(c); /* XXX no locale support yet */
	1194	}
	1195
	1196	bool
	1197	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1198	{
	1199	return is_uni_alpha(c); /* XXX no locale support yet */
	1200	}
	1201
	1202	bool
	1203	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1204	{
	1205	return is_uni_ascii(c); /* XXX no locale support yet */
	1206	}
	1207
	1208	bool
	1209	Perl_is_uni_space_lc(pTHX_ UV c)
	1210	{
	1211	return is_uni_space(c); /* XXX no locale support yet */
	1212	}
	1213
	1214	bool
	1215	Perl_is_uni_digit_lc(pTHX_ UV c)
	1216	{
	1217	return is_uni_digit(c); /* XXX no locale support yet */
	1218	}
	1219
	1220	bool
	1221	Perl_is_uni_upper_lc(pTHX_ UV c)
	1222	{
	1223	return is_uni_upper(c); /* XXX no locale support yet */
	1224	}
	1225
	1226	bool
	1227	Perl_is_uni_lower_lc(pTHX_ UV c)
	1228	{
	1229	return is_uni_lower(c); /* XXX no locale support yet */
	1230	}
	1231
	1232	bool
	1233	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1234	{
	1235	return is_uni_cntrl(c); /* XXX no locale support yet */
	1236	}
	1237
	1238	bool
	1239	Perl_is_uni_graph_lc(pTHX_ UV c)
	1240	{
	1241	return is_uni_graph(c); /* XXX no locale support yet */
	1242	}
	1243
	1244	bool
	1245	Perl_is_uni_print_lc(pTHX_ UV c)
	1246	{
	1247	return is_uni_print(c); /* XXX no locale support yet */
	1248	}
	1249
	1250	bool
	1251	Perl_is_uni_punct_lc(pTHX_ UV c)
	1252	{
	1253	return is_uni_punct(c); /* XXX no locale support yet */
	1254	}
	1255
	1256	bool
	1257	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1258	{
	1259	return is_uni_xdigit(c); /* XXX no locale support yet */
	1260	}
	1261
	1262	U32
	1263	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1264	{
	1265	/* XXX returns only the first character -- do not use XXX */
	1266	/* XXX no locale support yet */
	1267	STRLEN len;
	1268	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1269	return (U32)to_uni_upper(c, tmpbuf, &len);
	1270	}
	1271
	1272	U32
	1273	Perl_to_uni_title_lc(pTHX_ U32 c)
	1274	{
	1275	/* XXX returns only the first character XXX -- do not use XXX */
	1276	/* XXX no locale support yet */
	1277	STRLEN len;
	1278	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1279	return (U32)to_uni_title(c, tmpbuf, &len);
	1280	}
	1281
	1282	U32
	1283	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1284	{
	1285	/* XXX returns only the first character -- do not use XXX */
	1286	/* XXX no locale support yet */
	1287	STRLEN len;
	1288	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1289	return (U32)to_uni_lower(c, tmpbuf, &len);
	1290	}
	1291
	1292	static bool
	1293	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1294	const char *const swashname)
	1295	{
	1296	dVAR;
	1297
	1298	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1299
	1300	if (!is_utf8_char(p))
	1301	return FALSE;
	1302	if (!*swash)
	1303	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1304	return swash_fetch(*swash, p, TRUE) != 0;
	1305	}
	1306
	1307	bool
	1308	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1309	{
	1310	dVAR;
	1311
	1312	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1313
	1314	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1315	* descendant of isalnum(3), in other words, it doesn't
	1316	* contain the '_'. --jhi */
	1317	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1318	}
	1319
	1320	bool
	1321	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1322	{
	1323	dVAR;
	1324
	1325	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1326
	1327	if (*p == '_')
	1328	return TRUE;
	1329	/* is_utf8_idstart would be more logical. */
	1330	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1331	}
	1332
	1333	bool
	1334	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1335	{
	1336	dVAR;
	1337
	1338	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1339
	1340	if (*p == '_')
	1341	return TRUE;
	1342	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1343	}
	1344
	1345	bool
	1346	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1347	{
	1348	dVAR;
	1349
	1350	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1351
	1352	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1353	}
	1354
	1355	bool
	1356	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1357	{
	1358	dVAR;
	1359
	1360	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1361
	1362	return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
	1363	}
	1364
	1365	bool
	1366	Perl_is_utf8_space(pTHX_ const U8 *p)
	1367	{
	1368	dVAR;
	1369
	1370	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1371
	1372	return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
	1373	}
	1374
	1375	bool
	1376	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1377	{
	1378	dVAR;
	1379
	1380	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1381
	1382	return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
	1383	}
	1384
	1385	bool
	1386	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1387	{
	1388	dVAR;
	1389
	1390	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1391
	1392	return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
	1393	}
	1394
	1395	bool
	1396	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1397	{
	1398	dVAR;
	1399
	1400	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1401
	1402	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1403	}
	1404
	1405	bool
	1406	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1407	{
	1408	dVAR;
	1409
	1410	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1411
	1412	return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
	1413	}
	1414
	1415	bool
	1416	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1417	{
	1418	dVAR;
	1419
	1420	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1421
	1422	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1423	}
	1424
	1425	bool
	1426	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1427	{
	1428	dVAR;
	1429
	1430	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1431
	1432	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1433	}
	1434
	1435	bool
	1436	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1437	{
	1438	dVAR;
	1439
	1440	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1441
	1442	return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
	1443	}
	1444
	1445	bool
	1446	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1447	{
	1448	dVAR;
	1449
	1450	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1451
	1452	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1453	}
	1454
	1455	bool
	1456	Perl_is_utf8_print(pTHX_ const U8 *p)
	1457	{
	1458	dVAR;
	1459
	1460	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1461
	1462	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1463	}
	1464
	1465	bool
	1466	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1467	{
	1468	dVAR;
	1469
	1470	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1471
	1472	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1473	}
	1474
	1475	bool
	1476	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1477	{
	1478	dVAR;
	1479
	1480	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1481
	1482	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1483	}
	1484
	1485	bool
	1486	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1487	{
	1488	dVAR;
	1489
	1490	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1491
	1492	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1493	}
	1494
	1495	bool
	1496	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1497	{
	1498	dVAR;
	1499
	1500	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1501
	1502	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1503	}
	1504
	1505	bool
	1506	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1507	{
	1508	dVAR;
	1509
	1510	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1511
	1512	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1513	}
	1514
	1515	bool
	1516	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1517	{
	1518	dVAR;
	1519
	1520	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1521
	1522	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1523	}
	1524
	1525	bool
	1526	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1527	{
	1528	dVAR;
	1529
	1530	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1531
	1532	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1533	}
	1534
	1535	bool
	1536	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1537	{
	1538	dVAR;
	1539
	1540	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1541
	1542	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1543	}
	1544
	1545	bool
	1546	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1547	{
	1548	dVAR;
	1549
	1550	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1551
	1552	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1553	}
	1554
	1555	bool
	1556	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1557	{
	1558	dVAR;
	1559
	1560	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1561
	1562	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1563	}
	1564
	1565	bool
	1566	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1567	{
	1568	dVAR;
	1569
	1570	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1571
	1572	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1573	}
	1574
	1575	bool
	1576	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1577	{
	1578	dVAR;
	1579
	1580	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1581
	1582	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1583	}
	1584
	1585	bool
	1586	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1587	{
	1588	dVAR;
	1589
	1590	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1591
	1592	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1593	}
	1594
	1595	/*
	1596	=for apidoc to_utf8_case
	1597
	1598	The "p" contains the pointer to the UTF-8 string encoding
	1599	the character that is being converted.
	1600
	1601	The "ustrp" is a pointer to the character buffer to put the
	1602	conversion result to. The "lenp" is a pointer to the length
	1603	of the result.
	1604
	1605	The "swashp" is a pointer to the swash to use.
	1606
	1607	Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
	1608	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1609	but not always, a multicharacter mapping), is tried first.
	1610
	1611	The "special" is a string like "utf8::ToSpecLower", which means the
	1612	hash %utf8::ToSpecLower. The access to the hash is through
	1613	Perl_to_utf8_case().
	1614
	1615	The "normal" is a string like "ToLower" which means the swash
	1616	%utf8::ToLower.
	1617
	1618	=cut */
	1619
	1620	UV
	1621	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1622	SV *swashp, const char normal, const char *special)
	1623	{
	1624	dVAR;
	1625	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1626	STRLEN len = 0;
	1627	const UV uv0 = utf8_to_uvchr(p, NULL);
	1628	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1629	* are necessary in EBCDIC, they are redundant no-ops
	1630	* in ASCII-ish platforms, and hopefully optimized away. */
	1631	const UV uv1 = NATIVE_TO_UNI(uv0);
	1632
	1633	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1634
	1635	uvuni_to_utf8(tmpbuf, uv1);
	1636
	1637	if (!swashp) / load on-demand */
	1638	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1639	/* This is the beginnings of a skeleton of code to read the info section
	1640	* that is in all the swashes in case we ever want to do that, so one can
	1641	* read things whose maps aren't code points, and whose default if missing
	1642	* is not to the code point itself. This was just to see if it actually
	1643	* worked. Details on what the possibilities are are in perluniprops.pod
	1644	HV * const hv = get_hv("utf8::SwashInfo", 0);
	1645	if (hv) {
	1646	SV **svp;
	1647	svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
	1648	const char *s;
	1649
	1650	HV * const this_hash = SvRV(*svp);
	1651	svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
	1652	s = SvPV_const(*svp, len);
	1653	}
	1654	}*/
	1655
	1656	/* The 0xDF is the only special casing Unicode code point below 0x100. */
	1657	if (special && (uv1 == 0xDF \|\| uv1 > 0xFF)) {
	1658	/* It might be "special" (sometimes, but not always,
	1659	* a multicharacter mapping) */
	1660	HV * const hv = get_hv(special, 0);
	1661	SV **svp;
	1662
	1663	if (hv &&
	1664	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1665	(*svp)) {
	1666	const char *s;
	1667
	1668	s = SvPV_const(*svp, len);
	1669	if (len == 1)
	1670	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1671	else {
	1672	#ifdef EBCDIC
	1673	/* If we have EBCDIC we need to remap the characters
	1674	* since any characters in the low 256 are Unicode
	1675	* code points, not EBCDIC. */
	1676	U8 t = (U8)s, tend = t + len, d;
	1677
	1678	d = tmpbuf;
	1679	if (SvUTF8(*svp)) {
	1680	STRLEN tlen = 0;
	1681
	1682	while (t < tend) {
	1683	const UV c = utf8_to_uvchr(t, &tlen);
	1684	if (tlen > 0) {
	1685	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1686	t += tlen;
	1687	}
	1688	else
	1689	break;
	1690	}
	1691	}
	1692	else {
	1693	while (t < tend) {
	1694	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1695	t++;
	1696	}
	1697	}
	1698	len = d - tmpbuf;
	1699	Copy(tmpbuf, ustrp, len, U8);
	1700	#else
	1701	Copy(s, ustrp, len, U8);
	1702	#endif
	1703	}
	1704	}
	1705	}
	1706
	1707	if (!len && *swashp) {
	1708	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1709
	1710	if (uv2) {
	1711	/* It was "normal" (a single character mapping). */
	1712	const UV uv3 = UNI_TO_NATIVE(uv2);
	1713	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1714	}
	1715	}
	1716
	1717	if (!len) /* Neither: just copy. In other words, there was no mapping
	1718	defined, which means that the code point maps to itself */
	1719	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1720
	1721	if (lenp)
	1722	*lenp = len;
	1723
	1724	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1725	}
	1726
	1727	/*
	1728	=for apidoc to_utf8_upper
	1729
	1730	Convert the UTF-8 encoded character at p to its uppercase version and
	1731	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1732	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1733	the uppercase version may be longer than the original character.
	1734
	1735	The first character of the uppercased version is returned
	1736	(but note, as explained above, that there may be more.)
	1737
	1738	=cut */
	1739
	1740	UV
	1741	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1742	{
	1743	dVAR;
	1744
	1745	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1746
	1747	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1748	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1749	}
	1750
	1751	/*
	1752	=for apidoc to_utf8_title
	1753
	1754	Convert the UTF-8 encoded character at p to its titlecase version and
	1755	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1756	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1757	titlecase version may be longer than the original character.
	1758
	1759	The first character of the titlecased version is returned
	1760	(but note, as explained above, that there may be more.)
	1761
	1762	=cut */
	1763
	1764	UV
	1765	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1766	{
	1767	dVAR;
	1768
	1769	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	1770
	1771	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1772	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	1773	}
	1774
	1775	/*
	1776	=for apidoc to_utf8_lower
	1777
	1778	Convert the UTF-8 encoded character at p to its lowercase version and
	1779	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1780	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1781	lowercase version may be longer than the original character.
	1782
	1783	The first character of the lowercased version is returned
	1784	(but note, as explained above, that there may be more.)
	1785
	1786	=cut */
	1787
	1788	UV
	1789	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1790	{
	1791	dVAR;
	1792
	1793	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	1794
	1795	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1796	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	1797	}
	1798
	1799	/*
	1800	=for apidoc to_utf8_fold
	1801
	1802	Convert the UTF-8 encoded character at p to its foldcase version and
	1803	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1804	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	1805	foldcase version may be longer than the original character (up to
	1806	three characters).
	1807
	1808	The first character of the foldcased version is returned
	1809	(but note, as explained above, that there may be more.)
	1810
	1811	=cut */
	1812
	1813	UV
	1814	Perl_to_utf8_fold(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1815	{
	1816	dVAR;
	1817
	1818	PERL_ARGS_ASSERT_TO_UTF8_FOLD;
	1819
	1820	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1821	&PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
	1822	}
	1823
	1824	/* Note:
	1825	* A "swash" is a swatch hash.
	1826	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	1827	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	1828	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	1829	*/
	1830	SV*
	1831	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	1832	{
	1833	dVAR;
	1834	SV* retval;
	1835	dSP;
	1836	const size_t pkg_len = strlen(pkg);
	1837	const size_t name_len = strlen(name);
	1838	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	1839	SV* errsv_save;
	1840
	1841	PERL_ARGS_ASSERT_SWASH_INIT;
	1842
	1843	PUSHSTACKi(PERLSI_MAGIC);
	1844	ENTER;
	1845	SAVEHINTS();
	1846	save_re_context();
	1847	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1848	ENTER;
	1849	errsv_save = newSVsv(ERRSV);
	1850	/* It is assumed that callers of this routine are not passing in any
	1851	user derived data. */
	1852	/* Need to do this after save_re_context() as it will set PL_tainted to
	1853	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	1854	Even line to create errsv_save can turn on PL_tainted. */
	1855	SAVEBOOL(PL_tainted);
	1856	PL_tainted = 0;
	1857	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	1858	NULL);
	1859	if (!SvTRUE(ERRSV))
	1860	sv_setsv(ERRSV, errsv_save);
	1861	SvREFCNT_dec(errsv_save);
	1862	LEAVE;
	1863	}
	1864	SPAGAIN;
	1865	PUSHMARK(SP);
	1866	EXTEND(SP,5);
	1867	mPUSHp(pkg, pkg_len);
	1868	mPUSHp(name, name_len);
	1869	PUSHs(listsv);
	1870	mPUSHi(minbits);
	1871	mPUSHi(none);
	1872	PUTBACK;
	1873	errsv_save = newSVsv(ERRSV);
	1874	if (call_method("SWASHNEW", G_SCALAR))
	1875	retval = newSVsv(*PL_stack_sp--);
	1876	else
	1877	retval = &PL_sv_undef;
	1878	if (!SvTRUE(ERRSV))
	1879	sv_setsv(ERRSV, errsv_save);
	1880	SvREFCNT_dec(errsv_save);
	1881	LEAVE;
	1882	POPSTACK;
	1883	if (IN_PERL_COMPILETIME) {
	1884	CopHINTS_set(PL_curcop, PL_hints);
	1885	}
	1886	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	1887	if (SvPOK(retval))
	1888	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	1889	SVfARG(retval));
	1890	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1891	}
	1892	return retval;
	1893	}
	1894
	1895
	1896	/* This API is wrong for special case conversions since we may need to
	1897	* return several Unicode characters for a single Unicode character
	1898	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	1899	* the lower-level routine, and it is similarly broken for returning
	1900	* multiple values. --jhi */
	1901	/* Now SWASHGET is recasted into S_swash_get in this file. */
	1902
	1903	/* Note:
	1904	* Returns the value of property/mapping C<swash> for the first character
	1905	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	1906	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	1907	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	1908	*/
	1909	UV
	1910	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	1911	{
	1912	dVAR;
	1913	HV *const hv = MUTABLE_HV(SvRV(swash));
	1914	U32 klen;
	1915	U32 off;
	1916	STRLEN slen;
	1917	STRLEN needents;
	1918	const U8 *tmps = NULL;
	1919	U32 bit;
	1920	SV *swatch;
	1921	U8 tmputf8[2];
	1922	const UV c = NATIVE_TO_ASCII(*ptr);
	1923
	1924	PERL_ARGS_ASSERT_SWASH_FETCH;
	1925
	1926	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1927	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	1928	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	1929	ptr = tmputf8;
	1930	}
	1931	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1932	* then the "swatch" is a vec() for all the chars which start
	1933	* with 0xAA..0xYY
	1934	* So the key in the hash (klen) is length of encoded char -1
	1935	*/
	1936	klen = UTF8SKIP(ptr) - 1;
	1937	off = ptr[klen];
	1938
	1939	if (klen == 0) {
	1940	/* If char is invariant then swatch is for all the invariant chars
	1941	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	1942	*/
	1943	needents = UTF_CONTINUATION_MARK;
	1944	off = NATIVE_TO_UTF(ptr[klen]);
	1945	}
	1946	else {
	1947	/* If char is encoded then swatch is for the prefix */
	1948	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1949	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1950	}
	1951
	1952	/*
	1953	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1954	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1955	* it's nothing to sniff at.) Pity we usually come through at least
	1956	* two function calls to get here...
	1957	*
	1958	* NB: this code assumes that swatches are never modified, once generated!
	1959	*/
	1960
	1961	if (hv == PL_last_swash_hv &&
	1962	klen == PL_last_swash_klen &&
	1963	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1964	{
	1965	tmps = PL_last_swash_tmps;
	1966	slen = PL_last_swash_slen;
	1967	}
	1968	else {
	1969	/* Try our second-level swatch cache, kept in a hash. */
	1970	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	1971
	1972	/* If not cached, generate it via swash_get */
	1973	if (!svp \|\| !SvPOK(*svp)
	1974	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	1975	/* We use utf8n_to_uvuni() as we want an index into
	1976	Unicode tables, not a native character number.
	1977	*/
	1978	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	1979	ckWARN(WARN_UTF8) ?
	1980	0 : UTF8_ALLOW_ANY);
	1981	swatch = swash_get(swash,
	1982	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1983	(klen) ? (code_point & ~(needents - 1)) : 0,
	1984	needents);
	1985
	1986	if (IN_PERL_COMPILETIME)
	1987	CopHINTS_set(PL_curcop, PL_hints);
	1988
	1989	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	1990
	1991	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	1992	\|\| (slen << 3) < needents)
	1993	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	1994	}
	1995
	1996	PL_last_swash_hv = hv;
	1997	assert(klen <= sizeof(PL_last_swash_key));
	1998	PL_last_swash_klen = (U8)klen;
	1999	/* FIXME change interpvar.h? */
	2000	PL_last_swash_tmps = (U8 *) tmps;
	2001	PL_last_swash_slen = slen;
	2002	if (klen)
	2003	Copy(ptr, PL_last_swash_key, klen, U8);
	2004	}
	2005
	2006	switch ((int)((slen << 3) / needents)) {
	2007	case 1:
	2008	bit = 1 << (off & 7);
	2009	off >>= 3;
	2010	return (tmps[off] & bit) != 0;
	2011	case 8:
	2012	return tmps[off];
	2013	case 16:
	2014	off <<= 1;
	2015	return (tmps[off] << 8) + tmps[off + 1] ;
	2016	case 32:
	2017	off <<= 2;
	2018	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2019	}
	2020	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2021	NORETURN_FUNCTION_END;
	2022	}
	2023
	2024	/* Note:
	2025	* Returns a swatch (a bit vector string) for a code point sequence
	2026	* that starts from the value C<start> and comprises the number C<span>.
	2027	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2028	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2029	*/
	2030	STATIC SV*
	2031	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2032	{
	2033	SV *swatch;
	2034	U8 l, lend, x, xend, *s;
	2035	STRLEN lcur, xcur, scur;
	2036	HV *const hv = MUTABLE_HV(SvRV(swash));
	2037	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2038	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2039	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2040	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2041	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2042	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2043	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2044	const STRLEN bits = SvUV(*bitssvp);
	2045	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2046	const UV none = SvUV(*nonesvp);
	2047	const UV end = start + span;
	2048
	2049	PERL_ARGS_ASSERT_SWASH_GET;
	2050
	2051	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2052	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2053	(UV)bits);
	2054	}
	2055
	2056	/* create and initialize $swatch */
	2057	scur = octets ? (span * octets) : (span + 7) / 8;
	2058	swatch = newSV(scur);
	2059	SvPOK_on(swatch);
	2060	s = (U8*)SvPVX(swatch);
	2061	if (octets && none) {
	2062	const U8* const e = s + scur;
	2063	while (s < e) {
	2064	if (bits == 8)
	2065	*s++ = (U8)(none & 0xff);
	2066	else if (bits == 16) {
	2067	*s++ = (U8)((none >> 8) & 0xff);
	2068	*s++ = (U8)( none & 0xff);
	2069	}
	2070	else if (bits == 32) {
	2071	*s++ = (U8)((none >> 24) & 0xff);
	2072	*s++ = (U8)((none >> 16) & 0xff);
	2073	*s++ = (U8)((none >> 8) & 0xff);
	2074	*s++ = (U8)( none & 0xff);
	2075	}
	2076	}
	2077	*s = '\0';
	2078	}
	2079	else {
	2080	(void)memzero((U8*)s, scur + 1);
	2081	}
	2082	SvCUR_set(swatch, scur);
	2083	s = (U8*)SvPVX(swatch);
	2084
	2085	/* read $swash->{LIST} */
	2086	l = (U8)SvPV(listsvp, lcur);
	2087	lend = l + lcur;
	2088	while (l < lend) {
	2089	UV min, max, val;
	2090	STRLEN numlen;
	2091	I32 flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2092
	2093	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2094
	2095	numlen = lend - l;
	2096	min = grok_hex((char *)l, &numlen, &flags, NULL);
	2097	if (numlen)
	2098	l += numlen;
	2099	else if (nl) {
	2100	l = nl + 1; /* 1 is length of "\n" */
	2101	continue;
	2102	}
	2103	else {
	2104	l = lend; /* to LIST's end at which \n is not found */
	2105	break;
	2106	}
	2107
	2108	if (isBLANK(*l)) {
	2109	++l;
	2110	flags = PERL_SCAN_SILENT_ILLDIGIT \| PERL_SCAN_DISALLOW_PREFIX;
	2111	numlen = lend - l;
	2112	max = grok_hex((char *)l, &numlen, &flags, NULL);
	2113	if (numlen)
	2114	l += numlen;
	2115	else
	2116	max = min;
	2117
	2118	if (octets) {
	2119	if (isBLANK(*l)) {
	2120	++l;
	2121	flags = PERL_SCAN_SILENT_ILLDIGIT \|
	2122	PERL_SCAN_DISALLOW_PREFIX;
	2123	numlen = lend - l;
	2124	val = grok_hex((char *)l, &numlen, &flags, NULL);
	2125	if (numlen)
	2126	l += numlen;
	2127	else
	2128	val = 0;
	2129	}
	2130	else {
	2131	val = 0;
	2132	if (typeto) {
	2133	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2134	typestr, l);
	2135	}
	2136	}
	2137	}
	2138	else
	2139	val = 0; /* bits == 1, then val should be ignored */
	2140	}
	2141	else {
	2142	max = min;
	2143	if (octets) {
	2144	val = 0;
	2145	if (typeto) {
	2146	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2147	}
	2148	}
	2149	else
	2150	val = 0; /* bits == 1, then val should be ignored */
	2151	}
	2152
	2153	if (nl)
	2154	l = nl + 1;
	2155	else
	2156	l = lend;
	2157
	2158	if (max < start)
	2159	continue;
	2160
	2161	if (octets) {
	2162	UV key;
	2163	if (min < start) {
	2164	if (!none \|\| val < none) {
	2165	val += start - min;
	2166	}
	2167	min = start;
	2168	}
	2169	for (key = min; key <= max; key++) {
	2170	STRLEN offset;
	2171	if (key >= end)
	2172	goto go_out_list;
	2173	/* offset must be non-negative (start <= min <= key < end) */
	2174	offset = octets * (key - start);
	2175	if (bits == 8)
	2176	s[offset] = (U8)(val & 0xff);
	2177	else if (bits == 16) {
	2178	s[offset ] = (U8)((val >> 8) & 0xff);
	2179	s[offset + 1] = (U8)( val & 0xff);
	2180	}
	2181	else if (bits == 32) {
	2182	s[offset ] = (U8)((val >> 24) & 0xff);
	2183	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2184	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2185	s[offset + 3] = (U8)( val & 0xff);
	2186	}
	2187
	2188	if (!none \|\| val < none)
	2189	++val;
	2190	}
	2191	}
	2192	else { /* bits == 1, then val should be ignored */
	2193	UV key;
	2194	if (min < start)
	2195	min = start;
	2196	for (key = min; key <= max; key++) {
	2197	const STRLEN offset = (STRLEN)(key - start);
	2198	if (key >= end)
	2199	goto go_out_list;
	2200	s[offset >> 3] \|= 1 << (offset & 7);
	2201	}
	2202	}
	2203	} /* while */
	2204	go_out_list:
	2205
	2206	/* read $swash->{EXTRAS} */
	2207	x = (U8)SvPV(extssvp, xcur);
	2208	xend = x + xcur;
	2209	while (x < xend) {
	2210	STRLEN namelen;
	2211	U8 *namestr;
	2212	SV** othersvp;
	2213	HV* otherhv;
	2214	STRLEN otherbits;
	2215	SV *otherbitssvp, other;
	2216	U8 s, o, *nl;
	2217	STRLEN slen, olen;
	2218
	2219	const U8 opc = *x++;
	2220	if (opc == '\n')
	2221	continue;
	2222
	2223	nl = (U8*)memchr(x, '\n', xend - x);
	2224
	2225	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2226	if (nl) {
	2227	x = nl + 1; /* 1 is length of "\n" */
	2228	continue;
	2229	}
	2230	else {
	2231	x = xend; /* to EXTRAS' end at which \n is not found */
	2232	break;
	2233	}
	2234	}
	2235
	2236	namestr = x;
	2237	if (nl) {
	2238	namelen = nl - namestr;
	2239	x = nl + 1;
	2240	}
	2241	else {
	2242	namelen = xend - namestr;
	2243	x = xend;
	2244	}
	2245
	2246	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2247	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2248	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2249	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2250	if (bits < otherbits)
	2251	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2252
	2253	/* The "other" swatch must be destroyed after. */
	2254	other = swash_get(*othersvp, start, span);
	2255	o = (U8*)SvPV(other, olen);
	2256
	2257	if (!olen)
	2258	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2259
	2260	s = (U8*)SvPV(swatch, slen);
	2261	if (bits == 1 && otherbits == 1) {
	2262	if (slen != olen)
	2263	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2264
	2265	switch (opc) {
	2266	case '+':
	2267	while (slen--)
	2268	s++ \|= o++;
	2269	break;
	2270	case '!':
	2271	while (slen--)
	2272	s++ \|= ~o++;
	2273	break;
	2274	case '-':
	2275	while (slen--)
	2276	s++ &= ~o++;
	2277	break;
	2278	case '&':
	2279	while (slen--)
	2280	s++ &= o++;
	2281	break;
	2282	default:
	2283	break;
	2284	}
	2285	}
	2286	else {
	2287	STRLEN otheroctets = otherbits >> 3;
	2288	STRLEN offset = 0;
	2289	U8* const send = s + slen;
	2290
	2291	while (s < send) {
	2292	UV otherval = 0;
	2293
	2294	if (otherbits == 1) {
	2295	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2296	++offset;
	2297	}
	2298	else {
	2299	STRLEN vlen = otheroctets;
	2300	otherval = *o++;
	2301	while (--vlen) {
	2302	otherval <<= 8;
	2303	otherval \|= *o++;
	2304	}
	2305	}
	2306
	2307	if (opc == '+' && otherval)
	2308	NOOP; /* replace with otherval */
	2309	else if (opc == '!' && !otherval)
	2310	otherval = 1;
	2311	else if (opc == '-' && otherval)
	2312	otherval = 0;
	2313	else if (opc == '&' && !otherval)
	2314	otherval = 0;
	2315	else {
	2316	s += octets; /* no replacement */
	2317	continue;
	2318	}
	2319
	2320	if (bits == 8)
	2321	*s++ = (U8)( otherval & 0xff);
	2322	else if (bits == 16) {
	2323	*s++ = (U8)((otherval >> 8) & 0xff);
	2324	*s++ = (U8)( otherval & 0xff);
	2325	}
	2326	else if (bits == 32) {
	2327	*s++ = (U8)((otherval >> 24) & 0xff);
	2328	*s++ = (U8)((otherval >> 16) & 0xff);
	2329	*s++ = (U8)((otherval >> 8) & 0xff);
	2330	*s++ = (U8)( otherval & 0xff);
	2331	}
	2332	}
	2333	}
	2334	sv_free(other); /* through with it! */
	2335	} /* while */
	2336	return swatch;
	2337	}
	2338
	2339	/*
	2340	=for apidoc uvchr_to_utf8
	2341
	2342	Adds the UTF-8 representation of the Native codepoint C<uv> to the end
	2343	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	2344	bytes available. The return value is the pointer to the byte after the
	2345	end of the new character. In other words,
	2346
	2347	d = uvchr_to_utf8(d, uv);
	2348
	2349	is the recommended wide native character-aware way of saying
	2350
	2351	*(d++) = uv;
	2352
	2353	=cut
	2354	*/
	2355
	2356	/* On ASCII machines this is normally a macro but we want a
	2357	real function in case XS code wants it
	2358	*/
	2359	U8 *
	2360	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	2361	{
	2362	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	2363
	2364	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	2365	}
	2366
	2367	U8 *
	2368	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	2369	{
	2370	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	2371
	2372	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	2373	}
	2374
	2375	/*
	2376	=for apidoc utf8n_to_uvchr
	2377	flags
	2378
	2379	Returns the native character value of the first character in the string
	2380	C<s>
	2381	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	2382	length, in bytes, of that character.
	2383
	2384	Allows length and flags to be passed to low level routine.
	2385
	2386	=cut
	2387	*/
	2388	/* On ASCII machines this is normally a macro but we want
	2389	a real function in case XS code wants it
	2390	*/
	2391	UV
	2392	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	2393	U32 flags)
	2394	{
	2395	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	2396
	2397	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	2398
	2399	return UNI_TO_NATIVE(uv);
	2400	}
	2401
	2402	/*
	2403	=for apidoc pv_uni_display
	2404
	2405	Build to the scalar dsv a displayable version of the string spv,
	2406	length len, the displayable version being at most pvlim bytes long
	2407	(if longer, the rest is truncated and "..." will be appended).
	2408
	2409	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	2410	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	2411	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	2412	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	2413	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	2414	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	2415
	2416	The pointer to the PV of the dsv is returned.
	2417
	2418	=cut */
	2419	char *
	2420	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	2421	{
	2422	int truncated = 0;
	2423	const char s, e;
	2424
	2425	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	2426
	2427	sv_setpvs(dsv, "");
	2428	SvUTF8_off(dsv);
	2429	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	2430	UV u;
	2431	/* This serves double duty as a flag and a character to print after
	2432	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	2433	*/
	2434	char ok = 0;
	2435
	2436	if (pvlim && SvCUR(dsv) >= pvlim) {
	2437	truncated++;
	2438	break;
	2439	}
	2440	u = utf8_to_uvchr((U8*)s, 0);
	2441	if (u < 256) {
	2442	const unsigned char c = (unsigned char)u & 0xFF;
	2443	if (flags & UNI_DISPLAY_BACKSLASH) {
	2444	switch (c) {
	2445	case '\n':
	2446	ok = 'n'; break;
	2447	case '\r':
	2448	ok = 'r'; break;
	2449	case '\t':
	2450	ok = 't'; break;
	2451	case '\f':
	2452	ok = 'f'; break;
	2453	case '\a':
	2454	ok = 'a'; break;
	2455	case '\\':
	2456	ok = '\\'; break;
	2457	default: break;
	2458	}
	2459	if (ok) {
	2460	const char string = ok;
	2461	sv_catpvs(dsv, "\\");
	2462	sv_catpvn(dsv, &string, 1);
	2463	}
	2464	}
	2465	/* isPRINT() is the locale-blind version. */
	2466	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	2467	const char string = c;
	2468	sv_catpvn(dsv, &string, 1);
	2469	ok = 1;
	2470	}
	2471	}
	2472	if (!ok)
	2473	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	2474	}
	2475	if (truncated)
	2476	sv_catpvs(dsv, "...");
	2477
	2478	return SvPVX(dsv);
	2479	}
	2480
	2481	/*
	2482	=for apidoc sv_uni_display
	2483
	2484	Build to the scalar dsv a displayable version of the scalar sv,
	2485	the displayable version being at most pvlim bytes long
	2486	(if longer, the rest is truncated and "..." will be appended).
	2487
	2488	The flags argument is as in pv_uni_display().
	2489
	2490	The pointer to the PV of the dsv is returned.
	2491
	2492	=cut
	2493	*/
	2494	char *
	2495	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	2496	{
	2497	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	2498
	2499	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	2500	SvCUR(ssv), pvlim, flags);
	2501	}
	2502
	2503	/*
	2504	=for apidoc ibcmp_utf8
	2505
	2506	Return true if the strings s1 and s2 differ case-insensitively, false
	2507	if not (if they are equal case-insensitively). If u1 is true, the
	2508	string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
	2509	the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
	2510	are false, the respective string is assumed to be in native 8-bit
	2511	encoding.
	2512
	2513	If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
	2514	in there (they will point at the beginning of the I<next> character).
	2515	If the pointers behind pe1 or pe2 are non-NULL, they are the end
	2516	pointers beyond which scanning will not continue under any
	2517	circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
	2518	s2+l2 will be used as goal end pointers that will also stop the scan,
	2519	and which qualify towards defining a successful match: all the scans
	2520	that define an explicit length must reach their goal pointers for
	2521	a match to succeed).
	2522
	2523	For case-insensitiveness, the "casefolding" of Unicode is used
	2524	instead of upper/lowercasing both the characters, see
	2525	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	2526
	2527	=cut */
	2528	I32
	2529	Perl_ibcmp_utf8(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2)
	2530	{
	2531	dVAR;
	2532	register const U8 p1 = (const U8)s1;
	2533	register const U8 p2 = (const U8)s2;
	2534	register const U8 *f1 = NULL;
	2535	register const U8 *f2 = NULL;
	2536	register U8 *e1 = NULL;
	2537	register U8 *q1 = NULL;
	2538	register U8 *e2 = NULL;
	2539	register U8 *q2 = NULL;
	2540	STRLEN n1 = 0, n2 = 0;
	2541	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	2542	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	2543	U8 natbuf[1+1];
	2544	STRLEN foldlen1, foldlen2;
	2545	bool match;
	2546
	2547	PERL_ARGS_ASSERT_IBCMP_UTF8;
	2548
	2549	if (pe1)
	2550	e1 = (U8*)pe1;
	2551	/* assert(e1 \|\| l1); */
	2552	if (e1 == 0 \|\| (l1 && l1 < (UV)(e1 - (const U8*)s1)))
	2553	f1 = (const U8*)s1 + l1;
	2554	if (pe2)
	2555	e2 = (U8*)pe2;
	2556	/* assert(e2 \|\| l2); */
	2557	if (e2 == 0 \|\| (l2 && l2 < (UV)(e2 - (const U8*)s2)))
	2558	f2 = (const U8*)s2 + l2;
	2559
	2560	/* This shouldn't happen. However, putting an assert() there makes some
	2561	* tests fail. */
	2562	/* assert((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0)); */
	2563	if ((e1 == 0 && f1 == 0) \|\| (e2 == 0 && f2 == 0) \|\| (f1 == 0 && f2 == 0))
	2564	return 1; /* mismatch; possible infinite loop or false positive */
	2565
	2566	if (!u1 \|\| !u2)
	2567	natbuf[1] = 0; /* Need to terminate the buffer. */
	2568
	2569	while ((e1 == 0 \|\| p1 < e1) &&
	2570	(f1 == 0 \|\| p1 < f1) &&
	2571	(e2 == 0 \|\| p2 < e2) &&
	2572	(f2 == 0 \|\| p2 < f2)) {
	2573	if (n1 == 0) {
	2574	if (u1)
	2575	to_utf8_fold(p1, foldbuf1, &foldlen1);
	2576	else {
	2577	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	2578	to_utf8_fold(natbuf, foldbuf1, &foldlen1);
	2579	}
	2580	q1 = foldbuf1;
	2581	n1 = foldlen1;
	2582	}
	2583	if (n2 == 0) {
	2584	if (u2)
	2585	to_utf8_fold(p2, foldbuf2, &foldlen2);
	2586	else {
	2587	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	2588	to_utf8_fold(natbuf, foldbuf2, &foldlen2);
	2589	}
	2590	q2 = foldbuf2;
	2591	n2 = foldlen2;
	2592	}
	2593	while (n1 && n2) {
	2594	if ( UTF8SKIP(q1) != UTF8SKIP(q2) \|\|
	2595	(UTF8SKIP(q1) == 1 && q1 != q2) \|\|
	2596	memNE((char)q1, (char)q2, UTF8SKIP(q1)) )
	2597	return 1; /* mismatch */
	2598	n1 -= UTF8SKIP(q1);
	2599	q1 += UTF8SKIP(q1);
	2600	n2 -= UTF8SKIP(q2);
	2601	q2 += UTF8SKIP(q2);
	2602	}
	2603	if (n1 == 0)
	2604	p1 += u1 ? UTF8SKIP(p1) : 1;
	2605	if (n2 == 0)
	2606	p2 += u2 ? UTF8SKIP(p2) : 1;
	2607
	2608	}
	2609
	2610	/* A match is defined by all the scans that specified
	2611	* an explicit length reaching their final goals. */
	2612	match = (n1 == 0 && n2 == 0 /* Must not match partial char; Bug #72998 */
	2613	&& (f1 == 0 \|\| p1 == f1) && (f2 == 0 \|\| p2 == f2));
	2614
	2615	if (match) {
	2616	if (pe1)
	2617	pe1 = (char)p1;
	2618	if (pe2)
	2619	pe2 = (char)p2;
	2620	}
	2621
	2622	return match ? 0 : 1; /* 0 match, 1 mismatch */
	2623	}
	2624
	2625	/*
	2626	* Local variables:
	2627	* c-indentation-style: bsd
	2628	* c-basic-offset: 4
	2629	* indent-tabs-mode: t
	2630	* End:
	2631	*
	2632	* ex: set ts=8 sts=4 sw=4 noet:
	2633	*/