perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>.
	66
	67	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	68
	69	=cut
	70	*/
	71
	72	bool
	73	Perl_is_ascii_string(const U8 *s, STRLEN len)
	74	{
	75	const U8* const send = s + (len ? len : strlen((const char *)s));
	76	const U8* x = s;
	77
	78	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	79
	80	for (; x < send; ++x) {
	81	if (!UTF8_IS_INVARIANT(*x))
	82	break;
	83	}
	84
	85	return x == send;
	86	}
	87
	88	/*
	89	=for apidoc uvuni_to_utf8_flags
	90
	91	Adds the UTF-8 representation of the code point C<uv> to the end
	92	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	93	bytes available. The return value is the pointer to the byte after the
	94	end of the new character. In other words,
	95
	96	d = uvuni_to_utf8_flags(d, uv, flags);
	97
	98	or, in most cases,
	99
	100	d = uvuni_to_utf8(d, uv);
	101
	102	(which is equivalent to)
	103
	104	d = uvuni_to_utf8_flags(d, uv, 0);
	105
	106	This is the recommended Unicode-aware way of saying
	107
	108	*(d++) = uv;
	109
	110	This function will convert to UTF-8 (and not warn) even code points that aren't
	111	legal Unicode or are problematic, unless C<flags> contains one or more of the
	112	following flags.
	113	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	114	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	115	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	116	If both flags are set, the function will both warn and return NULL.
	117
	118	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
	119	affect how the function handles a Unicode non-character. And, likewise for the
	120	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
	121	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	122	even less portable) can be warned and/or disallowed even if other above-Unicode
	123	code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	124	flags.
	125
	126	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	127	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	128	DISALLOW flags.
	129
	130
	131	=cut
	132	*/
	133
	134	U8 *
	135	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	136	{
	137	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	138
	139	if (ckWARN_d(WARN_UTF8)) {
	140	if (UNICODE_IS_SURROGATE(uv)) {
	141	if (flags & UNICODE_WARN_SURROGATE) {
	142	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
	143	"UTF-16 surrogate U+%04"UVXf, uv);
	144	}
	145	if (flags & UNICODE_DISALLOW_SURROGATE) {
	146	return NULL;
	147	}
	148	}
	149	else if (UNICODE_IS_SUPER(uv)) {
	150	if (flags & UNICODE_WARN_SUPER
	151	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	152	{
	153	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	154	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	155	}
	156	if (flags & UNICODE_DISALLOW_SUPER
	157	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	158	{
	159	return NULL;
	160	}
	161	}
	162	else if (UNICODE_IS_NONCHAR(uv)) {
	163	if (flags & UNICODE_WARN_NONCHAR) {
	164	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
	165	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	166	uv);
	167	}
	168	if (flags & UNICODE_DISALLOW_NONCHAR) {
	169	return NULL;
	170	}
	171	}
	172	}
	173	if (UNI_IS_INVARIANT(uv)) {
	174	*d++ = (U8)UTF_TO_NATIVE(uv);
	175	return d;
	176	}
	177	#if defined(EBCDIC)
	178	else {
	179	STRLEN len = UNISKIP(uv);
	180	U8 *p = d+len-1;
	181	while (p > d) {
	182	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	183	uv >>= UTF_ACCUMULATION_SHIFT;
	184	}
	185	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	186	return d+len;
	187	}
	188	#else /* Non loop style */
	189	if (uv < 0x800) {
	190	*d++ = (U8)(( uv >> 6) \| 0xc0);
	191	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	192	return d;
	193	}
	194	if (uv < 0x10000) {
	195	*d++ = (U8)(( uv >> 12) \| 0xe0);
	196	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	197	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	198	return d;
	199	}
	200	if (uv < 0x200000) {
	201	*d++ = (U8)(( uv >> 18) \| 0xf0);
	202	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	204	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	205	return d;
	206	}
	207	if (uv < 0x4000000) {
	208	*d++ = (U8)(( uv >> 24) \| 0xf8);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	if (uv < 0x80000000) {
	216	*d++ = (U8)(( uv >> 30) \| 0xfc);
	217	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	218	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	219	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	220	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	221	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	222	return d;
	223	}
	224	#ifdef HAS_QUAD
	225	if (uv < UTF8_QUAD_MAX)
	226	#endif
	227	{
	228	d++ = 0xfe; / Can't match U+FEFF! */
	229	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	231	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	232	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	233	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	234	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	235	return d;
	236	}
	237	#ifdef HAS_QUAD
	238	{
	239	d++ = 0xff; / Can't match U+FFFE! */
	240	d++ = 0x80; / 6 Reserved bits */
	241	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	242	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	243	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	244	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	245	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	246	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	247	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	248	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	249	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	250	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	251	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	252	return d;
	253	}
	254	#endif
	255	#endif /* Loop style */
	256	}
	257
	258	/*
	259
	260	Tests if some arbitrary number of bytes begins in a valid UTF-8
	261	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	262	UTF-8 character. The actual number of bytes in the UTF-8 character
	263	will be returned if it is valid, otherwise 0.
	264
	265	This is the "slow" version as opposed to the "fast" version which is
	266	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	267	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	268	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	269	you should use the _slow(). In practice this means that the _slow()
	270	will be used very rarely, since the maximum Unicode code point (as of
	271	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	272	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	273	five bytes or more.
	274
	275	=cut */
	276	STATIC STRLEN
	277	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	278	{
	279	U8 u = *s;
	280	STRLEN slen;
	281	UV uv, ouv;
	282
	283	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	284
	285	if (UTF8_IS_INVARIANT(u))
	286	return 1;
	287
	288	if (!UTF8_IS_START(u))
	289	return 0;
	290
	291	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	292	return 0;
	293
	294	slen = len - 1;
	295	s++;
	296	#ifdef EBCDIC
	297	u = NATIVE_TO_UTF(u);
	298	#endif
	299	u &= UTF_START_MASK(len);
	300	uv = u;
	301	ouv = uv;
	302	while (slen--) {
	303	if (!UTF8_IS_CONTINUATION(*s))
	304	return 0;
	305	uv = UTF8_ACCUMULATE(uv, *s);
	306	if (uv < ouv)
	307	return 0;
	308	ouv = uv;
	309	s++;
	310	}
	311
	312	if ((STRLEN)UNISKIP(uv) < len)
	313	return 0;
	314
	315	return len;
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char
	320
	321	Tests if some arbitrary number of bytes begins in a valid UTF-8
	322	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	323	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	324	character will be returned if it is valid, otherwise 0.
	325
	326	=cut */
	327	STRLEN
	328	Perl_is_utf8_char(const U8 *s)
	329	{
	330	const STRLEN len = UTF8SKIP(s);
	331
	332	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	333	#ifdef IS_UTF8_CHAR
	334	if (IS_UTF8_CHAR_FAST(len))
	335	return IS_UTF8_CHAR(s, len) ? len : 0;
	336	#endif /* #ifdef IS_UTF8_CHAR */
	337	return is_utf8_char_slow(s, len);
	338	}
	339
	340
	341	/*
	342	=for apidoc is_utf8_string
	343
	344	Returns true if first C<len> bytes of the given string form a valid
	345	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	346	using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
	347	string that contains code points above 0x7F encoded in UTF-8' because a
	348	valid ASCII string is a valid UTF-8 string.
	349
	350	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	351
	352	=cut
	353	*/
	354
	355	bool
	356	Perl_is_utf8_string(const U8 *s, STRLEN len)
	357	{
	358	const U8* const send = s + (len ? len : strlen((const char *)s));
	359	const U8* x = s;
	360
	361	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	362
	363	while (x < send) {
	364	STRLEN c;
	365	/* Inline the easy bits of is_utf8_char() here for speed... */
	366	if (UTF8_IS_INVARIANT(*x))
	367	c = 1;
	368	else if (!UTF8_IS_START(*x))
	369	goto out;
	370	else {
	371	/* ... and call is_utf8_char() only if really needed. */
	372	#ifdef IS_UTF8_CHAR
	373	c = UTF8SKIP(x);
	374	if (IS_UTF8_CHAR_FAST(c)) {
	375	if (!IS_UTF8_CHAR(x, c))
	376	c = 0;
	377	}
	378	else
	379	c = is_utf8_char_slow(x, c);
	380	#else
	381	c = is_utf8_char(x);
	382	#endif /* #ifdef IS_UTF8_CHAR */
	383	if (!c)
	384	goto out;
	385	}
	386	x += c;
	387	}
	388
	389	out:
	390	if (x != send)
	391	return FALSE;
	392
	393	return TRUE;
	394	}
	395
	396	/*
	397	Implemented as a macro in utf8.h
	398
	399	=for apidoc is_utf8_string_loc
	400
	401	Like is_utf8_string() but stores the location of the failure (in the
	402	case of "utf8ness failure") or the location s+len (in the case of
	403	"utf8ness success") in the C<ep>.
	404
	405	See also is_utf8_string_loclen() and is_utf8_string().
	406
	407	=for apidoc is_utf8_string_loclen
	408
	409	Like is_utf8_string() but stores the location of the failure (in the
	410	case of "utf8ness failure") or the location s+len (in the case of
	411	"utf8ness success") in the C<ep>, and the number of UTF-8
	412	encoded characters in the C<el>.
	413
	414	See also is_utf8_string_loc() and is_utf8_string().
	415
	416	=cut
	417	*/
	418
	419	bool
	420	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	421	{
	422	const U8* const send = s + (len ? len : strlen((const char *)s));
	423	const U8* x = s;
	424	STRLEN c;
	425	STRLEN outlen = 0;
	426
	427	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	428
	429	while (x < send) {
	430	/* Inline the easy bits of is_utf8_char() here for speed... */
	431	if (UTF8_IS_INVARIANT(*x))
	432	c = 1;
	433	else if (!UTF8_IS_START(*x))
	434	goto out;
	435	else {
	436	/* ... and call is_utf8_char() only if really needed. */
	437	#ifdef IS_UTF8_CHAR
	438	c = UTF8SKIP(x);
	439	if (IS_UTF8_CHAR_FAST(c)) {
	440	if (!IS_UTF8_CHAR(x, c))
	441	c = 0;
	442	} else
	443	c = is_utf8_char_slow(x, c);
	444	#else
	445	c = is_utf8_char(x);
	446	#endif /* #ifdef IS_UTF8_CHAR */
	447	if (!c)
	448	goto out;
	449	}
	450	x += c;
	451	outlen++;
	452	}
	453
	454	out:
	455	if (el)
	456	*el = outlen;
	457
	458	if (ep)
	459	*ep = x;
	460	return (x == send);
	461	}
	462
	463	/*
	464
	465	=for apidoc utf8n_to_uvuni
	466
	467	Bottom level UTF-8 decode routine.
	468	Returns the code point value of the first character in the string C<s>
	469	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
	470	C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
	471	character.
	472
	473	The value of C<flags> determines the behavior when C<s> does not point to a
	474	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	475	C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
	476	is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
	477	is raised.
	478
	479	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	480	individual types of malformations, such as the sequence being overlong (that
	481	is, when there is a shorter sequence that can express the same code point;
	482	overlong sequences are expressly forbidden in the UTF-8 standard due to
	483	potential security issues). Another malformation example is the first byte of
	484	a character not being a legal first byte. See F<utf8.h> for the list of such
	485	flags. Of course, the value returned by this function under such conditions is
	486	not reliable.
	487
	488	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	489	flags) malformation is found. If this flag is set, the routine assumes that
	490	the caller will raise a warning, and this function will silently just set
	491	C<retlen> to C<-1> and return zero.
	492
	493	Certain code points are considered problematic. These are Unicode surrogates,
	494	Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
	495	By default these are considered regular code points, but certain situations
	496	warrant special handling for them. if C<flags> contains
	497	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	498	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	499	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	500	maximum) can be set to disallow these categories individually.
	501
	502	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	503	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	504	for their respective categories, but otherwise the code points are considered
	505	valid (not malformations). To get a category to both be treated as a
	506	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	507	(But note that warnings are not raised if lexically disabled nor if
	508	UTF8_CHECK_ONLY is also specified.)
	509
	510	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	511	the others that are above the Unicode legal maximum. There are several
	512	reasons, one of which is that the original UTF-8 specification never went above
	513	this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
	514	on ASCII platforms for these large code point begins with a byte containing
	515	0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
	516	malformations, while allowing smaller above-Unicode code points. (Of course
	517	UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
	518	as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
	519	flags, but applies just to these code points.
	520
	521	All other code points corresponding to Unicode characters, including private
	522	use and those yet to be assigned, are never considered malformed and never
	523	warn.
	524
	525	Most code should use utf8_to_uvchr() rather than call this directly.
	526
	527	=cut
	528	*/
	529
	530	UV
	531	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	532	{
	533	dVAR;
	534	const U8 * const s0 = s;
	535	UV uv = *s, ouv = 0;
	536	STRLEN len = 1;
	537	bool dowarn = ckWARN_d(WARN_UTF8);
	538	const UV startbyte = *s;
	539	STRLEN expectlen = 0;
	540	U32 warning = 0;
	541	SV* sv = NULL;
	542
	543	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	544
	545	/* This list is a superset of the UTF8_ALLOW_XXX. */
	546
	547	#define UTF8_WARN_EMPTY 1
	548	#define UTF8_WARN_CONTINUATION 2
	549	#define UTF8_WARN_NON_CONTINUATION 3
	550	#define UTF8_WARN_SHORT 4
	551	#define UTF8_WARN_OVERFLOW 5
	552	#define UTF8_WARN_LONG 6
	553
	554	if (curlen == 0 &&
	555	!(flags & UTF8_ALLOW_EMPTY)) {
	556	warning = UTF8_WARN_EMPTY;
	557	goto malformed;
	558	}
	559
	560	if (UTF8_IS_INVARIANT(uv)) {
	561	if (retlen)
	562	*retlen = 1;
	563	return (UV) (NATIVE_TO_UTF(*s));
	564	}
	565
	566	if (UTF8_IS_CONTINUATION(uv) &&
	567	!(flags & UTF8_ALLOW_CONTINUATION)) {
	568	warning = UTF8_WARN_CONTINUATION;
	569	goto malformed;
	570	}
	571
	572	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	573	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	574	warning = UTF8_WARN_NON_CONTINUATION;
	575	goto malformed;
	576	}
	577
	578	#ifdef EBCDIC
	579	uv = NATIVE_TO_UTF(uv);
	580	#else
	581	if (uv == 0xfe \|\| uv == 0xff) {
	582	if (flags & (UTF8_WARN_SUPER\|UTF8_WARN_FE_FF)) {
	583	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
	584	flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
	585	}
	586	if (flags & (UTF8_DISALLOW_SUPER\|UTF8_DISALLOW_FE_FF)) {
	587	goto malformed;
	588	}
	589	}
	590	#endif
	591
	592	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	593	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	594	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	595	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	596	#ifdef EBCDIC
	597	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	598	else { len = 7; uv &= 0x01; }
	599	#else
	600	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	601	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	602	else { len = 13; uv = 0; } /* whoa! */
	603	#endif
	604
	605	if (retlen)
	606	*retlen = len;
	607
	608	expectlen = len;
	609
	610	if ((curlen < expectlen) &&
	611	!(flags & UTF8_ALLOW_SHORT)) {
	612	warning = UTF8_WARN_SHORT;
	613	goto malformed;
	614	}
	615
	616	len--;
	617	s++;
	618	ouv = uv; /* ouv is the value from the previous iteration */
	619
	620	while (len--) {
	621	if (!UTF8_IS_CONTINUATION(*s) &&
	622	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	623	s--;
	624	warning = UTF8_WARN_NON_CONTINUATION;
	625	goto malformed;
	626	}
	627	else
	628	uv = UTF8_ACCUMULATE(uv, *s);
	629	if (!(uv > ouv)) { /* If the value didn't grow from the previous
	630	iteration, something is horribly wrong */
	631	/* These cannot be allowed. */
	632	if (uv == ouv) {
	633	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	634	warning = UTF8_WARN_LONG;
	635	goto malformed;
	636	}
	637	}
	638	else { /* uv < ouv */
	639	/* This cannot be allowed. */
	640	warning = UTF8_WARN_OVERFLOW;
	641	goto malformed;
	642	}
	643	}
	644	s++;
	645	ouv = uv;
	646	}
	647
	648	if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
	649	warning = UTF8_WARN_LONG;
	650	goto malformed;
	651	} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE\|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
	652	if (UNICODE_IS_SURROGATE(uv)) {
	653	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
	654	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	655	}
	656	if (flags & UTF8_DISALLOW_SURROGATE) {
	657	goto disallowed;
	658	}
	659	}
	660	else if (UNICODE_IS_NONCHAR(uv)) {
	661	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
	662	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	663	}
	664	if (flags & UTF8_DISALLOW_NONCHAR) {
	665	goto disallowed;
	666	}
	667	}
	668	else if ((uv > PERL_UNICODE_MAX)) {
	669	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
	670	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	671	}
	672	if (flags & UTF8_DISALLOW_SUPER) {
	673	goto disallowed;
	674	}
	675	}
	676
	677	/* Here, this is not considered a malformed character, so drop through
	678	* to return it */
	679	}
	680
	681	return uv;
	682
	683	disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
	684	set if there is to be a warning. */
	685	if (!sv) {
	686	dowarn = 0;
	687	}
	688
	689	malformed:
	690
	691	if (flags & UTF8_CHECK_ONLY) {
	692	if (retlen)
	693	*retlen = ((STRLEN) -1);
	694	return 0;
	695	}
	696
	697	if (dowarn) {
	698	if (! sv) {
	699	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	700	}
	701
	702	switch (warning) {
	703	case 0: /* Intentionally empty. */ break;
	704	case UTF8_WARN_EMPTY:
	705	sv_catpvs(sv, "(empty string)");
	706	break;
	707	case UTF8_WARN_CONTINUATION:
	708	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	709	break;
	710	case UTF8_WARN_NON_CONTINUATION:
	711	if (s == s0)
	712	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	713	(UV)s[1], startbyte);
	714	else {
	715	const int len = (int)(s-s0);
	716	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	717	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	718	}
	719
	720	break;
	721	case UTF8_WARN_SHORT:
	722	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	723	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	724	expectlen = curlen; /* distance for caller to skip */
	725	break;
	726	case UTF8_WARN_OVERFLOW:
	727	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	728	ouv, *s, startbyte);
	729	break;
	730	case UTF8_WARN_LONG:
	731	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	732	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	733	break;
	734	default:
	735	sv_catpvs(sv, "(unknown reason)");
	736	break;
	737	}
	738
	739	if (sv) {
	740	const char * const s = SvPVX_const(sv);
	741
	742	if (PL_op)
	743	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	744	"%s in %s", s, OP_DESC(PL_op));
	745	else
	746	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	747	}
	748	}
	749
	750	if (retlen)
	751	*retlen = expectlen ? expectlen : len;
	752
	753	return 0;
	754	}
	755
	756	/*
	757	=for apidoc utf8_to_uvchr
	758
	759	Returns the native code point of the first character in the string C<s>
	760	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	761	length, in bytes, of that character.
	762
	763	If C<s> does not point to a well-formed UTF-8 character, zero is
	764	returned and retlen is set, if possible, to -1.
	765
	766	=cut
	767	*/
	768
	769
	770	UV
	771	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	772	{
	773	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	774
	775	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	776	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	777	}
	778
	779	/*
	780	=for apidoc utf8_to_uvuni
	781
	782	Returns the Unicode code point of the first character in the string C<s>
	783	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	784	length, in bytes, of that character.
	785
	786	This function should only be used when the returned UV is considered
	787	an index into the Unicode semantic tables (e.g. swashes).
	788
	789	If C<s> does not point to a well-formed UTF-8 character, zero is
	790	returned and retlen is set, if possible, to -1.
	791
	792	=cut
	793	*/
	794
	795	UV
	796	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	797	{
	798	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	799
	800	/* Call the low level routine asking for checks */
	801	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	802	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	803	}
	804
	805	/*
	806	=for apidoc utf8_length
	807
	808	Return the length of the UTF-8 char encoded string C<s> in characters.
	809	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	810	up past C<e>, croaks.
	811
	812	=cut
	813	*/
	814
	815	STRLEN
	816	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	817	{
	818	dVAR;
	819	STRLEN len = 0;
	820
	821	PERL_ARGS_ASSERT_UTF8_LENGTH;
	822
	823	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	824	* the bitops (especially ~) can create illegal UTF-8.
	825	* In other words: in Perl UTF-8 is not just for Unicode. */
	826
	827	if (e < s)
	828	goto warn_and_return;
	829	while (s < e) {
	830	if (!UTF8_IS_INVARIANT(*s))
	831	s += UTF8SKIP(s);
	832	else
	833	s++;
	834	len++;
	835	}
	836
	837	if (e != s) {
	838	len--;
	839	warn_and_return:
	840	if (PL_op)
	841	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	842	"%s in %s", unees, OP_DESC(PL_op));
	843	else
	844	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	845	}
	846
	847	return len;
	848	}
	849
	850	/*
	851	=for apidoc utf8_distance
	852
	853	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	854	and C<b>.
	855
	856	WARNING: use only if you know that the pointers point inside the
	857	same UTF-8 buffer.
	858
	859	=cut
	860	*/
	861
	862	IV
	863	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	864	{
	865	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	866
	867	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	868	}
	869
	870	/*
	871	=for apidoc utf8_hop
	872
	873	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	874	forward or backward.
	875
	876	WARNING: do not use the following unless you know C<off> is within
	877	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	878	on the first byte of character or just after the last byte of a character.
	879
	880	=cut
	881	*/
	882
	883	U8 *
	884	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	885	{
	886	PERL_ARGS_ASSERT_UTF8_HOP;
	887
	888	PERL_UNUSED_CONTEXT;
	889	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	890	* the bitops (especially ~) can create illegal UTF-8.
	891	* In other words: in Perl UTF-8 is not just for Unicode. */
	892
	893	if (off >= 0) {
	894	while (off--)
	895	s += UTF8SKIP(s);
	896	}
	897	else {
	898	while (off++) {
	899	s--;
	900	while (UTF8_IS_CONTINUATION(*s))
	901	s--;
	902	}
	903	}
	904	return (U8 *)s;
	905	}
	906
	907	/*
	908	=for apidoc bytes_cmp_utf8
	909
	910	Compares the sequence of characters (stored as octets) in b, blen with the
	911	sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
	912	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	913	if the first string is greater than the second string.
	914
	915	-1 or +1 is returned if the shorter string was identical to the start of the
	916	longer string. -2 or +2 is returned if the was a difference between characters
	917	within the strings.
	918
	919	=cut
	920	*/
	921
	922	int
	923	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	924	{
	925	const U8 *const bend = b + blen;
	926	const U8 *const uend = u + ulen;
	927
	928	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	929
	930	PERL_UNUSED_CONTEXT;
	931
	932	while (b < bend && u < uend) {
	933	U8 c = *u++;
	934	if (!UTF8_IS_INVARIANT(c)) {
	935	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	936	if (u < uend) {
	937	U8 c1 = *u++;
	938	if (UTF8_IS_CONTINUATION(c1)) {
	939	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
	940	} else {
	941	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	942	"Malformed UTF-8 character "
	943	"(unexpected non-continuation byte 0x%02x"
	944	", immediately after start byte 0x%02x)"
	945	/* Dear diag.t, it's in the pod. */
	946	"%s%s", c1, c,
	947	PL_op ? " in " : "",
	948	PL_op ? OP_DESC(PL_op) : "");
	949	return -2;
	950	}
	951	} else {
	952	if (PL_op)
	953	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	954	"%s in %s", unees, OP_DESC(PL_op));
	955	else
	956	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	957	return -2; /* Really want to return undef :-) */
	958	}
	959	} else {
	960	return -2;
	961	}
	962	}
	963	if (*b != c) {
	964	return *b < c ? -2 : +2;
	965	}
	966	++b;
	967	}
	968
	969	if (b == bend && u == uend)
	970	return 0;
	971
	972	return b < bend ? +1 : -1;
	973	}
	974
	975	/*
	976	=for apidoc utf8_to_bytes
	977
	978	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	979	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	980	updates len to contain the new length.
	981	Returns zero on failure, setting C<len> to -1.
	982
	983	If you need a copy of the string, see C<bytes_from_utf8>.
	984
	985	=cut
	986	*/
	987
	988	U8 *
	989	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	990	{
	991	U8 * const save = s;
	992	U8 * const send = s + *len;
	993	U8 *d;
	994
	995	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	996
	997	/* ensure valid UTF-8 and chars < 256 before updating string */
	998	while (s < send) {
	999	U8 c = *s++;
	1000
	1001	if (!UTF8_IS_INVARIANT(c) &&
	1002	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	1003	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	1004	*len = ((STRLEN) -1);
	1005	return 0;
	1006	}
	1007	}
	1008
	1009	d = s = save;
	1010	while (s < send) {
	1011	STRLEN ulen;
	1012	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	1013	s += ulen;
	1014	}
	1015	*d = '\0';
	1016	*len = d - save;
	1017	return save;
	1018	}
	1019
	1020	/*
	1021	=for apidoc bytes_from_utf8
	1022
	1023	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1024	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	1025	the newly-created string, and updates C<len> to contain the new
	1026	length. Returns the original string if no conversion occurs, C<len>
	1027	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1028	0 if C<s> is converted or consisted entirely of characters that are invariant
	1029	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1030
	1031	=cut
	1032	*/
	1033
	1034	U8 *
	1035	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1036	{
	1037	U8 *d;
	1038	const U8 *start = s;
	1039	const U8 *send;
	1040	I32 count = 0;
	1041
	1042	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1043
	1044	PERL_UNUSED_CONTEXT;
	1045	if (!*is_utf8)
	1046	return (U8 *)start;
	1047
	1048	/* ensure valid UTF-8 and chars < 256 before converting string */
	1049	for (send = s + *len; s < send;) {
	1050	U8 c = *s++;
	1051	if (!UTF8_IS_INVARIANT(c)) {
	1052	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	1053	(c = *s++) && UTF8_IS_CONTINUATION(c))
	1054	count++;
	1055	else
	1056	return (U8 *)start;
	1057	}
	1058	}
	1059
	1060	*is_utf8 = FALSE;
	1061
	1062	Newx(d, (*len) - count + 1, U8);
	1063	s = start; start = d;
	1064	while (s < send) {
	1065	U8 c = *s++;
	1066	if (!UTF8_IS_INVARIANT(c)) {
	1067	/* Then it is two-byte encoded */
	1068	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
	1069	}
	1070	*d++ = c;
	1071	}
	1072	*d = '\0';
	1073	*len = d - start;
	1074	return (U8 *)start;
	1075	}
	1076
	1077	/*
	1078	=for apidoc bytes_to_utf8
	1079
	1080	Converts a string C<s> of length C<len> bytes from the native encoding into
	1081	UTF-8.
	1082	Returns a pointer to the newly-created string, and sets C<len> to
	1083	reflect the new length in bytes.
	1084
	1085	A NUL character will be written after the end of the string.
	1086
	1087	If you want to convert to UTF-8 from encodings other than
	1088	the native (Latin1 or EBCDIC),
	1089	see sv_recode_to_utf8().
	1090
	1091	=cut
	1092	*/
	1093
	1094	/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
	1095	likewise need duplication. */
	1096
	1097	U8*
	1098	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1099	{
	1100	const U8 * const send = s + (*len);
	1101	U8 *d;
	1102	U8 *dst;
	1103
	1104	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1105	PERL_UNUSED_CONTEXT;
	1106
	1107	Newx(d, (len) 2 + 1, U8);
	1108	dst = d;
	1109
	1110	while (s < send) {
	1111	const UV uv = NATIVE_TO_ASCII(*s++);
	1112	if (UNI_IS_INVARIANT(uv))
	1113	*d++ = (U8)UTF_TO_NATIVE(uv);
	1114	else {
	1115	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	1116	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	1117	}
	1118	}
	1119	*d = '\0';
	1120	*len = d-dst;
	1121	return dst;
	1122	}
	1123
	1124	/*
	1125	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1126	*
	1127	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1128	* We optimize for native, for obvious reasons. */
	1129
	1130	U8*
	1131	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1132	{
	1133	U8* pend;
	1134	U8* dstart = d;
	1135
	1136	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1137
	1138	if (bytelen & 1)
	1139	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1140
	1141	pend = p + bytelen;
	1142
	1143	while (p < pend) {
	1144	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1145	p += 2;
	1146	if (uv < 0x80) {
	1147	#ifdef EBCDIC
	1148	*d++ = UNI_TO_NATIVE(uv);
	1149	#else
	1150	*d++ = (U8)uv;
	1151	#endif
	1152	continue;
	1153	}
	1154	if (uv < 0x800) {
	1155	*d++ = (U8)(( uv >> 6) \| 0xc0);
	1156	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1157	continue;
	1158	}
	1159	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	1160	if (p >= pend) {
	1161	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1162	} else {
	1163	UV low = (p[0] << 8) + p[1];
	1164	p += 2;
	1165	if (low < 0xdc00 \|\| low > 0xdfff)
	1166	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1167	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	1168	}
	1169	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	1170	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1171	}
	1172	if (uv < 0x10000) {
	1173	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1174	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1175	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1176	continue;
	1177	}
	1178	else {
	1179	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1180	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1181	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1182	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1183	continue;
	1184	}
	1185	}
	1186	*newlen = d - dstart;
	1187	return d;
	1188	}
	1189
	1190	/* Note: this one is slightly destructive of the source. */
	1191
	1192	U8*
	1193	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1194	{
	1195	U8* s = (U8*)p;
	1196	U8* const send = s + bytelen;
	1197
	1198	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1199
	1200	if (bytelen & 1)
	1201	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1202	(UV)bytelen);
	1203
	1204	while (s < send) {
	1205	const U8 tmp = s[0];
	1206	s[0] = s[1];
	1207	s[1] = tmp;
	1208	s += 2;
	1209	}
	1210	return utf16_to_utf8(p, d, bytelen, newlen);
	1211	}
	1212
	1213	/* for now these are all defined (inefficiently) in terms of the utf8 versions.
	1214	* Note that the macros in handy.h that call these short-circuit calling them
	1215	* for Latin-1 range inputs */
	1216
	1217	bool
	1218	Perl_is_uni_alnum(pTHX_ UV c)
	1219	{
	1220	U8 tmpbuf[UTF8_MAXBYTES+1];
	1221	uvchr_to_utf8(tmpbuf, c);
	1222	return is_utf8_alnum(tmpbuf);
	1223	}
	1224
	1225	bool
	1226	Perl_is_uni_idfirst(pTHX_ UV c)
	1227	{
	1228	U8 tmpbuf[UTF8_MAXBYTES+1];
	1229	uvchr_to_utf8(tmpbuf, c);
	1230	return is_utf8_idfirst(tmpbuf);
	1231	}
	1232
	1233	bool
	1234	Perl_is_uni_alpha(pTHX_ UV c)
	1235	{
	1236	U8 tmpbuf[UTF8_MAXBYTES+1];
	1237	uvchr_to_utf8(tmpbuf, c);
	1238	return is_utf8_alpha(tmpbuf);
	1239	}
	1240
	1241	bool
	1242	Perl_is_uni_ascii(pTHX_ UV c)
	1243	{
	1244	return isASCII(c);
	1245	}
	1246
	1247	bool
	1248	Perl_is_uni_space(pTHX_ UV c)
	1249	{
	1250	U8 tmpbuf[UTF8_MAXBYTES+1];
	1251	uvchr_to_utf8(tmpbuf, c);
	1252	return is_utf8_space(tmpbuf);
	1253	}
	1254
	1255	bool
	1256	Perl_is_uni_digit(pTHX_ UV c)
	1257	{
	1258	U8 tmpbuf[UTF8_MAXBYTES+1];
	1259	uvchr_to_utf8(tmpbuf, c);
	1260	return is_utf8_digit(tmpbuf);
	1261	}
	1262
	1263	bool
	1264	Perl_is_uni_upper(pTHX_ UV c)
	1265	{
	1266	U8 tmpbuf[UTF8_MAXBYTES+1];
	1267	uvchr_to_utf8(tmpbuf, c);
	1268	return is_utf8_upper(tmpbuf);
	1269	}
	1270
	1271	bool
	1272	Perl_is_uni_lower(pTHX_ UV c)
	1273	{
	1274	U8 tmpbuf[UTF8_MAXBYTES+1];
	1275	uvchr_to_utf8(tmpbuf, c);
	1276	return is_utf8_lower(tmpbuf);
	1277	}
	1278
	1279	bool
	1280	Perl_is_uni_cntrl(pTHX_ UV c)
	1281	{
	1282	return isCNTRL_L1(c);
	1283	}
	1284
	1285	bool
	1286	Perl_is_uni_graph(pTHX_ UV c)
	1287	{
	1288	U8 tmpbuf[UTF8_MAXBYTES+1];
	1289	uvchr_to_utf8(tmpbuf, c);
	1290	return is_utf8_graph(tmpbuf);
	1291	}
	1292
	1293	bool
	1294	Perl_is_uni_print(pTHX_ UV c)
	1295	{
	1296	U8 tmpbuf[UTF8_MAXBYTES+1];
	1297	uvchr_to_utf8(tmpbuf, c);
	1298	return is_utf8_print(tmpbuf);
	1299	}
	1300
	1301	bool
	1302	Perl_is_uni_punct(pTHX_ UV c)
	1303	{
	1304	U8 tmpbuf[UTF8_MAXBYTES+1];
	1305	uvchr_to_utf8(tmpbuf, c);
	1306	return is_utf8_punct(tmpbuf);
	1307	}
	1308
	1309	bool
	1310	Perl_is_uni_xdigit(pTHX_ UV c)
	1311	{
	1312	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1313	uvchr_to_utf8(tmpbuf, c);
	1314	return is_utf8_xdigit(tmpbuf);
	1315	}
	1316
	1317
	1318	UV
	1319	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1320	{
	1321	/* Convert the Unicode character whose ordinal is c to its uppercase
	1322	* version and store that in UTF-8 in p and its length in bytes in lenp.
	1323	* Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1324	* the changed version may be longer than the original character.
	1325	*
	1326	* The ordinal of the first character of the changed version is returned
	1327	* (but note, as explained above, that there may be more.) */
	1328
	1329	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1330
	1331	uvchr_to_utf8(p, c);
	1332	return to_utf8_upper(p, p, lenp);
	1333	}
	1334
	1335	UV
	1336	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1337	{
	1338	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1339
	1340	uvchr_to_utf8(p, c);
	1341	return to_utf8_title(p, p, lenp);
	1342	}
	1343
	1344	UV
	1345	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1346	{
	1347	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1348
	1349	if (c > 255) {
	1350	uvchr_to_utf8(p, c);
	1351	return to_utf8_lower(p, p, lenp);
	1352	}
	1353
	1354	/* We have the latin1-range values compiled into the core, so just use
	1355	* those, converting the result to utf8 */
	1356	c = toLOWER_LATIN1(c);
	1357	if (UNI_IS_INVARIANT(c)) {
	1358	*p = c;
	1359	*lenp = 1;
	1360	}
	1361	else {
	1362	*p = UTF8_TWO_BYTE_HI(c);
	1363	*(p+1) = UTF8_TWO_BYTE_LO(c);
	1364	*lenp = 2;
	1365	}
	1366	return c;
	1367	}
	1368
	1369	UV
	1370	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
	1371	{
	1372	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1373
	1374	uvchr_to_utf8(p, c);
	1375	return _to_utf8_fold_flags(p, p, lenp, flags);
	1376	}
	1377
	1378	/* for now these all assume no locale info available for Unicode > 255 */
	1379
	1380	bool
	1381	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1382	{
	1383	return is_uni_alnum(c); /* XXX no locale support yet */
	1384	}
	1385
	1386	bool
	1387	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1388	{
	1389	return is_uni_idfirst(c); /* XXX no locale support yet */
	1390	}
	1391
	1392	bool
	1393	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1394	{
	1395	return is_uni_alpha(c); /* XXX no locale support yet */
	1396	}
	1397
	1398	bool
	1399	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1400	{
	1401	return is_uni_ascii(c); /* XXX no locale support yet */
	1402	}
	1403
	1404	bool
	1405	Perl_is_uni_space_lc(pTHX_ UV c)
	1406	{
	1407	return is_uni_space(c); /* XXX no locale support yet */
	1408	}
	1409
	1410	bool
	1411	Perl_is_uni_digit_lc(pTHX_ UV c)
	1412	{
	1413	return is_uni_digit(c); /* XXX no locale support yet */
	1414	}
	1415
	1416	bool
	1417	Perl_is_uni_upper_lc(pTHX_ UV c)
	1418	{
	1419	return is_uni_upper(c); /* XXX no locale support yet */
	1420	}
	1421
	1422	bool
	1423	Perl_is_uni_lower_lc(pTHX_ UV c)
	1424	{
	1425	return is_uni_lower(c); /* XXX no locale support yet */
	1426	}
	1427
	1428	bool
	1429	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1430	{
	1431	return is_uni_cntrl(c); /* XXX no locale support yet */
	1432	}
	1433
	1434	bool
	1435	Perl_is_uni_graph_lc(pTHX_ UV c)
	1436	{
	1437	return is_uni_graph(c); /* XXX no locale support yet */
	1438	}
	1439
	1440	bool
	1441	Perl_is_uni_print_lc(pTHX_ UV c)
	1442	{
	1443	return is_uni_print(c); /* XXX no locale support yet */
	1444	}
	1445
	1446	bool
	1447	Perl_is_uni_punct_lc(pTHX_ UV c)
	1448	{
	1449	return is_uni_punct(c); /* XXX no locale support yet */
	1450	}
	1451
	1452	bool
	1453	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1454	{
	1455	return is_uni_xdigit(c); /* XXX no locale support yet */
	1456	}
	1457
	1458	U32
	1459	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1460	{
	1461	/* XXX returns only the first character -- do not use XXX */
	1462	/* XXX no locale support yet */
	1463	STRLEN len;
	1464	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1465	return (U32)to_uni_upper(c, tmpbuf, &len);
	1466	}
	1467
	1468	U32
	1469	Perl_to_uni_title_lc(pTHX_ U32 c)
	1470	{
	1471	/* XXX returns only the first character XXX -- do not use XXX */
	1472	/* XXX no locale support yet */
	1473	STRLEN len;
	1474	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1475	return (U32)to_uni_title(c, tmpbuf, &len);
	1476	}
	1477
	1478	U32
	1479	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1480	{
	1481	/* XXX returns only the first character -- do not use XXX */
	1482	/* XXX no locale support yet */
	1483	STRLEN len;
	1484	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1485	return (U32)to_uni_lower(c, tmpbuf, &len);
	1486	}
	1487
	1488	static bool
	1489	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1490	const char *const swashname)
	1491	{
	1492	dVAR;
	1493
	1494	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1495
	1496	if (!is_utf8_char(p))
	1497	return FALSE;
	1498	if (!*swash)
	1499	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1500	return swash_fetch(*swash, p, TRUE) != 0;
	1501	}
	1502
	1503	bool
	1504	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1505	{
	1506	dVAR;
	1507
	1508	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1509
	1510	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1511	* descendant of isalnum(3), in other words, it doesn't
	1512	* contain the '_'. --jhi */
	1513	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1514	}
	1515
	1516	bool
	1517	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1518	{
	1519	dVAR;
	1520
	1521	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1522
	1523	if (*p == '_')
	1524	return TRUE;
	1525	/* is_utf8_idstart would be more logical. */
	1526	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1527	}
	1528
	1529	bool
	1530	Perl_is_utf8_xidfirst(pTHX_ const U8 p) / The naming is historical. */
	1531	{
	1532	dVAR;
	1533
	1534	PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
	1535
	1536	if (*p == '_')
	1537	return TRUE;
	1538	/* is_utf8_idstart would be more logical. */
	1539	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
	1540	}
	1541
	1542	bool
	1543	Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
	1544	{
	1545	dVAR;
	1546
	1547	PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
	1548
	1549	return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
	1550	}
	1551
	1552	bool
	1553	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1554	{
	1555	dVAR;
	1556
	1557	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1558
	1559	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1560	}
	1561
	1562	bool
	1563	Perl_is_utf8_xidcont(pTHX_ const U8 *p)
	1564	{
	1565	dVAR;
	1566
	1567	PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
	1568
	1569	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
	1570	}
	1571
	1572	bool
	1573	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1574	{
	1575	dVAR;
	1576
	1577	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1578
	1579	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1580	}
	1581
	1582	bool
	1583	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1584	{
	1585	dVAR;
	1586
	1587	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1588
	1589	/* ASCII characters are the same whether in utf8 or not. So the macro
	1590	* works on both utf8 and non-utf8 representations. */
	1591	return isASCII(*p);
	1592	}
	1593
	1594	bool
	1595	Perl_is_utf8_space(pTHX_ const U8 *p)
	1596	{
	1597	dVAR;
	1598
	1599	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1600
	1601	return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
	1602	}
	1603
	1604	bool
	1605	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1606	{
	1607	dVAR;
	1608
	1609	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1610
	1611	/* Only true if is an ASCII space-like character, and ASCII is invariant
	1612	* under utf8, so can just use the macro */
	1613	return isSPACE_A(*p);
	1614	}
	1615
	1616	bool
	1617	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1618	{
	1619	dVAR;
	1620
	1621	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1622
	1623	/* Only true if is an ASCII word character, and ASCII is invariant
	1624	* under utf8, so can just use the macro */
	1625	return isWORDCHAR_A(*p);
	1626	}
	1627
	1628	bool
	1629	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1630	{
	1631	dVAR;
	1632
	1633	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1634
	1635	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1636	}
	1637
	1638	bool
	1639	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1640	{
	1641	dVAR;
	1642
	1643	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1644
	1645	/* Only true if is an ASCII digit character, and ASCII is invariant
	1646	* under utf8, so can just use the macro */
	1647	return isDIGIT_A(*p);
	1648	}
	1649
	1650	bool
	1651	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1652	{
	1653	dVAR;
	1654
	1655	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1656
	1657	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1658	}
	1659
	1660	bool
	1661	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1662	{
	1663	dVAR;
	1664
	1665	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1666
	1667	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1668	}
	1669
	1670	bool
	1671	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1672	{
	1673	dVAR;
	1674
	1675	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1676
	1677	if (isASCII(*p)) {
	1678	return isCNTRL_A(*p);
	1679	}
	1680
	1681	/* All controls are in Latin1 */
	1682	if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
	1683	return 0;
	1684	}
	1685	return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	1686	}
	1687
	1688	bool
	1689	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1690	{
	1691	dVAR;
	1692
	1693	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1694
	1695	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1696	}
	1697
	1698	bool
	1699	Perl_is_utf8_print(pTHX_ const U8 *p)
	1700	{
	1701	dVAR;
	1702
	1703	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1704
	1705	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1706	}
	1707
	1708	bool
	1709	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1710	{
	1711	dVAR;
	1712
	1713	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1714
	1715	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1716	}
	1717
	1718	bool
	1719	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1720	{
	1721	dVAR;
	1722
	1723	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1724
	1725	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1726	}
	1727
	1728	bool
	1729	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1730	{
	1731	dVAR;
	1732
	1733	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1734
	1735	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1736	}
	1737
	1738	bool
	1739	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1740	{
	1741	dVAR;
	1742
	1743	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1744
	1745	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1746	}
	1747
	1748	bool
	1749	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1750	{
	1751	dVAR;
	1752
	1753	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1754
	1755	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1756	}
	1757
	1758	bool
	1759	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1760	{
	1761	dVAR;
	1762
	1763	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1764
	1765	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1766	}
	1767
	1768	bool
	1769	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1770	{
	1771	dVAR;
	1772
	1773	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1774
	1775	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1776	}
	1777
	1778	bool
	1779	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1780	{
	1781	dVAR;
	1782
	1783	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1784
	1785	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1786	}
	1787
	1788	bool
	1789	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1790	{
	1791	dVAR;
	1792
	1793	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1794
	1795	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1796	}
	1797
	1798	bool
	1799	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1800	{
	1801	dVAR;
	1802
	1803	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1804
	1805	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1806	}
	1807
	1808	bool
	1809	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1810	{
	1811	dVAR;
	1812
	1813	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1814
	1815	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1816	}
	1817
	1818	bool
	1819	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1820	{
	1821	dVAR;
	1822
	1823	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1824
	1825	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1826	}
	1827
	1828	bool
	1829	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1830	{
	1831	dVAR;
	1832
	1833	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1834
	1835	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1836	}
	1837
	1838	/*
	1839	=for apidoc to_utf8_case
	1840
	1841	The "p" contains the pointer to the UTF-8 string encoding
	1842	the character that is being converted.
	1843
	1844	The "ustrp" is a pointer to the character buffer to put the
	1845	conversion result to. The "lenp" is a pointer to the length
	1846	of the result.
	1847
	1848	The "swashp" is a pointer to the swash to use.
	1849
	1850	Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
	1851	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1852	but not always, a multicharacter mapping), is tried first.
	1853
	1854	The "special" is a string like "utf8::ToSpecLower", which means the
	1855	hash %utf8::ToSpecLower. The access to the hash is through
	1856	Perl_to_utf8_case().
	1857
	1858	The "normal" is a string like "ToLower" which means the swash
	1859	%utf8::ToLower.
	1860
	1861	=cut */
	1862
	1863	UV
	1864	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	1865	SV *swashp, const char normal, const char *special)
	1866	{
	1867	dVAR;
	1868	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1869	STRLEN len = 0;
	1870	const UV uv0 = utf8_to_uvchr(p, NULL);
	1871	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	1872	* are necessary in EBCDIC, they are redundant no-ops
	1873	* in ASCII-ish platforms, and hopefully optimized away. */
	1874	const UV uv1 = NATIVE_TO_UNI(uv0);
	1875
	1876	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	1877
	1878	/* Note that swash_fetch() doesn't output warnings for these because it
	1879	* assumes we will */
	1880	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	1881	if (uv1 <= UNICODE_SURROGATE_LAST) {
	1882	if (ckWARN_d(WARN_SURROGATE)) {
	1883	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1884	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	1885	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	1886	}
	1887	}
	1888	else if (UNICODE_IS_SUPER(uv1)) {
	1889	if (ckWARN_d(WARN_NON_UNICODE)) {
	1890	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	1891	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	1892	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	1893	}
	1894	}
	1895
	1896	/* Note that non-characters are perfectly legal, so no warning should
	1897	* be given */
	1898	}
	1899
	1900	uvuni_to_utf8(tmpbuf, uv1);
	1901
	1902	if (!swashp) / load on-demand */
	1903	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	1904
	1905	if (special) {
	1906	/* It might be "special" (sometimes, but not always,
	1907	* a multicharacter mapping) */
	1908	HV * const hv = get_hv(special, 0);
	1909	SV **svp;
	1910
	1911	if (hv &&
	1912	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	1913	(*svp)) {
	1914	const char *s;
	1915
	1916	s = SvPV_const(*svp, len);
	1917	if (len == 1)
	1918	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	1919	else {
	1920	#ifdef EBCDIC
	1921	/* If we have EBCDIC we need to remap the characters
	1922	* since any characters in the low 256 are Unicode
	1923	* code points, not EBCDIC. */
	1924	U8 t = (U8)s, tend = t + len, d;
	1925
	1926	d = tmpbuf;
	1927	if (SvUTF8(*svp)) {
	1928	STRLEN tlen = 0;
	1929
	1930	while (t < tend) {
	1931	const UV c = utf8_to_uvchr(t, &tlen);
	1932	if (tlen > 0) {
	1933	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	1934	t += tlen;
	1935	}
	1936	else
	1937	break;
	1938	}
	1939	}
	1940	else {
	1941	while (t < tend) {
	1942	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	1943	t++;
	1944	}
	1945	}
	1946	len = d - tmpbuf;
	1947	Copy(tmpbuf, ustrp, len, U8);
	1948	#else
	1949	Copy(s, ustrp, len, U8);
	1950	#endif
	1951	}
	1952	}
	1953	}
	1954
	1955	if (!len && *swashp) {
	1956	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	1957
	1958	if (uv2) {
	1959	/* It was "normal" (a single character mapping). */
	1960	const UV uv3 = UNI_TO_NATIVE(uv2);
	1961	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	1962	}
	1963	}
	1964
	1965	if (!len) /* Neither: just copy. In other words, there was no mapping
	1966	defined, which means that the code point maps to itself */
	1967	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	1968
	1969	if (lenp)
	1970	*lenp = len;
	1971
	1972	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	1973	}
	1974
	1975	/*
	1976	=for apidoc to_utf8_upper
	1977
	1978	Convert the UTF-8 encoded character at p to its uppercase version and
	1979	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	1980	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1981	the uppercase version may be longer than the original character.
	1982
	1983	The first character of the uppercased version is returned
	1984	(but note, as explained above, that there may be more.)
	1985
	1986	=cut */
	1987
	1988	UV
	1989	Perl_to_utf8_upper(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	1990	{
	1991	dVAR;
	1992
	1993	PERL_ARGS_ASSERT_TO_UTF8_UPPER;
	1994
	1995	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	1996	&PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
	1997	}
	1998
	1999	/*
	2000	=for apidoc to_utf8_title
	2001
	2002	Convert the UTF-8 encoded character at p to its titlecase version and
	2003	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2004	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2005	titlecase version may be longer than the original character.
	2006
	2007	The first character of the titlecased version is returned
	2008	(but note, as explained above, that there may be more.)
	2009
	2010	=cut */
	2011
	2012	UV
	2013	Perl_to_utf8_title(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	2014	{
	2015	dVAR;
	2016
	2017	PERL_ARGS_ASSERT_TO_UTF8_TITLE;
	2018
	2019	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2020	&PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
	2021	}
	2022
	2023	/*
	2024	=for apidoc to_utf8_lower
	2025
	2026	Convert the UTF-8 encoded character at p to its lowercase version and
	2027	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2028	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2029	lowercase version may be longer than the original character.
	2030
	2031	The first character of the lowercased version is returned
	2032	(but note, as explained above, that there may be more.)
	2033
	2034	=cut */
	2035
	2036	UV
	2037	Perl_to_utf8_lower(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp)
	2038	{
	2039	dVAR;
	2040
	2041	PERL_ARGS_ASSERT_TO_UTF8_LOWER;
	2042
	2043	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2044	&PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
	2045	}
	2046
	2047	/*
	2048	=for apidoc to_utf8_fold
	2049
	2050	Convert the UTF-8 encoded character at p to its foldcase version and
	2051	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2052	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2053	foldcase version may be longer than the original character (up to
	2054	three characters).
	2055
	2056	The first character of the foldcased version is returned
	2057	(but note, as explained above, that there may be more.)
	2058
	2059	=cut */
	2060
	2061	/* Not currently externally documented is 'flags', which currently is non-zero
	2062	* if full case folds are to be used; otherwise simple folds */
	2063
	2064	UV
	2065	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp, U8 flags)
	2066	{
	2067	const char *specials = (flags) ? "utf8::ToSpecFold" : NULL;
	2068
	2069	dVAR;
	2070
	2071	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2072
	2073	return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
	2074	&PL_utf8_tofold, "ToFold", specials);
	2075	}
	2076
	2077	/* Note:
	2078	* A "swash" is a swatch hash.
	2079	* A "swatch" is a bit vector generated by utf8.c:S_swash_get().
	2080	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2081	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2082	*/
	2083	SV*
	2084	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2085	{
	2086	dVAR;
	2087	SV* retval;
	2088	dSP;
	2089	const size_t pkg_len = strlen(pkg);
	2090	const size_t name_len = strlen(name);
	2091	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2092	SV* errsv_save;
	2093	GV *method;
	2094
	2095	PERL_ARGS_ASSERT_SWASH_INIT;
	2096
	2097	PUSHSTACKi(PERLSI_MAGIC);
	2098	ENTER;
	2099	SAVEHINTS();
	2100	save_re_context();
	2101	if (PL_parser && PL_parser->error_count)
	2102	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	2103	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2104	if (!method) { /* demand load utf8 */
	2105	ENTER;
	2106	errsv_save = newSVsv(ERRSV);
	2107	/* It is assumed that callers of this routine are not passing in any
	2108	user derived data. */
	2109	/* Need to do this after save_re_context() as it will set PL_tainted to
	2110	1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
	2111	Even line to create errsv_save can turn on PL_tainted. */
	2112	SAVEBOOL(PL_tainted);
	2113	PL_tainted = 0;
	2114	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2115	NULL);
	2116	if (!SvTRUE(ERRSV))
	2117	sv_setsv(ERRSV, errsv_save);
	2118	SvREFCNT_dec(errsv_save);
	2119	LEAVE;
	2120	}
	2121	SPAGAIN;
	2122	PUSHMARK(SP);
	2123	EXTEND(SP,5);
	2124	mPUSHp(pkg, pkg_len);
	2125	mPUSHp(name, name_len);
	2126	PUSHs(listsv);
	2127	mPUSHi(minbits);
	2128	mPUSHi(none);
	2129	PUTBACK;
	2130	errsv_save = newSVsv(ERRSV);
	2131	/* If we already have a pointer to the method, no need to use call_method()
	2132	to repeat the lookup. */
	2133	if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
	2134	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2135	retval = newSVsv(*PL_stack_sp--);
	2136	else
	2137	retval = &PL_sv_undef;
	2138	if (!SvTRUE(ERRSV))
	2139	sv_setsv(ERRSV, errsv_save);
	2140	SvREFCNT_dec(errsv_save);
	2141	LEAVE;
	2142	POPSTACK;
	2143	if (IN_PERL_COMPILETIME) {
	2144	CopHINTS_set(PL_curcop, PL_hints);
	2145	}
	2146	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2147	if (SvPOK(retval))
	2148	Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
	2149	SVfARG(retval));
	2150	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	2151	}
	2152	return retval;
	2153	}
	2154
	2155
	2156	/* This API is wrong for special case conversions since we may need to
	2157	* return several Unicode characters for a single Unicode character
	2158	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2159	* the lower-level routine, and it is similarly broken for returning
	2160	* multiple values. --jhi
	2161	* For those, you should use to_utf8_case() instead */
	2162	/* Now SWASHGET is recasted into S_swash_get in this file. */
	2163
	2164	/* Note:
	2165	* Returns the value of property/mapping C<swash> for the first character
	2166	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2167	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	2168	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2169	*/
	2170	UV
	2171	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2172	{
	2173	dVAR;
	2174	HV *const hv = MUTABLE_HV(SvRV(swash));
	2175	U32 klen;
	2176	U32 off;
	2177	STRLEN slen;
	2178	STRLEN needents;
	2179	const U8 *tmps = NULL;
	2180	U32 bit;
	2181	SV *swatch;
	2182	U8 tmputf8[2];
	2183	const UV c = NATIVE_TO_ASCII(*ptr);
	2184
	2185	PERL_ARGS_ASSERT_SWASH_FETCH;
	2186
	2187	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	2188	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	2189	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	2190	ptr = tmputf8;
	2191	}
	2192	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	2193	* then the "swatch" is a vec() for all the chars which start
	2194	* with 0xAA..0xYY
	2195	* So the key in the hash (klen) is length of encoded char -1
	2196	*/
	2197	klen = UTF8SKIP(ptr) - 1;
	2198	off = ptr[klen];
	2199
	2200	if (klen == 0) {
	2201	/* If char is invariant then swatch is for all the invariant chars
	2202	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	2203	*/
	2204	needents = UTF_CONTINUATION_MARK;
	2205	off = NATIVE_TO_UTF(ptr[klen]);
	2206	}
	2207	else {
	2208	/* If char is encoded then swatch is for the prefix */
	2209	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2210	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	2211	if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
	2212	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
	2213
	2214	/* This outputs warnings for binary properties only, assuming that
	2215	* to_utf8_case() will output any for non-binary. Also, surrogates
	2216	* aren't checked for, as that would warn on things like
	2217	* /\p{Gc=Cs}/ */
	2218	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2219	if (SvUV(*bitssvp) == 1) {
	2220	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2221	"Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
	2222	}
	2223	}
	2224	}
	2225
	2226	/*
	2227	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2228	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2229	* it's nothing to sniff at.) Pity we usually come through at least
	2230	* two function calls to get here...
	2231	*
	2232	* NB: this code assumes that swatches are never modified, once generated!
	2233	*/
	2234
	2235	if (hv == PL_last_swash_hv &&
	2236	klen == PL_last_swash_klen &&
	2237	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2238	{
	2239	tmps = PL_last_swash_tmps;
	2240	slen = PL_last_swash_slen;
	2241	}
	2242	else {
	2243	/* Try our second-level swatch cache, kept in a hash. */
	2244	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2245
	2246	/* If not cached, generate it via swash_get */
	2247	if (!svp \|\| !SvPOK(*svp)
	2248	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	2249	/* We use utf8n_to_uvuni() as we want an index into
	2250	Unicode tables, not a native character number.
	2251	*/
	2252	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	2253	ckWARN(WARN_UTF8) ?
	2254	0 : UTF8_ALLOW_ANY);
	2255	swatch = swash_get(swash,
	2256	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	2257	(klen) ? (code_point & ~(needents - 1)) : 0,
	2258	needents);
	2259
	2260	if (IN_PERL_COMPILETIME)
	2261	CopHINTS_set(PL_curcop, PL_hints);
	2262
	2263	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2264
	2265	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2266	\|\| (slen << 3) < needents)
	2267	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
	2268	}
	2269
	2270	PL_last_swash_hv = hv;
	2271	assert(klen <= sizeof(PL_last_swash_key));
	2272	PL_last_swash_klen = (U8)klen;
	2273	/* FIXME change interpvar.h? */
	2274	PL_last_swash_tmps = (U8 *) tmps;
	2275	PL_last_swash_slen = slen;
	2276	if (klen)
	2277	Copy(ptr, PL_last_swash_key, klen, U8);
	2278	}
	2279
	2280	switch ((int)((slen << 3) / needents)) {
	2281	case 1:
	2282	bit = 1 << (off & 7);
	2283	off >>= 3;
	2284	return (tmps[off] & bit) != 0;
	2285	case 8:
	2286	return tmps[off];
	2287	case 16:
	2288	off <<= 1;
	2289	return (tmps[off] << 8) + tmps[off + 1] ;
	2290	case 32:
	2291	off <<= 2;
	2292	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2293	}
	2294	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
	2295	NORETURN_FUNCTION_END;
	2296	}
	2297
	2298	/* Read a single line of the main body of the swash input text. These are of
	2299	* the form:
	2300	* 0053 0056 0073
	2301	* where each number is hex. The first two numbers form the minimum and
	2302	* maximum of a range, and the third is the value associated with the range.
	2303	* Not all swashes should have a third number
	2304	*
	2305	* On input: l points to the beginning of the line to be examined; it points
	2306	* to somewhere in the string of the whole input text, and is
	2307	* terminated by a \n or the null string terminator.
	2308	* lend points to the null terminator of that string
	2309	* wants_value is non-zero if the swash expects a third number
	2310	* typestr is the name of the swash's mapping, like 'ToLower'
	2311	* On output: min, max, and *val are set to the values read from the line.
	2312	* returns a pointer just beyond the line examined. If there was no
	2313	* valid min number on the line, returns lend+1
	2314	*/
	2315
	2316	STATIC U8*
	2317	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2318	const bool wants_value, const U8* const typestr)
	2319	{
	2320	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2321	STRLEN numlen; /* Length of the number */
	2322	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	2323	\| PERL_SCAN_DISALLOW_PREFIX
	2324	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2325
	2326	/* nl points to the next \n in the scan */
	2327	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2328
	2329	/* Get the first number on the line: the range minimum */
	2330	numlen = lend - l;
	2331	min = grok_hex((char )l, &numlen, &flags, NULL);
	2332	if (numlen) /* If found a hex number, position past it */
	2333	l += numlen;
	2334	else if (nl) { /* Else, go handle next line, if any */
	2335	return nl + 1; /* 1 is length of "\n" */
	2336	}
	2337	else { /* Else, no next line */
	2338	return lend + 1; /* to LIST's end at which \n is not found */
	2339	}
	2340
	2341	/* The max range value follows, separated by a BLANK */
	2342	if (isBLANK(*l)) {
	2343	++l;
	2344	flags = PERL_SCAN_SILENT_ILLDIGIT
	2345	\| PERL_SCAN_DISALLOW_PREFIX
	2346	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2347	numlen = lend - l;
	2348	max = grok_hex((char )l, &numlen, &flags, NULL);
	2349	if (numlen)
	2350	l += numlen;
	2351	else /* If no value here, it is a single element range */
	2352	max = min;
	2353
	2354	/* Non-binary tables have a third entry: what the first element of the
	2355	* range maps to */
	2356	if (wants_value) {
	2357	if (isBLANK(*l)) {
	2358	++l;
	2359	flags = PERL_SCAN_SILENT_ILLDIGIT
	2360	\| PERL_SCAN_DISALLOW_PREFIX
	2361	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2362	numlen = lend - l;
	2363	val = grok_hex((char )l, &numlen, &flags, NULL);
	2364	if (numlen)
	2365	l += numlen;
	2366	else
	2367	*val = 0;
	2368	}
	2369	else {
	2370	*val = 0;
	2371	if (typeto) {
	2372	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2373	typestr, l);
	2374	}
	2375	}
	2376	}
	2377	else
	2378	val = 0; / bits == 1, then any val should be ignored */
	2379	}
	2380	else { /* Nothing following range min, should be single element with no
	2381	mapping expected */
	2382	max = min;
	2383	if (wants_value) {
	2384	*val = 0;
	2385	if (typeto) {
	2386	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2387	}
	2388	}
	2389	else
	2390	val = 0; / bits == 1, then val should be ignored */
	2391	}
	2392
	2393	/* Position to next line if any, or EOF */
	2394	if (nl)
	2395	l = nl + 1;
	2396	else
	2397	l = lend;
	2398
	2399	return l;
	2400	}
	2401
	2402	/* Note:
	2403	* Returns a swatch (a bit vector string) for a code point sequence
	2404	* that starts from the value C<start> and comprises the number C<span>.
	2405	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2406	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2407	*/
	2408	STATIC SV*
	2409	S_swash_get(pTHX_ SV* swash, UV start, UV span)
	2410	{
	2411	SV *swatch;
	2412	U8 l, lend, x, xend, s, send;
	2413	STRLEN lcur, xcur, scur;
	2414	HV *const hv = MUTABLE_HV(SvRV(swash));
	2415
	2416	/* The string containing the main body of the table */
	2417	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2418
	2419	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2420	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2421	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2422	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2423	SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2424	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2425	const STRLEN bits = SvUV(*bitssvp);
	2426	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2427	const UV none = SvUV(*nonesvp);
	2428	const UV end = start + span;
	2429
	2430	PERL_ARGS_ASSERT_SWASH_GET;
	2431
	2432	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2433	Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
	2434	(UV)bits);
	2435	}
	2436
	2437	/* create and initialize $swatch */
	2438	scur = octets ? (span * octets) : (span + 7) / 8;
	2439	swatch = newSV(scur);
	2440	SvPOK_on(swatch);
	2441	s = (U8*)SvPVX(swatch);
	2442	if (octets && none) {
	2443	const U8* const e = s + scur;
	2444	while (s < e) {
	2445	if (bits == 8)
	2446	*s++ = (U8)(none & 0xff);
	2447	else if (bits == 16) {
	2448	*s++ = (U8)((none >> 8) & 0xff);
	2449	*s++ = (U8)( none & 0xff);
	2450	}
	2451	else if (bits == 32) {
	2452	*s++ = (U8)((none >> 24) & 0xff);
	2453	*s++ = (U8)((none >> 16) & 0xff);
	2454	*s++ = (U8)((none >> 8) & 0xff);
	2455	*s++ = (U8)( none & 0xff);
	2456	}
	2457	}
	2458	*s = '\0';
	2459	}
	2460	else {
	2461	(void)memzero((U8*)s, scur + 1);
	2462	}
	2463	SvCUR_set(swatch, scur);
	2464	s = (U8*)SvPVX(swatch);
	2465
	2466	/* read $swash->{LIST} */
	2467	l = (U8)SvPV(listsvp, lcur);
	2468	lend = l + lcur;
	2469	while (l < lend) {
	2470	UV min, max, val;
	2471	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2472	cBOOL(octets), typestr);
	2473	if (l > lend) {
	2474	break;
	2475	}
	2476
	2477	/* If looking for something beyond this range, go try the next one */
	2478	if (max < start)
	2479	continue;
	2480
	2481	if (octets) {
	2482	UV key;
	2483	if (min < start) {
	2484	if (!none \|\| val < none) {
	2485	val += start - min;
	2486	}
	2487	min = start;
	2488	}
	2489	for (key = min; key <= max; key++) {
	2490	STRLEN offset;
	2491	if (key >= end)
	2492	goto go_out_list;
	2493	/* offset must be non-negative (start <= min <= key < end) */
	2494	offset = octets * (key - start);
	2495	if (bits == 8)
	2496	s[offset] = (U8)(val & 0xff);
	2497	else if (bits == 16) {
	2498	s[offset ] = (U8)((val >> 8) & 0xff);
	2499	s[offset + 1] = (U8)( val & 0xff);
	2500	}
	2501	else if (bits == 32) {
	2502	s[offset ] = (U8)((val >> 24) & 0xff);
	2503	s[offset + 1] = (U8)((val >> 16) & 0xff);
	2504	s[offset + 2] = (U8)((val >> 8) & 0xff);
	2505	s[offset + 3] = (U8)( val & 0xff);
	2506	}
	2507
	2508	if (!none \|\| val < none)
	2509	++val;
	2510	}
	2511	}
	2512	else { /* bits == 1, then val should be ignored */
	2513	UV key;
	2514	if (min < start)
	2515	min = start;
	2516	for (key = min; key <= max; key++) {
	2517	const STRLEN offset = (STRLEN)(key - start);
	2518	if (key >= end)
	2519	goto go_out_list;
	2520	s[offset >> 3] \|= 1 << (offset & 7);
	2521	}
	2522	}
	2523	} /* while */
	2524	go_out_list:
	2525
	2526	/* Invert if the data says it should be. Assumes that bits == 1 */
	2527	if (invert_it_svp && SvUV(*invert_it_svp)) {
	2528
	2529	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	2530	* be 0, and their inversion should also be 0, as we don't succeed any
	2531	* Unicode property matches for non-Unicode code points */
	2532	if (start <= PERL_UNICODE_MAX) {
	2533
	2534	/* The code below assumes that we never cross the
	2535	* Unicode/above-Unicode boundary in a range, as otherwise we would
	2536	* have to figure out where to stop flipping the bits. Since this
	2537	* boundary is divisible by a large power of 2, and swatches comes
	2538	* in small powers of 2, this should be a valid assumption */
	2539	assert(start + span - 1 <= PERL_UNICODE_MAX);
	2540
	2541	send = s + scur;
	2542	while (s < send) {
	2543	s = ~(s);
	2544	s++;
	2545	}
	2546	}
	2547	}
	2548
	2549	/* read $swash->{EXTRAS}
	2550	* This code also copied to swash_to_invlist() below */
	2551	x = (U8)SvPV(extssvp, xcur);
	2552	xend = x + xcur;
	2553	while (x < xend) {
	2554	STRLEN namelen;
	2555	U8 *namestr;
	2556	SV** othersvp;
	2557	HV* otherhv;
	2558	STRLEN otherbits;
	2559	SV *otherbitssvp, other;
	2560	U8 s, o, *nl;
	2561	STRLEN slen, olen;
	2562
	2563	const U8 opc = *x++;
	2564	if (opc == '\n')
	2565	continue;
	2566
	2567	nl = (U8*)memchr(x, '\n', xend - x);
	2568
	2569	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	2570	if (nl) {
	2571	x = nl + 1; /* 1 is length of "\n" */
	2572	continue;
	2573	}
	2574	else {
	2575	x = xend; /* to EXTRAS' end at which \n is not found */
	2576	break;
	2577	}
	2578	}
	2579
	2580	namestr = x;
	2581	if (nl) {
	2582	namelen = nl - namestr;
	2583	x = nl + 1;
	2584	}
	2585	else {
	2586	namelen = xend - namestr;
	2587	x = xend;
	2588	}
	2589
	2590	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	2591	otherhv = MUTABLE_HV(SvRV(*othersvp));
	2592	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	2593	otherbits = (STRLEN)SvUV(*otherbitssvp);
	2594	if (bits < otherbits)
	2595	Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
	2596
	2597	/* The "other" swatch must be destroyed after. */
	2598	other = swash_get(*othersvp, start, span);
	2599	o = (U8*)SvPV(other, olen);
	2600
	2601	if (!olen)
	2602	Perl_croak(aTHX_ "panic: swash_get got improper swatch");
	2603
	2604	s = (U8*)SvPV(swatch, slen);
	2605	if (bits == 1 && otherbits == 1) {
	2606	if (slen != olen)
	2607	Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
	2608
	2609	switch (opc) {
	2610	case '+':
	2611	while (slen--)
	2612	s++ \|= o++;
	2613	break;
	2614	case '!':
	2615	while (slen--)
	2616	s++ \|= ~o++;
	2617	break;
	2618	case '-':
	2619	while (slen--)
	2620	s++ &= ~o++;
	2621	break;
	2622	case '&':
	2623	while (slen--)
	2624	s++ &= o++;
	2625	break;
	2626	default:
	2627	break;
	2628	}
	2629	}
	2630	else {
	2631	STRLEN otheroctets = otherbits >> 3;
	2632	STRLEN offset = 0;
	2633	U8* const send = s + slen;
	2634
	2635	while (s < send) {
	2636	UV otherval = 0;
	2637
	2638	if (otherbits == 1) {
	2639	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	2640	++offset;
	2641	}
	2642	else {
	2643	STRLEN vlen = otheroctets;
	2644	otherval = *o++;
	2645	while (--vlen) {
	2646	otherval <<= 8;
	2647	otherval \|= *o++;
	2648	}
	2649	}
	2650
	2651	if (opc == '+' && otherval)
	2652	NOOP; /* replace with otherval */
	2653	else if (opc == '!' && !otherval)
	2654	otherval = 1;
	2655	else if (opc == '-' && otherval)
	2656	otherval = 0;
	2657	else if (opc == '&' && !otherval)
	2658	otherval = 0;
	2659	else {
	2660	s += octets; /* no replacement */
	2661	continue;
	2662	}
	2663
	2664	if (bits == 8)
	2665	*s++ = (U8)( otherval & 0xff);
	2666	else if (bits == 16) {
	2667	*s++ = (U8)((otherval >> 8) & 0xff);
	2668	*s++ = (U8)( otherval & 0xff);
	2669	}
	2670	else if (bits == 32) {
	2671	*s++ = (U8)((otherval >> 24) & 0xff);
	2672	*s++ = (U8)((otherval >> 16) & 0xff);
	2673	*s++ = (U8)((otherval >> 8) & 0xff);
	2674	*s++ = (U8)( otherval & 0xff);
	2675	}
	2676	}
	2677	}
	2678	sv_free(other); /* through with it! */
	2679	} /* while */
	2680	return swatch;
	2681	}
	2682
	2683	HV*
	2684	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	2685	{
	2686
	2687	/* Subject to change or removal. For use only in one place in regcomp.c.
	2688	* Can't be used on a property that is subject to user override, as it
	2689	* relies on the value of SPECIALS in the swash which would be set by
	2690	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	2691	* for overridden properties
	2692	*
	2693	* Returns a hash which is the inversion and closure of a swash mapping.
	2694	* For example, consider the input lines:
	2695	* 004B 006B
	2696	* 004C 006C
	2697	* 212A 006B
	2698	*
	2699	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	2700	* 006C. The value for each key is an array. For 006C, the array would
	2701	* have a two elements, the utf8 for itself, and for 004C. For 006B, there
	2702	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	2703	*
	2704	* Essentially, for any code point, it gives all the code points that map to
	2705	* it, or the list of 'froms' for that point.
	2706	*
	2707	* Currently it ignores any additions or deletions from other swashes,
	2708	* looking at just the main body of the swash, and if there are SPECIALS
	2709	* in the swash, at that hash
	2710	*
	2711	* The specials hash can be extra code points, and most likely consists of
	2712	* maps from single code points to multiple ones (each expressed as a string
	2713	* of utf8 characters). This function currently returns only 1-1 mappings.
	2714	* However consider this possible input in the specials hash:
	2715	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	2716	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	2717	*
	2718	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	2719	* currently handle. But it also means that FB05 and FB06 are equivalent in
	2720	* a 1-1 mapping which we should handle, and this relationship may not be in
	2721	* the main table. Therefore this function examines all the multi-char
	2722	* sequences and adds the 1-1 mappings that come out of that. */
	2723
	2724	U8 l, lend;
	2725	STRLEN lcur;
	2726	HV *const hv = MUTABLE_HV(SvRV(swash));
	2727
	2728	/* The string containing the main body of the table */
	2729	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2730
	2731	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2732	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2733	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2734	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	2735	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2736	const STRLEN bits = SvUV(*bitssvp);
	2737	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2738	const UV none = SvUV(*nonesvp);
	2739	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	2740
	2741	HV* ret = newHV();
	2742
	2743	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	2744
	2745	/* Must have at least 8 bits to get the mappings */
	2746	if (bits != 8 && bits != 16 && bits != 32) {
	2747	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	2748	(UV)bits);
	2749	}
	2750
	2751	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	2752	mapping to more than one character */
	2753
	2754	/* Construct an inverse mapping hash for the specials */
	2755	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	2756	HV * specials_inverse = newHV();
	2757	char char_from; / the lhs of the map */
	2758	I32 from_len; /* its byte length */
	2759	char char_to; / the rhs of the map */
	2760	I32 to_len; /* its byte length */
	2761	SV sv_to; / and in a sv */
	2762	AV* from_list; /* list of things that map to each 'to' */
	2763
	2764	hv_iterinit(specials_hv);
	2765
	2766	/* The keys are the characters (in utf8) that map to the corresponding
	2767	* utf8 string value. Iterate through the list creating the inverse
	2768	* list. */
	2769	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	2770	SV** listp;
	2771	if (! SvPOK(sv_to)) {
	2772	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() unexpectedly is not a string");
	2773	}
	2774	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8) char_from, 0), utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	2775
	2776	/* Each key in the inverse list is a mapped-to value, and the key's
	2777	* hash value is a list of the strings (each in utf8) that map to
	2778	* it. Those strings are all one character long */
	2779	if ((listp = hv_fetch(specials_inverse,
	2780	SvPVX(sv_to),
	2781	SvCUR(sv_to), 0)))
	2782	{
	2783	from_list = (AV) listp;
	2784	}
	2785	else { /* No entry yet for it: create one */
	2786	from_list = newAV();
	2787	if (! hv_store(specials_inverse,
	2788	SvPVX(sv_to),
	2789	SvCUR(sv_to),
	2790	(SV*) from_list, 0))
	2791	{
	2792	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2793	}
	2794	}
	2795
	2796	/* Here have the list associated with this 'to' (perhaps newly
	2797	* created and empty). Just add to it. Note that we ASSUME that
	2798	* the input is guaranteed to not have duplications, so we don't
	2799	* check for that. Duplications just slow down execution time. */
	2800	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	2801	}
	2802
	2803	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	2804	* it looking for cases like the FB05/FB06 examples above. There would
	2805	* be an entry in the hash like
	2806	* 'st' => [ FB05, FB06 ]
	2807	* In this example we will create two lists that get stored in the
	2808	* returned hash, 'ret':
	2809	* FB05 => [ FB05, FB06 ]
	2810	* FB06 => [ FB05, FB06 ]
	2811	*
	2812	* Note that there is nothing to do if the array only has one element.
	2813	* (In the normal 1-1 case handled below, we don't have to worry about
	2814	* two lists, as everything gets tied to the single list that is
	2815	* generated for the single character 'to'. But here, we are omitting
	2816	* that list, ('st' in the example), so must have multiple lists.) */
	2817	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	2818	&char_to, &to_len)))
	2819	{
	2820	if (av_len(from_list) > 0) {
	2821	int i;
	2822
	2823	/* We iterate over all combinations of i,j to place each code
	2824	* point on each list */
	2825	for (i = 0; i <= av_len(from_list); i++) {
	2826	int j;
	2827	AV* i_list = newAV();
	2828	SV** entryp = av_fetch(from_list, i, FALSE);
	2829	if (entryp == NULL) {
	2830	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2831	}
	2832	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	2833	Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
	2834	}
	2835	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	2836	(SV*) i_list, FALSE))
	2837	{
	2838	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2839	}
	2840
	2841	/* For debugging: UV u = utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	2842	for (j = 0; j <= av_len(from_list); j++) {
	2843	entryp = av_fetch(from_list, j, FALSE);
	2844	if (entryp == NULL) {
	2845	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2846	}
	2847
	2848	/* When i==j this adds itself to the list */
	2849	av_push(i_list, newSVuv(utf8_to_uvchr(
	2850	(U8) SvPVX(entryp), 0)));
	2851	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	2852	}
	2853	}
	2854	}
	2855	}
	2856	SvREFCNT_dec(specials_inverse); /* done with it */
	2857	} /* End of specials */
	2858
	2859	/* read $swash->{LIST} */
	2860	l = (U8)SvPV(listsvp, lcur);
	2861	lend = l + lcur;
	2862
	2863	/* Go through each input line */
	2864	while (l < lend) {
	2865	UV min, max, val;
	2866	UV inverse;
	2867	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	2868	cBOOL(octets), typestr);
	2869	if (l > lend) {
	2870	break;
	2871	}
	2872
	2873	/* Each element in the range is to be inverted */
	2874	for (inverse = min; inverse <= max; inverse++) {
	2875	AV* list;
	2876	SV** listp;
	2877	IV i;
	2878	bool found_key = FALSE;
	2879	bool found_inverse = FALSE;
	2880
	2881	/* The key is the inverse mapping */
	2882	char key[UTF8_MAXBYTES+1];
	2883	char* key_end = (char ) uvuni_to_utf8((U8) key, val);
	2884	STRLEN key_len = key_end - key;
	2885
	2886	/* Get the list for the map */
	2887	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	2888	list = (AV) listp;
	2889	}
	2890	else { /* No entry yet for it: create one */
	2891	list = newAV();
	2892	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	2893	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2894	}
	2895	}
	2896
	2897	/* Look through list to see if this inverse mapping already is
	2898	* listed, or if there is a mapping to itself already */
	2899	for (i = 0; i <= av_len(list); i++) {
	2900	SV** entryp = av_fetch(list, i, FALSE);
	2901	SV* entry;
	2902	if (entryp == NULL) {
	2903	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	2904	}
	2905	entry = *entryp;
	2906	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));/
	2907	if (SvUV(entry) == val) {
	2908	found_key = TRUE;
	2909	}
	2910	if (SvUV(entry) == inverse) {
	2911	found_inverse = TRUE;
	2912	}
	2913
	2914	/* No need to continue searching if found everything we are
	2915	* looking for */
	2916	if (found_key && found_inverse) {
	2917	break;
	2918	}
	2919	}
	2920
	2921	/* Make sure there is a mapping to itself on the list */
	2922	if (! found_key) {
	2923	av_push(list, newSVuv(val));
	2924	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));/
	2925	}
	2926
	2927
	2928	/* Simply add the value to the list */
	2929	if (! found_inverse) {
	2930	av_push(list, newSVuv(inverse));
	2931	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));/
	2932	}
	2933
	2934	/* swash_get() increments the value of val for each element in the
	2935	* range. That makes more compact tables possible. You can
	2936	* express the capitalization, for example, of all consecutive
	2937	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	2938	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	2939	* and it's not documented; it appears to be used only in
	2940	* implementing tr//; I copied the semantics from swash_get(), just
	2941	* in case */
	2942	if (!none \|\| val < none) {
	2943	++val;
	2944	}
	2945	}
	2946	}
	2947
	2948	return ret;
	2949	}
	2950
	2951	SV*
	2952	Perl__swash_to_invlist(pTHX_ SV* const swash)
	2953	{
	2954
	2955	/* Subject to change or removal. For use only in one place in regcomp.c */
	2956
	2957	U8 l, lend;
	2958	char *loc;
	2959	STRLEN lcur;
	2960	HV *const hv = MUTABLE_HV(SvRV(swash));
	2961	UV elements = 0; /* Number of elements in the inversion list */
	2962	U8 empty[] = "";
	2963
	2964	/* The string containing the main body of the table */
	2965	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	2966	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2967	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2968	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2969	SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2970
	2971	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	2972	const STRLEN bits = SvUV(*bitssvp);
	2973	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2974	U8 x, xend;
	2975	STRLEN xcur;
	2976
	2977	SV* invlist;
	2978
	2979	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	2980
	2981	/* read $swash->{LIST} */
	2982	if (SvPOK(*listsvp)) {
	2983	l = (U8)SvPV(listsvp, lcur);
	2984	}
	2985	else {
	2986	/* LIST legitimately doesn't contain a string during compilation phases
	2987	* of Perl itself, before the Unicode tables are generated. In this
	2988	* case, just fake things up by creating an empty list */
	2989	l = empty;
	2990	lcur = 0;
	2991	}
	2992	loc = (char *) l;
	2993	lend = l + lcur;
	2994
	2995	/* Scan the input to count the number of lines to preallocate array size
	2996	* based on worst possible case, which is each line in the input creates 2
	2997	* elements in the inversion list: 1) the beginning of a range in the list;
	2998	* 2) the beginning of a range not in the list. */
	2999	while ((loc = (strchr(loc, '\n'))) != NULL) {
	3000	elements += 2;
	3001	loc++;
	3002	}
	3003
	3004	/* If the ending is somehow corrupt and isn't a new line, add another
	3005	* element for the final range that isn't in the inversion list */
	3006	if (! (lend == '\n' \|\| (lend == '\0' && *(lend - 1) == '\n'))) {
	3007	elements++;
	3008	}
	3009
	3010	invlist = _new_invlist(elements);
	3011
	3012	/* Now go through the input again, adding each range to the list */
	3013	while (l < lend) {
	3014	UV start, end;
	3015	UV val; /* Not used by this function */
	3016
	3017	l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
	3018	cBOOL(octets), typestr);
	3019
	3020	if (l > lend) {
	3021	break;
	3022	}
	3023
	3024	_append_range_to_invlist(invlist, start, end);
	3025	}
	3026
	3027	/* Invert if the data says it should be */
	3028	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3029	_invlist_invert_prop(invlist);
	3030	}
	3031
	3032	/* This code is copied from swash_get()
	3033	* read $swash->{EXTRAS} */
	3034	x = (U8)SvPV(extssvp, xcur);
	3035	xend = x + xcur;
	3036	while (x < xend) {
	3037	STRLEN namelen;
	3038	U8 *namestr;
	3039	SV** othersvp;
	3040	HV* otherhv;
	3041	STRLEN otherbits;
	3042	SV *otherbitssvp, other;
	3043	U8 *nl;
	3044
	3045	const U8 opc = *x++;
	3046	if (opc == '\n')
	3047	continue;
	3048
	3049	nl = (U8*)memchr(x, '\n', xend - x);
	3050
	3051	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3052	if (nl) {
	3053	x = nl + 1; /* 1 is length of "\n" */
	3054	continue;
	3055	}
	3056	else {
	3057	x = xend; /* to EXTRAS' end at which \n is not found */
	3058	break;
	3059	}
	3060	}
	3061
	3062	namestr = x;
	3063	if (nl) {
	3064	namelen = nl - namestr;
	3065	x = nl + 1;
	3066	}
	3067	else {
	3068	namelen = xend - namestr;
	3069	x = xend;
	3070	}
	3071
	3072	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3073	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3074	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3075	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3076
	3077	if (bits != otherbits \|\| bits != 1) {
	3078	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean properties");
	3079	}
	3080
	3081	/* The "other" swatch must be destroyed after. */
	3082	other = _swash_to_invlist((SV )othersvp);
	3083
	3084	/* End of code copied from swash_get() */
	3085	switch (opc) {
	3086	case '+':
	3087	_invlist_union(invlist, other, &invlist);
	3088	break;
	3089	case '!':
	3090	_invlist_invert(other);
	3091	_invlist_union(invlist, other, &invlist);
	3092	break;
	3093	case '-':
	3094	_invlist_subtract(invlist, other, &invlist);
	3095	break;
	3096	case '&':
	3097	_invlist_intersection(invlist, other, &invlist);
	3098	break;
	3099	default:
	3100	break;
	3101	}
	3102	sv_free(other); /* through with it! */
	3103	}
	3104
	3105	return invlist;
	3106	}
	3107
	3108	/*
	3109	=for apidoc uvchr_to_utf8
	3110
	3111	Adds the UTF-8 representation of the Native code point C<uv> to the end
	3112	of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
	3113	bytes available. The return value is the pointer to the byte after the
	3114	end of the new character. In other words,
	3115
	3116	d = uvchr_to_utf8(d, uv);
	3117
	3118	is the recommended wide native character-aware way of saying
	3119
	3120	*(d++) = uv;
	3121
	3122	=cut
	3123	*/
	3124
	3125	/* On ASCII machines this is normally a macro but we want a
	3126	real function in case XS code wants it
	3127	*/
	3128	U8 *
	3129	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	3130	{
	3131	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	3132
	3133	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	3134	}
	3135
	3136	U8 *
	3137	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	3138	{
	3139	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	3140
	3141	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	3142	}
	3143
	3144	/*
	3145	=for apidoc utf8n_to_uvchr
	3146
	3147	Returns the native character value of the first character in the string
	3148	C<s>
	3149	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	3150	length, in bytes, of that character.
	3151
	3152	length and flags are the same as utf8n_to_uvuni().
	3153
	3154	=cut
	3155	*/
	3156	/* On ASCII machines this is normally a macro but we want
	3157	a real function in case XS code wants it
	3158	*/
	3159	UV
	3160	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	3161	U32 flags)
	3162	{
	3163	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	3164
	3165	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	3166
	3167	return UNI_TO_NATIVE(uv);
	3168	}
	3169
	3170	bool
	3171	Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
	3172	{
	3173	/* May change: warns if surrogates, non-character code points, or
	3174	* non-Unicode code points are in s which has length len. Returns TRUE if
	3175	* none found; FALSE otherwise. The only other validity check is to make
	3176	* sure that this won't exceed the string's length */
	3177
	3178	const U8* const e = s + len;
	3179	bool ok = TRUE;
	3180
	3181	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3182
	3183	while (s < e) {
	3184	if (UTF8SKIP(s) > len) {
	3185	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3186	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3187	return FALSE;
	3188	}
	3189	if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
	3190	STRLEN char_len;
	3191	if (UTF8_IS_SUPER(s)) {
	3192	if (ckWARN_d(WARN_NON_UNICODE)) {
	3193	UV uv = utf8_to_uvchr(s, &char_len);
	3194	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3195	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	3196	ok = FALSE;
	3197	}
	3198	}
	3199	else if (UTF8_IS_SURROGATE(s)) {
	3200	if (ckWARN_d(WARN_SURROGATE)) {
	3201	UV uv = utf8_to_uvchr(s, &char_len);
	3202	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3203	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	3204	ok = FALSE;
	3205	}
	3206	}
	3207	else if
	3208	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	3209	&& (ckWARN_d(WARN_NONCHAR)))
	3210	{
	3211	UV uv = utf8_to_uvchr(s, &char_len);
	3212	Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
	3213	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	3214	ok = FALSE;
	3215	}
	3216	}
	3217	s += UTF8SKIP(s);
	3218	}
	3219
	3220	return ok;
	3221	}
	3222
	3223	/*
	3224	=for apidoc pv_uni_display
	3225
	3226	Build to the scalar dsv a displayable version of the string spv,
	3227	length len, the displayable version being at most pvlim bytes long
	3228	(if longer, the rest is truncated and "..." will be appended).
	3229
	3230	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	3231	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	3232	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	3233	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	3234	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	3235	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	3236
	3237	The pointer to the PV of the dsv is returned.
	3238
	3239	=cut */
	3240	char *
	3241	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	3242	{
	3243	int truncated = 0;
	3244	const char s, e;
	3245
	3246	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	3247
	3248	sv_setpvs(dsv, "");
	3249	SvUTF8_off(dsv);
	3250	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	3251	UV u;
	3252	/* This serves double duty as a flag and a character to print after
	3253	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	3254	*/
	3255	char ok = 0;
	3256
	3257	if (pvlim && SvCUR(dsv) >= pvlim) {
	3258	truncated++;
	3259	break;
	3260	}
	3261	u = utf8_to_uvchr((U8*)s, 0);
	3262	if (u < 256) {
	3263	const unsigned char c = (unsigned char)u & 0xFF;
	3264	if (flags & UNI_DISPLAY_BACKSLASH) {
	3265	switch (c) {
	3266	case '\n':
	3267	ok = 'n'; break;
	3268	case '\r':
	3269	ok = 'r'; break;
	3270	case '\t':
	3271	ok = 't'; break;
	3272	case '\f':
	3273	ok = 'f'; break;
	3274	case '\a':
	3275	ok = 'a'; break;
	3276	case '\\':
	3277	ok = '\\'; break;
	3278	default: break;
	3279	}
	3280	if (ok) {
	3281	const char string = ok;
	3282	sv_catpvs(dsv, "\\");
	3283	sv_catpvn(dsv, &string, 1);
	3284	}
	3285	}
	3286	/* isPRINT() is the locale-blind version. */
	3287	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	3288	const char string = c;
	3289	sv_catpvn(dsv, &string, 1);
	3290	ok = 1;
	3291	}
	3292	}
	3293	if (!ok)
	3294	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	3295	}
	3296	if (truncated)
	3297	sv_catpvs(dsv, "...");
	3298
	3299	return SvPVX(dsv);
	3300	}
	3301
	3302	/*
	3303	=for apidoc sv_uni_display
	3304
	3305	Build to the scalar dsv a displayable version of the scalar sv,
	3306	the displayable version being at most pvlim bytes long
	3307	(if longer, the rest is truncated and "..." will be appended).
	3308
	3309	The flags argument is as in pv_uni_display().
	3310
	3311	The pointer to the PV of the dsv is returned.
	3312
	3313	=cut
	3314	*/
	3315	char *
	3316	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	3317	{
	3318	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	3319
	3320	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	3321	SvCUR(ssv), pvlim, flags);
	3322	}
	3323
	3324	/*
	3325	=for apidoc foldEQ_utf8
	3326
	3327	Returns true if the leading portions of the strings s1 and s2 (either or both
	3328	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3329	How far into the strings to compare is determined by other input parameters.
	3330
	3331	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	3332	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	3333	with respect to s2.
	3334
	3335	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	3336	equality. In other words, s1+l1 will be used as a goal to reach. The
	3337	scan will not be considered to be a match unless the goal is reached, and
	3338	scanning won't continue past that goal. Correspondingly for l2 with respect to
	3339	s2.
	3340
	3341	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	3342	considered an end pointer beyond which scanning of s1 will not continue under
	3343	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	3344	is less than s1+l1, the match will never be successful because it can never
	3345	get as far as its goal (and in fact is asserted against). Correspondingly for
	3346	pe2 with respect to s2.
	3347
	3348	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	3349	non-zero), and if both do, both have to be
	3350	reached for a successful match. Also, if the fold of a character is multiple
	3351	characters, all of them must be matched (see tr21 reference below for
	3352	'folding').
	3353
	3354	Upon a successful match, if pe1 is non-NULL,
	3355	it will be set to point to the beginning of the I<next> character of s1 beyond
	3356	what was matched. Correspondingly for pe2 and s2.
	3357
	3358	For case-insensitiveness, the "casefolding" of Unicode is used
	3359	instead of upper/lowercasing both the characters, see
	3360	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	3361
	3362	=cut */
	3363
	3364	/* A flags parameter has been added which may change, and hence isn't
	3365	* externally documented. Currently it is:
	3366	* 0 for as-documented above
	3367	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	3368	ASCII one, to not match
	3369	* FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
	3370	* points below 256; unicode rules for above 255; and
	3371	* folds that cross those boundaries are disallowed,
	3372	* like the NOMIX_ASCII option
	3373	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	3374	* routine. This allows that step to be skipped.
	3375	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	3376	*/
	3377	I32
	3378	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2, U32 flags)
	3379	{
	3380	dVAR;
	3381	register const U8 p1 = (const U8)s1; /* Point to current char */
	3382	register const U8 p2 = (const U8)s2;
	3383	register const U8 g1 = NULL; / goal for s1 */
	3384	register const U8 *g2 = NULL;
	3385	register const U8 e1 = NULL; / Don't scan s1 past this */
	3386	register U8 f1 = NULL; / Point to current folded */
	3387	register const U8 *e2 = NULL;
	3388	register U8 *f2 = NULL;
	3389	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3390	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3391	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3392	U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
	3393	these always fit in 2 bytes */
	3394
	3395	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	3396
	3397	/* The algorithm requires that input with the flags on the first line of
	3398	* the assert not be pre-folded. */
	3399	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_UTF8_LOCALE))
	3400	&& (flags & (FOLDEQ_S1_ALREADY_FOLDED \| FOLDEQ_S2_ALREADY_FOLDED))));
	3401
	3402	if (pe1) {
	3403	e1 = (U8*)pe1;
	3404	}
	3405
	3406	if (l1) {
	3407	g1 = (const U8*)s1 + l1;
	3408	}
	3409
	3410	if (pe2) {
	3411	e2 = (U8*)pe2;
	3412	}
	3413
	3414	if (l2) {
	3415	g2 = (const U8*)s2 + l2;
	3416	}
	3417
	3418	/* Must have at least one goal */
	3419	assert(g1 \|\| g2);
	3420
	3421	if (g1) {
	3422
	3423	/* Will never match if goal is out-of-bounds */
	3424	assert(! e1 \|\| e1 >= g1);
	3425
	3426	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3427	* only go as far as the goal */
	3428	e1 = g1;
	3429	}
	3430	else {
	3431	assert(e1); /* Must have an end for looking at s1 */
	3432	}
	3433
	3434	/* Same for goal for s2 */
	3435	if (g2) {
	3436	assert(! e2 \|\| e2 >= g2);
	3437	e2 = g2;
	3438	}
	3439	else {
	3440	assert(e2);
	3441	}
	3442
	3443	/* If both operands are already folded, we could just do a memEQ on the
	3444	* whole strings at once, but it would be better if the caller realized
	3445	* this and didn't even call us */
	3446
	3447	/* Look through both strings, a character at a time */
	3448	while (p1 < e1 && p2 < e2) {
	3449
	3450	/* If at the beginning of a new character in s1, get its fold to use
	3451	* and the length of the fold. (exception: locale rules just get the
	3452	* character to a single byte) */
	3453	if (n1 == 0) {
	3454	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	3455	f1 = (U8 *) p1;
	3456	n1 = UTF8SKIP(f1);
	3457
	3458	/* If in locale matching, we use two sets of rules, depending on if
	3459	* the code point is above or below 255. Here, we test for and
	3460	* handle locale rules */
	3461	}
	3462	else {
	3463	if ((flags & FOLDEQ_UTF8_LOCALE)
	3464	&& (! u1 \|\| UTF8_IS_INVARIANT(*p1)
	3465	\|\| UTF8_IS_DOWNGRADEABLE_START(*p1)))
	3466	{
	3467	/* There is no mixing of code points above and below 255. */
	3468	if (u2 && (! UTF8_IS_INVARIANT(*p2)
	3469	&& ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
	3470	{
	3471	return 0;
	3472	}
	3473
	3474	/* We handle locale rules by converting, if necessary, the
	3475	* code point to a single byte. */
	3476	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
	3477	foldbuf1 = p1;
	3478	}
	3479	else {
	3480	foldbuf1 = TWO_BYTE_UTF8_TO_UNI(p1, *(p1 + 1));
	3481	}
	3482	n1 = 1;
	3483	}
	3484	else if (isASCII(p1)) { / Note, that here won't be
	3485	both ASCII and using locale
	3486	rules */
	3487
	3488	/* If trying to mix non- with ASCII, and not supposed to,
	3489	* fail */
	3490	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	3491	return 0;
	3492	}
	3493	n1 = 1;
	3494	foldbuf1 = toLOWER(p1); /* Folds in the ASCII range are
	3495	just lowercased */
	3496	}
	3497	else if (u1) {
	3498	to_utf8_fold(p1, foldbuf1, &n1);
	3499	}
	3500	else { /* Not utf8, convert to it first and then get fold */
	3501	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
	3502	to_utf8_fold(natbuf, foldbuf1, &n1);
	3503	}
	3504	f1 = foldbuf1;
	3505	}
	3506	}
	3507
	3508	if (n2 == 0) { /* Same for s2 */
	3509	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	3510	f2 = (U8 *) p2;
	3511	n2 = UTF8SKIP(f2);
	3512	}
	3513	else {
	3514	if ((flags & FOLDEQ_UTF8_LOCALE)
	3515	&& (! u2 \|\| UTF8_IS_INVARIANT(p2) \|\| UTF8_IS_DOWNGRADEABLE_START(p2)))
	3516	{
	3517	/* Here, the next char in s2 is < 256. We've already
	3518	* worked on s1, and if it isn't also < 256, can't match */
	3519	if (u1 && (! UTF8_IS_INVARIANT(*p1)
	3520	&& ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
	3521	{
	3522	return 0;
	3523	}
	3524	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
	3525	foldbuf2 = p2;
	3526	}
	3527	else {
	3528	foldbuf2 = TWO_BYTE_UTF8_TO_UNI(p2, *(p2 + 1));
	3529	}
	3530
	3531	/* Use another function to handle locale rules. We've made
	3532	* sure that both characters to compare are single bytes */
	3533	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
	3534	return 0;
	3535	}
	3536	n1 = n2 = 0;
	3537	}
	3538	else if (isASCII(*p2)) {
	3539	if (flags && ! isASCII(*p1)) {
	3540	return 0;
	3541	}
	3542	n2 = 1;
	3543	foldbuf2 = toLOWER(p2);
	3544	}
	3545	else if (u2) {
	3546	to_utf8_fold(p2, foldbuf2, &n2);
	3547	}
	3548	else {
	3549	uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
	3550	to_utf8_fold(natbuf, foldbuf2, &n2);
	3551	}
	3552	f2 = foldbuf2;
	3553	}
	3554	}
	3555
	3556	/* Here f1 and f2 point to the beginning of the strings to compare.
	3557	* These strings are the folds of the next character from each input
	3558	* string, stored in utf8. */
	3559
	3560	/* While there is more to look for in both folds, see if they
	3561	* continue to match */
	3562	while (n1 && n2) {
	3563	U8 fold_length = UTF8SKIP(f1);
	3564	if (fold_length != UTF8SKIP(f2)
	3565	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	3566	function call for single
	3567	character */
	3568	\|\| memNE((char)f1, (char)f2, fold_length))
	3569	{
	3570	return 0; /* mismatch */
	3571	}
	3572
	3573	/* Here, they matched, advance past them */
	3574	n1 -= fold_length;
	3575	f1 += fold_length;
	3576	n2 -= fold_length;
	3577	f2 += fold_length;
	3578	}
	3579
	3580	/* When reach the end of any fold, advance the input past it */
	3581	if (n1 == 0) {
	3582	p1 += u1 ? UTF8SKIP(p1) : 1;
	3583	}
	3584	if (n2 == 0) {
	3585	p2 += u2 ? UTF8SKIP(p2) : 1;
	3586	}
	3587	} /* End of loop through both strings */
	3588
	3589	/* A match is defined by each scan that specified an explicit length
	3590	* reaching its final goal, and the other not having matched a partial
	3591	* character (which can happen when the fold of a character is more than one
	3592	* character). */
	3593	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	3594	return 0;
	3595	}
	3596
	3597	/* Successful match. Set output pointers */
	3598	if (pe1) {
	3599	pe1 = (char)p1;
	3600	}
	3601	if (pe2) {
	3602	pe2 = (char)p2;
	3603	}
	3604	return 1;
	3605	}
	3606
	3607	/*
	3608	* Local variables:
	3609	* c-indentation-style: bsd
	3610	* c-basic-offset: 4
	3611	* indent-tabs-mode: t
	3612	* End:
	3613	*
	3614	* ex: set ts=8 sts=4 sw=4 noet:
	3615	*/