perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
	4	* by Larry Wall and others
	5	*
	6	* You may distribute under the terms of either the GNU General Public
	7	* License or the Artistic License, as specified in the README file.
	8	*
	9	*/
	10
	11	/*
	12	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	13	* heard of that we don't want to see any closer; and that's the one place
	14	* we're trying to get to! And that's just where we can't get, nohow.'
	15	*
	16	* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
	17	*
	18	* 'Well do I understand your speech,' he answered in the same language;
	19	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	20	* as is the custom in the West, if you wish to be answered?'
	21	* --Gandalf, addressing Théoden's door wardens
	22	*
	23	* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	24	*
	25	* ...the travellers perceived that the floor was paved with stones of many
	26	* hues; branching runes and strange devices intertwined beneath their feet.
	27	*
	28	* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
	29	*/
	30
	31	#include "EXTERN.h"
	32	#define PERL_IN_UTF8_C
	33	#include "perl.h"
	34
	35	#ifndef EBCDIC
	36	/* Separate prototypes needed because in ASCII systems these are
	37	* usually macros but they still are compiled as code, too. */
	38	PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags);
	39	PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
	40	#endif
	41
	42	static const char unees[] =
	43	"Malformed UTF-8 character (unexpected end of string)";
	44
	45	/*
	46	=head1 Unicode Support
	47
	48	This file contains various utility functions for manipulating UTF8-encoded
	49	strings. For the uninitiated, this is a method of representing arbitrary
	50	Unicode characters as a variable number of bytes, in such a way that
	51	characters in the ASCII range are unmodified, and a zero byte never appears
	52	within non-zero characters.
	53
	54	=cut
	55	*/
	56
	57	/*
	58	=for apidoc is_ascii_string
	59
	60	Returns true if the first C<len> bytes of the given string are the same whether
	61	or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
	62	is, if they are invariant. On ASCII-ish machines, only ASCII characters
	63	fit this definition, hence the function's name.
	64
	65	If C<len> is 0, it will be calculated using C<strlen(s)>.
	66
	67	See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	68
	69	=cut
	70	*/
	71
	72	bool
	73	Perl_is_ascii_string(const U8 *s, STRLEN len)
	74	{
	75	const U8* const send = s + (len ? len : strlen((const char *)s));
	76	const U8* x = s;
	77
	78	PERL_ARGS_ASSERT_IS_ASCII_STRING;
	79
	80	for (; x < send; ++x) {
	81	if (!UTF8_IS_INVARIANT(*x))
	82	break;
	83	}
	84
	85	return x == send;
	86	}
	87
	88	/*
	89	=for apidoc uvuni_to_utf8_flags
	90
	91	Adds the UTF-8 representation of the code point C<uv> to the end
	92	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	93	bytes available. The return value is the pointer to the byte after the
	94	end of the new character. In other words,
	95
	96	d = uvuni_to_utf8_flags(d, uv, flags);
	97
	98	or, in most cases,
	99
	100	d = uvuni_to_utf8(d, uv);
	101
	102	(which is equivalent to)
	103
	104	d = uvuni_to_utf8_flags(d, uv, 0);
	105
	106	This is the recommended Unicode-aware way of saying
	107
	108	*(d++) = uv;
	109
	110	This function will convert to UTF-8 (and not warn) even code points that aren't
	111	legal Unicode or are problematic, unless C<flags> contains one or more of the
	112	following flags.
	113	If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
	114	the function will raise a warning, provided UTF8 warnings are enabled. If instead
	115	UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
	116	If both flags are set, the function will both warn and return NULL.
	117
	118	The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
	119	affect how the function handles a Unicode non-character. And, likewise for the
	120	UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
	121	above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
	122	even less portable) can be warned and/or disallowed even if other above-Unicode
	123	code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
	124	flags.
	125
	126	And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
	127	above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
	128	DISALLOW flags.
	129
	130
	131	=cut
	132	*/
	133
	134	U8 *
	135	Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	136	{
	137	PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
	138
	139	if (ckWARN_d(WARN_UTF8)) {
	140	if (UNICODE_IS_SURROGATE(uv)) {
	141	if (flags & UNICODE_WARN_SURROGATE) {
	142	Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
	143	"UTF-16 surrogate U+%04"UVXf, uv);
	144	}
	145	if (flags & UNICODE_DISALLOW_SURROGATE) {
	146	return NULL;
	147	}
	148	}
	149	else if (UNICODE_IS_SUPER(uv)) {
	150	if (flags & UNICODE_WARN_SUPER
	151	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
	152	{
	153	Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
	154	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	155	}
	156	if (flags & UNICODE_DISALLOW_SUPER
	157	\|\| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
	158	{
	159	return NULL;
	160	}
	161	}
	162	else if (UNICODE_IS_NONCHAR(uv)) {
	163	if (flags & UNICODE_WARN_NONCHAR) {
	164	Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
	165	"Unicode non-character U+%04"UVXf" is illegal for open interchange",
	166	uv);
	167	}
	168	if (flags & UNICODE_DISALLOW_NONCHAR) {
	169	return NULL;
	170	}
	171	}
	172	}
	173	if (UNI_IS_INVARIANT(uv)) {
	174	*d++ = (U8)UTF_TO_NATIVE(uv);
	175	return d;
	176	}
	177	#if defined(EBCDIC)
	178	else {
	179	STRLEN len = UNISKIP(uv);
	180	U8 *p = d+len-1;
	181	while (p > d) {
	182	*p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	183	uv >>= UTF_ACCUMULATION_SHIFT;
	184	}
	185	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	186	return d+len;
	187	}
	188	#else /* Non loop style */
	189	if (uv < 0x800) {
	190	*d++ = (U8)(( uv >> 6) \| 0xc0);
	191	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	192	return d;
	193	}
	194	if (uv < 0x10000) {
	195	*d++ = (U8)(( uv >> 12) \| 0xe0);
	196	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	197	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	198	return d;
	199	}
	200	if (uv < 0x200000) {
	201	*d++ = (U8)(( uv >> 18) \| 0xf0);
	202	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	203	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	204	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	205	return d;
	206	}
	207	if (uv < 0x4000000) {
	208	*d++ = (U8)(( uv >> 24) \| 0xf8);
	209	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	210	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	211	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	212	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	213	return d;
	214	}
	215	if (uv < 0x80000000) {
	216	*d++ = (U8)(( uv >> 30) \| 0xfc);
	217	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	218	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	219	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	220	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	221	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	222	return d;
	223	}
	224	#ifdef HAS_QUAD
	225	if (uv < UTF8_QUAD_MAX)
	226	#endif
	227	{
	228	d++ = 0xfe; / Can't match U+FEFF! */
	229	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	230	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	231	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	232	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	233	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	234	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	235	return d;
	236	}
	237	#ifdef HAS_QUAD
	238	{
	239	d++ = 0xff; / Can't match U+FFFE! */
	240	d++ = 0x80; / 6 Reserved bits */
	241	d++ = (U8)(((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	242	*d++ = (U8)(((uv >> 54) & 0x3f) \| 0x80);
	243	*d++ = (U8)(((uv >> 48) & 0x3f) \| 0x80);
	244	*d++ = (U8)(((uv >> 42) & 0x3f) \| 0x80);
	245	*d++ = (U8)(((uv >> 36) & 0x3f) \| 0x80);
	246	*d++ = (U8)(((uv >> 30) & 0x3f) \| 0x80);
	247	*d++ = (U8)(((uv >> 24) & 0x3f) \| 0x80);
	248	*d++ = (U8)(((uv >> 18) & 0x3f) \| 0x80);
	249	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	250	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	251	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	252	return d;
	253	}
	254	#endif
	255	#endif /* Loop style */
	256	}
	257
	258	/*
	259
	260	Tests if some arbitrary number of bytes begins in a valid UTF-8
	261	character. Note that an INVARIANT (i.e. ASCII) character is a valid
	262	UTF-8 character. The actual number of bytes in the UTF-8 character
	263	will be returned if it is valid, otherwise 0.
	264
	265	This is the "slow" version as opposed to the "fast" version which is
	266	the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
	267	difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
	268	or less you should use the IS_UTF8_CHAR(), for lengths of five or more
	269	you should use the _slow(). In practice this means that the _slow()
	270	will be used very rarely, since the maximum Unicode code point (as of
	271	Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
	272	the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
	273	five bytes or more.
	274
	275	=cut */
	276	STATIC STRLEN
	277	S_is_utf8_char_slow(const U8 *s, const STRLEN len)
	278	{
	279	U8 u = *s;
	280	STRLEN slen;
	281	UV uv, ouv;
	282
	283	PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
	284
	285	if (UTF8_IS_INVARIANT(u))
	286	return 1;
	287
	288	if (!UTF8_IS_START(u))
	289	return 0;
	290
	291	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	292	return 0;
	293
	294	slen = len - 1;
	295	s++;
	296	#ifdef EBCDIC
	297	u = NATIVE_TO_UTF(u);
	298	#endif
	299	u &= UTF_START_MASK(len);
	300	uv = u;
	301	ouv = uv;
	302	while (slen--) {
	303	if (!UTF8_IS_CONTINUATION(*s))
	304	return 0;
	305	uv = UTF8_ACCUMULATE(uv, *s);
	306	if (uv < ouv)
	307	return 0;
	308	ouv = uv;
	309	s++;
	310	}
	311
	312	if ((STRLEN)UNISKIP(uv) < len)
	313	return 0;
	314
	315	return len;
	316	}
	317
	318	/*
	319	=for apidoc is_utf8_char
	320
	321	Tests if some arbitrary number of bytes begins in a valid UTF-8
	322	character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
	323	character is a valid UTF-8 character. The actual number of bytes in the UTF-8
	324	character will be returned if it is valid, otherwise 0.
	325
	326	WARNING: use only if you know that C<s> has at least either UTF8_MAXBYTES or
	327	UTF8SKIP(s) bytes.
	328
	329	=cut */
	330	STRLEN
	331	Perl_is_utf8_char(const U8 *s)
	332	{
	333	const STRLEN len = UTF8SKIP(s);
	334
	335	PERL_ARGS_ASSERT_IS_UTF8_CHAR;
	336	#ifdef IS_UTF8_CHAR
	337	if (IS_UTF8_CHAR_FAST(len))
	338	return IS_UTF8_CHAR(s, len) ? len : 0;
	339	#endif /* #ifdef IS_UTF8_CHAR */
	340	return is_utf8_char_slow(s, len);
	341	}
	342
	343
	344	/*
	345	=for apidoc is_utf8_string
	346
	347	Returns true if first C<len> bytes of the given string form a valid
	348	UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
	349	using C<strlen(s)> (which means if you use this option, that C<s> has to have a
	350	terminating NUL byte). Note that all characters being ASCII constitute 'a
	351	valid UTF-8 string'.
	352
	353	See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
	354
	355	=cut
	356	*/
	357
	358	bool
	359	Perl_is_utf8_string(const U8 *s, STRLEN len)
	360	{
	361	const U8* const send = s + (len ? len : strlen((const char *)s));
	362	const U8* x = s;
	363
	364	PERL_ARGS_ASSERT_IS_UTF8_STRING;
	365
	366	while (x < send) {
	367	/* Inline the easy bits of is_utf8_char() here for speed... */
	368	if (UTF8_IS_INVARIANT(*x)) {
	369	x++;
	370	}
	371	else if (!UTF8_IS_START(*x))
	372	return FALSE;
	373	else {
	374	/* ... and call is_utf8_char() only if really needed. */
	375	const STRLEN c = UTF8SKIP(x);
	376	const U8* const next_char_ptr = x + c;
	377
	378	if (next_char_ptr > send) {
	379	return FALSE;
	380	}
	381
	382	if (IS_UTF8_CHAR_FAST(c)) {
	383	if (!IS_UTF8_CHAR(x, c))
	384	return FALSE;
	385	}
	386	else if (! is_utf8_char_slow(x, c)) {
	387	return FALSE;
	388	}
	389	x = next_char_ptr;
	390	}
	391	}
	392
	393	return TRUE;
	394	}
	395
	396	/*
	397	Implemented as a macro in utf8.h
	398
	399	=for apidoc is_utf8_string_loc
	400
	401	Like is_utf8_string() but stores the location of the failure (in the
	402	case of "utf8ness failure") or the location s+len (in the case of
	403	"utf8ness success") in the C<ep>.
	404
	405	See also is_utf8_string_loclen() and is_utf8_string().
	406
	407	=for apidoc is_utf8_string_loclen
	408
	409	Like is_utf8_string() but stores the location of the failure (in the
	410	case of "utf8ness failure") or the location s+len (in the case of
	411	"utf8ness success") in the C<ep>, and the number of UTF-8
	412	encoded characters in the C<el>.
	413
	414	See also is_utf8_string_loc() and is_utf8_string().
	415
	416	=cut
	417	*/
	418
	419	bool
	420	Perl_is_utf8_string_loclen(const U8 s, STRLEN len, const U8 ep, STRLEN el)
	421	{
	422	const U8* const send = s + (len ? len : strlen((const char *)s));
	423	const U8* x = s;
	424	STRLEN c;
	425	STRLEN outlen = 0;
	426
	427	PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
	428
	429	while (x < send) {
	430	const U8* next_char_ptr;
	431
	432	/* Inline the easy bits of is_utf8_char() here for speed... */
	433	if (UTF8_IS_INVARIANT(*x))
	434	next_char_ptr = x + 1;
	435	else if (!UTF8_IS_START(*x))
	436	goto out;
	437	else {
	438	/* ... and call is_utf8_char() only if really needed. */
	439	c = UTF8SKIP(x);
	440	next_char_ptr = c + x;
	441	if (next_char_ptr > send) {
	442	goto out;
	443	}
	444	if (IS_UTF8_CHAR_FAST(c)) {
	445	if (!IS_UTF8_CHAR(x, c))
	446	c = 0;
	447	} else
	448	c = is_utf8_char_slow(x, c);
	449	if (!c)
	450	goto out;
	451	}
	452	x = next_char_ptr;
	453	outlen++;
	454	}
	455
	456	out:
	457	if (el)
	458	*el = outlen;
	459
	460	if (ep)
	461	*ep = x;
	462	return (x == send);
	463	}
	464
	465	/*
	466
	467	=for apidoc utf8n_to_uvuni
	468
	469	Bottom level UTF-8 decode routine.
	470	Returns the code point value of the first character in the string C<s>
	471	which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
	472	C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
	473	character.
	474
	475	The value of C<flags> determines the behavior when C<s> does not point to a
	476	well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
	477	C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
	478	is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
	479	is raised.
	480
	481	Various ALLOW flags can be set in C<flags> to allow (and not warn on)
	482	individual types of malformations, such as the sequence being overlong (that
	483	is, when there is a shorter sequence that can express the same code point;
	484	overlong sequences are expressly forbidden in the UTF-8 standard due to
	485	potential security issues). Another malformation example is the first byte of
	486	a character not being a legal first byte. See F<utf8.h> for the list of such
	487	flags. Of course, the value returned by this function under such conditions is
	488	not reliable.
	489
	490	The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
	491	flags) malformation is found. If this flag is set, the routine assumes that
	492	the caller will raise a warning, and this function will silently just set
	493	C<retlen> to C<-1> and return zero.
	494
	495	Certain code points are considered problematic. These are Unicode surrogates,
	496	Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
	497	By default these are considered regular code points, but certain situations
	498	warrant special handling for them. If C<flags> contains
	499	UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
	500	malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
	501	UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
	502	maximum) can be set to disallow these categories individually.
	503
	504	The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
	505	UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
	506	for their respective categories, but otherwise the code points are considered
	507	valid (not malformations). To get a category to both be treated as a
	508	malformation and raise a warning, specify both the WARN and DISALLOW flags.
	509	(But note that warnings are not raised if lexically disabled nor if
	510	UTF8_CHECK_ONLY is also specified.)
	511
	512	Very large code points (above 0x7FFF_FFFF) are considered more problematic than
	513	the others that are above the Unicode legal maximum. There are several
	514	reasons, one of which is that the original UTF-8 specification never went above
	515	this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
	516	on ASCII platforms for these large code points begins with a byte containing
	517	0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
	518	malformations, while allowing smaller above-Unicode code points. (Of course
	519	UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
	520	as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
	521	flags, but applies just to these code points.
	522
	523	All other code points corresponding to Unicode characters, including private
	524	use and those yet to be assigned, are never considered malformed and never
	525	warn.
	526
	527	Most code should use utf8_to_uvchr() rather than call this directly.
	528
	529	=cut
	530	*/
	531
	532	UV
	533	Perl_utf8n_to_uvuni(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	534	{
	535	dVAR;
	536	const U8 * const s0 = s;
	537	UV uv = *s, ouv = 0;
	538	STRLEN len = 1;
	539	bool dowarn = ckWARN_d(WARN_UTF8);
	540	const UV startbyte = *s;
	541	STRLEN expectlen = 0;
	542	U32 warning = 0;
	543	SV* sv = NULL;
	544
	545	PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
	546
	547	/* This list is a superset of the UTF8_ALLOW_XXX. */
	548
	549	#define UTF8_WARN_EMPTY 1
	550	#define UTF8_WARN_CONTINUATION 2
	551	#define UTF8_WARN_NON_CONTINUATION 3
	552	#define UTF8_WARN_SHORT 4
	553	#define UTF8_WARN_OVERFLOW 5
	554	#define UTF8_WARN_LONG 6
	555
	556	if (curlen == 0 &&
	557	!(flags & UTF8_ALLOW_EMPTY)) {
	558	warning = UTF8_WARN_EMPTY;
	559	goto malformed;
	560	}
	561
	562	if (UTF8_IS_INVARIANT(uv)) {
	563	if (retlen)
	564	*retlen = 1;
	565	return (UV) (NATIVE_TO_UTF(*s));
	566	}
	567
	568	if (UTF8_IS_CONTINUATION(uv) &&
	569	!(flags & UTF8_ALLOW_CONTINUATION)) {
	570	warning = UTF8_WARN_CONTINUATION;
	571	goto malformed;
	572	}
	573
	574	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	575	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	576	warning = UTF8_WARN_NON_CONTINUATION;
	577	goto malformed;
	578	}
	579
	580	#ifdef EBCDIC
	581	uv = NATIVE_TO_UTF(uv);
	582	#else
	583	if (uv == 0xfe \|\| uv == 0xff) {
	584	if (flags & (UTF8_WARN_SUPER\|UTF8_WARN_FE_FF)) {
	585	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
	586	flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
	587	}
	588	if (flags & (UTF8_DISALLOW_SUPER\|UTF8_DISALLOW_FE_FF)) {
	589	goto malformed;
	590	}
	591	}
	592	#endif
	593
	594	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	595	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	596	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	597	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	598	#ifdef EBCDIC
	599	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	600	else { len = 7; uv &= 0x01; }
	601	#else
	602	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	603	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	604	else { len = 13; uv = 0; } /* whoa! */
	605	#endif
	606
	607	if (retlen)
	608	*retlen = len;
	609
	610	expectlen = len;
	611
	612	if ((curlen < expectlen) &&
	613	!(flags & UTF8_ALLOW_SHORT)) {
	614	warning = UTF8_WARN_SHORT;
	615	goto malformed;
	616	}
	617
	618	len--;
	619	s++;
	620	ouv = uv; /* ouv is the value from the previous iteration */
	621
	622	while (len--) {
	623	if (!UTF8_IS_CONTINUATION(*s) &&
	624	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	625	s--;
	626	warning = UTF8_WARN_NON_CONTINUATION;
	627	goto malformed;
	628	}
	629	else
	630	uv = UTF8_ACCUMULATE(uv, *s);
	631	if (!(uv > ouv)) { /* If the value didn't grow from the previous
	632	iteration, something is horribly wrong */
	633	/* These cannot be allowed. */
	634	if (uv == ouv) {
	635	if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
	636	warning = UTF8_WARN_LONG;
	637	goto malformed;
	638	}
	639	}
	640	else { /* uv < ouv */
	641	/* This cannot be allowed. */
	642	warning = UTF8_WARN_OVERFLOW;
	643	goto malformed;
	644	}
	645	}
	646	s++;
	647	ouv = uv;
	648	}
	649
	650	if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
	651	warning = UTF8_WARN_LONG;
	652	goto malformed;
	653	} else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE\|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
	654	if (UNICODE_IS_SURROGATE(uv)) {
	655	if ((flags & (UTF8_WARN_SURROGATE\|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
	656	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
	657	}
	658	if (flags & UTF8_DISALLOW_SURROGATE) {
	659	goto disallowed;
	660	}
	661	}
	662	else if (UNICODE_IS_NONCHAR(uv)) {
	663	if ((flags & (UTF8_WARN_NONCHAR\|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
	664	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
	665	}
	666	if (flags & UTF8_DISALLOW_NONCHAR) {
	667	goto disallowed;
	668	}
	669	}
	670	else if ((uv > PERL_UNICODE_MAX)) {
	671	if ((flags & (UTF8_WARN_SUPER\|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
	672	sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
	673	}
	674	if (flags & UTF8_DISALLOW_SUPER) {
	675	goto disallowed;
	676	}
	677	}
	678
	679	/* Here, this is not considered a malformed character, so drop through
	680	* to return it */
	681	}
	682
	683	return uv;
	684
	685	disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
	686	set if there is to be a warning. */
	687	if (!sv) {
	688	dowarn = 0;
	689	}
	690
	691	malformed:
	692
	693	if (flags & UTF8_CHECK_ONLY) {
	694	if (retlen)
	695	*retlen = ((STRLEN) -1);
	696	return 0;
	697	}
	698
	699	if (dowarn) {
	700	if (! sv) {
	701	sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
	702	}
	703
	704	switch (warning) {
	705	case 0: /* Intentionally empty. */ break;
	706	case UTF8_WARN_EMPTY:
	707	sv_catpvs(sv, "(empty string)");
	708	break;
	709	case UTF8_WARN_CONTINUATION:
	710	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
	711	break;
	712	case UTF8_WARN_NON_CONTINUATION:
	713	if (s == s0)
	714	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
	715	(UV)s[1], startbyte);
	716	else {
	717	const int len = (int)(s-s0);
	718	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
	719	(UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
	720	}
	721
	722	break;
	723	case UTF8_WARN_SHORT:
	724	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	725	(int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
	726	expectlen = curlen; /* distance for caller to skip */
	727	break;
	728	case UTF8_WARN_OVERFLOW:
	729	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
	730	ouv, *s, startbyte);
	731	break;
	732	case UTF8_WARN_LONG:
	733	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
	734	(int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
	735	break;
	736	default:
	737	sv_catpvs(sv, "(unknown reason)");
	738	break;
	739	}
	740
	741	if (sv) {
	742	const char * const s = SvPVX_const(sv);
	743
	744	if (PL_op)
	745	Perl_warner(aTHX_ packWARN(WARN_UTF8),
	746	"%s in %s", s, OP_DESC(PL_op));
	747	else
	748	Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
	749	}
	750	}
	751
	752	if (retlen)
	753	*retlen = expectlen ? expectlen : len;
	754
	755	return 0;
	756	}
	757
	758	/*
	759	=for apidoc utf8_to_uvchr
	760
	761	Returns the native code point of the first character in the string C<s>
	762	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	763	length, in bytes, of that character.
	764
	765	If C<s> does not point to a well-formed UTF-8 character, zero is
	766	returned and retlen is set, if possible, to -1.
	767
	768	=cut
	769	*/
	770
	771
	772	UV
	773	Perl_utf8_to_uvchr(pTHX_ const U8 s, STRLEN retlen)
	774	{
	775	PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
	776
	777	return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
	778	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	779	}
	780
	781	/*
	782	=for apidoc utf8_to_uvuni
	783
	784	Returns the Unicode code point of the first character in the string C<s>
	785	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	786	length, in bytes, of that character.
	787
	788	This function should only be used when the returned UV is considered
	789	an index into the Unicode semantic tables (e.g. swashes).
	790
	791	If C<s> does not point to a well-formed UTF-8 character, zero is
	792	returned and retlen is set, if possible, to -1.
	793
	794	=cut
	795	*/
	796
	797	UV
	798	Perl_utf8_to_uvuni(pTHX_ const U8 s, STRLEN retlen)
	799	{
	800	PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
	801
	802	/* Call the low level routine asking for checks */
	803	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
	804	ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
	805	}
	806
	807	/*
	808	=for apidoc utf8_length
	809
	810	Return the length of the UTF-8 char encoded string C<s> in characters.
	811	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	812	up past C<e>, croaks.
	813
	814	=cut
	815	*/
	816
	817	STRLEN
	818	Perl_utf8_length(pTHX_ const U8 s, const U8 e)
	819	{
	820	dVAR;
	821	STRLEN len = 0;
	822
	823	PERL_ARGS_ASSERT_UTF8_LENGTH;
	824
	825	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	826	* the bitops (especially ~) can create illegal UTF-8.
	827	* In other words: in Perl UTF-8 is not just for Unicode. */
	828
	829	if (e < s)
	830	goto warn_and_return;
	831	while (s < e) {
	832	if (!UTF8_IS_INVARIANT(*s))
	833	s += UTF8SKIP(s);
	834	else
	835	s++;
	836	len++;
	837	}
	838
	839	if (e != s) {
	840	len--;
	841	warn_and_return:
	842	if (PL_op)
	843	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	844	"%s in %s", unees, OP_DESC(PL_op));
	845	else
	846	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	847	}
	848
	849	return len;
	850	}
	851
	852	/*
	853	=for apidoc utf8_distance
	854
	855	Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
	856	and C<b>.
	857
	858	WARNING: use only if you know that the pointers point inside the
	859	same UTF-8 buffer.
	860
	861	=cut
	862	*/
	863
	864	IV
	865	Perl_utf8_distance(pTHX_ const U8 a, const U8 b)
	866	{
	867	PERL_ARGS_ASSERT_UTF8_DISTANCE;
	868
	869	return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
	870	}
	871
	872	/*
	873	=for apidoc utf8_hop
	874
	875	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	876	forward or backward.
	877
	878	WARNING: do not use the following unless you know C<off> is within
	879	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	880	on the first byte of character or just after the last byte of a character.
	881
	882	=cut
	883	*/
	884
	885	U8 *
	886	Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
	887	{
	888	PERL_ARGS_ASSERT_UTF8_HOP;
	889
	890	PERL_UNUSED_CONTEXT;
	891	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	892	* the bitops (especially ~) can create illegal UTF-8.
	893	* In other words: in Perl UTF-8 is not just for Unicode. */
	894
	895	if (off >= 0) {
	896	while (off--)
	897	s += UTF8SKIP(s);
	898	}
	899	else {
	900	while (off++) {
	901	s--;
	902	while (UTF8_IS_CONTINUATION(*s))
	903	s--;
	904	}
	905	}
	906	return (U8 *)s;
	907	}
	908
	909	/*
	910	=for apidoc bytes_cmp_utf8
	911
	912	Compares the sequence of characters (stored as octets) in b, blen with the
	913	sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
	914	equal, -1 or -2 if the first string is less than the second string, +1 or +2
	915	if the first string is greater than the second string.
	916
	917	-1 or +1 is returned if the shorter string was identical to the start of the
	918	longer string. -2 or +2 is returned if the was a difference between characters
	919	within the strings.
	920
	921	=cut
	922	*/
	923
	924	int
	925	Perl_bytes_cmp_utf8(pTHX_ const U8 b, STRLEN blen, const U8 u, STRLEN ulen)
	926	{
	927	const U8 *const bend = b + blen;
	928	const U8 *const uend = u + ulen;
	929
	930	PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
	931
	932	PERL_UNUSED_CONTEXT;
	933
	934	while (b < bend && u < uend) {
	935	U8 c = *u++;
	936	if (!UTF8_IS_INVARIANT(c)) {
	937	if (UTF8_IS_DOWNGRADEABLE_START(c)) {
	938	if (u < uend) {
	939	U8 c1 = *u++;
	940	if (UTF8_IS_CONTINUATION(c1)) {
	941	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
	942	} else {
	943	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	944	"Malformed UTF-8 character "
	945	"(unexpected non-continuation byte 0x%02x"
	946	", immediately after start byte 0x%02x)"
	947	/* Dear diag.t, it's in the pod. */
	948	"%s%s", c1, c,
	949	PL_op ? " in " : "",
	950	PL_op ? OP_DESC(PL_op) : "");
	951	return -2;
	952	}
	953	} else {
	954	if (PL_op)
	955	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	956	"%s in %s", unees, OP_DESC(PL_op));
	957	else
	958	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
	959	return -2; /* Really want to return undef :-) */
	960	}
	961	} else {
	962	return -2;
	963	}
	964	}
	965	if (*b != c) {
	966	return *b < c ? -2 : +2;
	967	}
	968	++b;
	969	}
	970
	971	if (b == bend && u == uend)
	972	return 0;
	973
	974	return b < bend ? +1 : -1;
	975	}
	976
	977	/*
	978	=for apidoc utf8_to_bytes
	979
	980	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	981	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	982	updates len to contain the new length.
	983	Returns zero on failure, setting C<len> to -1.
	984
	985	If you need a copy of the string, see C<bytes_from_utf8>.
	986
	987	=cut
	988	*/
	989
	990	U8 *
	991	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	992	{
	993	U8 * const save = s;
	994	U8 * const send = s + *len;
	995	U8 *d;
	996
	997	PERL_ARGS_ASSERT_UTF8_TO_BYTES;
	998
	999	/* ensure valid UTF-8 and chars < 256 before updating string */
	1000	while (s < send) {
	1001	U8 c = *s++;
	1002
	1003	if (!UTF8_IS_INVARIANT(c) &&
	1004	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	1005	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	1006	*len = ((STRLEN) -1);
	1007	return 0;
	1008	}
	1009	}
	1010
	1011	d = s = save;
	1012	while (s < send) {
	1013	STRLEN ulen;
	1014	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	1015	s += ulen;
	1016	}
	1017	*d = '\0';
	1018	*len = d - save;
	1019	return save;
	1020	}
	1021
	1022	/*
	1023	=for apidoc bytes_from_utf8
	1024
	1025	Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
	1026	Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	1027	the newly-created string, and updates C<len> to contain the new
	1028	length. Returns the original string if no conversion occurs, C<len>
	1029	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	1030	0 if C<s> is converted or consisted entirely of characters that are invariant
	1031	in utf8 (i.e., US-ASCII on non-EBCDIC machines).
	1032
	1033	=cut
	1034	*/
	1035
	1036	U8 *
	1037	Perl_bytes_from_utf8(pTHX_ const U8 s, STRLEN len, bool *is_utf8)
	1038	{
	1039	U8 *d;
	1040	const U8 *start = s;
	1041	const U8 *send;
	1042	I32 count = 0;
	1043
	1044	PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
	1045
	1046	PERL_UNUSED_CONTEXT;
	1047	if (!*is_utf8)
	1048	return (U8 *)start;
	1049
	1050	/* ensure valid UTF-8 and chars < 256 before converting string */
	1051	for (send = s + *len; s < send;) {
	1052	U8 c = *s++;
	1053	if (!UTF8_IS_INVARIANT(c)) {
	1054	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	1055	(c = *s++) && UTF8_IS_CONTINUATION(c))
	1056	count++;
	1057	else
	1058	return (U8 *)start;
	1059	}
	1060	}
	1061
	1062	*is_utf8 = FALSE;
	1063
	1064	Newx(d, (*len) - count + 1, U8);
	1065	s = start; start = d;
	1066	while (s < send) {
	1067	U8 c = *s++;
	1068	if (!UTF8_IS_INVARIANT(c)) {
	1069	/* Then it is two-byte encoded */
	1070	c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
	1071	}
	1072	*d++ = c;
	1073	}
	1074	*d = '\0';
	1075	*len = d - start;
	1076	return (U8 *)start;
	1077	}
	1078
	1079	/*
	1080	=for apidoc bytes_to_utf8
	1081
	1082	Converts a string C<s> of length C<len> bytes from the native encoding into
	1083	UTF-8.
	1084	Returns a pointer to the newly-created string, and sets C<len> to
	1085	reflect the new length in bytes.
	1086
	1087	A NUL character will be written after the end of the string.
	1088
	1089	If you want to convert to UTF-8 from encodings other than
	1090	the native (Latin1 or EBCDIC),
	1091	see sv_recode_to_utf8().
	1092
	1093	=cut
	1094	*/
	1095
	1096	/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
	1097	likewise need duplication. */
	1098
	1099	U8*
	1100	Perl_bytes_to_utf8(pTHX_ const U8 s, STRLEN len)
	1101	{
	1102	const U8 * const send = s + (*len);
	1103	U8 *d;
	1104	U8 *dst;
	1105
	1106	PERL_ARGS_ASSERT_BYTES_TO_UTF8;
	1107	PERL_UNUSED_CONTEXT;
	1108
	1109	Newx(d, (len) 2 + 1, U8);
	1110	dst = d;
	1111
	1112	while (s < send) {
	1113	const UV uv = NATIVE_TO_ASCII(*s++);
	1114	if (UNI_IS_INVARIANT(uv))
	1115	*d++ = (U8)UTF_TO_NATIVE(uv);
	1116	else {
	1117	*d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
	1118	*d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
	1119	}
	1120	}
	1121	*d = '\0';
	1122	*len = d-dst;
	1123	return dst;
	1124	}
	1125
	1126	/*
	1127	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	1128	*
	1129	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	1130	* We optimize for native, for obvious reasons. */
	1131
	1132	U8*
	1133	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1134	{
	1135	U8* pend;
	1136	U8* dstart = d;
	1137
	1138	PERL_ARGS_ASSERT_UTF16_TO_UTF8;
	1139
	1140	if (bytelen & 1)
	1141	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
	1142
	1143	pend = p + bytelen;
	1144
	1145	while (p < pend) {
	1146	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	1147	p += 2;
	1148	if (uv < 0x80) {
	1149	#ifdef EBCDIC
	1150	*d++ = UNI_TO_NATIVE(uv);
	1151	#else
	1152	*d++ = (U8)uv;
	1153	#endif
	1154	continue;
	1155	}
	1156	if (uv < 0x800) {
	1157	*d++ = (U8)(( uv >> 6) \| 0xc0);
	1158	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1159	continue;
	1160	}
	1161	if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
	1162	if (p >= pend) {
	1163	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1164	} else {
	1165	UV low = (p[0] << 8) + p[1];
	1166	p += 2;
	1167	if (low < 0xdc00 \|\| low > 0xdfff)
	1168	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1169	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	1170	}
	1171	} else if (uv >= 0xdc00 && uv <= 0xdfff) {
	1172	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	1173	}
	1174	if (uv < 0x10000) {
	1175	*d++ = (U8)(( uv >> 12) \| 0xe0);
	1176	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1177	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1178	continue;
	1179	}
	1180	else {
	1181	*d++ = (U8)(( uv >> 18) \| 0xf0);
	1182	*d++ = (U8)(((uv >> 12) & 0x3f) \| 0x80);
	1183	*d++ = (U8)(((uv >> 6) & 0x3f) \| 0x80);
	1184	*d++ = (U8)(( uv & 0x3f) \| 0x80);
	1185	continue;
	1186	}
	1187	}
	1188	*newlen = d - dstart;
	1189	return d;
	1190	}
	1191
	1192	/* Note: this one is slightly destructive of the source. */
	1193
	1194	U8*
	1195	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	1196	{
	1197	U8* s = (U8*)p;
	1198	U8* const send = s + bytelen;
	1199
	1200	PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
	1201
	1202	if (bytelen & 1)
	1203	Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
	1204	(UV)bytelen);
	1205
	1206	while (s < send) {
	1207	const U8 tmp = s[0];
	1208	s[0] = s[1];
	1209	s[1] = tmp;
	1210	s += 2;
	1211	}
	1212	return utf16_to_utf8(p, d, bytelen, newlen);
	1213	}
	1214
	1215	/* for now these are all defined (inefficiently) in terms of the utf8 versions.
	1216	* Note that the macros in handy.h that call these short-circuit calling them
	1217	* for Latin-1 range inputs */
	1218
	1219	bool
	1220	Perl_is_uni_alnum(pTHX_ UV c)
	1221	{
	1222	U8 tmpbuf[UTF8_MAXBYTES+1];
	1223	uvchr_to_utf8(tmpbuf, c);
	1224	return is_utf8_alnum(tmpbuf);
	1225	}
	1226
	1227	bool
	1228	Perl_is_uni_idfirst(pTHX_ UV c)
	1229	{
	1230	U8 tmpbuf[UTF8_MAXBYTES+1];
	1231	uvchr_to_utf8(tmpbuf, c);
	1232	return is_utf8_idfirst(tmpbuf);
	1233	}
	1234
	1235	bool
	1236	Perl_is_uni_alpha(pTHX_ UV c)
	1237	{
	1238	U8 tmpbuf[UTF8_MAXBYTES+1];
	1239	uvchr_to_utf8(tmpbuf, c);
	1240	return is_utf8_alpha(tmpbuf);
	1241	}
	1242
	1243	bool
	1244	Perl_is_uni_ascii(pTHX_ UV c)
	1245	{
	1246	return isASCII(c);
	1247	}
	1248
	1249	bool
	1250	Perl_is_uni_space(pTHX_ UV c)
	1251	{
	1252	U8 tmpbuf[UTF8_MAXBYTES+1];
	1253	uvchr_to_utf8(tmpbuf, c);
	1254	return is_utf8_space(tmpbuf);
	1255	}
	1256
	1257	bool
	1258	Perl_is_uni_digit(pTHX_ UV c)
	1259	{
	1260	U8 tmpbuf[UTF8_MAXBYTES+1];
	1261	uvchr_to_utf8(tmpbuf, c);
	1262	return is_utf8_digit(tmpbuf);
	1263	}
	1264
	1265	bool
	1266	Perl_is_uni_upper(pTHX_ UV c)
	1267	{
	1268	U8 tmpbuf[UTF8_MAXBYTES+1];
	1269	uvchr_to_utf8(tmpbuf, c);
	1270	return is_utf8_upper(tmpbuf);
	1271	}
	1272
	1273	bool
	1274	Perl_is_uni_lower(pTHX_ UV c)
	1275	{
	1276	U8 tmpbuf[UTF8_MAXBYTES+1];
	1277	uvchr_to_utf8(tmpbuf, c);
	1278	return is_utf8_lower(tmpbuf);
	1279	}
	1280
	1281	bool
	1282	Perl_is_uni_cntrl(pTHX_ UV c)
	1283	{
	1284	return isCNTRL_L1(c);
	1285	}
	1286
	1287	bool
	1288	Perl_is_uni_graph(pTHX_ UV c)
	1289	{
	1290	U8 tmpbuf[UTF8_MAXBYTES+1];
	1291	uvchr_to_utf8(tmpbuf, c);
	1292	return is_utf8_graph(tmpbuf);
	1293	}
	1294
	1295	bool
	1296	Perl_is_uni_print(pTHX_ UV c)
	1297	{
	1298	U8 tmpbuf[UTF8_MAXBYTES+1];
	1299	uvchr_to_utf8(tmpbuf, c);
	1300	return is_utf8_print(tmpbuf);
	1301	}
	1302
	1303	bool
	1304	Perl_is_uni_punct(pTHX_ UV c)
	1305	{
	1306	U8 tmpbuf[UTF8_MAXBYTES+1];
	1307	uvchr_to_utf8(tmpbuf, c);
	1308	return is_utf8_punct(tmpbuf);
	1309	}
	1310
	1311	bool
	1312	Perl_is_uni_xdigit(pTHX_ UV c)
	1313	{
	1314	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1315	uvchr_to_utf8(tmpbuf, c);
	1316	return is_utf8_xdigit(tmpbuf);
	1317	}
	1318
	1319	UV
	1320	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
	1321	{
	1322	/* We have the latin1-range values compiled into the core, so just use
	1323	* those, converting the result to utf8. The only difference between upper
	1324	* and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
	1325	* either "SS" or "Ss". Which one to use is passed into the routine in
	1326	* 'S_or_s' to avoid a test */
	1327
	1328	UV converted = toUPPER_LATIN1_MOD(c);
	1329
	1330	PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
	1331
	1332	assert(S_or_s == 'S' \|\| S_or_s == 's');
	1333
	1334	if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
	1335	characters in this range */
	1336	*p = (U8) converted;
	1337	*lenp = 1;
	1338	return converted;
	1339	}
	1340
	1341	/* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
	1342	* which it maps to one of them, so as to only have to have one check for
	1343	* it in the main case */
	1344	if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
	1345	switch (c) {
	1346	case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
	1347	converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
	1348	break;
	1349	case MICRO_SIGN:
	1350	converted = GREEK_CAPITAL_LETTER_MU;
	1351	break;
	1352	case LATIN_SMALL_LETTER_SHARP_S:
	1353	*(p)++ = 'S';
	1354	*p = S_or_s;
	1355	*lenp = 2;
	1356	return 'S';
	1357	default:
	1358	Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
	1359	/* NOTREACHED */
	1360	}
	1361	}
	1362
	1363	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1364	*p = UTF8_TWO_BYTE_LO(converted);
	1365	*lenp = 2;
	1366
	1367	return converted;
	1368	}
	1369
	1370	/* Call the function to convert a UTF-8 encoded character to the specified case.
	1371	* Note that there may be more than one character in the result.
	1372	* INP is a pointer to the first byte of the input character
	1373	* OUTP will be set to the first byte of the string of changed characters. It
	1374	* needs to have space for UTF8_MAXBYTES_CASE+1 bytes
	1375	* LENP will be set to the length in bytes of the string of changed characters
	1376	*
	1377	* The functions return the ordinal of the first character in the string of OUTP */
	1378	#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUpper")
	1379	#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTitle")
	1380	#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLower")
	1381
	1382	/* This additionally has the input parameter SPECIALS, which if non-zero will
	1383	* cause this to use the SPECIALS hash for folding (meaning get full case
	1384	* folding); otherwise, when zero, this implies a simple case fold */
	1385	#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecFold" : NULL)
	1386
	1387	UV
	1388	Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
	1389	{
	1390	dVAR;
	1391
	1392	/* Convert the Unicode character whose ordinal is c to its uppercase
	1393	* version and store that in UTF-8 in p and its length in bytes in lenp.
	1394	* Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	1395	* the changed version may be longer than the original character.
	1396	*
	1397	* The ordinal of the first character of the changed version is returned
	1398	* (but note, as explained above, that there may be more.) */
	1399
	1400	PERL_ARGS_ASSERT_TO_UNI_UPPER;
	1401
	1402	if (c < 256) {
	1403	return _to_upper_title_latin1((U8) c, p, lenp, 'S');
	1404	}
	1405
	1406	uvchr_to_utf8(p, c);
	1407	return CALL_UPPER_CASE(p, p, lenp);
	1408	}
	1409
	1410	UV
	1411	Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
	1412	{
	1413	dVAR;
	1414
	1415	PERL_ARGS_ASSERT_TO_UNI_TITLE;
	1416
	1417	if (c < 256) {
	1418	return _to_upper_title_latin1((U8) c, p, lenp, 's');
	1419	}
	1420
	1421	uvchr_to_utf8(p, c);
	1422	return CALL_TITLE_CASE(p, p, lenp);
	1423	}
	1424
	1425	STATIC U8
	1426	S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
	1427	{
	1428	/* We have the latin1-range values compiled into the core, so just use
	1429	* those, converting the result to utf8. Since the result is always just
	1430	* one character, we allow p to be NULL */
	1431
	1432	U8 converted = toLOWER_LATIN1(c);
	1433
	1434	if (p != NULL) {
	1435	if (UNI_IS_INVARIANT(converted)) {
	1436	*p = converted;
	1437	*lenp = 1;
	1438	}
	1439	else {
	1440	*p = UTF8_TWO_BYTE_HI(converted);
	1441	*(p+1) = UTF8_TWO_BYTE_LO(converted);
	1442	*lenp = 2;
	1443	}
	1444	}
	1445	return converted;
	1446	}
	1447
	1448	UV
	1449	Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
	1450	{
	1451	dVAR;
	1452
	1453	PERL_ARGS_ASSERT_TO_UNI_LOWER;
	1454
	1455	if (c < 256) {
	1456	return to_lower_latin1((U8) c, p, lenp);
	1457	}
	1458
	1459	uvchr_to_utf8(p, c);
	1460	return CALL_LOWER_CASE(p, p, lenp);
	1461	}
	1462
	1463	UV
	1464	Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
	1465	{
	1466	/* Corresponds to to_lower_latin1(), flags is TRUE if to use full case
	1467	* folding */
	1468
	1469	UV converted;
	1470
	1471	PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
	1472
	1473	if (c == MICRO_SIGN) {
	1474	converted = GREEK_SMALL_LETTER_MU;
	1475	}
	1476	else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
	1477	*(p)++ = 's';
	1478	*p = 's';
	1479	*lenp = 2;
	1480	return 's';
	1481	}
	1482	else { /* In this range the fold of all other characters is their lower
	1483	case */
	1484	converted = toLOWER_LATIN1(c);
	1485	}
	1486
	1487	if (UNI_IS_INVARIANT(converted)) {
	1488	*p = (U8) converted;
	1489	*lenp = 1;
	1490	}
	1491	else {
	1492	*(p)++ = UTF8_TWO_BYTE_HI(converted);
	1493	*p = UTF8_TWO_BYTE_LO(converted);
	1494	*lenp = 2;
	1495	}
	1496
	1497	return converted;
	1498	}
	1499
	1500	UV
	1501	Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
	1502	{
	1503
	1504	/* Not currently externally documented, and subject to change, <flags> is
	1505	* TRUE iff full folding is to be used */
	1506
	1507	PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
	1508
	1509	if (c < 256) {
	1510	return _to_fold_latin1((U8) c, p, lenp, flags);
	1511	}
	1512
	1513	uvchr_to_utf8(p, c);
	1514	return CALL_FOLD_CASE(p, p, lenp, flags);
	1515	}
	1516
	1517	/* for now these all assume no locale info available for Unicode > 255 */
	1518
	1519	bool
	1520	Perl_is_uni_alnum_lc(pTHX_ UV c)
	1521	{
	1522	return is_uni_alnum(c); /* XXX no locale support yet */
	1523	}
	1524
	1525	bool
	1526	Perl_is_uni_idfirst_lc(pTHX_ UV c)
	1527	{
	1528	return is_uni_idfirst(c); /* XXX no locale support yet */
	1529	}
	1530
	1531	bool
	1532	Perl_is_uni_alpha_lc(pTHX_ UV c)
	1533	{
	1534	return is_uni_alpha(c); /* XXX no locale support yet */
	1535	}
	1536
	1537	bool
	1538	Perl_is_uni_ascii_lc(pTHX_ UV c)
	1539	{
	1540	return is_uni_ascii(c); /* XXX no locale support yet */
	1541	}
	1542
	1543	bool
	1544	Perl_is_uni_space_lc(pTHX_ UV c)
	1545	{
	1546	return is_uni_space(c); /* XXX no locale support yet */
	1547	}
	1548
	1549	bool
	1550	Perl_is_uni_digit_lc(pTHX_ UV c)
	1551	{
	1552	return is_uni_digit(c); /* XXX no locale support yet */
	1553	}
	1554
	1555	bool
	1556	Perl_is_uni_upper_lc(pTHX_ UV c)
	1557	{
	1558	return is_uni_upper(c); /* XXX no locale support yet */
	1559	}
	1560
	1561	bool
	1562	Perl_is_uni_lower_lc(pTHX_ UV c)
	1563	{
	1564	return is_uni_lower(c); /* XXX no locale support yet */
	1565	}
	1566
	1567	bool
	1568	Perl_is_uni_cntrl_lc(pTHX_ UV c)
	1569	{
	1570	return is_uni_cntrl(c); /* XXX no locale support yet */
	1571	}
	1572
	1573	bool
	1574	Perl_is_uni_graph_lc(pTHX_ UV c)
	1575	{
	1576	return is_uni_graph(c); /* XXX no locale support yet */
	1577	}
	1578
	1579	bool
	1580	Perl_is_uni_print_lc(pTHX_ UV c)
	1581	{
	1582	return is_uni_print(c); /* XXX no locale support yet */
	1583	}
	1584
	1585	bool
	1586	Perl_is_uni_punct_lc(pTHX_ UV c)
	1587	{
	1588	return is_uni_punct(c); /* XXX no locale support yet */
	1589	}
	1590
	1591	bool
	1592	Perl_is_uni_xdigit_lc(pTHX_ UV c)
	1593	{
	1594	return is_uni_xdigit(c); /* XXX no locale support yet */
	1595	}
	1596
	1597	U32
	1598	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1599	{
	1600	/* XXX returns only the first character -- do not use XXX */
	1601	/* XXX no locale support yet */
	1602	STRLEN len;
	1603	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1604	return (U32)to_uni_upper(c, tmpbuf, &len);
	1605	}
	1606
	1607	U32
	1608	Perl_to_uni_title_lc(pTHX_ U32 c)
	1609	{
	1610	/* XXX returns only the first character XXX -- do not use XXX */
	1611	/* XXX no locale support yet */
	1612	STRLEN len;
	1613	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1614	return (U32)to_uni_title(c, tmpbuf, &len);
	1615	}
	1616
	1617	U32
	1618	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1619	{
	1620	/* XXX returns only the first character -- do not use XXX */
	1621	/* XXX no locale support yet */
	1622	STRLEN len;
	1623	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	1624	return (U32)to_uni_lower(c, tmpbuf, &len);
	1625	}
	1626
	1627	static bool
	1628	S_is_utf8_common(pTHX_ const U8 const p, SV *swash,
	1629	const char *const swashname)
	1630	{
	1631	dVAR;
	1632
	1633	PERL_ARGS_ASSERT_IS_UTF8_COMMON;
	1634
	1635	if (!is_utf8_char(p))
	1636	return FALSE;
	1637	if (!*swash)
	1638	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
	1639	return swash_fetch(*swash, p, TRUE) != 0;
	1640	}
	1641
	1642	bool
	1643	Perl_is_utf8_alnum(pTHX_ const U8 *p)
	1644	{
	1645	dVAR;
	1646
	1647	PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
	1648
	1649	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1650	* descendant of isalnum(3), in other words, it doesn't
	1651	* contain the '_'. --jhi */
	1652	return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
	1653	}
	1654
	1655	bool
	1656	Perl_is_utf8_idfirst(pTHX_ const U8 p) / The naming is historical. */
	1657	{
	1658	dVAR;
	1659
	1660	PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
	1661
	1662	if (*p == '_')
	1663	return TRUE;
	1664	/* is_utf8_idstart would be more logical. */
	1665	return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
	1666	}
	1667
	1668	bool
	1669	Perl_is_utf8_xidfirst(pTHX_ const U8 p) / The naming is historical. */
	1670	{
	1671	dVAR;
	1672
	1673	PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
	1674
	1675	if (*p == '_')
	1676	return TRUE;
	1677	/* is_utf8_idstart would be more logical. */
	1678	return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
	1679	}
	1680
	1681	bool
	1682	Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
	1683	{
	1684	dVAR;
	1685
	1686	PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
	1687
	1688	return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
	1689	}
	1690
	1691	bool
	1692	Perl_is_utf8_idcont(pTHX_ const U8 *p)
	1693	{
	1694	dVAR;
	1695
	1696	PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
	1697
	1698	return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
	1699	}
	1700
	1701	bool
	1702	Perl_is_utf8_xidcont(pTHX_ const U8 *p)
	1703	{
	1704	dVAR;
	1705
	1706	PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
	1707
	1708	return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
	1709	}
	1710
	1711	bool
	1712	Perl_is_utf8_alpha(pTHX_ const U8 *p)
	1713	{
	1714	dVAR;
	1715
	1716	PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
	1717
	1718	return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
	1719	}
	1720
	1721	bool
	1722	Perl_is_utf8_ascii(pTHX_ const U8 *p)
	1723	{
	1724	dVAR;
	1725
	1726	PERL_ARGS_ASSERT_IS_UTF8_ASCII;
	1727
	1728	/* ASCII characters are the same whether in utf8 or not. So the macro
	1729	* works on both utf8 and non-utf8 representations. */
	1730	return isASCII(*p);
	1731	}
	1732
	1733	bool
	1734	Perl_is_utf8_space(pTHX_ const U8 *p)
	1735	{
	1736	dVAR;
	1737
	1738	PERL_ARGS_ASSERT_IS_UTF8_SPACE;
	1739
	1740	return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
	1741	}
	1742
	1743	bool
	1744	Perl_is_utf8_perl_space(pTHX_ const U8 *p)
	1745	{
	1746	dVAR;
	1747
	1748	PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
	1749
	1750	/* Only true if is an ASCII space-like character, and ASCII is invariant
	1751	* under utf8, so can just use the macro */
	1752	return isSPACE_A(*p);
	1753	}
	1754
	1755	bool
	1756	Perl_is_utf8_perl_word(pTHX_ const U8 *p)
	1757	{
	1758	dVAR;
	1759
	1760	PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
	1761
	1762	/* Only true if is an ASCII word character, and ASCII is invariant
	1763	* under utf8, so can just use the macro */
	1764	return isWORDCHAR_A(*p);
	1765	}
	1766
	1767	bool
	1768	Perl_is_utf8_digit(pTHX_ const U8 *p)
	1769	{
	1770	dVAR;
	1771
	1772	PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
	1773
	1774	return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
	1775	}
	1776
	1777	bool
	1778	Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
	1779	{
	1780	dVAR;
	1781
	1782	PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
	1783
	1784	/* Only true if is an ASCII digit character, and ASCII is invariant
	1785	* under utf8, so can just use the macro */
	1786	return isDIGIT_A(*p);
	1787	}
	1788
	1789	bool
	1790	Perl_is_utf8_upper(pTHX_ const U8 *p)
	1791	{
	1792	dVAR;
	1793
	1794	PERL_ARGS_ASSERT_IS_UTF8_UPPER;
	1795
	1796	return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
	1797	}
	1798
	1799	bool
	1800	Perl_is_utf8_lower(pTHX_ const U8 *p)
	1801	{
	1802	dVAR;
	1803
	1804	PERL_ARGS_ASSERT_IS_UTF8_LOWER;
	1805
	1806	return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
	1807	}
	1808
	1809	bool
	1810	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
	1811	{
	1812	dVAR;
	1813
	1814	PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
	1815
	1816	if (isASCII(*p)) {
	1817	return isCNTRL_A(*p);
	1818	}
	1819
	1820	/* All controls are in Latin1 */
	1821	if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
	1822	return 0;
	1823	}
	1824	return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	1825	}
	1826
	1827	bool
	1828	Perl_is_utf8_graph(pTHX_ const U8 *p)
	1829	{
	1830	dVAR;
	1831
	1832	PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
	1833
	1834	return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
	1835	}
	1836
	1837	bool
	1838	Perl_is_utf8_print(pTHX_ const U8 *p)
	1839	{
	1840	dVAR;
	1841
	1842	PERL_ARGS_ASSERT_IS_UTF8_PRINT;
	1843
	1844	return is_utf8_common(p, &PL_utf8_print, "IsPrint");
	1845	}
	1846
	1847	bool
	1848	Perl_is_utf8_punct(pTHX_ const U8 *p)
	1849	{
	1850	dVAR;
	1851
	1852	PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
	1853
	1854	return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
	1855	}
	1856
	1857	bool
	1858	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
	1859	{
	1860	dVAR;
	1861
	1862	PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
	1863
	1864	return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
	1865	}
	1866
	1867	bool
	1868	Perl_is_utf8_mark(pTHX_ const U8 *p)
	1869	{
	1870	dVAR;
	1871
	1872	PERL_ARGS_ASSERT_IS_UTF8_MARK;
	1873
	1874	return is_utf8_common(p, &PL_utf8_mark, "IsM");
	1875	}
	1876
	1877	bool
	1878	Perl_is_utf8_X_begin(pTHX_ const U8 *p)
	1879	{
	1880	dVAR;
	1881
	1882	PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
	1883
	1884	return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
	1885	}
	1886
	1887	bool
	1888	Perl_is_utf8_X_extend(pTHX_ const U8 *p)
	1889	{
	1890	dVAR;
	1891
	1892	PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
	1893
	1894	return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
	1895	}
	1896
	1897	bool
	1898	Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
	1899	{
	1900	dVAR;
	1901
	1902	PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
	1903
	1904	return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
	1905	}
	1906
	1907	bool
	1908	Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
	1909	{
	1910	dVAR;
	1911
	1912	PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
	1913
	1914	return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
	1915	}
	1916
	1917	bool
	1918	Perl_is_utf8_X_L(pTHX_ const U8 *p)
	1919	{
	1920	dVAR;
	1921
	1922	PERL_ARGS_ASSERT_IS_UTF8_X_L;
	1923
	1924	return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
	1925	}
	1926
	1927	bool
	1928	Perl_is_utf8_X_LV(pTHX_ const U8 *p)
	1929	{
	1930	dVAR;
	1931
	1932	PERL_ARGS_ASSERT_IS_UTF8_X_LV;
	1933
	1934	return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
	1935	}
	1936
	1937	bool
	1938	Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
	1939	{
	1940	dVAR;
	1941
	1942	PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
	1943
	1944	return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
	1945	}
	1946
	1947	bool
	1948	Perl_is_utf8_X_T(pTHX_ const U8 *p)
	1949	{
	1950	dVAR;
	1951
	1952	PERL_ARGS_ASSERT_IS_UTF8_X_T;
	1953
	1954	return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
	1955	}
	1956
	1957	bool
	1958	Perl_is_utf8_X_V(pTHX_ const U8 *p)
	1959	{
	1960	dVAR;
	1961
	1962	PERL_ARGS_ASSERT_IS_UTF8_X_V;
	1963
	1964	return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
	1965	}
	1966
	1967	bool
	1968	Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
	1969	{
	1970	dVAR;
	1971
	1972	PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
	1973
	1974	return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
	1975	}
	1976
	1977	/*
	1978	=for apidoc to_utf8_case
	1979
	1980	The "p" contains the pointer to the UTF-8 string encoding
	1981	the character that is being converted.
	1982
	1983	The "ustrp" is a pointer to the character buffer to put the
	1984	conversion result to. The "lenp" is a pointer to the length
	1985	of the result.
	1986
	1987	The "swashp" is a pointer to the swash to use.
	1988
	1989	Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
	1990	and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
	1991	but not always, a multicharacter mapping), is tried first.
	1992
	1993	The "special" is a string like "utf8::ToSpecLower", which means the
	1994	hash %utf8::ToSpecLower. The access to the hash is through
	1995	Perl_to_utf8_case().
	1996
	1997	The "normal" is a string like "ToLower" which means the swash
	1998	%utf8::ToLower.
	1999
	2000	=cut */
	2001
	2002	UV
	2003	Perl_to_utf8_case(pTHX_ const U8 p, U8 ustrp, STRLEN *lenp,
	2004	SV *swashp, const char normal, const char *special)
	2005	{
	2006	dVAR;
	2007	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
	2008	STRLEN len = 0;
	2009	const UV uv0 = utf8_to_uvchr(p, NULL);
	2010	/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
	2011	* are necessary in EBCDIC, they are redundant no-ops
	2012	* in ASCII-ish platforms, and hopefully optimized away. */
	2013	const UV uv1 = NATIVE_TO_UNI(uv0);
	2014
	2015	PERL_ARGS_ASSERT_TO_UTF8_CASE;
	2016
	2017	/* Note that swash_fetch() doesn't output warnings for these because it
	2018	* assumes we will */
	2019	if (uv1 >= UNICODE_SURROGATE_FIRST) {
	2020	if (uv1 <= UNICODE_SURROGATE_LAST) {
	2021	if (ckWARN_d(WARN_SURROGATE)) {
	2022	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	2023	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	2024	"Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
	2025	}
	2026	}
	2027	else if (UNICODE_IS_SUPER(uv1)) {
	2028	if (ckWARN_d(WARN_NON_UNICODE)) {
	2029	const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
	2030	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2031	"Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
	2032	}
	2033	}
	2034
	2035	/* Note that non-characters are perfectly legal, so no warning should
	2036	* be given */
	2037	}
	2038
	2039	uvuni_to_utf8(tmpbuf, uv1);
	2040
	2041	if (!swashp) / load on-demand */
	2042	*swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
	2043
	2044	if (special) {
	2045	/* It might be "special" (sometimes, but not always,
	2046	* a multicharacter mapping) */
	2047	HV * const hv = get_hv(special, 0);
	2048	SV **svp;
	2049
	2050	if (hv &&
	2051	(svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
	2052	(*svp)) {
	2053	const char *s;
	2054
	2055	s = SvPV_const(*svp, len);
	2056	if (len == 1)
	2057	len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI((U8)s)) - ustrp;
	2058	else {
	2059	#ifdef EBCDIC
	2060	/* If we have EBCDIC we need to remap the characters
	2061	* since any characters in the low 256 are Unicode
	2062	* code points, not EBCDIC. */
	2063	U8 t = (U8)s, tend = t + len, d;
	2064
	2065	d = tmpbuf;
	2066	if (SvUTF8(*svp)) {
	2067	STRLEN tlen = 0;
	2068
	2069	while (t < tend) {
	2070	const UV c = utf8_to_uvchr(t, &tlen);
	2071	if (tlen > 0) {
	2072	d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
	2073	t += tlen;
	2074	}
	2075	else
	2076	break;
	2077	}
	2078	}
	2079	else {
	2080	while (t < tend) {
	2081	d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
	2082	t++;
	2083	}
	2084	}
	2085	len = d - tmpbuf;
	2086	Copy(tmpbuf, ustrp, len, U8);
	2087	#else
	2088	Copy(s, ustrp, len, U8);
	2089	#endif
	2090	}
	2091	}
	2092	}
	2093
	2094	if (!len && *swashp) {
	2095	const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
	2096
	2097	if (uv2) {
	2098	/* It was "normal" (a single character mapping). */
	2099	const UV uv3 = UNI_TO_NATIVE(uv2);
	2100	len = uvchr_to_utf8(ustrp, uv3) - ustrp;
	2101	}
	2102	}
	2103
	2104	if (!len) /* Neither: just copy. In other words, there was no mapping
	2105	defined, which means that the code point maps to itself */
	2106	len = uvchr_to_utf8(ustrp, uv0) - ustrp;
	2107
	2108	if (lenp)
	2109	*lenp = len;
	2110
	2111	return len ? utf8_to_uvchr(ustrp, 0) : 0;
	2112	}
	2113
	2114	STATIC UV
	2115	S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
	2116	{
	2117	/* This is called when changing the case of a utf8-encoded character above
	2118	* the Latin1 range, and the operation is in locale. If the result
	2119	* contains a character that crosses the 255/256 boundary, disallow the
	2120	* change, and return the original code point. See L<perlfunc/lc> for why;
	2121	*
	2122	* p points to the original string whose case was changed
	2123	* result the code point of the first character in the changed-case string
	2124	* ustrp points to the changed-case string (<result> represents its first char)
	2125	* lenp points to the length of <ustrp> */
	2126
	2127	UV original; /* To store the first code point of <p> */
	2128
	2129	PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
	2130
	2131	assert(! UTF8_IS_INVARIANT(p) && ! UTF8_IS_DOWNGRADEABLE_START(p));
	2132
	2133	/* We know immediately if the first character in the string crosses the
	2134	* boundary, so can skip */
	2135	if (result > 255) {
	2136
	2137	/* Look at every character in the result; if any cross the
	2138	* boundary, the whole thing is disallowed */
	2139	U8* s = ustrp + UTF8SKIP(ustrp);
	2140	U8* e = ustrp + *lenp;
	2141	while (s < e) {
	2142	if (UTF8_IS_INVARIANT(s) \|\| UTF8_IS_DOWNGRADEABLE_START(s))
	2143	{
	2144	goto bad_crossing;
	2145	}
	2146	s += UTF8SKIP(s);
	2147	}
	2148
	2149	/* Here, no characters crossed, result is ok as-is */
	2150	return result;
	2151	}
	2152
	2153	bad_crossing:
	2154
	2155	/* Failed, have to return the original */
	2156	original = utf8_to_uvchr(p, lenp);
	2157	Copy(p, ustrp, *lenp, char);
	2158	return original;
	2159	}
	2160
	2161	/*
	2162	=for apidoc to_utf8_upper
	2163
	2164	Convert the UTF-8 encoded character at p to its uppercase version and
	2165	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2166	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
	2167	the uppercase version may be longer than the original character.
	2168
	2169	The first character of the uppercased version is returned
	2170	(but note, as explained above, that there may be more.)
	2171
	2172	=cut */
	2173
	2174	/* Not currently externally documented, and subject to change:
	2175	* <flags> is set iff locale semantics are to be used for code points < 256
	2176	* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
	2177	* were used in the calculation; otherwise unchanged. */
	2178
	2179	UV
	2180	Perl__to_utf8_upper_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
	2181	{
	2182	dVAR;
	2183
	2184	UV result;
	2185
	2186	PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
	2187
	2188	if (UTF8_IS_INVARIANT(*p)) {
	2189	if (flags) {
	2190	result = toUPPER_LC(*p);
	2191	}
	2192	else {
	2193	return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
	2194	}
	2195	}
	2196	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2197	if (flags) {
	2198	result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	2199	}
	2200	else {
	2201	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)),
	2202	ustrp, lenp, 'S');
	2203	}
	2204	}
	2205	else { /* utf8, ord above 255 */
	2206	result = CALL_UPPER_CASE(p, ustrp, lenp);
	2207
	2208	if (flags) {
	2209	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2210	}
	2211	return result;
	2212	}
	2213
	2214	/* Here, used locale rules. Convert back to utf8 */
	2215	if (UTF8_IS_INVARIANT(result)) {
	2216	*ustrp = (U8) result;
	2217	*lenp = 1;
	2218	}
	2219	else {
	2220	*ustrp = UTF8_EIGHT_BIT_HI(result);
	2221	*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
	2222	*lenp = 2;
	2223	}
	2224
	2225	if (tainted_ptr) {
	2226	*tainted_ptr = TRUE;
	2227	}
	2228	return result;
	2229	}
	2230
	2231	/*
	2232	=for apidoc to_utf8_title
	2233
	2234	Convert the UTF-8 encoded character at p to its titlecase version and
	2235	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2236	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2237	titlecase version may be longer than the original character.
	2238
	2239	The first character of the titlecased version is returned
	2240	(but note, as explained above, that there may be more.)
	2241
	2242	=cut */
	2243
	2244	/* Not currently externally documented, and subject to change:
	2245	* <flags> is set iff locale semantics are to be used for code points < 256
	2246	* Since titlecase is not defined in POSIX, uppercase is used instead
	2247	* for these/
	2248	* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
	2249	* were used in the calculation; otherwise unchanged. */
	2250
	2251	UV
	2252	Perl__to_utf8_title_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
	2253	{
	2254	dVAR;
	2255
	2256	UV result;
	2257
	2258	PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
	2259
	2260	if (UTF8_IS_INVARIANT(*p)) {
	2261	if (flags) {
	2262	result = toUPPER_LC(*p);
	2263	}
	2264	else {
	2265	return _to_upper_title_latin1(*p, ustrp, lenp, 's');
	2266	}
	2267	}
	2268	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2269	if (flags) {
	2270	result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	2271	}
	2272	else {
	2273	return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)),
	2274	ustrp, lenp, 's');
	2275	}
	2276	}
	2277	else { /* utf8, ord above 255 */
	2278	result = CALL_TITLE_CASE(p, ustrp, lenp);
	2279
	2280	if (flags) {
	2281	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2282	}
	2283	return result;
	2284	}
	2285
	2286	/* Here, used locale rules. Convert back to utf8 */
	2287	if (UTF8_IS_INVARIANT(result)) {
	2288	*ustrp = (U8) result;
	2289	*lenp = 1;
	2290	}
	2291	else {
	2292	*ustrp = UTF8_EIGHT_BIT_HI(result);
	2293	*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
	2294	*lenp = 2;
	2295	}
	2296
	2297	if (tainted_ptr) {
	2298	*tainted_ptr = TRUE;
	2299	}
	2300	return result;
	2301	}
	2302
	2303	/*
	2304	=for apidoc to_utf8_lower
	2305
	2306	Convert the UTF-8 encoded character at p to its lowercase version and
	2307	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2308	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2309	lowercase version may be longer than the original character.
	2310
	2311	The first character of the lowercased version is returned
	2312	(but note, as explained above, that there may be more.)
	2313
	2314	=cut */
	2315
	2316	/* Not currently externally documented, and subject to change:
	2317	* <flags> is set iff locale semantics are to be used for code points < 256
	2318	* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
	2319	* were used in the calculation; otherwise unchanged. */
	2320
	2321	UV
	2322	Perl__to_utf8_lower_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, const bool flags, bool tainted_ptr)
	2323	{
	2324	UV result;
	2325
	2326	dVAR;
	2327
	2328	PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
	2329
	2330	if (UTF8_IS_INVARIANT(*p)) {
	2331	if (flags) {
	2332	result = toLOWER_LC(*p);
	2333	}
	2334	else {
	2335	return to_lower_latin1(*p, ustrp, lenp);
	2336	}
	2337	}
	2338	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2339	if (flags) {
	2340	result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	2341	}
	2342	else {
	2343	return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)),
	2344	ustrp, lenp);
	2345	}
	2346	}
	2347	else { /* utf8, ord above 255 */
	2348	result = CALL_LOWER_CASE(p, ustrp, lenp);
	2349
	2350	if (flags) {
	2351	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2352	}
	2353
	2354	return result;
	2355	}
	2356
	2357	/* Here, used locale rules. Convert back to utf8 */
	2358	if (UTF8_IS_INVARIANT(result)) {
	2359	*ustrp = (U8) result;
	2360	*lenp = 1;
	2361	}
	2362	else {
	2363	*ustrp = UTF8_EIGHT_BIT_HI(result);
	2364	*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
	2365	*lenp = 2;
	2366	}
	2367
	2368	if (tainted_ptr) {
	2369	*tainted_ptr = TRUE;
	2370	}
	2371	return result;
	2372	}
	2373
	2374	/*
	2375	=for apidoc to_utf8_fold
	2376
	2377	Convert the UTF-8 encoded character at p to its foldcase version and
	2378	store that in UTF-8 in ustrp and its length in bytes in lenp. Note
	2379	that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
	2380	foldcase version may be longer than the original character (up to
	2381	three characters).
	2382
	2383	The first character of the foldcased version is returned
	2384	(but note, as explained above, that there may be more.)
	2385
	2386	=cut */
	2387
	2388	/* Not currently externally documented, and subject to change,
	2389	* in <flags>
	2390	* bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
	2391	* points < 256. Since foldcase is not defined in
	2392	* POSIX, lowercase is used instead
	2393	* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
	2394	* otherwise simple folds
	2395	* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
	2396	* were used in the calculation; otherwise unchanged. */
	2397
	2398	UV
	2399	Perl__to_utf8_fold_flags(pTHX_ const U8 p, U8 ustrp, STRLEN lenp, U8 flags, bool tainted_ptr)
	2400	{
	2401	dVAR;
	2402
	2403	UV result;
	2404
	2405	PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
	2406
	2407	if (UTF8_IS_INVARIANT(*p)) {
	2408	if (flags & FOLD_FLAGS_LOCALE) {
	2409	result = toLOWER_LC(*p);
	2410	}
	2411	else {
	2412	return _to_fold_latin1(*p, ustrp, lenp,
	2413	cBOOL(flags & FOLD_FLAGS_FULL));
	2414	}
	2415	}
	2416	else if UTF8_IS_DOWNGRADEABLE_START(*p) {
	2417	if (flags & FOLD_FLAGS_LOCALE) {
	2418	result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(p, (p+1)));
	2419	}
	2420	else {
	2421	return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(p, (p+1)),
	2422	ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
	2423	}
	2424	}
	2425	else { /* utf8, ord above 255 */
	2426	result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
	2427
	2428	if ((flags & FOLD_FLAGS_LOCALE)) {
	2429	result = check_locale_boundary_crossing(p, result, ustrp, lenp);
	2430	}
	2431
	2432	return result;
	2433	}
	2434
	2435	/* Here, used locale rules. Convert back to utf8 */
	2436	if (UTF8_IS_INVARIANT(result)) {
	2437	*ustrp = (U8) result;
	2438	*lenp = 1;
	2439	}
	2440	else {
	2441	*ustrp = UTF8_EIGHT_BIT_HI(result);
	2442	*(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
	2443	*lenp = 2;
	2444	}
	2445
	2446	if (tainted_ptr) {
	2447	*tainted_ptr = TRUE;
	2448	}
	2449	return result;
	2450	}
	2451
	2452	/* Note:
	2453	* Returns a "swash" which is a hash described in utf8.c:S_swash_fetch().
	2454	* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
	2455	* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
	2456	*/
	2457
	2458	SV*
	2459	Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
	2460	{
	2461	PERL_ARGS_ASSERT_SWASH_INIT;
	2462
	2463	/* Returns a copy of a swash initiated by the called function. This is the
	2464	* public interface, and returning a copy prevents others from doing
	2465	* mischief on the original */
	2466
	2467	return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, FALSE, NULL, FALSE));
	2468	}
	2469
	2470	SV*
	2471	Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV listsv, I32 minbits, I32 none, bool return_if_undef, SV invlist, bool passed_in_invlist_has_user_defined_property)
	2472	{
	2473	/* Initialize and return a swash, creating it if necessary. It does this
	2474	* by calling utf8_heavy.pl in the general case.
	2475	*
	2476	* This interface should only be used by functions that won't destroy or
	2477	* adversely change the swash, as doing so affects all other uses of the
	2478	* swash in the program; the general public should use 'Perl_swash_init'
	2479	* instead.
	2480	*
	2481	* pkg is the name of the package that <name> should be in.
	2482	* name is the name of the swash to find. Typically it is a Unicode
	2483	* property name, including user-defined ones
	2484	* listsv is a string to initialize the swash with. It must be of the form
	2485	* documented as the subroutine return value in
	2486	* L<perlunicode/User-Defined Character Properties>
	2487	* minbits is the number of bits required to represent each data element.
	2488	* It is '1' for binary properties.
	2489	* none I (khw) do not understand this one, but it is used only in tr///.
	2490	* return_if_undef is TRUE if the routine shouldn't croak if it can't find
	2491	* the requested property
	2492	* invlist is an inversion list to initialize the swash with (or NULL)
	2493	* has_user_defined_property is TRUE if <invlist> has some component that
	2494	* came from a user-defined property
	2495	*
	2496	* Thus there are three possible inputs to find the swash: <name>,
	2497	* <listsv>, and <invlist>. At least one must be specified. The result
	2498	* will be the union of the specified ones, although <listsv>'s various
	2499	* actions can intersect, etc. what <name> gives.
	2500	*
	2501	* <invlist> is only valid for binary properties */
	2502
	2503	dVAR;
	2504	SV* retval = &PL_sv_undef;
	2505
	2506	assert(listsv != &PL_sv_undef \|\| strNE(name, "") \|\| invlist);
	2507	assert(! invlist \|\| minbits == 1);
	2508
	2509	/* If data was passed in to go out to utf8_heavy to find the swash of, do
	2510	* so */
	2511	if (listsv != &PL_sv_undef \|\| strNE(name, "")) {
	2512	dSP;
	2513	const size_t pkg_len = strlen(pkg);
	2514	const size_t name_len = strlen(name);
	2515	HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
	2516	SV* errsv_save;
	2517	GV *method;
	2518
	2519	PERL_ARGS_ASSERT__CORE_SWASH_INIT;
	2520
	2521	PUSHSTACKi(PERLSI_MAGIC);
	2522	ENTER;
	2523	SAVEHINTS();
	2524	save_re_context();
	2525	if (PL_parser && PL_parser->error_count)
	2526	SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
	2527	method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
	2528	if (!method) { /* demand load utf8 */
	2529	ENTER;
	2530	errsv_save = newSVsv(ERRSV);
	2531	/* It is assumed that callers of this routine are not passing in
	2532	* any user derived data. */
	2533	/* Need to do this after save_re_context() as it will set
	2534	* PL_tainted to 1 while saving $1 etc (see the code after getrx:
	2535	* in Perl_magic_get). Even line to create errsv_save can turn on
	2536	* PL_tainted. */
	2537	SAVEBOOL(PL_tainted);
	2538	PL_tainted = 0;
	2539	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
	2540	NULL);
	2541	if (!SvTRUE(ERRSV))
	2542	sv_setsv(ERRSV, errsv_save);
	2543	SvREFCNT_dec(errsv_save);
	2544	LEAVE;
	2545	}
	2546	SPAGAIN;
	2547	PUSHMARK(SP);
	2548	EXTEND(SP,5);
	2549	mPUSHp(pkg, pkg_len);
	2550	mPUSHp(name, name_len);
	2551	PUSHs(listsv);
	2552	mPUSHi(minbits);
	2553	mPUSHi(none);
	2554	PUTBACK;
	2555	errsv_save = newSVsv(ERRSV);
	2556	/* If we already have a pointer to the method, no need to use
	2557	* call_method() to repeat the lookup. */
	2558	if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
	2559	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR \| G_METHOD))
	2560	{
	2561	retval = *PL_stack_sp--;
	2562	SvREFCNT_inc(retval);
	2563	}
	2564	if (!SvTRUE(ERRSV))
	2565	sv_setsv(ERRSV, errsv_save);
	2566	SvREFCNT_dec(errsv_save);
	2567	LEAVE;
	2568	POPSTACK;
	2569	if (IN_PERL_COMPILETIME) {
	2570	CopHINTS_set(PL_curcop, PL_hints);
	2571	}
	2572	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV) {
	2573	if (SvPOK(retval))
	2574
	2575	/* If caller wants to handle missing properties, let them */
	2576	if (return_if_undef) {
	2577	return NULL;
	2578	}
	2579	Perl_croak(aTHX_
	2580	"Can't find Unicode property definition \"%"SVf"\"",
	2581	SVfARG(retval));
	2582	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	2583	}
	2584	} /* End of calling the module to find the swash */
	2585
	2586	/* Make sure there is an inversion list for binary properties */
	2587	if (minbits == 1) {
	2588	SV** swash_invlistsvp = NULL;
	2589	SV* swash_invlist = NULL;
	2590	bool invlist_in_swash_is_valid = FALSE;
	2591	HV* swash_hv = NULL;
	2592
	2593	/* If this operation fetched a swash, get its already existing
	2594	* inversion list or create one for it */
	2595	if (retval != &PL_sv_undef) {
	2596	swash_hv = MUTABLE_HV(SvRV(retval));
	2597
	2598	swash_invlistsvp = hv_fetchs(swash_hv, "INVLIST", FALSE);
	2599	if (swash_invlistsvp) {
	2600	swash_invlist = *swash_invlistsvp;
	2601	invlist_in_swash_is_valid = TRUE;
	2602	}
	2603	else {
	2604	swash_invlist = _swash_to_invlist(retval);
	2605	}
	2606	}
	2607
	2608	/* If an inversion list was passed in, have to include it */
	2609	if (invlist) {
	2610
	2611	/* Any fetched swash will by now have an inversion list in it;
	2612	* otherwise <swash_invlist> will be NULL, indicating that we
	2613	* didn't fetch a swash */
	2614	if (swash_invlist) {
	2615
	2616	/* Add the passed-in inversion list, which invalidates the one
	2617	* already stored in the swash */
	2618	invlist_in_swash_is_valid = FALSE;
	2619	_invlist_union(invlist, swash_invlist, &swash_invlist);
	2620	}
	2621	else {
	2622
	2623	/* Here, there is no swash already. Set up a minimal one */
	2624	swash_hv = newHV();
	2625	retval = newRV_inc(MUTABLE_SV(swash_hv));
	2626	swash_invlist = invlist;
	2627	}
	2628
	2629	if (passed_in_invlist_has_user_defined_property) {
	2630	if (! hv_stores(swash_hv, "USER_DEFINED", newSVuv(1))) {
	2631	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2632	}
	2633	}
	2634	}
	2635
	2636	/* Here, we have computed the union of all the passed-in data. It may
	2637	* be that there was an inversion list in the swash which didn't get
	2638	* touched; otherwise save the one computed one */
	2639	if (! invlist_in_swash_is_valid) {
	2640	if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
	2641	{
	2642	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	2643	}
	2644	}
	2645	}
	2646
	2647	return retval;
	2648	}
	2649
	2650
	2651	/* This API is wrong for special case conversions since we may need to
	2652	* return several Unicode characters for a single Unicode character
	2653	* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
	2654	* the lower-level routine, and it is similarly broken for returning
	2655	* multiple values. --jhi
	2656	* For those, you should use to_utf8_case() instead */
	2657	/* Now SWASHGET is recasted into S_swatch_get in this file. */
	2658
	2659	/* Note:
	2660	* Returns the value of property/mapping C<swash> for the first character
	2661	* of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
	2662	* assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
	2663	* assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
	2664	*
	2665	* A "swash" is a hash which contains initially the keys/values set up by
	2666	* SWASHNEW. The purpose is to be able to completely represent a Unicode
	2667	* property for all possible code points. Things are stored in a compact form
	2668	* (see utf8_heavy.pl) so that calculation is required to find the actual
	2669	* property value for a given code point. As code points are looked up, new
	2670	* key/value pairs are added to the hash, so that the calculation doesn't have
	2671	* to ever be re-done. Further, each calculation is done, not just for the
	2672	* desired one, but for a whole block of code points adjacent to that one.
	2673	* For binary properties on ASCII machines, the block is usually for 64 code
	2674	* points, starting with a code point evenly divisible by 64. Thus if the
	2675	* property value for code point 257 is requested, the code goes out and
	2676	* calculates the property values for all 64 code points between 256 and 319,
	2677	* and stores these as a single 64-bit long bit vector, called a "swatch",
	2678	* under the key for code point 256. The key is the UTF-8 encoding for code
	2679	* point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
	2680	* for a code point is 13 bytes, the key will be 12 bytes long. If the value
	2681	* for code point 258 is then requested, this code realizes that it would be
	2682	* stored under the key for 256, and would find that value and extract the
	2683	* relevant bit, offset from 256.
	2684	*
	2685	* Non-binary properties are stored in as many bits as necessary to represent
	2686	* their values (32 currently, though the code is more general than that), not
	2687	* as single bits, but the principal is the same: the value for each key is a
	2688	* vector that encompasses the property values for all code points whose UTF-8
	2689	* representations are represented by the key. That is, for all code points
	2690	* whose UTF-8 representations are length N bytes, and the key is the first N-1
	2691	* bytes of that.
	2692	*/
	2693	UV
	2694	Perl_swash_fetch(pTHX_ SV swash, const U8 ptr, bool do_utf8)
	2695	{
	2696	dVAR;
	2697	HV *const hv = MUTABLE_HV(SvRV(swash));
	2698	U32 klen;
	2699	U32 off;
	2700	STRLEN slen;
	2701	STRLEN needents;
	2702	const U8 *tmps = NULL;
	2703	U32 bit;
	2704	SV *swatch;
	2705	U8 tmputf8[2];
	2706	const UV c = NATIVE_TO_ASCII(*ptr);
	2707
	2708	PERL_ARGS_ASSERT_SWASH_FETCH;
	2709
	2710	/* Convert to utf8 if not already */
	2711	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	2712	tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
	2713	tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
	2714	ptr = tmputf8;
	2715	}
	2716	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	2717	* then the "swatch" is a vec() for all the chars which start
	2718	* with 0xAA..0xYY
	2719	* So the key in the hash (klen) is length of encoded char -1
	2720	*/
	2721	klen = UTF8SKIP(ptr) - 1;
	2722	off = ptr[klen];
	2723
	2724	if (klen == 0) {
	2725	/* If char is invariant then swatch is for all the invariant chars
	2726	* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
	2727	*/
	2728	needents = UTF_CONTINUATION_MARK;
	2729	off = NATIVE_TO_UTF(ptr[klen]);
	2730	}
	2731	else {
	2732	/* If char is encoded then swatch is for the prefix */
	2733	needents = (1 << UTF_ACCUMULATION_SHIFT);
	2734	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	2735	}
	2736
	2737	/*
	2738	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	2739	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	2740	* it's nothing to sniff at.) Pity we usually come through at least
	2741	* two function calls to get here...
	2742	*
	2743	* NB: this code assumes that swatches are never modified, once generated!
	2744	*/
	2745
	2746	if (hv == PL_last_swash_hv &&
	2747	klen == PL_last_swash_klen &&
	2748	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	2749	{
	2750	tmps = PL_last_swash_tmps;
	2751	slen = PL_last_swash_slen;
	2752	}
	2753	else {
	2754	/* Try our second-level swatch cache, kept in a hash. */
	2755	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
	2756
	2757	/* If not cached, generate it via swatch_get */
	2758	if (!svp \|\| !SvPOK(*svp)
	2759	\|\| !(tmps = (const U8)SvPV_const(svp, slen))) {
	2760	/* We use utf8n_to_uvuni() as we want an index into
	2761	Unicode tables, not a native character number.
	2762	*/
	2763	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
	2764	ckWARN(WARN_UTF8) ?
	2765	0 : UTF8_ALLOW_ANY);
	2766	swatch = swatch_get(swash,
	2767	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	2768	(klen) ? (code_point & ~((UV)needents - 1)) : 0,
	2769	needents);
	2770
	2771	if (IN_PERL_COMPILETIME)
	2772	CopHINTS_set(PL_curcop, PL_hints);
	2773
	2774	svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
	2775
	2776	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen))
	2777	\|\| (slen << 3) < needents)
	2778	Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
	2779	"svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
	2780	svp, tmps, (UV)slen, (UV)needents);
	2781	}
	2782
	2783	PL_last_swash_hv = hv;
	2784	assert(klen <= sizeof(PL_last_swash_key));
	2785	PL_last_swash_klen = (U8)klen;
	2786	/* FIXME change interpvar.h? */
	2787	PL_last_swash_tmps = (U8 *) tmps;
	2788	PL_last_swash_slen = slen;
	2789	if (klen)
	2790	Copy(ptr, PL_last_swash_key, klen, U8);
	2791	}
	2792
	2793	if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
	2794	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2795
	2796	/* This outputs warnings for binary properties only, assuming that
	2797	* to_utf8_case() will output any for non-binary. Also, surrogates
	2798	* aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
	2799
	2800	if (! bitssvp \|\| SvUV(*bitssvp) == 1) {
	2801	/* User-defined properties can silently match above-Unicode */
	2802	SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
	2803	if (! user_defined_svp \|\| ! SvUV(*user_defined_svp)) {
	2804	const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
	2805	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	2806	"Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
	2807	}
	2808	}
	2809	}
	2810
	2811	switch ((int)((slen << 3) / needents)) {
	2812	case 1:
	2813	bit = 1 << (off & 7);
	2814	off >>= 3;
	2815	return (tmps[off] & bit) != 0;
	2816	case 8:
	2817	return tmps[off];
	2818	case 16:
	2819	off <<= 1;
	2820	return (tmps[off] << 8) + tmps[off + 1] ;
	2821	case 32:
	2822	off <<= 2;
	2823	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	2824	}
	2825	Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
	2826	"slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
	2827	NORETURN_FUNCTION_END;
	2828	}
	2829
	2830	/* Read a single line of the main body of the swash input text. These are of
	2831	* the form:
	2832	* 0053 0056 0073
	2833	* where each number is hex. The first two numbers form the minimum and
	2834	* maximum of a range, and the third is the value associated with the range.
	2835	* Not all swashes should have a third number
	2836	*
	2837	* On input: l points to the beginning of the line to be examined; it points
	2838	* to somewhere in the string of the whole input text, and is
	2839	* terminated by a \n or the null string terminator.
	2840	* lend points to the null terminator of that string
	2841	* wants_value is non-zero if the swash expects a third number
	2842	* typestr is the name of the swash's mapping, like 'ToLower'
	2843	* On output: min, max, and *val are set to the values read from the line.
	2844	* returns a pointer just beyond the line examined. If there was no
	2845	* valid min number on the line, returns lend+1
	2846	*/
	2847
	2848	STATIC U8*
	2849	S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
	2850	const bool wants_value, const U8* const typestr)
	2851	{
	2852	const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
	2853	STRLEN numlen; /* Length of the number */
	2854	I32 flags = PERL_SCAN_SILENT_ILLDIGIT
	2855	\| PERL_SCAN_DISALLOW_PREFIX
	2856	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2857
	2858	/* nl points to the next \n in the scan */
	2859	U8* const nl = (U8*)memchr(l, '\n', lend - l);
	2860
	2861	/* Get the first number on the line: the range minimum */
	2862	numlen = lend - l;
	2863	min = grok_hex((char )l, &numlen, &flags, NULL);
	2864	if (numlen) /* If found a hex number, position past it */
	2865	l += numlen;
	2866	else if (nl) { /* Else, go handle next line, if any */
	2867	return nl + 1; /* 1 is length of "\n" */
	2868	}
	2869	else { /* Else, no next line */
	2870	return lend + 1; /* to LIST's end at which \n is not found */
	2871	}
	2872
	2873	/* The max range value follows, separated by a BLANK */
	2874	if (isBLANK(*l)) {
	2875	++l;
	2876	flags = PERL_SCAN_SILENT_ILLDIGIT
	2877	\| PERL_SCAN_DISALLOW_PREFIX
	2878	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2879	numlen = lend - l;
	2880	max = grok_hex((char )l, &numlen, &flags, NULL);
	2881	if (numlen)
	2882	l += numlen;
	2883	else /* If no value here, it is a single element range */
	2884	max = min;
	2885
	2886	/* Non-binary tables have a third entry: what the first element of the
	2887	* range maps to */
	2888	if (wants_value) {
	2889	if (isBLANK(*l)) {
	2890	++l;
	2891	flags = PERL_SCAN_SILENT_ILLDIGIT
	2892	\| PERL_SCAN_DISALLOW_PREFIX
	2893	\| PERL_SCAN_SILENT_NON_PORTABLE;
	2894	numlen = lend - l;
	2895	val = grok_hex((char )l, &numlen, &flags, NULL);
	2896	if (numlen)
	2897	l += numlen;
	2898	else
	2899	*val = 0;
	2900	}
	2901	else {
	2902	*val = 0;
	2903	if (typeto) {
	2904	/* diag_listed_as: To%s: illegal mapping '%s' */
	2905	Perl_croak(aTHX_ "%s: illegal mapping '%s'",
	2906	typestr, l);
	2907	}
	2908	}
	2909	}
	2910	else
	2911	val = 0; / bits == 1, then any val should be ignored */
	2912	}
	2913	else { /* Nothing following range min, should be single element with no
	2914	mapping expected */
	2915	max = min;
	2916	if (wants_value) {
	2917	*val = 0;
	2918	if (typeto) {
	2919	/* diag_listed_as: To%s: illegal mapping '%s' */
	2920	Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
	2921	}
	2922	}
	2923	else
	2924	val = 0; / bits == 1, then val should be ignored */
	2925	}
	2926
	2927	/* Position to next line if any, or EOF */
	2928	if (nl)
	2929	l = nl + 1;
	2930	else
	2931	l = lend;
	2932
	2933	return l;
	2934	}
	2935
	2936	/* Note:
	2937	* Returns a swatch (a bit vector string) for a code point sequence
	2938	* that starts from the value C<start> and comprises the number C<span>.
	2939	* A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
	2940	* Should be used via swash_fetch, which will cache the swatch in C<swash>.
	2941	*/
	2942	STATIC SV*
	2943	S_swatch_get(pTHX_ SV* swash, UV start, UV span)
	2944	{
	2945	SV *swatch;
	2946	U8 l, lend, x, xend, s, send;
	2947	STRLEN lcur, xcur, scur;
	2948	HV *const hv = MUTABLE_HV(SvRV(swash));
	2949	SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
	2950
	2951	SV** listsvp = NULL; /* The string containing the main body of the table */
	2952	SV** extssvp = NULL;
	2953	SV** invert_it_svp = NULL;
	2954	U8* typestr = NULL;
	2955	STRLEN bits;
	2956	STRLEN octets; /* if bits == 1, then octets == 0 */
	2957	UV none;
	2958	UV end = start + span;
	2959
	2960	if (invlistsvp == NULL) {
	2961	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	2962	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	2963	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	2964	extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	2965	listsvp = hv_fetchs(hv, "LIST", FALSE);
	2966	invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	2967
	2968	bits = SvUV(*bitssvp);
	2969	none = SvUV(*nonesvp);
	2970	typestr = (U8)SvPV_nolen(typesvp);
	2971	}
	2972	else {
	2973	bits = 1;
	2974	none = 0;
	2975	}
	2976	octets = bits >> 3; /* if bits == 1, then octets == 0 */
	2977
	2978	PERL_ARGS_ASSERT_SWATCH_GET;
	2979
	2980	if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
	2981	Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
	2982	(UV)bits);
	2983	}
	2984
	2985	/* If overflowed, use the max possible */
	2986	if (end < start) {
	2987	end = UV_MAX;
	2988	span = end - start;
	2989	}
	2990
	2991	/* create and initialize $swatch */
	2992	scur = octets ? (span * octets) : (span + 7) / 8;
	2993	swatch = newSV(scur);
	2994	SvPOK_on(swatch);
	2995	s = (U8*)SvPVX(swatch);
	2996	if (octets && none) {
	2997	const U8* const e = s + scur;
	2998	while (s < e) {
	2999	if (bits == 8)
	3000	*s++ = (U8)(none & 0xff);
	3001	else if (bits == 16) {
	3002	*s++ = (U8)((none >> 8) & 0xff);
	3003	*s++ = (U8)( none & 0xff);
	3004	}
	3005	else if (bits == 32) {
	3006	*s++ = (U8)((none >> 24) & 0xff);
	3007	*s++ = (U8)((none >> 16) & 0xff);
	3008	*s++ = (U8)((none >> 8) & 0xff);
	3009	*s++ = (U8)( none & 0xff);
	3010	}
	3011	}
	3012	*s = '\0';
	3013	}
	3014	else {
	3015	(void)memzero((U8*)s, scur + 1);
	3016	}
	3017	SvCUR_set(swatch, scur);
	3018	s = (U8*)SvPVX(swatch);
	3019
	3020	if (invlistsvp) { /* If has an inversion list set up use that */
	3021	_invlist_populate_swatch(*invlistsvp, start, end, s);
	3022	return swatch;
	3023	}
	3024
	3025	/* read $swash->{LIST} */
	3026	l = (U8)SvPV(listsvp, lcur);
	3027	lend = l + lcur;
	3028	while (l < lend) {
	3029	UV min, max, val, upper;
	3030	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	3031	cBOOL(octets), typestr);
	3032	if (l > lend) {
	3033	break;
	3034	}
	3035
	3036	/* If looking for something beyond this range, go try the next one */
	3037	if (max < start)
	3038	continue;
	3039
	3040	/* <end> is generally 1 beyond where we want to set things, but at the
	3041	* platform's infinity, where we can't go any higher, we want to
	3042	* include the code point at <end> */
	3043	upper = (max < end)
	3044	? max
	3045	: (max != UV_MAX \|\| end != UV_MAX)
	3046	? end - 1
	3047	: end;
	3048
	3049	if (octets) {
	3050	UV key;
	3051	if (min < start) {
	3052	if (!none \|\| val < none) {
	3053	val += start - min;
	3054	}
	3055	min = start;
	3056	}
	3057	for (key = min; key <= upper; key++) {
	3058	STRLEN offset;
	3059	/* offset must be non-negative (start <= min <= key < end) */
	3060	offset = octets * (key - start);
	3061	if (bits == 8)
	3062	s[offset] = (U8)(val & 0xff);
	3063	else if (bits == 16) {
	3064	s[offset ] = (U8)((val >> 8) & 0xff);
	3065	s[offset + 1] = (U8)( val & 0xff);
	3066	}
	3067	else if (bits == 32) {
	3068	s[offset ] = (U8)((val >> 24) & 0xff);
	3069	s[offset + 1] = (U8)((val >> 16) & 0xff);
	3070	s[offset + 2] = (U8)((val >> 8) & 0xff);
	3071	s[offset + 3] = (U8)( val & 0xff);
	3072	}
	3073
	3074	if (!none \|\| val < none)
	3075	++val;
	3076	}
	3077	}
	3078	else { /* bits == 1, then val should be ignored */
	3079	UV key;
	3080	if (min < start)
	3081	min = start;
	3082
	3083	for (key = min; key <= upper; key++) {
	3084	const STRLEN offset = (STRLEN)(key - start);
	3085	s[offset >> 3] \|= 1 << (offset & 7);
	3086	}
	3087	}
	3088	} /* while */
	3089
	3090	/* Invert if the data says it should be. Assumes that bits == 1 */
	3091	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3092
	3093	/* Unicode properties should come with all bits above PERL_UNICODE_MAX
	3094	* be 0, and their inversion should also be 0, as we don't succeed any
	3095	* Unicode property matches for non-Unicode code points */
	3096	if (start <= PERL_UNICODE_MAX) {
	3097
	3098	/* The code below assumes that we never cross the
	3099	* Unicode/above-Unicode boundary in a range, as otherwise we would
	3100	* have to figure out where to stop flipping the bits. Since this
	3101	* boundary is divisible by a large power of 2, and swatches comes
	3102	* in small powers of 2, this should be a valid assumption */
	3103	assert(start + span - 1 <= PERL_UNICODE_MAX);
	3104
	3105	send = s + scur;
	3106	while (s < send) {
	3107	s = ~(s);
	3108	s++;
	3109	}
	3110	}
	3111	}
	3112
	3113	/* read $swash->{EXTRAS}
	3114	* This code also copied to swash_to_invlist() below */
	3115	x = (U8)SvPV(extssvp, xcur);
	3116	xend = x + xcur;
	3117	while (x < xend) {
	3118	STRLEN namelen;
	3119	U8 *namestr;
	3120	SV** othersvp;
	3121	HV* otherhv;
	3122	STRLEN otherbits;
	3123	SV *otherbitssvp, other;
	3124	U8 s, o, *nl;
	3125	STRLEN slen, olen;
	3126
	3127	const U8 opc = *x++;
	3128	if (opc == '\n')
	3129	continue;
	3130
	3131	nl = (U8*)memchr(x, '\n', xend - x);
	3132
	3133	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3134	if (nl) {
	3135	x = nl + 1; /* 1 is length of "\n" */
	3136	continue;
	3137	}
	3138	else {
	3139	x = xend; /* to EXTRAS' end at which \n is not found */
	3140	break;
	3141	}
	3142	}
	3143
	3144	namestr = x;
	3145	if (nl) {
	3146	namelen = nl - namestr;
	3147	x = nl + 1;
	3148	}
	3149	else {
	3150	namelen = xend - namestr;
	3151	x = xend;
	3152	}
	3153
	3154	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3155	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3156	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3157	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3158	if (bits < otherbits)
	3159	Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
	3160	"bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
	3161
	3162	/* The "other" swatch must be destroyed after. */
	3163	other = swatch_get(*othersvp, start, span);
	3164	o = (U8*)SvPV(other, olen);
	3165
	3166	if (!olen)
	3167	Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
	3168
	3169	s = (U8*)SvPV(swatch, slen);
	3170	if (bits == 1 && otherbits == 1) {
	3171	if (slen != olen)
	3172	Perl_croak(aTHX_ "panic: swatch_get found swatch length "
	3173	"mismatch, slen=%"UVuf", olen=%"UVuf,
	3174	(UV)slen, (UV)olen);
	3175
	3176	switch (opc) {
	3177	case '+':
	3178	while (slen--)
	3179	s++ \|= o++;
	3180	break;
	3181	case '!':
	3182	while (slen--)
	3183	s++ \|= ~o++;
	3184	break;
	3185	case '-':
	3186	while (slen--)
	3187	s++ &= ~o++;
	3188	break;
	3189	case '&':
	3190	while (slen--)
	3191	s++ &= o++;
	3192	break;
	3193	default:
	3194	break;
	3195	}
	3196	}
	3197	else {
	3198	STRLEN otheroctets = otherbits >> 3;
	3199	STRLEN offset = 0;
	3200	U8* const send = s + slen;
	3201
	3202	while (s < send) {
	3203	UV otherval = 0;
	3204
	3205	if (otherbits == 1) {
	3206	otherval = (o[offset >> 3] >> (offset & 7)) & 1;
	3207	++offset;
	3208	}
	3209	else {
	3210	STRLEN vlen = otheroctets;
	3211	otherval = *o++;
	3212	while (--vlen) {
	3213	otherval <<= 8;
	3214	otherval \|= *o++;
	3215	}
	3216	}
	3217
	3218	if (opc == '+' && otherval)
	3219	NOOP; /* replace with otherval */
	3220	else if (opc == '!' && !otherval)
	3221	otherval = 1;
	3222	else if (opc == '-' && otherval)
	3223	otherval = 0;
	3224	else if (opc == '&' && !otherval)
	3225	otherval = 0;
	3226	else {
	3227	s += octets; /* no replacement */
	3228	continue;
	3229	}
	3230
	3231	if (bits == 8)
	3232	*s++ = (U8)( otherval & 0xff);
	3233	else if (bits == 16) {
	3234	*s++ = (U8)((otherval >> 8) & 0xff);
	3235	*s++ = (U8)( otherval & 0xff);
	3236	}
	3237	else if (bits == 32) {
	3238	*s++ = (U8)((otherval >> 24) & 0xff);
	3239	*s++ = (U8)((otherval >> 16) & 0xff);
	3240	*s++ = (U8)((otherval >> 8) & 0xff);
	3241	*s++ = (U8)( otherval & 0xff);
	3242	}
	3243	}
	3244	}
	3245	sv_free(other); /* through with it! */
	3246	} /* while */
	3247	return swatch;
	3248	}
	3249
	3250	HV*
	3251	Perl__swash_inversion_hash(pTHX_ SV* const swash)
	3252	{
	3253
	3254	/* Subject to change or removal. For use only in one place in regcomp.c.
	3255	* Can't be used on a property that is subject to user override, as it
	3256	* relies on the value of SPECIALS in the swash which would be set by
	3257	* utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
	3258	* for overridden properties
	3259	*
	3260	* Returns a hash which is the inversion and closure of a swash mapping.
	3261	* For example, consider the input lines:
	3262	* 004B 006B
	3263	* 004C 006C
	3264	* 212A 006B
	3265	*
	3266	* The returned hash would have two keys, the utf8 for 006B and the utf8 for
	3267	* 006C. The value for each key is an array. For 006C, the array would
	3268	* have a two elements, the utf8 for itself, and for 004C. For 006B, there
	3269	* would be three elements in its array, the utf8 for 006B, 004B and 212A.
	3270	*
	3271	* Essentially, for any code point, it gives all the code points that map to
	3272	* it, or the list of 'froms' for that point.
	3273	*
	3274	* Currently it ignores any additions or deletions from other swashes,
	3275	* looking at just the main body of the swash, and if there are SPECIALS
	3276	* in the swash, at that hash
	3277	*
	3278	* The specials hash can be extra code points, and most likely consists of
	3279	* maps from single code points to multiple ones (each expressed as a string
	3280	* of utf8 characters). This function currently returns only 1-1 mappings.
	3281	* However consider this possible input in the specials hash:
	3282	* "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
	3283	* "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
	3284	*
	3285	* Both FB05 and FB06 map to the same multi-char sequence, which we don't
	3286	* currently handle. But it also means that FB05 and FB06 are equivalent in
	3287	* a 1-1 mapping which we should handle, and this relationship may not be in
	3288	* the main table. Therefore this function examines all the multi-char
	3289	* sequences and adds the 1-1 mappings that come out of that. */
	3290
	3291	U8 l, lend;
	3292	STRLEN lcur;
	3293	HV *const hv = MUTABLE_HV(SvRV(swash));
	3294
	3295	/* The string containing the main body of the table */
	3296	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	3297
	3298	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3299	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3300	SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
	3301	/SV* const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
	3302	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	3303	const STRLEN bits = SvUV(*bitssvp);
	3304	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3305	const UV none = SvUV(*nonesvp);
	3306	SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
	3307
	3308	HV* ret = newHV();
	3309
	3310	PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
	3311
	3312	/* Must have at least 8 bits to get the mappings */
	3313	if (bits != 8 && bits != 16 && bits != 32) {
	3314	Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
	3315	(UV)bits);
	3316	}
	3317
	3318	if (specials_p) { /* It might be "special" (sometimes, but not always, a
	3319	mapping to more than one character */
	3320
	3321	/* Construct an inverse mapping hash for the specials */
	3322	HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
	3323	HV * specials_inverse = newHV();
	3324	char char_from; / the lhs of the map */
	3325	I32 from_len; /* its byte length */
	3326	char char_to; / the rhs of the map */
	3327	I32 to_len; /* its byte length */
	3328	SV sv_to; / and in a sv */
	3329	AV* from_list; /* list of things that map to each 'to' */
	3330
	3331	hv_iterinit(specials_hv);
	3332
	3333	/* The keys are the characters (in utf8) that map to the corresponding
	3334	* utf8 string value. Iterate through the list creating the inverse
	3335	* list. */
	3336	while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
	3337	SV** listp;
	3338	if (! SvPOK(sv_to)) {
	3339	Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
	3340	"unexpectedly is not a string, flags=%lu",
	3341	(unsigned long)SvFLAGS(sv_to));
	3342	}
	3343	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8) char_from, 0), utf8_to_uvchr((U8) SvPVX(sv_to), 0)));/
	3344
	3345	/* Each key in the inverse list is a mapped-to value, and the key's
	3346	* hash value is a list of the strings (each in utf8) that map to
	3347	* it. Those strings are all one character long */
	3348	if ((listp = hv_fetch(specials_inverse,
	3349	SvPVX(sv_to),
	3350	SvCUR(sv_to), 0)))
	3351	{
	3352	from_list = (AV) listp;
	3353	}
	3354	else { /* No entry yet for it: create one */
	3355	from_list = newAV();
	3356	if (! hv_store(specials_inverse,
	3357	SvPVX(sv_to),
	3358	SvCUR(sv_to),
	3359	(SV*) from_list, 0))
	3360	{
	3361	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3362	}
	3363	}
	3364
	3365	/* Here have the list associated with this 'to' (perhaps newly
	3366	* created and empty). Just add to it. Note that we ASSUME that
	3367	* the input is guaranteed to not have duplications, so we don't
	3368	* check for that. Duplications just slow down execution time. */
	3369	av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
	3370	}
	3371
	3372	/* Here, 'specials_inverse' contains the inverse mapping. Go through
	3373	* it looking for cases like the FB05/FB06 examples above. There would
	3374	* be an entry in the hash like
	3375	* 'st' => [ FB05, FB06 ]
	3376	* In this example we will create two lists that get stored in the
	3377	* returned hash, 'ret':
	3378	* FB05 => [ FB05, FB06 ]
	3379	* FB06 => [ FB05, FB06 ]
	3380	*
	3381	* Note that there is nothing to do if the array only has one element.
	3382	* (In the normal 1-1 case handled below, we don't have to worry about
	3383	* two lists, as everything gets tied to the single list that is
	3384	* generated for the single character 'to'. But here, we are omitting
	3385	* that list, ('st' in the example), so must have multiple lists.) */
	3386	while ((from_list = (AV *) hv_iternextsv(specials_inverse,
	3387	&char_to, &to_len)))
	3388	{
	3389	if (av_len(from_list) > 0) {
	3390	int i;
	3391
	3392	/* We iterate over all combinations of i,j to place each code
	3393	* point on each list */
	3394	for (i = 0; i <= av_len(from_list); i++) {
	3395	int j;
	3396	AV* i_list = newAV();
	3397	SV** entryp = av_fetch(from_list, i, FALSE);
	3398	if (entryp == NULL) {
	3399	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3400	}
	3401	if (hv_fetch(ret, SvPVX(entryp), SvCUR(entryp), FALSE)) {
	3402	Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
	3403	}
	3404	if (! hv_store(ret, SvPVX(entryp), SvCUR(entryp),
	3405	(SV*) i_list, FALSE))
	3406	{
	3407	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3408	}
	3409
	3410	/* For debugging: UV u = utf8_to_uvchr((U8) SvPVX(entryp), 0);*/
	3411	for (j = 0; j <= av_len(from_list); j++) {
	3412	entryp = av_fetch(from_list, j, FALSE);
	3413	if (entryp == NULL) {
	3414	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3415	}
	3416
	3417	/* When i==j this adds itself to the list */
	3418	av_push(i_list, newSVuv(utf8_to_uvchr(
	3419	(U8) SvPVX(entryp), 0)));
	3420	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8) SvPVX(entryp), 0), u));/
	3421	}
	3422	}
	3423	}
	3424	}
	3425	SvREFCNT_dec(specials_inverse); /* done with it */
	3426	} /* End of specials */
	3427
	3428	/* read $swash->{LIST} */
	3429	l = (U8)SvPV(listsvp, lcur);
	3430	lend = l + lcur;
	3431
	3432	/* Go through each input line */
	3433	while (l < lend) {
	3434	UV min, max, val;
	3435	UV inverse;
	3436	l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
	3437	cBOOL(octets), typestr);
	3438	if (l > lend) {
	3439	break;
	3440	}
	3441
	3442	/* Each element in the range is to be inverted */
	3443	for (inverse = min; inverse <= max; inverse++) {
	3444	AV* list;
	3445	SV** listp;
	3446	IV i;
	3447	bool found_key = FALSE;
	3448	bool found_inverse = FALSE;
	3449
	3450	/* The key is the inverse mapping */
	3451	char key[UTF8_MAXBYTES+1];
	3452	char* key_end = (char ) uvuni_to_utf8((U8) key, val);
	3453	STRLEN key_len = key_end - key;
	3454
	3455	/* Get the list for the map */
	3456	if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
	3457	list = (AV) listp;
	3458	}
	3459	else { /* No entry yet for it: create one */
	3460	list = newAV();
	3461	if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
	3462	Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
	3463	}
	3464	}
	3465
	3466	/* Look through list to see if this inverse mapping already is
	3467	* listed, or if there is a mapping to itself already */
	3468	for (i = 0; i <= av_len(list); i++) {
	3469	SV** entryp = av_fetch(list, i, FALSE);
	3470	SV* entry;
	3471	if (entryp == NULL) {
	3472	Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
	3473	}
	3474	entry = *entryp;
	3475	/DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));/
	3476	if (SvUV(entry) == val) {
	3477	found_key = TRUE;
	3478	}
	3479	if (SvUV(entry) == inverse) {
	3480	found_inverse = TRUE;
	3481	}
	3482
	3483	/* No need to continue searching if found everything we are
	3484	* looking for */
	3485	if (found_key && found_inverse) {
	3486	break;
	3487	}
	3488	}
	3489
	3490	/* Make sure there is a mapping to itself on the list */
	3491	if (! found_key) {
	3492	av_push(list, newSVuv(val));
	3493	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));/
	3494	}
	3495
	3496
	3497	/* Simply add the value to the list */
	3498	if (! found_inverse) {
	3499	av_push(list, newSVuv(inverse));
	3500	/DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));/
	3501	}
	3502
	3503	/* swatch_get() increments the value of val for each element in the
	3504	* range. That makes more compact tables possible. You can
	3505	* express the capitalization, for example, of all consecutive
	3506	* letters with a single line: 0061\t007A\t0041 This maps 0061 to
	3507	* 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
	3508	* and it's not documented; it appears to be used only in
	3509	* implementing tr//; I copied the semantics from swatch_get(), just
	3510	* in case */
	3511	if (!none \|\| val < none) {
	3512	++val;
	3513	}
	3514	}
	3515	}
	3516
	3517	return ret;
	3518	}
	3519
	3520	SV*
	3521	Perl__swash_to_invlist(pTHX_ SV* const swash)
	3522	{
	3523
	3524	/* Subject to change or removal. For use only in one place in regcomp.c */
	3525
	3526	U8 l, lend;
	3527	char *loc;
	3528	STRLEN lcur;
	3529	HV *const hv = MUTABLE_HV(SvRV(swash));
	3530	UV elements = 0; /* Number of elements in the inversion list */
	3531	U8 empty[] = "";
	3532
	3533	/* The string containing the main body of the table */
	3534	SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
	3535	SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
	3536	SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
	3537	SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
	3538	SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
	3539
	3540	const U8* const typestr = (U8)SvPV_nolen(typesvp);
	3541	const STRLEN bits = SvUV(*bitssvp);
	3542	const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
	3543	U8 x, xend;
	3544	STRLEN xcur;
	3545
	3546	SV* invlist;
	3547
	3548	PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
	3549
	3550	/* read $swash->{LIST} */
	3551	if (SvPOK(*listsvp)) {
	3552	l = (U8)SvPV(listsvp, lcur);
	3553	}
	3554	else {
	3555	/* LIST legitimately doesn't contain a string during compilation phases
	3556	* of Perl itself, before the Unicode tables are generated. In this
	3557	* case, just fake things up by creating an empty list */
	3558	l = empty;
	3559	lcur = 0;
	3560	}
	3561	loc = (char *) l;
	3562	lend = l + lcur;
	3563
	3564	/* Scan the input to count the number of lines to preallocate array size
	3565	* based on worst possible case, which is each line in the input creates 2
	3566	* elements in the inversion list: 1) the beginning of a range in the list;
	3567	* 2) the beginning of a range not in the list. */
	3568	while ((loc = (strchr(loc, '\n'))) != NULL) {
	3569	elements += 2;
	3570	loc++;
	3571	}
	3572
	3573	/* If the ending is somehow corrupt and isn't a new line, add another
	3574	* element for the final range that isn't in the inversion list */
	3575	if (! (*lend == '\n'
	3576	\|\| (lend == '\0' && (lcur == 0 \|\| (lend - 1) == '\n'))))
	3577	{
	3578	elements++;
	3579	}
	3580
	3581	invlist = _new_invlist(elements);
	3582
	3583	/* Now go through the input again, adding each range to the list */
	3584	while (l < lend) {
	3585	UV start, end;
	3586	UV val; /* Not used by this function */
	3587
	3588	l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
	3589	cBOOL(octets), typestr);
	3590
	3591	if (l > lend) {
	3592	break;
	3593	}
	3594
	3595	_append_range_to_invlist(invlist, start, end);
	3596	}
	3597
	3598	/* Invert if the data says it should be */
	3599	if (invert_it_svp && SvUV(*invert_it_svp)) {
	3600	_invlist_invert_prop(invlist);
	3601	}
	3602
	3603	/* This code is copied from swatch_get()
	3604	* read $swash->{EXTRAS} */
	3605	x = (U8)SvPV(extssvp, xcur);
	3606	xend = x + xcur;
	3607	while (x < xend) {
	3608	STRLEN namelen;
	3609	U8 *namestr;
	3610	SV** othersvp;
	3611	HV* otherhv;
	3612	STRLEN otherbits;
	3613	SV *otherbitssvp, other;
	3614	U8 *nl;
	3615
	3616	const U8 opc = *x++;
	3617	if (opc == '\n')
	3618	continue;
	3619
	3620	nl = (U8*)memchr(x, '\n', xend - x);
	3621
	3622	if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
	3623	if (nl) {
	3624	x = nl + 1; /* 1 is length of "\n" */
	3625	continue;
	3626	}
	3627	else {
	3628	x = xend; /* to EXTRAS' end at which \n is not found */
	3629	break;
	3630	}
	3631	}
	3632
	3633	namestr = x;
	3634	if (nl) {
	3635	namelen = nl - namestr;
	3636	x = nl + 1;
	3637	}
	3638	else {
	3639	namelen = xend - namestr;
	3640	x = xend;
	3641	}
	3642
	3643	othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
	3644	otherhv = MUTABLE_HV(SvRV(*othersvp));
	3645	otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
	3646	otherbits = (STRLEN)SvUV(*otherbitssvp);
	3647
	3648	if (bits != otherbits \|\| bits != 1) {
	3649	Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
	3650	"properties, bits=%"UVuf", otherbits=%"UVuf,
	3651	(UV)bits, (UV)otherbits);
	3652	}
	3653
	3654	/* The "other" swatch must be destroyed after. */
	3655	other = _swash_to_invlist((SV )othersvp);
	3656
	3657	/* End of code copied from swatch_get() */
	3658	switch (opc) {
	3659	case '+':
	3660	_invlist_union(invlist, other, &invlist);
	3661	break;
	3662	case '!':
	3663	_invlist_invert(other);
	3664	_invlist_union(invlist, other, &invlist);
	3665	break;
	3666	case '-':
	3667	_invlist_subtract(invlist, other, &invlist);
	3668	break;
	3669	case '&':
	3670	_invlist_intersection(invlist, other, &invlist);
	3671	break;
	3672	default:
	3673	break;
	3674	}
	3675	sv_free(other); /* through with it! */
	3676	}
	3677
	3678	return invlist;
	3679	}
	3680
	3681	/*
	3682	=for apidoc uvchr_to_utf8
	3683
	3684	Adds the UTF-8 representation of the Native code point C<uv> to the end
	3685	of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
	3686	bytes available. The return value is the pointer to the byte after the
	3687	end of the new character. In other words,
	3688
	3689	d = uvchr_to_utf8(d, uv);
	3690
	3691	is the recommended wide native character-aware way of saying
	3692
	3693	*(d++) = uv;
	3694
	3695	=cut
	3696	*/
	3697
	3698	/* On ASCII machines this is normally a macro but we want a
	3699	real function in case XS code wants it
	3700	*/
	3701	U8 *
	3702	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	3703	{
	3704	PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
	3705
	3706	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
	3707	}
	3708
	3709	U8 *
	3710	Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
	3711	{
	3712	PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
	3713
	3714	return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
	3715	}
	3716
	3717	/*
	3718	=for apidoc utf8n_to_uvchr
	3719
	3720	Returns the native character value of the first character in the string
	3721	C<s>
	3722	which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
	3723	length, in bytes, of that character.
	3724
	3725	length and flags are the same as utf8n_to_uvuni().
	3726
	3727	=cut
	3728	*/
	3729	/* On ASCII machines this is normally a macro but we want
	3730	a real function in case XS code wants it
	3731	*/
	3732	UV
	3733	Perl_utf8n_to_uvchr(pTHX_ const U8 s, STRLEN curlen, STRLEN retlen,
	3734	U32 flags)
	3735	{
	3736	const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	3737
	3738	PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
	3739
	3740	return UNI_TO_NATIVE(uv);
	3741	}
	3742
	3743	bool
	3744	Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
	3745	{
	3746	/* May change: warns if surrogates, non-character code points, or
	3747	* non-Unicode code points are in s which has length len bytes. Returns
	3748	* TRUE if none found; FALSE otherwise. The only other validity check is
	3749	* to make sure that this won't exceed the string's length */
	3750
	3751	const U8* const e = s + len;
	3752	bool ok = TRUE;
	3753
	3754	PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
	3755
	3756	while (s < e) {
	3757	if (UTF8SKIP(s) > len) {
	3758	Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
	3759	"%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
	3760	return FALSE;
	3761	}
	3762	if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
	3763	STRLEN char_len;
	3764	if (UTF8_IS_SUPER(s)) {
	3765	if (ckWARN_d(WARN_NON_UNICODE)) {
	3766	UV uv = utf8_to_uvchr(s, &char_len);
	3767	Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
	3768	"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
	3769	ok = FALSE;
	3770	}
	3771	}
	3772	else if (UTF8_IS_SURROGATE(s)) {
	3773	if (ckWARN_d(WARN_SURROGATE)) {
	3774	UV uv = utf8_to_uvchr(s, &char_len);
	3775	Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
	3776	"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
	3777	ok = FALSE;
	3778	}
	3779	}
	3780	else if
	3781	((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
	3782	&& (ckWARN_d(WARN_NONCHAR)))
	3783	{
	3784	UV uv = utf8_to_uvchr(s, &char_len);
	3785	Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
	3786	"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
	3787	ok = FALSE;
	3788	}
	3789	}
	3790	s += UTF8SKIP(s);
	3791	}
	3792
	3793	return ok;
	3794	}
	3795
	3796	/*
	3797	=for apidoc pv_uni_display
	3798
	3799	Build to the scalar dsv a displayable version of the string spv,
	3800	length len, the displayable version being at most pvlim bytes long
	3801	(if longer, the rest is truncated and "..." will be appended).
	3802
	3803	The flags argument can have UNI_DISPLAY_ISPRINT set to display
	3804	isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
	3805	to display the \\[nrfta\\] as the backslashed versions (like '\n')
	3806	(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
	3807	UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
	3808	UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
	3809
	3810	The pointer to the PV of the dsv is returned.
	3811
	3812	=cut */
	3813	char *
	3814	Perl_pv_uni_display(pTHX_ SV dsv, const U8 spv, STRLEN len, STRLEN pvlim, UV flags)
	3815	{
	3816	int truncated = 0;
	3817	const char s, e;
	3818
	3819	PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
	3820
	3821	sv_setpvs(dsv, "");
	3822	SvUTF8_off(dsv);
	3823	for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
	3824	UV u;
	3825	/* This serves double duty as a flag and a character to print after
	3826	a \ when flags & UNI_DISPLAY_BACKSLASH is true.
	3827	*/
	3828	char ok = 0;
	3829
	3830	if (pvlim && SvCUR(dsv) >= pvlim) {
	3831	truncated++;
	3832	break;
	3833	}
	3834	u = utf8_to_uvchr((U8*)s, 0);
	3835	if (u < 256) {
	3836	const unsigned char c = (unsigned char)u & 0xFF;
	3837	if (flags & UNI_DISPLAY_BACKSLASH) {
	3838	switch (c) {
	3839	case '\n':
	3840	ok = 'n'; break;
	3841	case '\r':
	3842	ok = 'r'; break;
	3843	case '\t':
	3844	ok = 't'; break;
	3845	case '\f':
	3846	ok = 'f'; break;
	3847	case '\a':
	3848	ok = 'a'; break;
	3849	case '\\':
	3850	ok = '\\'; break;
	3851	default: break;
	3852	}
	3853	if (ok) {
	3854	const char string = ok;
	3855	sv_catpvs(dsv, "\\");
	3856	sv_catpvn(dsv, &string, 1);
	3857	}
	3858	}
	3859	/* isPRINT() is the locale-blind version. */
	3860	if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
	3861	const char string = c;
	3862	sv_catpvn(dsv, &string, 1);
	3863	ok = 1;
	3864	}
	3865	}
	3866	if (!ok)
	3867	Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
	3868	}
	3869	if (truncated)
	3870	sv_catpvs(dsv, "...");
	3871
	3872	return SvPVX(dsv);
	3873	}
	3874
	3875	/*
	3876	=for apidoc sv_uni_display
	3877
	3878	Build to the scalar dsv a displayable version of the scalar sv,
	3879	the displayable version being at most pvlim bytes long
	3880	(if longer, the rest is truncated and "..." will be appended).
	3881
	3882	The flags argument is as in pv_uni_display().
	3883
	3884	The pointer to the PV of the dsv is returned.
	3885
	3886	=cut
	3887	*/
	3888	char *
	3889	Perl_sv_uni_display(pTHX_ SV dsv, SV ssv, STRLEN pvlim, UV flags)
	3890	{
	3891	PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
	3892
	3893	return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
	3894	SvCUR(ssv), pvlim, flags);
	3895	}
	3896
	3897	/*
	3898	=for apidoc foldEQ_utf8
	3899
	3900	Returns true if the leading portions of the strings s1 and s2 (either or both
	3901	of which may be in UTF-8) are the same case-insensitively; false otherwise.
	3902	How far into the strings to compare is determined by other input parameters.
	3903
	3904	If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
	3905	otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
	3906	with respect to s2.
	3907
	3908	If the byte length l1 is non-zero, it says how far into s1 to check for fold
	3909	equality. In other words, s1+l1 will be used as a goal to reach. The
	3910	scan will not be considered to be a match unless the goal is reached, and
	3911	scanning won't continue past that goal. Correspondingly for l2 with respect to
	3912	s2.
	3913
	3914	If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
	3915	considered an end pointer beyond which scanning of s1 will not continue under
	3916	any circumstances. This means that if both l1 and pe1 are specified, and pe1
	3917	is less than s1+l1, the match will never be successful because it can never
	3918	get as far as its goal (and in fact is asserted against). Correspondingly for
	3919	pe2 with respect to s2.
	3920
	3921	At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
	3922	non-zero), and if both do, both have to be
	3923	reached for a successful match. Also, if the fold of a character is multiple
	3924	characters, all of them must be matched (see tr21 reference below for
	3925	'folding').
	3926
	3927	Upon a successful match, if pe1 is non-NULL,
	3928	it will be set to point to the beginning of the I<next> character of s1 beyond
	3929	what was matched. Correspondingly for pe2 and s2.
	3930
	3931	For case-insensitiveness, the "casefolding" of Unicode is used
	3932	instead of upper/lowercasing both the characters, see
	3933	http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
	3934
	3935	=cut */
	3936
	3937	/* A flags parameter has been added which may change, and hence isn't
	3938	* externally documented. Currently it is:
	3939	* 0 for as-documented above
	3940	* FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
	3941	ASCII one, to not match
	3942	* FOLDEQ_UTF8_LOCALE meaning that locale rules are to be used for code
	3943	* points below 256; unicode rules for above 255; and
	3944	* folds that cross those boundaries are disallowed,
	3945	* like the NOMIX_ASCII option
	3946	* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
	3947	* routine. This allows that step to be skipped.
	3948	* FOLDEQ_S2_ALREADY_FOLDED Similarly.
	3949	*/
	3950	I32
	3951	Perl_foldEQ_utf8_flags(pTHX_ const char s1, char pe1, register UV l1, bool u1, const char s2, char **pe2, register UV l2, bool u2, U32 flags)
	3952	{
	3953	dVAR;
	3954	register const U8 p1 = (const U8)s1; /* Point to current char */
	3955	register const U8 p2 = (const U8)s2;
	3956	register const U8 g1 = NULL; / goal for s1 */
	3957	register const U8 *g2 = NULL;
	3958	register const U8 e1 = NULL; / Don't scan s1 past this */
	3959	register U8 f1 = NULL; / Point to current folded */
	3960	register const U8 *e2 = NULL;
	3961	register U8 *f2 = NULL;
	3962	STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
	3963	U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
	3964	U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
	3965
	3966	PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
	3967
	3968	/* The algorithm requires that input with the flags on the first line of
	3969	* the assert not be pre-folded. */
	3970	assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII \| FOLDEQ_UTF8_LOCALE))
	3971	&& (flags & (FOLDEQ_S1_ALREADY_FOLDED \| FOLDEQ_S2_ALREADY_FOLDED))));
	3972
	3973	if (pe1) {
	3974	e1 = (U8*)pe1;
	3975	}
	3976
	3977	if (l1) {
	3978	g1 = (const U8*)s1 + l1;
	3979	}
	3980
	3981	if (pe2) {
	3982	e2 = (U8*)pe2;
	3983	}
	3984
	3985	if (l2) {
	3986	g2 = (const U8*)s2 + l2;
	3987	}
	3988
	3989	/* Must have at least one goal */
	3990	assert(g1 \|\| g2);
	3991
	3992	if (g1) {
	3993
	3994	/* Will never match if goal is out-of-bounds */
	3995	assert(! e1 \|\| e1 >= g1);
	3996
	3997	/* Here, there isn't an end pointer, or it is beyond the goal. We
	3998	* only go as far as the goal */
	3999	e1 = g1;
	4000	}
	4001	else {
	4002	assert(e1); /* Must have an end for looking at s1 */
	4003	}
	4004
	4005	/* Same for goal for s2 */
	4006	if (g2) {
	4007	assert(! e2 \|\| e2 >= g2);
	4008	e2 = g2;
	4009	}
	4010	else {
	4011	assert(e2);
	4012	}
	4013
	4014	/* If both operands are already folded, we could just do a memEQ on the
	4015	* whole strings at once, but it would be better if the caller realized
	4016	* this and didn't even call us */
	4017
	4018	/* Look through both strings, a character at a time */
	4019	while (p1 < e1 && p2 < e2) {
	4020
	4021	/* If at the beginning of a new character in s1, get its fold to use
	4022	* and the length of the fold. (exception: locale rules just get the
	4023	* character to a single byte) */
	4024	if (n1 == 0) {
	4025	if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
	4026	f1 = (U8 *) p1;
	4027	n1 = UTF8SKIP(f1);
	4028	}
	4029
	4030	else {
	4031	/* If in locale matching, we use two sets of rules, depending
	4032	* on if the code point is above or below 255. Here, we test
	4033	* for and handle locale rules */
	4034	if ((flags & FOLDEQ_UTF8_LOCALE)
	4035	&& (! u1 \|\| UTF8_IS_INVARIANT(*p1)
	4036	\|\| UTF8_IS_DOWNGRADEABLE_START(*p1)))
	4037	{
	4038	/* There is no mixing of code points above and below 255. */
	4039	if (u2 && (! UTF8_IS_INVARIANT(*p2)
	4040	&& ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
	4041	{
	4042	return 0;
	4043	}
	4044
	4045	/* We handle locale rules by converting, if necessary, the
	4046	* code point to a single byte. */
	4047	if (! u1 \|\| UTF8_IS_INVARIANT(*p1)) {
	4048	foldbuf1 = p1;
	4049	}
	4050	else {
	4051	foldbuf1 = TWO_BYTE_UTF8_TO_UNI(p1, *(p1 + 1));
	4052	}
	4053	n1 = 1;
	4054	}
	4055	else if (isASCII(p1)) { / Note, that here won't be both
	4056	ASCII and using locale rules */
	4057
	4058	/* If trying to mix non- with ASCII, and not supposed to,
	4059	* fail */
	4060	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
	4061	return 0;
	4062	}
	4063	n1 = 1;
	4064	foldbuf1 = toLOWER(p1); /* Folds in the ASCII range are
	4065	just lowercased */
	4066	}
	4067	else if (u1) {
	4068	to_utf8_fold(p1, foldbuf1, &n1);
	4069	}
	4070	else { /* Not utf8, get utf8 fold */
	4071	to_uni_fold(NATIVE_TO_UNI(*p1), foldbuf1, &n1);
	4072	}
	4073	f1 = foldbuf1;
	4074	}
	4075	}
	4076
	4077	if (n2 == 0) { /* Same for s2 */
	4078	if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
	4079	f2 = (U8 *) p2;
	4080	n2 = UTF8SKIP(f2);
	4081	}
	4082	else {
	4083	if ((flags & FOLDEQ_UTF8_LOCALE)
	4084	&& (! u2 \|\| UTF8_IS_INVARIANT(p2) \|\| UTF8_IS_DOWNGRADEABLE_START(p2)))
	4085	{
	4086	/* Here, the next char in s2 is < 256. We've already
	4087	* worked on s1, and if it isn't also < 256, can't match */
	4088	if (u1 && (! UTF8_IS_INVARIANT(*p1)
	4089	&& ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
	4090	{
	4091	return 0;
	4092	}
	4093	if (! u2 \|\| UTF8_IS_INVARIANT(*p2)) {
	4094	foldbuf2 = p2;
	4095	}
	4096	else {
	4097	foldbuf2 = TWO_BYTE_UTF8_TO_UNI(p2, *(p2 + 1));
	4098	}
	4099
	4100	/* Use another function to handle locale rules. We've made
	4101	* sure that both characters to compare are single bytes */
	4102	if (! foldEQ_locale((char ) f1, (char ) foldbuf2, 1)) {
	4103	return 0;
	4104	}
	4105	n1 = n2 = 0;
	4106	}
	4107	else if (isASCII(*p2)) {
	4108	if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
	4109	return 0;
	4110	}
	4111	n2 = 1;
	4112	foldbuf2 = toLOWER(p2);
	4113	}
	4114	else if (u2) {
	4115	to_utf8_fold(p2, foldbuf2, &n2);
	4116	}
	4117	else {
	4118	to_uni_fold(NATIVE_TO_UNI(*p2), foldbuf2, &n2);
	4119	}
	4120	f2 = foldbuf2;
	4121	}
	4122	}
	4123
	4124	/* Here f1 and f2 point to the beginning of the strings to compare.
	4125	* These strings are the folds of the next character from each input
	4126	* string, stored in utf8. */
	4127
	4128	/* While there is more to look for in both folds, see if they
	4129	* continue to match */
	4130	while (n1 && n2) {
	4131	U8 fold_length = UTF8SKIP(f1);
	4132	if (fold_length != UTF8SKIP(f2)
	4133	\|\| (fold_length == 1 && f1 != f2) /* Short circuit memNE
	4134	function call for single
	4135	byte */
	4136	\|\| memNE((char)f1, (char)f2, fold_length))
	4137	{
	4138	return 0; /* mismatch */
	4139	}
	4140
	4141	/* Here, they matched, advance past them */
	4142	n1 -= fold_length;
	4143	f1 += fold_length;
	4144	n2 -= fold_length;
	4145	f2 += fold_length;
	4146	}
	4147
	4148	/* When reach the end of any fold, advance the input past it */
	4149	if (n1 == 0) {
	4150	p1 += u1 ? UTF8SKIP(p1) : 1;
	4151	}
	4152	if (n2 == 0) {
	4153	p2 += u2 ? UTF8SKIP(p2) : 1;
	4154	}
	4155	} /* End of loop through both strings */
	4156
	4157	/* A match is defined by each scan that specified an explicit length
	4158	* reaching its final goal, and the other not having matched a partial
	4159	* character (which can happen when the fold of a character is more than one
	4160	* character). */
	4161	if (! ((g1 == 0 \|\| p1 == g1) && (g2 == 0 \|\| p2 == g2)) \|\| n1 \|\| n2) {
	4162	return 0;
	4163	}
	4164
	4165	/* Successful match. Set output pointers */
	4166	if (pe1) {
	4167	pe1 = (char)p1;
	4168	}
	4169	if (pe2) {
	4170	pe2 = (char)p2;
	4171	}
	4172	return 1;
	4173	}
	4174
	4175	/*
	4176	* Local variables:
	4177	* c-indentation-style: bsd
	4178	* c-basic-offset: 4
	4179	* indent-tabs-mode: t
	4180	* End:
	4181	*
	4182	* ex: set ts=8 sts=4 sw=4 noet:
	4183	*/