perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (c) 1998-2001, Larry Wall
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	*/
	9
	10	/*
	11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	12	* heard of that we don't want to see any closer; and that's the one place
	13	* we're trying to get to! And that's just where we can't get, nohow.'
	14	*
	15	* 'Well do I understand your speech,' he answered in the same language;
	16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	17	* as is the custom in the West, if you wish to be answered?'
	18	*
	19	* ...the travellers perceived that the floor was paved with stones of many
	20	* hues; branching runes and strange devices intertwined beneath their feet.
	21	*/
	22
	23	#include "EXTERN.h"
	24	#define PERL_IN_UTF8_C
	25	#include "perl.h"
	26
	27	/* Unicode support */
	28
	29	/*
	30	=for apidoc A\|U8 \|uvuni_to_utf8\|U8 d\|UV uv
	31
	32	Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
	33	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	34	bytes available. The return value is the pointer to the byte after the
	35	end of the new character. In other words,
	36
	37	d = uvuni_to_utf8(d, uv);
	38
	39	is the recommended Unicode-aware way of saying
	40
	41	*(d++) = uv;
	42
	43	=cut
	44	*/
	45
	46	U8 *
	47	Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
	48	{
	49	if (UNI_IS_INVARIANT(uv)) {
	50	*d++ = UTF_TO_NATIVE(uv);
	51	return d;
	52	}
	53	#if defined(EBCDIC) \|\| 1 /* always for testing */
	54	else {
	55	STRLEN len = UNISKIP(uv);
	56	U8 *p = d+len-1;
	57	while (p > d) {
	58	*p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) \| UTF_CONTINUATION_MARK);
	59	uv >>= UTF_ACCUMULATION_SHIFT;
	60	}
	61	*p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) \| UTF_START_MARK(len));
	62	return d+len;
	63	}
	64	#else /* Non loop style */
	65	if (uv < 0x800) {
	66	*d++ = (( uv >> 6) \| 0xc0);
	67	*d++ = (( uv & 0x3f) \| 0x80);
	68	return d;
	69	}
	70	if (uv < 0x10000) {
	71	*d++ = (( uv >> 12) \| 0xe0);
	72	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	73	*d++ = (( uv & 0x3f) \| 0x80);
	74	return d;
	75	}
	76	if (uv < 0x200000) {
	77	*d++ = (( uv >> 18) \| 0xf0);
	78	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	79	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	80	*d++ = (( uv & 0x3f) \| 0x80);
	81	return d;
	82	}
	83	if (uv < 0x4000000) {
	84	*d++ = (( uv >> 24) \| 0xf8);
	85	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	86	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	87	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	88	*d++ = (( uv & 0x3f) \| 0x80);
	89	return d;
	90	}
	91	if (uv < 0x80000000) {
	92	*d++ = (( uv >> 30) \| 0xfc);
	93	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	94	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	95	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	96	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	97	*d++ = (( uv & 0x3f) \| 0x80);
	98	return d;
	99	}
	100	#ifdef HAS_QUAD
	101	if (uv < UTF8_QUAD_MAX)
	102	#endif
	103	{
	104	d++ = 0xfe; / Can't match U+FEFF! */
	105	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	106	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	107	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	108	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	109	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	110	*d++ = (( uv & 0x3f) \| 0x80);
	111	return d;
	112	}
	113	#ifdef HAS_QUAD
	114	{
	115	d++ = 0xff; / Can't match U+FFFE! */
	116	d++ = 0x80; / 6 Reserved bits */
	117	d++ = (((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	118	*d++ = (((uv >> 54) & 0x3f) \| 0x80);
	119	*d++ = (((uv >> 48) & 0x3f) \| 0x80);
	120	*d++ = (((uv >> 42) & 0x3f) \| 0x80);
	121	*d++ = (((uv >> 36) & 0x3f) \| 0x80);
	122	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	123	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	124	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	125	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	126	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	127	*d++ = (( uv & 0x3f) \| 0x80);
	128	return d;
	129	}
	130	#endif
	131	#endif /* Loop style */
	132	}
	133
	134
	135
	136	/*
	137	=for apidoc A\|STRLEN\|is_utf8_char\|U8 *s
	138
	139	Tests if some arbitrary number of bytes begins in a valid UTF-8
	140	character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
	141	The actual number of bytes in the UTF-8 character will be returned if
	142	it is valid, otherwise 0.
	143
	144	=cut
	145	*/
	146	STRLEN
	147	Perl_is_utf8_char(pTHX_ U8 *s)
	148	{
	149	U8 u = *s;
	150	STRLEN slen, len;
	151	UV uv, ouv;
	152
	153	if (UTF8_IS_INVARIANT(u))
	154	return 1;
	155
	156	if (!UTF8_IS_START(u))
	157	return 0;
	158
	159	len = UTF8SKIP(s);
	160
	161	if (len < 2 \|\| !UTF8_IS_CONTINUATION(s[1]))
	162	return 0;
	163
	164	slen = len - 1;
	165	s++;
	166	u &= UTF_START_MASK(len);
	167	uv = u;
	168	ouv = uv;
	169	while (slen--) {
	170	if (!UTF8_IS_CONTINUATION(*s))
	171	return 0;
	172	uv = UTF8_ACCUMULATE(uv, *s);
	173	if (uv < ouv)
	174	return 0;
	175	ouv = uv;
	176	s++;
	177	}
	178
	179	if (UNISKIP(uv) < len)
	180	return 0;
	181
	182	return len;
	183	}
	184
	185	/*
	186	=for apidoc A\|bool\|is_utf8_string\|U8 *s\|STRLEN len
	187
	188	Returns true if first C<len> bytes of the given string form a valid UTF8
	189	string, false otherwise. Note that 'a valid UTF8 string' does not mean
	190	'a string that contains UTF8' because a valid ASCII string is a valid
	191	UTF8 string.
	192
	193	=cut
	194	*/
	195
	196	bool
	197	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	198	{
	199	U8* x = s;
	200	U8* send;
	201	STRLEN c;
	202
	203	if (!len)
	204	len = strlen((char *)s);
	205	send = s + len;
	206
	207	while (x < send) {
	208	c = is_utf8_char(x);
	209	if (!c)
	210	return FALSE;
	211	x += c;
	212	}
	213	if (x != send)
	214	return FALSE;
	215
	216	return TRUE;
	217	}
	218
	219	/*
	220	=for apidoc A\|UV\|utf8n_to_uvuni\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	221
	222	Bottom level UTF-8 decode routine.
	223	Returns the unicode code point value of the first character in the string C<s>
	224	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
	225	C<retlen> will be set to the length, in bytes, of that character.
	226
	227	If C<s> does not point to a well-formed UTF8 character, the behaviour
	228	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	229	it is assumed that the caller will raise a warning, and this function
	230	will silently just set C<retlen> to C<-1> and return zero. If the
	231	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	232	malformations will be given, C<retlen> will be set to the expected
	233	length of the UTF-8 character in bytes, and zero will be returned.
	234
	235	The C<flags> can also contain various flags to allow deviations from
	236	the strict UTF-8 encoding (see F<utf8.h>).
	237
	238	Most code should use utf8_to_uvchr() rather than call this directly.
	239
	240	=cut
	241	*/
	242
	243	UV
	244	Perl_utf8n_to_uvuni(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	245	{
	246	UV uv = *s, ouv = 0;
	247	STRLEN len = 1;
	248	bool dowarn = ckWARN_d(WARN_UTF8);
	249	STRLEN expectlen = 0;
	250	U32 warning = 0;
	251
	252	/* This list is a superset of the UTF8_ALLOW_XXX. */
	253
	254	#define UTF8_WARN_EMPTY 1
	255	#define UTF8_WARN_CONTINUATION 2
	256	#define UTF8_WARN_NON_CONTINUATION 3
	257	#define UTF8_WARN_FE_FF 4
	258	#define UTF8_WARN_SHORT 5
	259	#define UTF8_WARN_OVERFLOW 6
	260	#define UTF8_WARN_SURROGATE 7
	261	#define UTF8_WARN_BOM 8
	262	#define UTF8_WARN_LONG 9
	263	#define UTF8_WARN_FFFF 10
	264
	265	if (curlen == 0 &&
	266	!(flags & UTF8_ALLOW_EMPTY)) {
	267	warning = UTF8_WARN_EMPTY;
	268	goto malformed;
	269	}
	270
	271	if (UTF8_IS_INVARIANT(uv)) {
	272	if (retlen)
	273	*retlen = 1;
	274	return (UV) (NATIVE_TO_UTF(*s));
	275	}
	276
	277	if (UTF8_IS_CONTINUATION(uv) &&
	278	!(flags & UTF8_ALLOW_CONTINUATION)) {
	279	warning = UTF8_WARN_CONTINUATION;
	280	goto malformed;
	281	}
	282
	283	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	284	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	285	warning = UTF8_WARN_NON_CONTINUATION;
	286	goto malformed;
	287	}
	288
	289	#ifdef EBCDIC
	290	uv = NATIVE_TO_UTF(uv);
	291	#else
	292	if ((uv == 0xfe \|\| uv == 0xff) &&
	293	!(flags & UTF8_ALLOW_FE_FF)) {
	294	warning = UTF8_WARN_FE_FF;
	295	goto malformed;
	296	}
	297	#endif
	298
	299	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	300	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	301	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	302	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	303	#ifdef EBCDIC
	304	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	305	else { len = 7; uv &= 0x01; }
	306	#else
	307	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	308	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	309	else { len = 13; uv = 0; } /* whoa! */
	310	#endif
	311
	312	if (retlen)
	313	*retlen = len;
	314
	315	expectlen = len;
	316
	317	if ((curlen < expectlen) &&
	318	!(flags & UTF8_ALLOW_SHORT)) {
	319	warning = UTF8_WARN_SHORT;
	320	goto malformed;
	321	}
	322
	323	len--;
	324	s++;
	325	ouv = uv;
	326
	327	while (len--) {
	328	if (!UTF8_IS_CONTINUATION(*s) &&
	329	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	330	s--;
	331	warning = UTF8_WARN_NON_CONTINUATION;
	332	goto malformed;
	333	}
	334	else
	335	uv = UTF8_ACCUMULATE(uv, *s);
	336	if (!(uv > ouv)) {
	337	/* These cannot be allowed. */
	338	if (uv == ouv) {
	339	if (!(flags & UTF8_ALLOW_LONG)) {
	340	warning = UTF8_WARN_LONG;
	341	goto malformed;
	342	}
	343	}
	344	else { /* uv < ouv */
	345	/* This cannot be allowed. */
	346	warning = UTF8_WARN_OVERFLOW;
	347	goto malformed;
	348	}
	349	}
	350	s++;
	351	ouv = uv;
	352	}
	353
	354	if (UNICODE_IS_SURROGATE(uv) &&
	355	!(flags & UTF8_ALLOW_SURROGATE)) {
	356	warning = UTF8_WARN_SURROGATE;
	357	goto malformed;
	358	} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
	359	!(flags & UTF8_ALLOW_BOM)) {
	360	warning = UTF8_WARN_BOM;
	361	goto malformed;
	362	} else if ((expectlen > UNISKIP(uv)) &&
	363	!(flags & UTF8_ALLOW_LONG)) {
	364	warning = UTF8_WARN_LONG;
	365	goto malformed;
	366	} else if (UNICODE_IS_ILLEGAL(uv) &&
	367	!(flags & UTF8_ALLOW_FFFF)) {
	368	warning = UTF8_WARN_FFFF;
	369	goto malformed;
	370	}
	371
	372	return uv;
	373
	374	malformed:
	375
	376	if (flags & UTF8_CHECK_ONLY) {
	377	if (retlen)
	378	*retlen = -1;
	379	return 0;
	380	}
	381
	382	if (dowarn) {
	383	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
	384
	385	switch (warning) {
	386	case 0: /* Intentionally empty. */ break;
	387	case UTF8_WARN_EMPTY:
	388	Perl_sv_catpvf(aTHX_ sv, "(empty string)");
	389	break;
	390	case UTF8_WARN_CONTINUATION:
	391	Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
	392	break;
	393	case UTF8_WARN_NON_CONTINUATION:
	394	Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
	395	(UV)s[1], uv);
	396	break;
	397	case UTF8_WARN_FE_FF:
	398	Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
	399	break;
	400	case UTF8_WARN_SHORT:
	401	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
	402	curlen, curlen == 1 ? "" : "s", expectlen);
	403	break;
	404	case UTF8_WARN_OVERFLOW:
	405	Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
	406	ouv, *s);
	407	break;
	408	case UTF8_WARN_SURROGATE:
	409	Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
	410	break;
	411	case UTF8_WARN_BOM:
	412	Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
	413	break;
	414	case UTF8_WARN_LONG:
	415	Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
	416	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
	417	break;
	418	case UTF8_WARN_FFFF:
	419	Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
	420	break;
	421	default:
	422	Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
	423	break;
	424	}
	425
	426	if (warning) {
	427	char *s = SvPVX(sv);
	428
	429	if (PL_op)
	430	Perl_warner(aTHX_ WARN_UTF8,
	431	"%s in %s", s, PL_op_desc[PL_op->op_type]);
	432	else
	433	Perl_warner(aTHX_ WARN_UTF8, "%s", s);
	434	}
	435	}
	436
	437	if (retlen)
	438	*retlen = expectlen ? expectlen : len;
	439
	440	return 0;
	441	}
	442
	443	/*
	444	=for apidoc A\|UV\|utf8_to_uvchr\|U8 s\|STRLEN retlen
	445
	446	Returns the native character value of the first character in the string C<s>
	447	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	448	length, in bytes, of that character.
	449
	450	If C<s> does not point to a well-formed UTF8 character, zero is
	451	returned and retlen is set, if possible, to -1.
	452
	453	=cut
	454	*/
	455
	456	UV
	457	Perl_utf8_to_uvchr(pTHX_ U8 s, STRLEN retlen)
	458	{
	459	return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	460	}
	461
	462	/*
	463	=for apidoc A\|UV\|utf8_to_uvuni\|U8 s\|STRLEN retlen
	464
	465	Returns the Unicode code point of the first character in the string C<s>
	466	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	467	length, in bytes, of that character.
	468
	469	This function should only be used when returned UV is considered
	470	an index into the Unicode semantic tables (e.g. swashes).
	471
	472	If C<s> does not point to a well-formed UTF8 character, zero is
	473	returned and retlen is set, if possible, to -1.
	474
	475	=cut
	476	*/
	477
	478	UV
	479	Perl_utf8_to_uvuni(pTHX_ U8 s, STRLEN retlen)
	480	{
	481	/* Call the low level routine asking for checks */
	482	return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	483	}
	484
	485	/*
	486	=for apidoc A\|STRLEN\|utf8_length\|U8 s\|U8 e
	487
	488	Return the length of the UTF-8 char encoded string C<s> in characters.
	489	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	490	up past C<e>, croaks.
	491
	492	=cut
	493	*/
	494
	495	STRLEN
	496	Perl_utf8_length(pTHX_ U8 s, U8 e)
	497	{
	498	STRLEN len = 0;
	499
	500	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	501	* the bitops (especially ~) can create illegal UTF-8.
	502	* In other words: in Perl UTF-8 is not just for Unicode. */
	503
	504	if (e < s)
	505	Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
	506	while (s < e) {
	507	U8 t = UTF8SKIP(s);
	508
	509	if (e - s < t)
	510	Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
	511	s += t;
	512	len++;
	513	}
	514
	515	return len;
	516	}
	517
	518	/*
	519	=for apidoc A\|IV\|utf8_distance\|U8 a\|U8 b
	520
	521	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
	522	and C<b>.
	523
	524	WARNING: use only if you know that the pointers point inside the
	525	same UTF-8 buffer.
	526
	527	=cut
	528	*/
	529
	530	IV
	531	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	532	{
	533	IV off = 0;
	534
	535	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	536	* the bitops (especially ~) can create illegal UTF-8.
	537	* In other words: in Perl UTF-8 is not just for Unicode. */
	538
	539	if (a < b) {
	540	while (a < b) {
	541	U8 c = UTF8SKIP(a);
	542
	543	if (b - a < c)
	544	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	545	a += c;
	546	off--;
	547	}
	548	}
	549	else {
	550	while (b < a) {
	551	U8 c = UTF8SKIP(b);
	552
	553	if (a - b < c)
	554	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	555	b += c;
	556	off++;
	557	}
	558	}
	559
	560	return off;
	561	}
	562
	563	/*
	564	=for apidoc A\|U8 \|utf8_hop\|U8 s\|I32 off
	565
	566	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	567	forward or backward.
	568
	569	WARNING: do not use the following unless you know C<off> is within
	570	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	571	on the first byte of character or just after the last byte of a character.
	572
	573	=cut
	574	*/
	575
	576	U8 *
	577	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	578	{
	579	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	580	* the bitops (especially ~) can create illegal UTF-8.
	581	* In other words: in Perl UTF-8 is not just for Unicode. */
	582
	583	if (off >= 0) {
	584	while (off--)
	585	s += UTF8SKIP(s);
	586	}
	587	else {
	588	while (off++) {
	589	s--;
	590	while (UTF8_IS_CONTINUATION(*s))
	591	s--;
	592	}
	593	}
	594	return s;
	595	}
	596
	597	/*
	598	=for apidoc A\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	599
	600	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	601	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	602	updates len to contain the new length.
	603	Returns zero on failure, setting C<len> to -1.
	604
	605	=cut
	606	*/
	607
	608	U8 *
	609	Perl_utf8_to_bytes(pTHX_ U8 s, STRLEN len)
	610	{
	611	U8 *send;
	612	U8 *d;
	613	U8 *save = s;
	614
	615	/* ensure valid UTF8 and chars < 256 before updating string */
	616	for (send = s + *len; s < send; ) {
	617	U8 c = *s++;
	618
	619	if (!UTF8_IS_INVARIANT(c) &&
	620	(!UTF8_IS_DOWNGRADEABLE_START(c) \|\| (s >= send)
	621	\|\| !(c = *s++) \|\| !UTF8_IS_CONTINUATION(c))) {
	622	*len = -1;
	623	return 0;
	624	}
	625	}
	626
	627	d = s = save;
	628	while (s < send) {
	629	STRLEN ulen;
	630	*d++ = (U8)utf8_to_uvchr(s, &ulen);
	631	s += ulen;
	632	}
	633	*d = '\0';
	634	*len = d - save;
	635	return save;
	636	}
	637
	638	/*
	639	=for apidoc A\|U8 \|bytes_from_utf8\|U8 s\|STRLEN len\|bool is_utf8
	640
	641	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	642	Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
	643	the newly-created string, and updates C<len> to contain the new
	644	length. Returns the original string if no conversion occurs, C<len>
	645	is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
	646	0 if C<s> is converted or contains all 7bit characters.
	647
	648	=cut
	649	*/
	650
	651	U8 *
	652	Perl_bytes_from_utf8(pTHX_ U8 s, STRLEN len, bool *is_utf8)
	653	{
	654	U8 *d;
	655	U8 *start = s;
	656	U8 *send;
	657	I32 count = 0;
	658
	659	if (!*is_utf8)
	660	return start;
	661
	662	/* ensure valid UTF8 and chars < 256 before converting string */
	663	for (send = s + *len; s < send;) {
	664	U8 c = *s++;
	665	if (!UTF8_IS_INVARIANT(c)) {
	666	if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
	667	(c = *s++) && UTF8_IS_CONTINUATION(c))
	668	count++;
	669	else
	670	return start;
	671	}
	672	}
	673
	674	*is_utf8 = 0;
	675
	676	Newz(801, d, (*len) - count + 1, U8);
	677	s = start; start = d;
	678	while (s < send) {
	679	U8 c = *s++;
	680	if (!UTF8_IS_INVARIANT(c)) {
	681	/* Then it is two-byte encoded */
	682	c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
	683	c = ASCII_TO_NATIVE(c);
	684	}
	685	*d++ = c;
	686	}
	687	*d = '\0';
	688	*len = d - start;
	689	return start;
	690	}
	691
	692	/*
	693	=for apidoc A\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	694
	695	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
	696	Returns a pointer to the newly-created string, and sets C<len> to
	697	reflect the new length.
	698
	699	=cut
	700	*/
	701
	702	U8*
	703	Perl_bytes_to_utf8(pTHX_ U8 s, STRLEN len)
	704	{
	705	U8 *send;
	706	U8 *d;
	707	U8 *dst;
	708	send = s + (*len);
	709
	710	Newz(801, d, (len) 2 + 1, U8);
	711	dst = d;
	712
	713	while (s < send) {
	714	UV uv = NATIVE_TO_ASCII(*s++);
	715	if (UNI_IS_INVARIANT(uv))
	716	*d++ = UTF_TO_NATIVE(uv);
	717	else {
	718	*d++ = UTF8_EIGHT_BIT_HI(uv);
	719	*d++ = UTF8_EIGHT_BIT_LO(uv);
	720	}
	721	}
	722	*d = '\0';
	723	*len = d-dst;
	724	return dst;
	725	}
	726
	727	/*
	728	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	729	*
	730	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	731	* We optimize for native, for obvious reasons. */
	732
	733	U8*
	734	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	735	{
	736	U8* pend;
	737	U8* dstart = d;
	738
	739	if (bytelen & 1)
	740	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
	741
	742	pend = p + bytelen;
	743
	744	while (p < pend) {
	745	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	746	p += 2;
	747	if (uv < 0x80) {
	748	*d++ = uv;
	749	continue;
	750	}
	751	if (uv < 0x800) {
	752	*d++ = (( uv >> 6) \| 0xc0);
	753	*d++ = (( uv & 0x3f) \| 0x80);
	754	continue;
	755	}
	756	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	757	UV low = *p++;
	758	if (low < 0xdc00 \|\| low >= 0xdfff)
	759	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	760	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	761	}
	762	if (uv < 0x10000) {
	763	*d++ = (( uv >> 12) \| 0xe0);
	764	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	765	*d++ = (( uv & 0x3f) \| 0x80);
	766	continue;
	767	}
	768	else {
	769	*d++ = (( uv >> 18) \| 0xf0);
	770	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	771	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	772	*d++ = (( uv & 0x3f) \| 0x80);
	773	continue;
	774	}
	775	}
	776	*newlen = d - dstart;
	777	return d;
	778	}
	779
	780	/* Note: this one is slightly destructive of the source. */
	781
	782	U8*
	783	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	784	{
	785	U8* s = (U8*)p;
	786	U8* send = s + bytelen;
	787	while (s < send) {
	788	U8 tmp = s[0];
	789	s[0] = s[1];
	790	s[1] = tmp;
	791	s += 2;
	792	}
	793	return utf16_to_utf8(p, d, bytelen, newlen);
	794	}
	795
	796	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	797
	798	bool
	799	Perl_is_uni_alnum(pTHX_ U32 c)
	800	{
	801	U8 tmpbuf[UTF8_MAXLEN+1];
	802	uvchr_to_utf8(tmpbuf, (UV)c);
	803	return is_utf8_alnum(tmpbuf);
	804	}
	805
	806	bool
	807	Perl_is_uni_alnumc(pTHX_ U32 c)
	808	{
	809	U8 tmpbuf[UTF8_MAXLEN+1];
	810	uvchr_to_utf8(tmpbuf, (UV)c);
	811	return is_utf8_alnumc(tmpbuf);
	812	}
	813
	814	bool
	815	Perl_is_uni_idfirst(pTHX_ U32 c)
	816	{
	817	U8 tmpbuf[UTF8_MAXLEN+1];
	818	uvchr_to_utf8(tmpbuf, (UV)c);
	819	return is_utf8_idfirst(tmpbuf);
	820	}
	821
	822	bool
	823	Perl_is_uni_alpha(pTHX_ U32 c)
	824	{
	825	U8 tmpbuf[UTF8_MAXLEN+1];
	826	uvchr_to_utf8(tmpbuf, (UV)c);
	827	return is_utf8_alpha(tmpbuf);
	828	}
	829
	830	bool
	831	Perl_is_uni_ascii(pTHX_ U32 c)
	832	{
	833	U8 tmpbuf[UTF8_MAXLEN+1];
	834	uvchr_to_utf8(tmpbuf, (UV)c);
	835	return is_utf8_ascii(tmpbuf);
	836	}
	837
	838	bool
	839	Perl_is_uni_space(pTHX_ U32 c)
	840	{
	841	U8 tmpbuf[UTF8_MAXLEN+1];
	842	uvchr_to_utf8(tmpbuf, (UV)c);
	843	return is_utf8_space(tmpbuf);
	844	}
	845
	846	bool
	847	Perl_is_uni_digit(pTHX_ U32 c)
	848	{
	849	U8 tmpbuf[UTF8_MAXLEN+1];
	850	uvchr_to_utf8(tmpbuf, (UV)c);
	851	return is_utf8_digit(tmpbuf);
	852	}
	853
	854	bool
	855	Perl_is_uni_upper(pTHX_ U32 c)
	856	{
	857	U8 tmpbuf[UTF8_MAXLEN+1];
	858	uvchr_to_utf8(tmpbuf, (UV)c);
	859	return is_utf8_upper(tmpbuf);
	860	}
	861
	862	bool
	863	Perl_is_uni_lower(pTHX_ U32 c)
	864	{
	865	U8 tmpbuf[UTF8_MAXLEN+1];
	866	uvchr_to_utf8(tmpbuf, (UV)c);
	867	return is_utf8_lower(tmpbuf);
	868	}
	869
	870	bool
	871	Perl_is_uni_cntrl(pTHX_ U32 c)
	872	{
	873	U8 tmpbuf[UTF8_MAXLEN+1];
	874	uvchr_to_utf8(tmpbuf, (UV)c);
	875	return is_utf8_cntrl(tmpbuf);
	876	}
	877
	878	bool
	879	Perl_is_uni_graph(pTHX_ U32 c)
	880	{
	881	U8 tmpbuf[UTF8_MAXLEN+1];
	882	uvchr_to_utf8(tmpbuf, (UV)c);
	883	return is_utf8_graph(tmpbuf);
	884	}
	885
	886	bool
	887	Perl_is_uni_print(pTHX_ U32 c)
	888	{
	889	U8 tmpbuf[UTF8_MAXLEN+1];
	890	uvchr_to_utf8(tmpbuf, (UV)c);
	891	return is_utf8_print(tmpbuf);
	892	}
	893
	894	bool
	895	Perl_is_uni_punct(pTHX_ U32 c)
	896	{
	897	U8 tmpbuf[UTF8_MAXLEN+1];
	898	uvchr_to_utf8(tmpbuf, (UV)c);
	899	return is_utf8_punct(tmpbuf);
	900	}
	901
	902	bool
	903	Perl_is_uni_xdigit(pTHX_ U32 c)
	904	{
	905	U8 tmpbuf[UTF8_MAXLEN+1];
	906	uvchr_to_utf8(tmpbuf, (UV)c);
	907	return is_utf8_xdigit(tmpbuf);
	908	}
	909
	910	U32
	911	Perl_to_uni_upper(pTHX_ U32 c)
	912	{
	913	U8 tmpbuf[UTF8_MAXLEN+1];
	914	uvchr_to_utf8(tmpbuf, (UV)c);
	915	return to_utf8_upper(tmpbuf);
	916	}
	917
	918	U32
	919	Perl_to_uni_title(pTHX_ U32 c)
	920	{
	921	U8 tmpbuf[UTF8_MAXLEN+1];
	922	uvchr_to_utf8(tmpbuf, (UV)c);
	923	return to_utf8_title(tmpbuf);
	924	}
	925
	926	U32
	927	Perl_to_uni_lower(pTHX_ U32 c)
	928	{
	929	U8 tmpbuf[UTF8_MAXLEN+1];
	930	uvchr_to_utf8(tmpbuf, (UV)c);
	931	return to_utf8_lower(tmpbuf);
	932	}
	933
	934	/* for now these all assume no locale info available for Unicode > 255 */
	935
	936	bool
	937	Perl_is_uni_alnum_lc(pTHX_ U32 c)
	938	{
	939	return is_uni_alnum(c); /* XXX no locale support yet */
	940	}
	941
	942	bool
	943	Perl_is_uni_alnumc_lc(pTHX_ U32 c)
	944	{
	945	return is_uni_alnumc(c); /* XXX no locale support yet */
	946	}
	947
	948	bool
	949	Perl_is_uni_idfirst_lc(pTHX_ U32 c)
	950	{
	951	return is_uni_idfirst(c); /* XXX no locale support yet */
	952	}
	953
	954	bool
	955	Perl_is_uni_alpha_lc(pTHX_ U32 c)
	956	{
	957	return is_uni_alpha(c); /* XXX no locale support yet */
	958	}
	959
	960	bool
	961	Perl_is_uni_ascii_lc(pTHX_ U32 c)
	962	{
	963	return is_uni_ascii(c); /* XXX no locale support yet */
	964	}
	965
	966	bool
	967	Perl_is_uni_space_lc(pTHX_ U32 c)
	968	{
	969	return is_uni_space(c); /* XXX no locale support yet */
	970	}
	971
	972	bool
	973	Perl_is_uni_digit_lc(pTHX_ U32 c)
	974	{
	975	return is_uni_digit(c); /* XXX no locale support yet */
	976	}
	977
	978	bool
	979	Perl_is_uni_upper_lc(pTHX_ U32 c)
	980	{
	981	return is_uni_upper(c); /* XXX no locale support yet */
	982	}
	983
	984	bool
	985	Perl_is_uni_lower_lc(pTHX_ U32 c)
	986	{
	987	return is_uni_lower(c); /* XXX no locale support yet */
	988	}
	989
	990	bool
	991	Perl_is_uni_cntrl_lc(pTHX_ U32 c)
	992	{
	993	return is_uni_cntrl(c); /* XXX no locale support yet */
	994	}
	995
	996	bool
	997	Perl_is_uni_graph_lc(pTHX_ U32 c)
	998	{
	999	return is_uni_graph(c); /* XXX no locale support yet */
	1000	}
	1001
	1002	bool
	1003	Perl_is_uni_print_lc(pTHX_ U32 c)
	1004	{
	1005	return is_uni_print(c); /* XXX no locale support yet */
	1006	}
	1007
	1008	bool
	1009	Perl_is_uni_punct_lc(pTHX_ U32 c)
	1010	{
	1011	return is_uni_punct(c); /* XXX no locale support yet */
	1012	}
	1013
	1014	bool
	1015	Perl_is_uni_xdigit_lc(pTHX_ U32 c)
	1016	{
	1017	return is_uni_xdigit(c); /* XXX no locale support yet */
	1018	}
	1019
	1020	U32
	1021	Perl_to_uni_upper_lc(pTHX_ U32 c)
	1022	{
	1023	return to_uni_upper(c); /* XXX no locale support yet */
	1024	}
	1025
	1026	U32
	1027	Perl_to_uni_title_lc(pTHX_ U32 c)
	1028	{
	1029	return to_uni_title(c); /* XXX no locale support yet */
	1030	}
	1031
	1032	U32
	1033	Perl_to_uni_lower_lc(pTHX_ U32 c)
	1034	{
	1035	return to_uni_lower(c); /* XXX no locale support yet */
	1036	}
	1037
	1038	bool
	1039	Perl_is_utf8_alnum(pTHX_ U8 *p)
	1040	{
	1041	if (!is_utf8_char(p))
	1042	return FALSE;
	1043	if (!PL_utf8_alnum)
	1044	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	1045	* descendant of isalnum(3), in other words, it doesn't
	1046	* contain the '_'. --jhi */
	1047	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	1048	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1049	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	1050	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1051	if (!PL_utf8_alnum)
	1052	PL_utf8_alnum = swash_init("utf8", "",
	1053	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1054	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1055	#endif
	1056	}
	1057
	1058	bool
	1059	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	1060	{
	1061	if (!is_utf8_char(p))
	1062	return FALSE;
	1063	if (!PL_utf8_alnum)
	1064	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	1065	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1066	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	1067	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	1068	if (!PL_utf8_alnum)
	1069	PL_utf8_alnum = swash_init("utf8", "",
	1070	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	1071	return swash_fetch(PL_utf8_alnum, p, TRUE);
	1072	#endif
	1073	}
	1074
	1075	bool
	1076	Perl_is_utf8_idfirst(pTHX_ U8 *p)
	1077	{
	1078	return *p == '_' \|\| is_utf8_alpha(p);
	1079	}
	1080
	1081	bool
	1082	Perl_is_utf8_alpha(pTHX_ U8 *p)
	1083	{
	1084	if (!is_utf8_char(p))
	1085	return FALSE;
	1086	if (!PL_utf8_alpha)
	1087	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	1088	return swash_fetch(PL_utf8_alpha, p, TRUE);
	1089	}
	1090
	1091	bool
	1092	Perl_is_utf8_ascii(pTHX_ U8 *p)
	1093	{
	1094	if (!is_utf8_char(p))
	1095	return FALSE;
	1096	if (!PL_utf8_ascii)
	1097	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	1098	return swash_fetch(PL_utf8_ascii, p, TRUE);
	1099	}
	1100
	1101	bool
	1102	Perl_is_utf8_space(pTHX_ U8 *p)
	1103	{
	1104	if (!is_utf8_char(p))
	1105	return FALSE;
	1106	if (!PL_utf8_space)
	1107	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	1108	return swash_fetch(PL_utf8_space, p, TRUE);
	1109	}
	1110
	1111	bool
	1112	Perl_is_utf8_digit(pTHX_ U8 *p)
	1113	{
	1114	if (!is_utf8_char(p))
	1115	return FALSE;
	1116	if (!PL_utf8_digit)
	1117	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	1118	return swash_fetch(PL_utf8_digit, p, TRUE);
	1119	}
	1120
	1121	bool
	1122	Perl_is_utf8_upper(pTHX_ U8 *p)
	1123	{
	1124	if (!is_utf8_char(p))
	1125	return FALSE;
	1126	if (!PL_utf8_upper)
	1127	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
	1128	return swash_fetch(PL_utf8_upper, p, TRUE);
	1129	}
	1130
	1131	bool
	1132	Perl_is_utf8_lower(pTHX_ U8 *p)
	1133	{
	1134	if (!is_utf8_char(p))
	1135	return FALSE;
	1136	if (!PL_utf8_lower)
	1137	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
	1138	return swash_fetch(PL_utf8_lower, p, TRUE);
	1139	}
	1140
	1141	bool
	1142	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	1143	{
	1144	if (!is_utf8_char(p))
	1145	return FALSE;
	1146	if (!PL_utf8_cntrl)
	1147	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	1148	return swash_fetch(PL_utf8_cntrl, p, TRUE);
	1149	}
	1150
	1151	bool
	1152	Perl_is_utf8_graph(pTHX_ U8 *p)
	1153	{
	1154	if (!is_utf8_char(p))
	1155	return FALSE;
	1156	if (!PL_utf8_graph)
	1157	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	1158	return swash_fetch(PL_utf8_graph, p, TRUE);
	1159	}
	1160
	1161	bool
	1162	Perl_is_utf8_print(pTHX_ U8 *p)
	1163	{
	1164	if (!is_utf8_char(p))
	1165	return FALSE;
	1166	if (!PL_utf8_print)
	1167	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	1168	return swash_fetch(PL_utf8_print, p, TRUE);
	1169	}
	1170
	1171	bool
	1172	Perl_is_utf8_punct(pTHX_ U8 *p)
	1173	{
	1174	if (!is_utf8_char(p))
	1175	return FALSE;
	1176	if (!PL_utf8_punct)
	1177	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1178	return swash_fetch(PL_utf8_punct, p, TRUE);
	1179	}
	1180
	1181	bool
	1182	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1183	{
	1184	if (!is_utf8_char(p))
	1185	return FALSE;
	1186	if (!PL_utf8_xdigit)
	1187	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1188	return swash_fetch(PL_utf8_xdigit, p, TRUE);
	1189	}
	1190
	1191	bool
	1192	Perl_is_utf8_mark(pTHX_ U8 *p)
	1193	{
	1194	if (!is_utf8_char(p))
	1195	return FALSE;
	1196	if (!PL_utf8_mark)
	1197	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1198	return swash_fetch(PL_utf8_mark, p, TRUE);
	1199	}
	1200
	1201	UV
	1202	Perl_to_utf8_upper(pTHX_ U8 *p)
	1203	{
	1204	UV uv;
	1205
	1206	if (!PL_utf8_toupper)
	1207	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
	1208	uv = swash_fetch(PL_utf8_toupper, p, TRUE);
	1209	return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
	1210	}
	1211
	1212	UV
	1213	Perl_to_utf8_title(pTHX_ U8 *p)
	1214	{
	1215	UV uv;
	1216
	1217	if (!PL_utf8_totitle)
	1218	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
	1219	uv = swash_fetch(PL_utf8_totitle, p, TRUE);
	1220	return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
	1221	}
	1222
	1223	UV
	1224	Perl_to_utf8_lower(pTHX_ U8 *p)
	1225	{
	1226	UV uv;
	1227
	1228	if (!PL_utf8_tolower)
	1229	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
	1230	uv = swash_fetch(PL_utf8_tolower, p, TRUE);
	1231	return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
	1232	}
	1233
	1234	/* a "swash" is a swatch hash */
	1235
	1236	SV*
	1237	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1238	{
	1239	SV* retval;
	1240	SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
	1241	dSP;
	1242	HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
	1243	SV* errsv_save;
	1244
	1245	if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
	1246	ENTER;
	1247	errsv_save = newSVsv(ERRSV);
	1248	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
	1249	if (!SvTRUE(ERRSV))
	1250	sv_setsv(ERRSV, errsv_save);
	1251	SvREFCNT_dec(errsv_save);
	1252	LEAVE;
	1253	}
	1254	SPAGAIN;
	1255	PUSHSTACKi(PERLSI_MAGIC);
	1256	PUSHMARK(SP);
	1257	EXTEND(SP,5);
	1258	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
	1259	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
	1260	PUSHs(listsv);
	1261	PUSHs(sv_2mortal(newSViv(minbits)));
	1262	PUSHs(sv_2mortal(newSViv(none)));
	1263	PUTBACK;
	1264	ENTER;
	1265	SAVEI32(PL_hints);
	1266	PL_hints = 0;
	1267	save_re_context();
	1268	if (PL_curcop == &PL_compiling)
	1269	/* XXX ought to be handled by lex_start */
	1270	sv_setpv(tokenbufsv, PL_tokenbuf);
	1271	errsv_save = newSVsv(ERRSV);
	1272	if (call_method("SWASHNEW", G_SCALAR))
	1273	retval = newSVsv(*PL_stack_sp--);
	1274	else
	1275	retval = &PL_sv_undef;
	1276	if (!SvTRUE(ERRSV))
	1277	sv_setsv(ERRSV, errsv_save);
	1278	SvREFCNT_dec(errsv_save);
	1279	LEAVE;
	1280	POPSTACK;
	1281	if (PL_curcop == &PL_compiling) {
	1282	STRLEN len;
	1283	char* pv = SvPV(tokenbufsv, len);
	1284
	1285	Copy(pv, PL_tokenbuf, len+1, char);
	1286	PL_curcop->op_private = PL_hints;
	1287	}
	1288	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV)
	1289	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1290	return retval;
	1291	}
	1292
	1293	UV
	1294	Perl_swash_fetch(pTHX_ SV sv, U8 ptr, bool do_utf8)
	1295	{
	1296	HV* hv = (HV*)SvRV(sv);
	1297	U32 klen;
	1298	U32 off;
	1299	STRLEN slen;
	1300	STRLEN needents;
	1301	U8 *tmps;
	1302	U32 bit;
	1303	SV *retval;
	1304	U8 tmputf8[2];
	1305	UV c = NATIVE_TO_ASCII(*ptr);
	1306
	1307	if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
	1308	tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
	1309	tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
	1310	ptr = tmputf8;
	1311	}
	1312	/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
	1313	* then the "swatch" is a vec() for al the chars which start
	1314	* with 0xAA..0xYY
	1315	* So the key in the hash (klen) is length of encoded char -1
	1316	*/
	1317	klen = UTF8SKIP(ptr) - 1;
	1318	off = ptr[klen];
	1319
	1320	if (klen == 0)
	1321	{
	1322	/* If char in invariant then swatch is for all the invariant chars
	1323	* In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
	1324	*/
	1325	needents = UTF_CONTINUATION_MARK;
	1326	off = NATIVE_TO_UTF(ptr[klen]);
	1327	}
	1328	else
	1329	{
	1330	/* If char is encoded then swatch is for the prefix */
	1331	needents = (1 << UTF_ACCUMULATION_SHIFT);
	1332	off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
	1333	}
	1334
	1335	/*
	1336	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1337	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1338	* it's nothing to sniff at.) Pity we usually come through at least
	1339	* two function calls to get here...
	1340	*
	1341	* NB: this code assumes that swatches are never modified, once generated!
	1342	*/
	1343
	1344	if (hv == PL_last_swash_hv &&
	1345	klen == PL_last_swash_klen &&
	1346	(!klen \|\| memEQ((char )ptr, (char )PL_last_swash_key, klen)) )
	1347	{
	1348	tmps = PL_last_swash_tmps;
	1349	slen = PL_last_swash_slen;
	1350	}
	1351	else {
	1352	/* Try our second-level swatch cache, kept in a hash. */
	1353	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1354
	1355	/* If not cached, generate it via utf8::SWASHGET */
	1356	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1357	dSP;
	1358	/* We use utf8n_to_uvuni() as we want an index into
	1359	Unicode tables, not a native character number.
	1360	*/
	1361	UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
	1362	SV *errsv_save;
	1363	ENTER;
	1364	SAVETMPS;
	1365	save_re_context();
	1366	PUSHSTACKi(PERLSI_MAGIC);
	1367	PUSHMARK(SP);
	1368	EXTEND(SP,3);
	1369	PUSHs((SV*)sv);
	1370	/* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
	1371	PUSHs(sv_2mortal(newSViv((klen) ?
	1372	(code_point & ~(needents - 1)) : 0)));
	1373	PUSHs(sv_2mortal(newSViv(needents)));
	1374	PUTBACK;
	1375	errsv_save = newSVsv(ERRSV);
	1376	if (call_method("SWASHGET", G_SCALAR))
	1377	retval = newSVsv(*PL_stack_sp--);
	1378	else
	1379	retval = &PL_sv_undef;
	1380	if (!SvTRUE(ERRSV))
	1381	sv_setsv(ERRSV, errsv_save);
	1382	SvREFCNT_dec(errsv_save);
	1383	POPSTACK;
	1384	FREETMPS;
	1385	LEAVE;
	1386	if (PL_curcop == &PL_compiling)
	1387	PL_curcop->op_private = PL_hints;
	1388
	1389	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1390
	1391	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| (slen << 3) < needents)
	1392	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1393	}
	1394
	1395	PL_last_swash_hv = hv;
	1396	PL_last_swash_klen = klen;
	1397	PL_last_swash_tmps = tmps;
	1398	PL_last_swash_slen = slen;
	1399	if (klen)
	1400	Copy(ptr, PL_last_swash_key, klen, U8);
	1401	}
	1402
	1403	switch ((int)((slen << 3) / needents)) {
	1404	case 1:
	1405	bit = 1 << (off & 7);
	1406	off >>= 3;
	1407	return (tmps[off] & bit) != 0;
	1408	case 8:
	1409	return tmps[off];
	1410	case 16:
	1411	off <<= 1;
	1412	return (tmps[off] << 8) + tmps[off + 1] ;
	1413	case 32:
	1414	off <<= 2;
	1415	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1416	}
	1417	Perl_croak(aTHX_ "panic: swash_fetch");
	1418	return 0;
	1419	}
	1420
	1421
	1422	/*
	1423	=for apidoc A\|U8 \|uvchr_to_utf8\|U8 d\|UV uv
	1424
	1425	Adds the UTF8 representation of the Native codepoint C<uv> to the end
	1426	of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
	1427	bytes available. The return value is the pointer to the byte after the
	1428	end of the new character. In other words,
	1429
	1430	d = uvchr_to_utf8(d, uv);
	1431
	1432	is the recommended wide native character-aware way of saying
	1433
	1434	*(d++) = uv;
	1435
	1436	=cut
	1437	*/
	1438
	1439	/* On ASCII machines this is normally a macro but we want a
	1440	real function in case XS code wants it
	1441	*/
	1442	#undef Perl_uvchr_to_utf8
	1443	U8 *
	1444	Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
	1445	{
	1446	return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
	1447	}
	1448
	1449
	1450	/*
	1451	=for apidoc A\|UV\|utf8n_to_uvchr\|U8 s\|STRLEN curlen\|STRLEN retlen\|U32 flags
	1452
	1453	Returns the native character value of the first character in the string C<s>
	1454	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	1455	length, in bytes, of that character.
	1456
	1457	Allows length and flags to be passed to low level routine.
	1458
	1459	=cut
	1460	*/
	1461	/* On ASCII machines this is normally a macro but we want a
	1462	real function in case XS code wants it
	1463	*/
	1464	#undef Perl_utf8n_to_uvchr
	1465	UV
	1466	Perl_utf8n_to_uvchr(pTHX_ U8 s, STRLEN curlen, STRLEN retlen, U32 flags)
	1467	{
	1468	UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
	1469	return UNI_TO_NATIVE(uv);
	1470	}
	1471
	1472