perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* utf8.c
	2	*
	3	* Copyright (c) 1998-2000, Larry Wall
	4	*
	5	* You may distribute under the terms of either the GNU General Public
	6	* License or the Artistic License, as specified in the README file.
	7	*
	8	*/
	9
	10	/*
	11	* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
	12	* heard of that we don't want to see any closer; and that's the one place
	13	* we're trying to get to! And that's just where we can't get, nohow.'
	14	*
	15	* 'Well do I understand your speech,' he answered in the same language;
	16	* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
	17	* as is the custom in the West, if you wish to be answered?'
	18	*
	19	* ...the travellers perceived that the floor was paved with stones of many
	20	* hues; branching runes and strange devices intertwined beneath their feet.
	21	*/
	22
	23	#include "EXTERN.h"
	24	#define PERL_IN_UTF8_C
	25	#include "perl.h"
	26
	27	/* Unicode support */
	28
	29	U8 *
	30	Perl_uv_to_utf8(pTHX_ U8 d, UV uv) / the d must be UTF8_MAXLEN+1 deep */
	31	{
	32	if (uv < 0x80) {
	33	*d++ = uv;
	34	*d = 0;
	35	return d;
	36	}
	37	if (uv < 0x800) {
	38	*d++ = (( uv >> 6) \| 0xc0);
	39	*d++ = (( uv & 0x3f) \| 0x80);
	40	*d = 0;
	41	return d;
	42	}
	43	if (uv < 0x10000) {
	44	*d++ = (( uv >> 12) \| 0xe0);
	45	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	46	*d++ = (( uv & 0x3f) \| 0x80);
	47	*d = 0;
	48	return d;
	49	}
	50	if (uv < 0x200000) {
	51	*d++ = (( uv >> 18) \| 0xf0);
	52	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	53	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	54	*d++ = (( uv & 0x3f) \| 0x80);
	55	*d = 0;
	56	return d;
	57	}
	58	if (uv < 0x4000000) {
	59	*d++ = (( uv >> 24) \| 0xf8);
	60	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	61	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	62	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	63	*d++ = (( uv & 0x3f) \| 0x80);
	64	*d = 0;
	65	return d;
	66	}
	67	if (uv < 0x80000000) {
	68	*d++ = (( uv >> 30) \| 0xfc);
	69	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	70	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	71	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	72	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	73	*d++ = (( uv & 0x3f) \| 0x80);
	74	*d = 0;
	75	return d;
	76	}
	77	#ifdef HAS_QUAD
	78	if (uv < UTF8_QUAD_MAX)
	79	#endif
	80	{
	81	d++ = 0xfe; / Can't match U+FEFF! */
	82	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	83	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	84	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	85	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	86	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	87	*d++ = (( uv & 0x3f) \| 0x80);
	88	*d = 0;
	89	return d;
	90	}
	91	#ifdef HAS_QUAD
	92	{
	93	d++ = 0xff; / Can't match U+FFFE! */
	94	d++ = 0x80; / 6 Reserved bits */
	95	d++ = (((uv >> 60) & 0x0f) \| 0x80); / 2 Reserved bits */
	96	*d++ = (((uv >> 54) & 0x3f) \| 0x80);
	97	*d++ = (((uv >> 48) & 0x3f) \| 0x80);
	98	*d++ = (((uv >> 42) & 0x3f) \| 0x80);
	99	*d++ = (((uv >> 36) & 0x3f) \| 0x80);
	100	*d++ = (((uv >> 30) & 0x3f) \| 0x80);
	101	*d++ = (((uv >> 24) & 0x3f) \| 0x80);
	102	*d++ = (((uv >> 18) & 0x3f) \| 0x80);
	103	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	104	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	105	*d++ = (( uv & 0x3f) \| 0x80);
	106	*d = 0;
	107	return d;
	108	}
	109	#endif
	110	}
	111
	112	/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
	113	* The actual number of bytes in the UTF-8 character will be returned if it
	114	* is valid, otherwise 0. */
	115	STRLEN
	116	Perl_is_utf8_char(pTHX_ U8 *s)
	117	{
	118	U8 u = *s;
	119	STRLEN slen, len;
	120	UV uv, ouv;
	121
	122	if (u <= 0x7f)
	123	return 1;
	124
	125	if (u >= 0x80 && u <= 0xbf)
	126	return 0;
	127
	128	len = UTF8SKIP(s);
	129
	130	if (len < 2 \|\| (u >= 0xc0 && u <= 0xfd && s[1] < 0x80))
	131	return 0;
	132
	133	slen = len - 1;
	134	s++;
	135	uv = u;
	136	ouv = uv;
	137	while (slen--) {
	138	if ((*s & 0xc0) != 0x80)
	139	return 0;
	140	uv = UTF8_ACCUMULATE(uv, *s);
	141	if (uv < ouv)
	142	return 0;
	143	ouv = uv;
	144	s++;
	145	}
	146
	147	if (UNISKIP(uv) < len)
	148	return 0;
	149
	150	return len;
	151	}
	152
	153	/*
	154	=for apidoc Am\|is_utf8_string\|U8 *s\|STRLEN len
	155
	156	Returns true if first C<len> bytes of the given string form valid a UTF8
	157	string, false otherwise.
	158
	159	=cut
	160	*/
	161
	162	bool
	163	Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
	164	{
	165	U8* x = s;
	166	U8* send = s + len;
	167	STRLEN c;
	168
	169	while (x < send) {
	170	c = is_utf8_char(x);
	171	if (!c)
	172	return FALSE;
	173	x += c;
	174	if (x > send)
	175	return FALSE;
	176	}
	177
	178	return TRUE;
	179	}
	180
	181	/*
	182	=for apidoc Am\|U8* s\|utf8_to_uv\|STRLEN curlen\|STRLEN *retlen\|U32 flags
	183
	184	Returns the character value of the first character in the string C<s>
	185	which is assumed to be in UTF8 encoding and no longer than C<curlen>;
	186	C<retlen> will be set to the length, in bytes, of that character.
	187
	188	If C<s> does not point to a well-formed UTF8 character, the behaviour
	189	is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
	190	it is assumed that the caller will raise a warning, and this function
	191	will silently just set C<retlen> to C<-1> and return zero. If the
	192	C<flags> does not contain UTF8_CHECK_ONLY, warnings about
	193	malformations will be given, C<retlen> will be set to the expected
	194	length of the UTF-8 character in bytes, and zero will be returned.
	195
	196	The C<flags> can also contain various flags to allow deviations from
	197	the strict UTF-8 encoding (see F<utf8.h>).
	198
	199	=cut */
	200
	201	UV
	202	Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
	203	{
	204	UV uv = *s, ouv;
	205	STRLEN len = 1;
	206	#ifdef EBCDIC
	207	bool dowarn = 0;
	208	#else
	209	bool dowarn = ckWARN_d(WARN_UTF8);
	210	#endif
	211	STRLEN expectlen = 0;
	212
	213	if (curlen == 0) {
	214	if (dowarn)
	215	Perl_warner(aTHX_ WARN_UTF8,
	216	"Malformed UTF-8 character (an empty string)");
	217	goto malformed;
	218	}
	219
	220	if (UTF8_IS_ASCII(uv)) {
	221	if (retlen)
	222	*retlen = 1;
	223	return *s;
	224	}
	225
	226	if (UTF8_IS_CONTINUATION(uv) &&
	227	!(flags & UTF8_ALLOW_CONTINUATION)) {
	228	if (dowarn)
	229	Perl_warner(aTHX_ WARN_UTF8,
	230	"Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")",
	231	uv);
	232	goto malformed;
	233	}
	234
	235	if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
	236	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	237	if (dowarn)
	238	Perl_warner(aTHX_ WARN_UTF8,
	239	"Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
	240	(UV)s[1], uv);
	241	goto malformed;
	242	}
	243
	244	if ((uv == 0xfe \|\| uv == 0xff) &&
	245	!(flags & UTF8_ALLOW_FE_FF)) {
	246	if (dowarn)
	247	Perl_warner(aTHX_ WARN_UTF8,
	248	"Malformed UTF-8 character (byte 0x%02"UVxf")",
	249	uv);
	250	goto malformed;
	251	}
	252
	253	if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
	254	else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
	255	else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
	256	else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
	257	else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
	258	else if (!(uv & 0x01)) { len = 7; uv = 0; }
	259	else { len = 13; uv = 0; } /* whoa! */
	260
	261	if (retlen)
	262	*retlen = len;
	263
	264	expectlen = len;
	265
	266	if ((curlen < expectlen) &&
	267	!(flags & UTF8_ALLOW_SHORT)) {
	268	if (dowarn)
	269	Perl_warner(aTHX_ WARN_UTF8,
	270	"Malformed UTF-8 character (%d byte%s, need %d)",
	271	curlen, curlen == 1 ? "" : "s", expectlen);
	272	goto malformed;
	273	}
	274
	275	len--;
	276	s++;
	277	ouv = uv;
	278
	279	while (len--) {
	280	if (!UTF8_IS_CONTINUATION(*s) &&
	281	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
	282	if (dowarn)
	283	Perl_warner(aTHX_ WARN_UTF8,
	284	"Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)",
	285	*s);
	286	goto malformed;
	287	}
	288	else
	289	uv = UTF8_ACCUMULATE(uv, *s);
	290	if (uv < ouv) {
	291	/* This cannot be allowed. */
	292	if (dowarn)
	293	Perl_warner(aTHX_ WARN_UTF8,
	294	"Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
	295	ouv, *s);
	296	goto malformed;
	297	}
	298	s++;
	299	ouv = uv;
	300	}
	301
	302	if (UNICODE_IS_SURROGATE(uv) &&
	303	!(flags & UTF8_ALLOW_SURROGATE)) {
	304	if (dowarn)
	305	Perl_warner(aTHX_ WARN_UTF8,
	306	"Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
	307	uv);
	308	goto malformed;
	309	} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
	310	!(flags & UTF8_ALLOW_BOM)) {
	311	if (dowarn)
	312	Perl_warner(aTHX_ WARN_UTF8,
	313	"Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
	314	uv);
	315	goto malformed;
	316	} else if ((expectlen > UNISKIP(uv)) &&
	317	!(flags & UTF8_ALLOW_LONG)) {
	318	if (dowarn)
	319	Perl_warner(aTHX_ WARN_UTF8,
	320	"Malformed UTF-8 character (%d byte%s, need %d)",
	321	expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
	322	goto malformed;
	323	} else if (UNICODE_IS_ILLEGAL(uv) &&
	324	!(flags & UTF8_ALLOW_FFFF)) {
	325	if (dowarn)
	326	Perl_warner(aTHX_ WARN_UTF8,
	327	"Malformed UTF-8 character (character 0x%04"UVxf")",
	328	uv);
	329	goto malformed;
	330	}
	331
	332	return uv;
	333
	334	malformed:
	335
	336	if (flags & UTF8_CHECK_ONLY) {
	337	if (retlen)
	338	*retlen = -1;
	339	return 0;
	340	}
	341
	342	if (retlen)
	343	*retlen = expectlen ? expectlen : len;
	344
	345	return 0;
	346	}
	347
	348	/*
	349	=for apidoc Am\|U8* s\|utf8_to_uv_simple\|STRLEN *retlen
	350
	351	Returns the character value of the first character in the string C<s>
	352	which is assumed to be in UTF8 encoding; C<retlen> will be set to the
	353	length, in bytes, of that character.
	354
	355	If C<s> does not point to a well-formed UTF8 character, zero is
	356	returned and retlen is set, if possible, to -1.
	357
	358	=cut
	359	*/
	360
	361	UV
	362	Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
	363	{
	364	return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0);
	365	}
	366
	367	/*
	368	=for apidoc Am\|STRLEN\|utf8_length\|U8* s\|U8 *e
	369
	370	Return the length of the UTF-8 char encoded string C<s> in characters.
	371	Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
	372	up past C<e>, croaks.
	373
	374	=cut
	375	*/
	376
	377	STRLEN
	378	Perl_utf8_length(pTHX_ U8* s, U8* e)
	379	{
	380	STRLEN len = 0;
	381
	382	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	383	* the bitops (especially ~) can create illegal UTF-8.
	384	* In other words: in Perl UTF-8 is not just for Unicode. */
	385
	386	if (e < s)
	387	Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
	388	while (s < e) {
	389	U8 t = UTF8SKIP(s);
	390
	391	if (e - s < t)
	392	Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
	393	s += t;
	394	len++;
	395	}
	396
	397	return len;
	398	}
	399
	400	/*
	401	=for apidoc Am\|IV\|utf8_distance\|U8 a\|U8 b
	402
	403	Returns the number of UTF8 characters between the UTF-8 pointers C<a>
	404	and C<b>.
	405
	406	WARNING: use only if you know that the pointers point inside the
	407	same UTF-8 buffer.
	408
	409	=cut */
	410
	411	IV
	412	Perl_utf8_distance(pTHX_ U8 a, U8 b)
	413	{
	414	IV off = 0;
	415
	416	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
	417	* the bitops (especially ~) can create illegal UTF-8.
	418	* In other words: in Perl UTF-8 is not just for Unicode. */
	419
	420	if (a < b) {
	421	while (a < b) {
	422	U8 c = UTF8SKIP(a);
	423
	424	if (b - a < c)
	425	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	426	a += c;
	427	off--;
	428	}
	429	}
	430	else {
	431	while (b < a) {
	432	U8 c = UTF8SKIP(b);
	433
	434	if (a - b < c)
	435	Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
	436	b += c;
	437	off++;
	438	}
	439	}
	440
	441	return off;
	442	}
	443
	444	/*
	445	=for apidoc Am\|U8\|utf8_hop\|U8 s\|I32 off
	446
	447	Return the UTF-8 pointer C<s> displaced by C<off> characters, either
	448	forward or backward.
	449
	450	WARNING: do not use the following unless you know C<off> is within
	451	the UTF-8 data pointed to by C<s> and that on entry C<s> is aligned
	452	on the first byte of character or just after the last byte of a character.
	453
	454	=cut */
	455
	456	U8 *
	457	Perl_utf8_hop(pTHX_ U8 *s, I32 off)
	458	{
	459	/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
	460	* the bitops (especially ~) can create illegal UTF-8.
	461	* In other words: in Perl UTF-8 is not just for Unicode. */
	462
	463	if (off >= 0) {
	464	while (off--)
	465	s += UTF8SKIP(s);
	466	}
	467	else {
	468	while (off++) {
	469	s--;
	470	while (UTF8_IS_CONTINUATION(*s))
	471	s--;
	472	}
	473	}
	474	return s;
	475	}
	476
	477	/*
	478	=for apidoc Am\|U8 \|utf8_to_bytes\|U8 s\|STRLEN *len
	479
	480	Converts a string C<s> of length C<len> from UTF8 into byte encoding.
	481	Unlike C<bytes_to_utf8>, this over-writes the original string, and
	482	updates len to contain the new length.
	483	Returns zero on failure, setting C<len> to -1.
	484
	485	=cut
	486	*/
	487
	488	U8 *
	489	Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
	490	{
	491	U8 *send;
	492	U8 *d;
	493	U8 *save = s;
	494
	495	/* ensure valid UTF8 and chars < 256 before updating string */
	496	for (send = s + *len; s < send; ) {
	497	U8 c = *s++;
	498
	499	if (c >= 0x80 &&
	500	((s >= send) \|\|
	501	((*s++ & 0xc0) != 0x80) \|\| ((c & 0xfe) != 0xc2))) {
	502	*len = -1;
	503	return 0;
	504	}
	505	}
	506
	507	d = s = save;
	508	while (s < send) {
	509	STRLEN ulen;
	510	*d++ = (U8)utf8_to_uv_simple(s, &ulen);
	511	s += ulen;
	512	}
	513	*d = '\0';
	514	*len = d - save;
	515	return save;
	516	}
	517
	518	/*
	519	=for apidoc Am\|U8 \|bytes_to_utf8\|U8 s\|STRLEN *len
	520
	521	Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
	522	Returns a pointer to the newly-created string, and sets C<len> to
	523	reflect the new length.
	524
	525	=cut
	526	*/
	527
	528	U8*
	529	Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
	530	{
	531	U8 *send;
	532	U8 *d;
	533	U8 *dst;
	534	send = s + (*len);
	535
	536	Newz(801, d, (len) 2 + 1, U8);
	537	dst = d;
	538
	539	while (s < send) {
	540	if (*s < 0x80)
	541	d++ = s++;
	542	else {
	543	UV uv = *s++;
	544	*d++ = (( uv >> 6) \| 0xc0);
	545	*d++ = (( uv & 0x3f) \| 0x80);
	546	}
	547	}
	548	*d = '\0';
	549	*len = d-dst;
	550	return dst;
	551	}
	552
	553	/*
	554	* Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
	555	*
	556	* Destination must be pre-extended to 3/2 source. Do not use in-place.
	557	* We optimize for native, for obvious reasons. */
	558
	559	U8*
	560	Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	561	{
	562	U8* pend;
	563	U8* dstart = d;
	564
	565	if (bytelen & 1)
	566	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
	567
	568	pend = p + bytelen;
	569
	570	while (p < pend) {
	571	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
	572	p += 2;
	573	if (uv < 0x80) {
	574	*d++ = uv;
	575	continue;
	576	}
	577	if (uv < 0x800) {
	578	*d++ = (( uv >> 6) \| 0xc0);
	579	*d++ = (( uv & 0x3f) \| 0x80);
	580	continue;
	581	}
	582	if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
	583	UV low = *p++;
	584	if (low < 0xdc00 \|\| low >= 0xdfff)
	585	Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
	586	uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
	587	}
	588	if (uv < 0x10000) {
	589	*d++ = (( uv >> 12) \| 0xe0);
	590	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	591	*d++ = (( uv & 0x3f) \| 0x80);
	592	continue;
	593	}
	594	else {
	595	*d++ = (( uv >> 18) \| 0xf0);
	596	*d++ = (((uv >> 12) & 0x3f) \| 0x80);
	597	*d++ = (((uv >> 6) & 0x3f) \| 0x80);
	598	*d++ = (( uv & 0x3f) \| 0x80);
	599	continue;
	600	}
	601	}
	602	*newlen = d - dstart;
	603	return d;
	604	}
	605
	606	/* Note: this one is slightly destructive of the source. */
	607
	608	U8*
	609	Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
	610	{
	611	U8* s = (U8*)p;
	612	U8* send = s + bytelen;
	613	while (s < send) {
	614	U8 tmp = s[0];
	615	s[0] = s[1];
	616	s[1] = tmp;
	617	s += 2;
	618	}
	619	return utf16_to_utf8(p, d, bytelen, newlen);
	620	}
	621
	622	/* for now these are all defined (inefficiently) in terms of the utf8 versions */
	623
	624	bool
	625	Perl_is_uni_alnum(pTHX_ U32 c)
	626	{
	627	U8 tmpbuf[UTF8_MAXLEN+1];
	628	uv_to_utf8(tmpbuf, (UV)c);
	629	return is_utf8_alnum(tmpbuf);
	630	}
	631
	632	bool
	633	Perl_is_uni_alnumc(pTHX_ U32 c)
	634	{
	635	U8 tmpbuf[UTF8_MAXLEN+1];
	636	uv_to_utf8(tmpbuf, (UV)c);
	637	return is_utf8_alnumc(tmpbuf);
	638	}
	639
	640	bool
	641	Perl_is_uni_idfirst(pTHX_ U32 c)
	642	{
	643	U8 tmpbuf[UTF8_MAXLEN+1];
	644	uv_to_utf8(tmpbuf, (UV)c);
	645	return is_utf8_idfirst(tmpbuf);
	646	}
	647
	648	bool
	649	Perl_is_uni_alpha(pTHX_ U32 c)
	650	{
	651	U8 tmpbuf[UTF8_MAXLEN+1];
	652	uv_to_utf8(tmpbuf, (UV)c);
	653	return is_utf8_alpha(tmpbuf);
	654	}
	655
	656	bool
	657	Perl_is_uni_ascii(pTHX_ U32 c)
	658	{
	659	U8 tmpbuf[UTF8_MAXLEN+1];
	660	uv_to_utf8(tmpbuf, (UV)c);
	661	return is_utf8_ascii(tmpbuf);
	662	}
	663
	664	bool
	665	Perl_is_uni_space(pTHX_ U32 c)
	666	{
	667	U8 tmpbuf[UTF8_MAXLEN+1];
	668	uv_to_utf8(tmpbuf, (UV)c);
	669	return is_utf8_space(tmpbuf);
	670	}
	671
	672	bool
	673	Perl_is_uni_digit(pTHX_ U32 c)
	674	{
	675	U8 tmpbuf[UTF8_MAXLEN+1];
	676	uv_to_utf8(tmpbuf, (UV)c);
	677	return is_utf8_digit(tmpbuf);
	678	}
	679
	680	bool
	681	Perl_is_uni_upper(pTHX_ U32 c)
	682	{
	683	U8 tmpbuf[UTF8_MAXLEN+1];
	684	uv_to_utf8(tmpbuf, (UV)c);
	685	return is_utf8_upper(tmpbuf);
	686	}
	687
	688	bool
	689	Perl_is_uni_lower(pTHX_ U32 c)
	690	{
	691	U8 tmpbuf[UTF8_MAXLEN+1];
	692	uv_to_utf8(tmpbuf, (UV)c);
	693	return is_utf8_lower(tmpbuf);
	694	}
	695
	696	bool
	697	Perl_is_uni_cntrl(pTHX_ U32 c)
	698	{
	699	U8 tmpbuf[UTF8_MAXLEN+1];
	700	uv_to_utf8(tmpbuf, (UV)c);
	701	return is_utf8_cntrl(tmpbuf);
	702	}
	703
	704	bool
	705	Perl_is_uni_graph(pTHX_ U32 c)
	706	{
	707	U8 tmpbuf[UTF8_MAXLEN+1];
	708	uv_to_utf8(tmpbuf, (UV)c);
	709	return is_utf8_graph(tmpbuf);
	710	}
	711
	712	bool
	713	Perl_is_uni_print(pTHX_ U32 c)
	714	{
	715	U8 tmpbuf[UTF8_MAXLEN+1];
	716	uv_to_utf8(tmpbuf, (UV)c);
	717	return is_utf8_print(tmpbuf);
	718	}
	719
	720	bool
	721	Perl_is_uni_punct(pTHX_ U32 c)
	722	{
	723	U8 tmpbuf[UTF8_MAXLEN+1];
	724	uv_to_utf8(tmpbuf, (UV)c);
	725	return is_utf8_punct(tmpbuf);
	726	}
	727
	728	bool
	729	Perl_is_uni_xdigit(pTHX_ U32 c)
	730	{
	731	U8 tmpbuf[UTF8_MAXLEN+1];
	732	uv_to_utf8(tmpbuf, (UV)c);
	733	return is_utf8_xdigit(tmpbuf);
	734	}
	735
	736	U32
	737	Perl_to_uni_upper(pTHX_ U32 c)
	738	{
	739	U8 tmpbuf[UTF8_MAXLEN+1];
	740	uv_to_utf8(tmpbuf, (UV)c);
	741	return to_utf8_upper(tmpbuf);
	742	}
	743
	744	U32
	745	Perl_to_uni_title(pTHX_ U32 c)
	746	{
	747	U8 tmpbuf[UTF8_MAXLEN+1];
	748	uv_to_utf8(tmpbuf, (UV)c);
	749	return to_utf8_title(tmpbuf);
	750	}
	751
	752	U32
	753	Perl_to_uni_lower(pTHX_ U32 c)
	754	{
	755	U8 tmpbuf[UTF8_MAXLEN+1];
	756	uv_to_utf8(tmpbuf, (UV)c);
	757	return to_utf8_lower(tmpbuf);
	758	}
	759
	760	/* for now these all assume no locale info available for Unicode > 255 */
	761
	762	bool
	763	Perl_is_uni_alnum_lc(pTHX_ U32 c)
	764	{
	765	return is_uni_alnum(c); /* XXX no locale support yet */
	766	}
	767
	768	bool
	769	Perl_is_uni_alnumc_lc(pTHX_ U32 c)
	770	{
	771	return is_uni_alnumc(c); /* XXX no locale support yet */
	772	}
	773
	774	bool
	775	Perl_is_uni_idfirst_lc(pTHX_ U32 c)
	776	{
	777	return is_uni_idfirst(c); /* XXX no locale support yet */
	778	}
	779
	780	bool
	781	Perl_is_uni_alpha_lc(pTHX_ U32 c)
	782	{
	783	return is_uni_alpha(c); /* XXX no locale support yet */
	784	}
	785
	786	bool
	787	Perl_is_uni_ascii_lc(pTHX_ U32 c)
	788	{
	789	return is_uni_ascii(c); /* XXX no locale support yet */
	790	}
	791
	792	bool
	793	Perl_is_uni_space_lc(pTHX_ U32 c)
	794	{
	795	return is_uni_space(c); /* XXX no locale support yet */
	796	}
	797
	798	bool
	799	Perl_is_uni_digit_lc(pTHX_ U32 c)
	800	{
	801	return is_uni_digit(c); /* XXX no locale support yet */
	802	}
	803
	804	bool
	805	Perl_is_uni_upper_lc(pTHX_ U32 c)
	806	{
	807	return is_uni_upper(c); /* XXX no locale support yet */
	808	}
	809
	810	bool
	811	Perl_is_uni_lower_lc(pTHX_ U32 c)
	812	{
	813	return is_uni_lower(c); /* XXX no locale support yet */
	814	}
	815
	816	bool
	817	Perl_is_uni_cntrl_lc(pTHX_ U32 c)
	818	{
	819	return is_uni_cntrl(c); /* XXX no locale support yet */
	820	}
	821
	822	bool
	823	Perl_is_uni_graph_lc(pTHX_ U32 c)
	824	{
	825	return is_uni_graph(c); /* XXX no locale support yet */
	826	}
	827
	828	bool
	829	Perl_is_uni_print_lc(pTHX_ U32 c)
	830	{
	831	return is_uni_print(c); /* XXX no locale support yet */
	832	}
	833
	834	bool
	835	Perl_is_uni_punct_lc(pTHX_ U32 c)
	836	{
	837	return is_uni_punct(c); /* XXX no locale support yet */
	838	}
	839
	840	bool
	841	Perl_is_uni_xdigit_lc(pTHX_ U32 c)
	842	{
	843	return is_uni_xdigit(c); /* XXX no locale support yet */
	844	}
	845
	846	U32
	847	Perl_to_uni_upper_lc(pTHX_ U32 c)
	848	{
	849	return to_uni_upper(c); /* XXX no locale support yet */
	850	}
	851
	852	U32
	853	Perl_to_uni_title_lc(pTHX_ U32 c)
	854	{
	855	return to_uni_title(c); /* XXX no locale support yet */
	856	}
	857
	858	U32
	859	Perl_to_uni_lower_lc(pTHX_ U32 c)
	860	{
	861	return to_uni_lower(c); /* XXX no locale support yet */
	862	}
	863
	864	bool
	865	Perl_is_utf8_alnum(pTHX_ U8 *p)
	866	{
	867	if (!is_utf8_char(p))
	868	return FALSE;
	869	if (!PL_utf8_alnum)
	870	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
	871	* descendant of isalnum(3), in other words, it doesn't
	872	* contain the '_'. --jhi */
	873	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
	874	return swash_fetch(PL_utf8_alnum, p);
	875	/* return p == '_' \|\| is_utf8_alpha(p) \|\| is_utf8_digit(p); /
	876	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	877	if (!PL_utf8_alnum)
	878	PL_utf8_alnum = swash_init("utf8", "",
	879	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	880	return swash_fetch(PL_utf8_alnum, p);
	881	#endif
	882	}
	883
	884	bool
	885	Perl_is_utf8_alnumc(pTHX_ U8 *p)
	886	{
	887	if (!is_utf8_char(p))
	888	return FALSE;
	889	if (!PL_utf8_alnum)
	890	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
	891	return swash_fetch(PL_utf8_alnum, p);
	892	/* return is_utf8_alpha(p) \|\| is_utf8_digit(p); */
	893	#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
	894	if (!PL_utf8_alnum)
	895	PL_utf8_alnum = swash_init("utf8", "",
	896	sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
	897	return swash_fetch(PL_utf8_alnum, p);
	898	#endif
	899	}
	900
	901	bool
	902	Perl_is_utf8_idfirst(pTHX_ U8 *p)
	903	{
	904	return *p == '_' \|\| is_utf8_alpha(p);
	905	}
	906
	907	bool
	908	Perl_is_utf8_alpha(pTHX_ U8 *p)
	909	{
	910	if (!is_utf8_char(p))
	911	return FALSE;
	912	if (!PL_utf8_alpha)
	913	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
	914	return swash_fetch(PL_utf8_alpha, p);
	915	}
	916
	917	bool
	918	Perl_is_utf8_ascii(pTHX_ U8 *p)
	919	{
	920	if (!is_utf8_char(p))
	921	return FALSE;
	922	if (!PL_utf8_ascii)
	923	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
	924	return swash_fetch(PL_utf8_ascii, p);
	925	}
	926
	927	bool
	928	Perl_is_utf8_space(pTHX_ U8 *p)
	929	{
	930	if (!is_utf8_char(p))
	931	return FALSE;
	932	if (!PL_utf8_space)
	933	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
	934	return swash_fetch(PL_utf8_space, p);
	935	}
	936
	937	bool
	938	Perl_is_utf8_digit(pTHX_ U8 *p)
	939	{
	940	if (!is_utf8_char(p))
	941	return FALSE;
	942	if (!PL_utf8_digit)
	943	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
	944	return swash_fetch(PL_utf8_digit, p);
	945	}
	946
	947	bool
	948	Perl_is_utf8_upper(pTHX_ U8 *p)
	949	{
	950	if (!is_utf8_char(p))
	951	return FALSE;
	952	if (!PL_utf8_upper)
	953	PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
	954	return swash_fetch(PL_utf8_upper, p);
	955	}
	956
	957	bool
	958	Perl_is_utf8_lower(pTHX_ U8 *p)
	959	{
	960	if (!is_utf8_char(p))
	961	return FALSE;
	962	if (!PL_utf8_lower)
	963	PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
	964	return swash_fetch(PL_utf8_lower, p);
	965	}
	966
	967	bool
	968	Perl_is_utf8_cntrl(pTHX_ U8 *p)
	969	{
	970	if (!is_utf8_char(p))
	971	return FALSE;
	972	if (!PL_utf8_cntrl)
	973	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
	974	return swash_fetch(PL_utf8_cntrl, p);
	975	}
	976
	977	bool
	978	Perl_is_utf8_graph(pTHX_ U8 *p)
	979	{
	980	if (!is_utf8_char(p))
	981	return FALSE;
	982	if (!PL_utf8_graph)
	983	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
	984	return swash_fetch(PL_utf8_graph, p);
	985	}
	986
	987	bool
	988	Perl_is_utf8_print(pTHX_ U8 *p)
	989	{
	990	if (!is_utf8_char(p))
	991	return FALSE;
	992	if (!PL_utf8_print)
	993	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
	994	return swash_fetch(PL_utf8_print, p);
	995	}
	996
	997	bool
	998	Perl_is_utf8_punct(pTHX_ U8 *p)
	999	{
	1000	if (!is_utf8_char(p))
	1001	return FALSE;
	1002	if (!PL_utf8_punct)
	1003	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
	1004	return swash_fetch(PL_utf8_punct, p);
	1005	}
	1006
	1007	bool
	1008	Perl_is_utf8_xdigit(pTHX_ U8 *p)
	1009	{
	1010	if (!is_utf8_char(p))
	1011	return FALSE;
	1012	if (!PL_utf8_xdigit)
	1013	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
	1014	return swash_fetch(PL_utf8_xdigit, p);
	1015	}
	1016
	1017	bool
	1018	Perl_is_utf8_mark(pTHX_ U8 *p)
	1019	{
	1020	if (!is_utf8_char(p))
	1021	return FALSE;
	1022	if (!PL_utf8_mark)
	1023	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
	1024	return swash_fetch(PL_utf8_mark, p);
	1025	}
	1026
	1027	UV
	1028	Perl_to_utf8_upper(pTHX_ U8 *p)
	1029	{
	1030	UV uv;
	1031
	1032	if (!PL_utf8_toupper)
	1033	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
	1034	uv = swash_fetch(PL_utf8_toupper, p);
	1035	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
	1036	}
	1037
	1038	UV
	1039	Perl_to_utf8_title(pTHX_ U8 *p)
	1040	{
	1041	UV uv;
	1042
	1043	if (!PL_utf8_totitle)
	1044	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
	1045	uv = swash_fetch(PL_utf8_totitle, p);
	1046	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
	1047	}
	1048
	1049	UV
	1050	Perl_to_utf8_lower(pTHX_ U8 *p)
	1051	{
	1052	UV uv;
	1053
	1054	if (!PL_utf8_tolower)
	1055	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
	1056	uv = swash_fetch(PL_utf8_tolower, p);
	1057	return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0);
	1058	}
	1059
	1060	/* a "swash" is a swatch hash */
	1061
	1062	SV*
	1063	Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
	1064	{
	1065	SV* retval;
	1066	char tmpbuf[256];
	1067	dSP;
	1068
	1069	if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
	1070	ENTER;
	1071	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
	1072	LEAVE;
	1073	}
	1074	SPAGAIN;
	1075	PUSHSTACKi(PERLSI_MAGIC);
	1076	PUSHMARK(SP);
	1077	EXTEND(SP,5);
	1078	PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
	1079	PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
	1080	PUSHs(listsv);
	1081	PUSHs(sv_2mortal(newSViv(minbits)));
	1082	PUSHs(sv_2mortal(newSViv(none)));
	1083	PUTBACK;
	1084	ENTER;
	1085	SAVEI32(PL_hints);
	1086	PL_hints = 0;
	1087	save_re_context();
	1088	if (PL_curcop == &PL_compiling) /* XXX ought to be handled by lex_start */
	1089	strncpy(tmpbuf, PL_tokenbuf, sizeof tmpbuf);
	1090	if (call_method("SWASHNEW", G_SCALAR))
	1091	retval = newSVsv(*PL_stack_sp--);
	1092	else
	1093	retval = &PL_sv_undef;
	1094	LEAVE;
	1095	POPSTACK;
	1096	if (PL_curcop == &PL_compiling) {
	1097	strncpy(PL_tokenbuf, tmpbuf, sizeof tmpbuf);
	1098	PL_curcop->op_private = PL_hints;
	1099	}
	1100	if (!SvROK(retval) \|\| SvTYPE(SvRV(retval)) != SVt_PVHV)
	1101	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
	1102	return retval;
	1103	}
	1104
	1105	UV
	1106	Perl_swash_fetch(pTHX_ SV sv, U8 ptr)
	1107	{
	1108	HV* hv = (HV*)SvRV(sv);
	1109	U32 klen = UTF8SKIP(ptr) - 1;
	1110	U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
	1111	STRLEN slen;
	1112	STRLEN needents = (klen ? 64 : 128);
	1113	U8 *tmps;
	1114	U32 bit;
	1115	SV *retval;
	1116
	1117	/*
	1118	* This single-entry cache saves about 1/3 of the utf8 overhead in test
	1119	* suite. (That is, only 7-8% overall over just a hash cache. Still,
	1120	* it's nothing to sniff at.) Pity we usually come through at least
	1121	* two function calls to get here...
	1122	*
	1123	* NB: this code assumes that swatches are never modified, once generated!
	1124	*/
	1125
	1126	if (hv == PL_last_swash_hv &&
	1127	klen == PL_last_swash_klen &&
	1128	(!klen \|\| memEQ((char )ptr,(char )PL_last_swash_key,klen)) )
	1129	{
	1130	tmps = PL_last_swash_tmps;
	1131	slen = PL_last_swash_slen;
	1132	}
	1133	else {
	1134	/* Try our second-level swatch cache, kept in a hash. */
	1135	SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
	1136
	1137	/* If not cached, generate it via utf8::SWASHGET */
	1138	if (!svp \|\| !SvPOK(svp) \|\| !(tmps = (U8)SvPV(*svp, slen))) {
	1139	dSP;
	1140	ENTER;
	1141	SAVETMPS;
	1142	save_re_context();
	1143	PUSHSTACKi(PERLSI_MAGIC);
	1144	PUSHMARK(SP);
	1145	EXTEND(SP,3);
	1146	PUSHs((SV*)sv);
	1147	PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1))));
	1148	PUSHs(sv_2mortal(newSViv(needents)));
	1149	PUTBACK;
	1150	if (call_method("SWASHGET", G_SCALAR))
	1151	retval = newSVsv(*PL_stack_sp--);
	1152	else
	1153	retval = &PL_sv_undef;
	1154	POPSTACK;
	1155	FREETMPS;
	1156	LEAVE;
	1157	if (PL_curcop == &PL_compiling)
	1158	PL_curcop->op_private = PL_hints;
	1159
	1160	svp = hv_store(hv, (char*)ptr, klen, retval, 0);
	1161
	1162	if (!svp \|\| !(tmps = (U8)SvPV(svp, slen)) \|\| slen < 8)
	1163	Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
	1164	}
	1165
	1166	PL_last_swash_hv = hv;
	1167	PL_last_swash_klen = klen;
	1168	PL_last_swash_tmps = tmps;
	1169	PL_last_swash_slen = slen;
	1170	if (klen)
	1171	Copy(ptr, PL_last_swash_key, klen, U8);
	1172	}
	1173
	1174	switch ((int)((slen << 3) / needents)) {
	1175	case 1:
	1176	bit = 1 << (off & 7);
	1177	off >>= 3;
	1178	return (tmps[off] & bit) != 0;
	1179	case 8:
	1180	return tmps[off];
	1181	case 16:
	1182	off <<= 1;
	1183	return (tmps[off] << 8) + tmps[off + 1] ;
	1184	case 32:
	1185	off <<= 2;
	1186	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
	1187	}
	1188	Perl_croak(aTHX_ "panic: swash_fetch");
	1189	return 0;
	1190	}