This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Prefer !UTF8_IS_INVARIANT() over UTF8_IS_CONTINUED() when that
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
9041c2e3 30=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
1d72bdf6 49 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
50 *d++ = uv;
51 return d;
52 }
1d72bdf6
NIS
53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3
LW
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3
LW
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3
LW
132}
133
eebe1485 134/*
9041c2e3
NIS
135=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
136
137Adds the UTF8 representation of the Native codepoint C<uv> to the end
138of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
139bytes available. The return value is the pointer to the byte after the
140end of the new character. In other words,
141
142 d = uvchr_to_utf8(d, uv);
143
144is the recommended wide native character-aware way of saying
145
146 *(d++) = uv;
147
148=cut
149*/
150
151U8 *
152Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
153{
154 if (uv < 0x100)
155 uv = NATIVE_TO_ASCII(uv);
156 return Perl_uvuni_to_utf8(aTHX_ d, uv);
157}
158
159
160/*
eebe1485
SC
161=for apidoc A|STRLEN|is_utf8_char|U8 *s
162
5da9da9e 163Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 164character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
165The actual number of bytes in the UTF-8 character will be returned if
166it is valid, otherwise 0.
9041c2e3 167
5da9da9e 168=cut */
067a85ef 169STRLEN
386d01d6
GS
170Perl_is_utf8_char(pTHX_ U8 *s)
171{
172 U8 u = *s;
067a85ef
A
173 STRLEN slen, len;
174 UV uv, ouv;
386d01d6 175
1d72bdf6 176 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
177 return 1;
178
60006e79 179 if (!UTF8_IS_START(u))
386d01d6
GS
180 return 0;
181
9f07fdcd 182 len = UTF8SKIP(s);
386d01d6 183
60006e79 184 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
185 return 0;
186
386d01d6
GS
187 slen = len - 1;
188 s++;
1d72bdf6
NIS
189 /* The initial value is dubious */
190 uv = u;
067a85ef 191 ouv = uv;
386d01d6 192 while (slen--) {
60006e79 193 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 194 return 0;
8850bf83 195 uv = UTF8_ACCUMULATE(uv, *s);
067a85ef
A
196 if (uv < ouv)
197 return 0;
198 ouv = uv;
386d01d6
GS
199 s++;
200 }
067a85ef 201
5bbb0b5a 202 if (UNISKIP(uv) < len)
067a85ef
A
203 return 0;
204
386d01d6
GS
205 return len;
206}
207
6662521e 208/*
eebe1485 209=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 210
5da9da9e
JH
211Returns true if first C<len> bytes of the given string form a valid UTF8
212string, false otherwise. Note that 'a valid UTF8 string' does not mean
213'a string that contains UTF8' because a valid ASCII string is a valid
214UTF8 string.
6662521e
GS
215
216=cut
217*/
218
8e84507e 219bool
6662521e
GS
220Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
221{
067a85ef 222 U8* x = s;
1aa99e6b 223 U8* send;
067a85ef
A
224 STRLEN c;
225
1aa99e6b 226 if (!len)
6cd5fe39 227 len = strlen((char *)s);
1aa99e6b
IH
228 send = s + len;
229
6662521e
GS
230 while (x < send) {
231 c = is_utf8_char(x);
067a85ef
A
232 if (!c)
233 return FALSE;
6662521e 234 x += c;
6662521e 235 }
60006e79
JH
236 if (x != send)
237 return FALSE;
067a85ef
A
238
239 return TRUE;
6662521e
GS
240}
241
67e989fb 242/*
9041c2e3 243=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 244
9041c2e3
NIS
245Bottom level UTF-8 decode routine.
246Returns the unicode code point value of the first character in the string C<s>
ba210ebe 247which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 248C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
249
250If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
251is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
252it is assumed that the caller will raise a warning, and this function
28d3d195
JH
253will silently just set C<retlen> to C<-1> and return zero. If the
254C<flags> does not contain UTF8_CHECK_ONLY, warnings about
255malformations will be given, C<retlen> will be set to the expected
256length of the UTF-8 character in bytes, and zero will be returned.
257
258The C<flags> can also contain various flags to allow deviations from
259the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 260
9041c2e3
NIS
261Most code should use utf8_to_uvchr() rather than call this directly.
262
dcad2880 263=cut */
67e989fb 264
a0ed51b3 265UV
9041c2e3 266Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
a0ed51b3 267{
ba210ebe
JH
268 UV uv = *s, ouv;
269 STRLEN len = 1;
270 bool dowarn = ckWARN_d(WARN_UTF8);
271 STRLEN expectlen = 0;
a0dbb045
JH
272 U32 warning = 0;
273
274/* This list is a superset of the UTF8_ALLOW_XXX. */
275
276#define UTF8_WARN_EMPTY 1
277#define UTF8_WARN_CONTINUATION 2
278#define UTF8_WARN_NON_CONTINUATION 3
279#define UTF8_WARN_FE_FF 4
280#define UTF8_WARN_SHORT 5
281#define UTF8_WARN_OVERFLOW 6
282#define UTF8_WARN_SURROGATE 7
283#define UTF8_WARN_BOM 8
284#define UTF8_WARN_LONG 9
285#define UTF8_WARN_FFFF 10
286
287 if (curlen == 0 &&
288 !(flags & UTF8_ALLOW_EMPTY)) {
289 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
290 goto malformed;
291 }
292
1d72bdf6 293 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
294 if (retlen)
295 *retlen = 1;
9041c2e3 296 return (UV) (*s);
a0ed51b3 297 }
67e989fb 298
421a8bf2 299 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 300 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 301 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
302 goto malformed;
303 }
304
421a8bf2 305 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 306 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 307 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
308 goto malformed;
309 }
9041c2e3 310
1d72bdf6 311#ifdef EBCDIC
75383841 312 uv = NATIVE_TO_UTF(uv);
1d72bdf6 313#else
fcc8fcf6
JH
314 if ((uv == 0xfe || uv == 0xff) &&
315 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 316 warning = UTF8_WARN_FE_FF;
ba210ebe 317 goto malformed;
a0ed51b3 318 }
1d72bdf6
NIS
319#endif
320
ba210ebe
JH
321 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
322 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
323 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
324 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
325#ifdef EBCDIC
326 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
327 else { len = 7; uv &= 0x01; }
328#else
ba210ebe
JH
329 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
330 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
331 else { len = 13; uv = 0; } /* whoa! */
332#endif
333
a0ed51b3
LW
334 if (retlen)
335 *retlen = len;
9041c2e3 336
ba210ebe
JH
337 expectlen = len;
338
fcc8fcf6
JH
339 if ((curlen < expectlen) &&
340 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 341 warning = UTF8_WARN_SHORT;
ba210ebe
JH
342 goto malformed;
343 }
344
345 len--;
a0ed51b3 346 s++;
ba210ebe
JH
347 ouv = uv;
348
a0ed51b3 349 while (len--) {
421a8bf2
JH
350 if (!UTF8_IS_CONTINUATION(*s) &&
351 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
352 s--;
353 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 354 goto malformed;
a0ed51b3
LW
355 }
356 else
8850bf83 357 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
358 if (!(uv > ouv)) {
359 /* These cannot be allowed. */
360 if (uv == ouv) {
361 if (!(flags & UTF8_ALLOW_LONG)) {
362 warning = UTF8_WARN_LONG;
363 goto malformed;
364 }
365 }
366 else { /* uv < ouv */
367 /* This cannot be allowed. */
368 warning = UTF8_WARN_OVERFLOW;
369 goto malformed;
370 }
ba210ebe
JH
371 }
372 s++;
373 ouv = uv;
374 }
375
421a8bf2 376 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 377 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 378 warning = UTF8_WARN_SURROGATE;
ba210ebe 379 goto malformed;
421a8bf2 380 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 381 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 382 warning = UTF8_WARN_BOM;
ba210ebe 383 goto malformed;
fcc8fcf6
JH
384 } else if ((expectlen > UNISKIP(uv)) &&
385 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 386 warning = UTF8_WARN_LONG;
ba210ebe 387 goto malformed;
421a8bf2 388 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 389 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 390 warning = UTF8_WARN_FFFF;
a9917092 391 goto malformed;
a0ed51b3 392 }
ba210ebe 393
a0ed51b3 394 return uv;
ba210ebe
JH
395
396malformed:
397
fcc8fcf6 398 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 399 if (retlen)
cc366d4b 400 *retlen = -1;
ba210ebe
JH
401 return 0;
402 }
403
a0dbb045
JH
404 if (dowarn) {
405 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
406
407 switch (warning) {
408 case 0: /* Intentionally empty. */ break;
409 case UTF8_WARN_EMPTY:
410 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
411 break;
412 case UTF8_WARN_CONTINUATION:
413 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
414 break;
415 case UTF8_WARN_NON_CONTINUATION:
416 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
417 (UV)s[1], uv);
418 break;
419 case UTF8_WARN_FE_FF:
420 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
421 break;
422 case UTF8_WARN_SHORT:
423 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
424 curlen, curlen == 1 ? "" : "s", expectlen);
425 break;
426 case UTF8_WARN_OVERFLOW:
427 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
428 ouv, *s);
429 break;
430 case UTF8_WARN_SURROGATE:
431 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
432 break;
433 case UTF8_WARN_BOM:
434 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
435 break;
436 case UTF8_WARN_LONG:
437 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
438 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
439 break;
440 case UTF8_WARN_FFFF:
441 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
442 break;
443 default:
444 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
445 break;
446 }
447
448 if (warning) {
449 char *s = SvPVX(sv);
450
451 if (PL_op)
452 Perl_warner(aTHX_ WARN_UTF8,
453 "%s in %s", s, PL_op_desc[PL_op->op_type]);
454 else
455 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
456 }
457 }
458
ba210ebe 459 if (retlen)
28d3d195 460 *retlen = expectlen ? expectlen : len;
ba210ebe 461
28d3d195 462 return 0;
a0ed51b3
LW
463}
464
8e84507e 465/*
9041c2e3 466=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
8e84507e 467
9041c2e3 468Returns the native character value of the first character in the string C<s>
8e84507e 469which is assumed to be in UTF8 encoding; C<retlen> will be set to the
7df053ec 470length, in bytes, of that character.
8e84507e 471
9041c2e3
NIS
472Allows length and flags to be passed to low level routine.
473
474=cut
475*/
476
477UV
478Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
479{
480 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
481 if (uv < 0x100)
482 return (UV) ASCII_TO_NATIVE(uv);
483 return uv;
484}
485
486/*
487=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
488
489Returns the native character value of the first character in the string C<s>
490which is assumed to be in UTF8 encoding; C<retlen> will be set to the
491length, in bytes, of that character.
492
493If C<s> does not point to a well-formed UTF8 character, zero is
494returned and retlen is set, if possible, to -1.
495
496=cut
497*/
498
499UV
500Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
501{
502 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
503}
504
505/*
506=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
507
508Returns the Unicode code point of the first character in the string C<s>
509which is assumed to be in UTF8 encoding; C<retlen> will be set to the
510length, in bytes, of that character.
511
512This function should only be used when returned UV is considered
513an index into the Unicode semantic tables (e.g. swashes).
514
ba210ebe
JH
515If C<s> does not point to a well-formed UTF8 character, zero is
516returned and retlen is set, if possible, to -1.
8e84507e
NIS
517
518=cut
519*/
520
521UV
9041c2e3 522Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
8e84507e 523{
9041c2e3
NIS
524 /* Call the low level routine asking for checks */
525 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
526}
527
b76347f2 528/*
eebe1485 529=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
b76347f2
JH
530
531Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
532Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
533up past C<e>, croaks.
b76347f2
JH
534
535=cut
536*/
537
538STRLEN
539Perl_utf8_length(pTHX_ U8* s, U8* e)
540{
541 STRLEN len = 0;
542
8850bf83
JH
543 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
544 * the bitops (especially ~) can create illegal UTF-8.
545 * In other words: in Perl UTF-8 is not just for Unicode. */
546
b76347f2 547 if (e < s)
02eb7b47 548 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 549 while (s < e) {
02eb7b47 550 U8 t = UTF8SKIP(s);
b76347f2
JH
551
552 if (e - s < t)
02eb7b47 553 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
554 s += t;
555 len++;
556 }
557
558 return len;
559}
560
b06226ff 561/*
eebe1485 562=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
563
564Returns the number of UTF8 characters between the UTF-8 pointers C<a>
565and C<b>.
566
567WARNING: use only if you *know* that the pointers point inside the
568same UTF-8 buffer.
569
570=cut */
a0ed51b3 571
02eb7b47 572IV
864dbfa3 573Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 574{
02eb7b47
JH
575 IV off = 0;
576
8850bf83
JH
577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
580
a0ed51b3
LW
581 if (a < b) {
582 while (a < b) {
02eb7b47
JH
583 U8 c = UTF8SKIP(a);
584
585 if (b - a < c)
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 a += c;
a0ed51b3
LW
588 off--;
589 }
590 }
591 else {
592 while (b < a) {
02eb7b47
JH
593 U8 c = UTF8SKIP(b);
594
595 if (a - b < c)
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
597 b += c;
a0ed51b3
LW
598 off++;
599 }
600 }
02eb7b47 601
a0ed51b3
LW
602 return off;
603}
604
b06226ff 605/*
eebe1485 606=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
b06226ff 607
8850bf83
JH
608Return the UTF-8 pointer C<s> displaced by C<off> characters, either
609forward or backward.
b06226ff
JH
610
611WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
612the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613on the first byte of character or just after the last byte of a character.
b06226ff
JH
614
615=cut */
a0ed51b3
LW
616
617U8 *
864dbfa3 618Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 619{
8850bf83
JH
620 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
621 * the bitops (especially ~) can create illegal UTF-8.
622 * In other words: in Perl UTF-8 is not just for Unicode. */
623
a0ed51b3
LW
624 if (off >= 0) {
625 while (off--)
626 s += UTF8SKIP(s);
627 }
628 else {
629 while (off++) {
630 s--;
8850bf83
JH
631 while (UTF8_IS_CONTINUATION(*s))
632 s--;
a0ed51b3
LW
633 }
634 }
635 return s;
636}
637
6940069f 638/*
eebe1485 639=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 640
246fae53
MG
641Converts a string C<s> of length C<len> from UTF8 into byte encoding.
642Unlike C<bytes_to_utf8>, this over-writes the original string, and
643updates len to contain the new length.
67e989fb 644Returns zero on failure, setting C<len> to -1.
6940069f
GS
645
646=cut
647*/
648
649U8 *
246fae53 650Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 651{
6940069f
GS
652 U8 *send;
653 U8 *d;
dcad2880 654 U8 *save = s;
246fae53
MG
655
656 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
657 for (send = s + *len; s < send; ) {
658 U8 c = *s++;
659
1d72bdf6
NIS
660 if (!UTF8_IS_INVARIANT(c) &&
661 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
662 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
663 *len = -1;
664 return 0;
665 }
246fae53 666 }
dcad2880
JH
667
668 d = s = save;
6940069f 669 while (s < send) {
ed646e6e 670 STRLEN ulen;
9041c2e3 671 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 672 s += ulen;
6940069f
GS
673 }
674 *d = '\0';
246fae53 675 *len = d - save;
6940069f
GS
676 return save;
677}
678
679/*
f9a63242
JH
680=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
681
682Converts a string C<s> of length C<len> from UTF8 into byte encoding.
683Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
684the newly-created string, and updates C<len> to contain the new
685length. Returns the original string if no conversion occurs, C<len>
686is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6870 if C<s> is converted or contains all 7bit characters.
f9a63242
JH
688
689=cut */
690
691U8 *
692Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
693{
f9a63242
JH
694 U8 *d;
695 U8 *start = s;
db42d148 696 U8 *send;
f9a63242
JH
697 I32 count = 0;
698
699 if (!*is_utf8)
700 return start;
701
ef9edfd0 702 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
703 for (send = s + *len; s < send;) {
704 U8 c = *s++;
1d72bdf6 705 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
706 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
707 (c = *s++) && UTF8_IS_CONTINUATION(c))
708 count++;
709 else
f9a63242 710 return start;
db42d148 711 }
f9a63242
JH
712 }
713
714 *is_utf8 = 0;
715
db42d148
NIS
716#ifndef EBCDIC
717 /* Can use as-is if no high chars */
f9a63242
JH
718 if (!count)
719 return start;
db42d148 720#endif
f9a63242
JH
721
722 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 723 s = start; start = d;
f9a63242
JH
724 while (s < send) {
725 U8 c = *s++;
1d72bdf6 726 if (!UTF8_IS_INVARIANT(c))
db42d148
NIS
727 c = UTF8_ACCUMULATE(c, *s++);
728 *d++ = ASCII_TO_NATIVE(c);
f9a63242
JH
729 }
730 *d = '\0';
731 *len = d - start;
732 return start;
733}
734
735/*
eebe1485 736=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
737
738Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
739Returns a pointer to the newly-created string, and sets C<len> to
740reflect the new length.
6940069f 741
497711e7 742=cut
6940069f
GS
743*/
744
745U8*
6662521e 746Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 747{
6940069f
GS
748 U8 *send;
749 U8 *d;
750 U8 *dst;
6662521e 751 send = s + (*len);
6940069f 752
6662521e 753 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
754 dst = d;
755
756 while (s < send) {
db42d148 757 UV uv = NATIVE_TO_ASCII(*s++);
1d72bdf6 758 if (UTF8_IS_INVARIANT(uv))
db42d148 759 *d++ = uv;
6940069f 760 else {
90f44359
JH
761 *d++ = UTF8_EIGHT_BIT_HI(uv);
762 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
763 }
764 }
765 *d = '\0';
6662521e 766 *len = d-dst;
6940069f
GS
767 return dst;
768}
769
a0ed51b3 770/*
dea0fc0b 771 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
772 *
773 * Destination must be pre-extended to 3/2 source. Do not use in-place.
774 * We optimize for native, for obvious reasons. */
775
776U8*
dea0fc0b 777Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 778{
dea0fc0b
JH
779 U8* pend;
780 U8* dstart = d;
781
782 if (bytelen & 1)
a7867d0a 783 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
784
785 pend = p + bytelen;
786
a0ed51b3 787 while (p < pend) {
dea0fc0b
JH
788 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
789 p += 2;
a0ed51b3
LW
790 if (uv < 0x80) {
791 *d++ = uv;
792 continue;
793 }
794 if (uv < 0x800) {
795 *d++ = (( uv >> 6) | 0xc0);
796 *d++ = (( uv & 0x3f) | 0x80);
797 continue;
798 }
799 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
800 UV low = *p++;
801 if (low < 0xdc00 || low >= 0xdfff)
802 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
803 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
804 }
805 if (uv < 0x10000) {
806 *d++ = (( uv >> 12) | 0xe0);
807 *d++ = (((uv >> 6) & 0x3f) | 0x80);
808 *d++ = (( uv & 0x3f) | 0x80);
809 continue;
810 }
811 else {
812 *d++ = (( uv >> 18) | 0xf0);
813 *d++ = (((uv >> 12) & 0x3f) | 0x80);
814 *d++ = (((uv >> 6) & 0x3f) | 0x80);
815 *d++ = (( uv & 0x3f) | 0x80);
816 continue;
817 }
818 }
dea0fc0b 819 *newlen = d - dstart;
a0ed51b3
LW
820 return d;
821}
822
823/* Note: this one is slightly destructive of the source. */
824
825U8*
dea0fc0b 826Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
827{
828 U8* s = (U8*)p;
829 U8* send = s + bytelen;
830 while (s < send) {
831 U8 tmp = s[0];
832 s[0] = s[1];
833 s[1] = tmp;
834 s += 2;
835 }
dea0fc0b 836 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
837}
838
839/* for now these are all defined (inefficiently) in terms of the utf8 versions */
840
841bool
864dbfa3 842Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 843{
ad391ad9 844 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 845 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
846 return is_utf8_alnum(tmpbuf);
847}
848
849bool
b8c5462f
JH
850Perl_is_uni_alnumc(pTHX_ U32 c)
851{
ad391ad9 852 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 853 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
854 return is_utf8_alnumc(tmpbuf);
855}
856
857bool
864dbfa3 858Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 859{
ad391ad9 860 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 861 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
862 return is_utf8_idfirst(tmpbuf);
863}
864
865bool
864dbfa3 866Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 867{
ad391ad9 868 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 869 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
870 return is_utf8_alpha(tmpbuf);
871}
872
873bool
4d61ec05
GS
874Perl_is_uni_ascii(pTHX_ U32 c)
875{
ad391ad9 876 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 877 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
878 return is_utf8_ascii(tmpbuf);
879}
880
881bool
864dbfa3 882Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 883{
ad391ad9 884 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 885 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
886 return is_utf8_space(tmpbuf);
887}
888
889bool
864dbfa3 890Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 891{
ad391ad9 892 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 893 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
894 return is_utf8_digit(tmpbuf);
895}
896
897bool
864dbfa3 898Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 899{
ad391ad9 900 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 901 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
902 return is_utf8_upper(tmpbuf);
903}
904
905bool
864dbfa3 906Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 907{
ad391ad9 908 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 909 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
910 return is_utf8_lower(tmpbuf);
911}
912
913bool
b8c5462f
JH
914Perl_is_uni_cntrl(pTHX_ U32 c)
915{
ad391ad9 916 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 917 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
918 return is_utf8_cntrl(tmpbuf);
919}
920
921bool
922Perl_is_uni_graph(pTHX_ U32 c)
923{
ad391ad9 924 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 925 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
926 return is_utf8_graph(tmpbuf);
927}
928
929bool
864dbfa3 930Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 931{
ad391ad9 932 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 933 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
934 return is_utf8_print(tmpbuf);
935}
936
b8c5462f 937bool
f248d071 938Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 939{
ad391ad9 940 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 941 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
942 return is_utf8_punct(tmpbuf);
943}
944
4d61ec05
GS
945bool
946Perl_is_uni_xdigit(pTHX_ U32 c)
947{
ad391ad9 948 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 949 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
950 return is_utf8_xdigit(tmpbuf);
951}
952
a0ed51b3 953U32
864dbfa3 954Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 955{
ad391ad9 956 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 957 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
958 return to_utf8_upper(tmpbuf);
959}
960
961U32
864dbfa3 962Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 963{
ad391ad9 964 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 965 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
966 return to_utf8_title(tmpbuf);
967}
968
969U32
864dbfa3 970Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 971{
ad391ad9 972 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 973 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
974 return to_utf8_lower(tmpbuf);
975}
976
977/* for now these all assume no locale info available for Unicode > 255 */
978
979bool
864dbfa3 980Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3
LW
981{
982 return is_uni_alnum(c); /* XXX no locale support yet */
983}
984
985bool
b8c5462f
JH
986Perl_is_uni_alnumc_lc(pTHX_ U32 c)
987{
988 return is_uni_alnumc(c); /* XXX no locale support yet */
989}
990
991bool
864dbfa3 992Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3
LW
993{
994 return is_uni_idfirst(c); /* XXX no locale support yet */
995}
996
997bool
864dbfa3 998Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3
LW
999{
1000 return is_uni_alpha(c); /* XXX no locale support yet */
1001}
1002
1003bool
4d61ec05
GS
1004Perl_is_uni_ascii_lc(pTHX_ U32 c)
1005{
1006 return is_uni_ascii(c); /* XXX no locale support yet */
1007}
1008
1009bool
864dbfa3 1010Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3
LW
1011{
1012 return is_uni_space(c); /* XXX no locale support yet */
1013}
1014
1015bool
864dbfa3 1016Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3
LW
1017{
1018 return is_uni_digit(c); /* XXX no locale support yet */
1019}
1020
1021bool
864dbfa3 1022Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1023{
1024 return is_uni_upper(c); /* XXX no locale support yet */
1025}
1026
1027bool
864dbfa3 1028Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1029{
1030 return is_uni_lower(c); /* XXX no locale support yet */
1031}
1032
1033bool
b8c5462f
JH
1034Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1035{
1036 return is_uni_cntrl(c); /* XXX no locale support yet */
1037}
1038
1039bool
1040Perl_is_uni_graph_lc(pTHX_ U32 c)
1041{
1042 return is_uni_graph(c); /* XXX no locale support yet */
1043}
1044
1045bool
864dbfa3 1046Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3
LW
1047{
1048 return is_uni_print(c); /* XXX no locale support yet */
1049}
1050
b8c5462f
JH
1051bool
1052Perl_is_uni_punct_lc(pTHX_ U32 c)
1053{
1054 return is_uni_punct(c); /* XXX no locale support yet */
1055}
1056
4d61ec05
GS
1057bool
1058Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1059{
1060 return is_uni_xdigit(c); /* XXX no locale support yet */
1061}
1062
a0ed51b3 1063U32
864dbfa3 1064Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1065{
1066 return to_uni_upper(c); /* XXX no locale support yet */
1067}
1068
1069U32
864dbfa3 1070Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3
LW
1071{
1072 return to_uni_title(c); /* XXX no locale support yet */
1073}
1074
1075U32
864dbfa3 1076Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1077{
1078 return to_uni_lower(c); /* XXX no locale support yet */
1079}
1080
a0ed51b3 1081bool
864dbfa3 1082Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1083{
386d01d6
GS
1084 if (!is_utf8_char(p))
1085 return FALSE;
a0ed51b3 1086 if (!PL_utf8_alnum)
289d4f09
ML
1087 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1088 * descendant of isalnum(3), in other words, it doesn't
1089 * contain the '_'. --jhi */
1090 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1091 return swash_fetch(PL_utf8_alnum, p);
1092/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1093#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1094 if (!PL_utf8_alnum)
1095 PL_utf8_alnum = swash_init("utf8", "",
1096 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1097 return swash_fetch(PL_utf8_alnum, p);
1098#endif
1099}
1100
1101bool
b8c5462f
JH
1102Perl_is_utf8_alnumc(pTHX_ U8 *p)
1103{
386d01d6
GS
1104 if (!is_utf8_char(p))
1105 return FALSE;
b8c5462f
JH
1106 if (!PL_utf8_alnum)
1107 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_alnum, p);
1109/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1110#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1111 if (!PL_utf8_alnum)
1112 PL_utf8_alnum = swash_init("utf8", "",
1113 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1114 return swash_fetch(PL_utf8_alnum, p);
1115#endif
1116}
1117
1118bool
864dbfa3 1119Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1120{
1121 return *p == '_' || is_utf8_alpha(p);
1122}
1123
1124bool
864dbfa3 1125Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1126{
386d01d6
GS
1127 if (!is_utf8_char(p))
1128 return FALSE;
a0ed51b3 1129 if (!PL_utf8_alpha)
e24b16f9 1130 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1131 return swash_fetch(PL_utf8_alpha, p);
1132}
1133
1134bool
b8c5462f
JH
1135Perl_is_utf8_ascii(pTHX_ U8 *p)
1136{
386d01d6
GS
1137 if (!is_utf8_char(p))
1138 return FALSE;
b8c5462f
JH
1139 if (!PL_utf8_ascii)
1140 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1141 return swash_fetch(PL_utf8_ascii, p);
1142}
1143
1144bool
864dbfa3 1145Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1146{
386d01d6
GS
1147 if (!is_utf8_char(p))
1148 return FALSE;
a0ed51b3 1149 if (!PL_utf8_space)
3bec3564 1150 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1151 return swash_fetch(PL_utf8_space, p);
1152}
1153
1154bool
864dbfa3 1155Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1156{
386d01d6
GS
1157 if (!is_utf8_char(p))
1158 return FALSE;
a0ed51b3 1159 if (!PL_utf8_digit)
e24b16f9 1160 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1161 return swash_fetch(PL_utf8_digit, p);
1162}
1163
1164bool
864dbfa3 1165Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1166{
386d01d6
GS
1167 if (!is_utf8_char(p))
1168 return FALSE;
a0ed51b3 1169 if (!PL_utf8_upper)
e24b16f9 1170 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1171 return swash_fetch(PL_utf8_upper, p);
1172}
1173
1174bool
864dbfa3 1175Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1176{
386d01d6
GS
1177 if (!is_utf8_char(p))
1178 return FALSE;
a0ed51b3 1179 if (!PL_utf8_lower)
e24b16f9 1180 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1181 return swash_fetch(PL_utf8_lower, p);
1182}
1183
1184bool
b8c5462f
JH
1185Perl_is_utf8_cntrl(pTHX_ U8 *p)
1186{
386d01d6
GS
1187 if (!is_utf8_char(p))
1188 return FALSE;
b8c5462f
JH
1189 if (!PL_utf8_cntrl)
1190 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1191 return swash_fetch(PL_utf8_cntrl, p);
1192}
1193
1194bool
1195Perl_is_utf8_graph(pTHX_ U8 *p)
1196{
386d01d6
GS
1197 if (!is_utf8_char(p))
1198 return FALSE;
b8c5462f
JH
1199 if (!PL_utf8_graph)
1200 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1201 return swash_fetch(PL_utf8_graph, p);
1202}
1203
1204bool
864dbfa3 1205Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1206{
386d01d6
GS
1207 if (!is_utf8_char(p))
1208 return FALSE;
a0ed51b3 1209 if (!PL_utf8_print)
e24b16f9 1210 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1211 return swash_fetch(PL_utf8_print, p);
1212}
1213
1214bool
b8c5462f
JH
1215Perl_is_utf8_punct(pTHX_ U8 *p)
1216{
386d01d6
GS
1217 if (!is_utf8_char(p))
1218 return FALSE;
b8c5462f
JH
1219 if (!PL_utf8_punct)
1220 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1221 return swash_fetch(PL_utf8_punct, p);
1222}
1223
1224bool
1225Perl_is_utf8_xdigit(pTHX_ U8 *p)
1226{
386d01d6
GS
1227 if (!is_utf8_char(p))
1228 return FALSE;
b8c5462f
JH
1229 if (!PL_utf8_xdigit)
1230 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1231 return swash_fetch(PL_utf8_xdigit, p);
1232}
1233
1234bool
864dbfa3 1235Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1236{
386d01d6
GS
1237 if (!is_utf8_char(p))
1238 return FALSE;
a0ed51b3 1239 if (!PL_utf8_mark)
e24b16f9 1240 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1241 return swash_fetch(PL_utf8_mark, p);
1242}
1243
2104c8d9 1244UV
864dbfa3 1245Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3
LW
1246{
1247 UV uv;
1248
1249 if (!PL_utf8_toupper)
e24b16f9 1250 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1251 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1252 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1253}
1254
2104c8d9 1255UV
864dbfa3 1256Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3
LW
1257{
1258 UV uv;
1259
1260 if (!PL_utf8_totitle)
e24b16f9 1261 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1262 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1263 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1264}
1265
2104c8d9 1266UV
864dbfa3 1267Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3
LW
1268{
1269 UV uv;
1270
1271 if (!PL_utf8_tolower)
e24b16f9 1272 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1273 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1274 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1275}
1276
1277/* a "swash" is a swatch hash */
1278
1279SV*
864dbfa3 1280Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1281{
1282 SV* retval;
bf1fed83 1283 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1284 dSP;
1b026014 1285 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
ce3b816e 1286
1b026014 1287 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e
GS
1288 ENTER;
1289 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1290 LEAVE;
1291 }
1292 SPAGAIN;
a0ed51b3
LW
1293 PUSHSTACKi(PERLSI_MAGIC);
1294 PUSHMARK(SP);
1295 EXTEND(SP,5);
1296 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1297 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1298 PUSHs(listsv);
1299 PUSHs(sv_2mortal(newSViv(minbits)));
1300 PUSHs(sv_2mortal(newSViv(none)));
1301 PUTBACK;
1302 ENTER;
1303 SAVEI32(PL_hints);
1304 PL_hints = 0;
1305 save_re_context();
bf1fed83
JH
1306 if (PL_curcop == &PL_compiling)
1307 /* XXX ought to be handled by lex_start */
1308 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1309 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1310 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1311 else
e24b16f9 1312 retval = &PL_sv_undef;
a0ed51b3
LW
1313 LEAVE;
1314 POPSTACK;
e24b16f9 1315 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1316 STRLEN len;
1317 char* pv = SvPV(tokenbufsv, len);
1318
1319 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1320 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1321 }
1322 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1323 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1324 return retval;
1325}
1326
1327UV
864dbfa3 1328Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3
LW
1329{
1330 HV* hv = (HV*)SvRV(sv);
1331 U32 klen = UTF8SKIP(ptr) - 1;
1332 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1333 STRLEN slen;
1334 STRLEN needents = (klen ? 64 : 128);
dfe13c55 1335 U8 *tmps;
a0ed51b3
LW
1336 U32 bit;
1337 SV *retval;
1338
1339 /*
1340 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1341 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1342 * it's nothing to sniff at.) Pity we usually come through at least
1343 * two function calls to get here...
1344 *
1345 * NB: this code assumes that swatches are never modified, once generated!
1346 */
1347
1348 if (hv == PL_last_swash_hv &&
1349 klen == PL_last_swash_klen &&
12ae5dfc 1350 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3
LW
1351 {
1352 tmps = PL_last_swash_tmps;
1353 slen = PL_last_swash_slen;
1354 }
1355 else {
1356 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1357 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1358
1359 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1360 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3
LW
1361 dSP;
1362 ENTER;
1363 SAVETMPS;
1364 save_re_context();
1365 PUSHSTACKi(PERLSI_MAGIC);
1366 PUSHMARK(SP);
1367 EXTEND(SP,3);
1368 PUSHs((SV*)sv);
9041c2e3
NIS
1369 /* We call utf8_to_uni as we want and index into Unicode tables,
1370 not a native character number.
1371 */
1372 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
a0ed51b3
LW
1373 PUSHs(sv_2mortal(newSViv(needents)));
1374 PUTBACK;
864dbfa3 1375 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1376 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1377 else
e24b16f9 1378 retval = &PL_sv_undef;
a0ed51b3
LW
1379 POPSTACK;
1380 FREETMPS;
1381 LEAVE;
e24b16f9
GS
1382 if (PL_curcop == &PL_compiling)
1383 PL_curcop->op_private = PL_hints;
a0ed51b3 1384
dfe13c55 1385 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1386
dfe13c55 1387 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
cea2e8a9 1388 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1389 }
1390
1391 PL_last_swash_hv = hv;
1392 PL_last_swash_klen = klen;
1393 PL_last_swash_tmps = tmps;
1394 PL_last_swash_slen = slen;
1395 if (klen)
1396 Copy(ptr, PL_last_swash_key, klen, U8);
1397 }
1398
9faf8d75 1399 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1400 case 1:
1401 bit = 1 << (off & 7);
1402 off >>= 3;
1403 return (tmps[off] & bit) != 0;
1404 case 8:
1405 return tmps[off];
1406 case 16:
1407 off <<= 1;
1408 return (tmps[off] << 8) + tmps[off + 1] ;
1409 case 32:
1410 off <<= 2;
1411 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1412 }
cea2e8a9 1413 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1414 return 0;
1415}