This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate changes #9259,9260 from maintperl into mainline.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
9041c2e3 30=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a
NIS
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
51 return d;
52 }
1d72bdf6
NIS
53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3
LW
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3
LW
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3
LW
132}
133
eebe1485 134/*
9041c2e3
NIS
135=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
136
137Adds the UTF8 representation of the Native codepoint C<uv> to the end
138of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
139bytes available. The return value is the pointer to the byte after the
140end of the new character. In other words,
141
142 d = uvchr_to_utf8(d, uv);
143
144is the recommended wide native character-aware way of saying
145
146 *(d++) = uv;
147
148=cut
149*/
150
151U8 *
152Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
153{
c4d5f83a 154 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
9041c2e3
NIS
155}
156
157
158/*
eebe1485
SC
159=for apidoc A|STRLEN|is_utf8_char|U8 *s
160
5da9da9e 161Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 162character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
163The actual number of bytes in the UTF-8 character will be returned if
164it is valid, otherwise 0.
9041c2e3 165
5da9da9e 166=cut */
067a85ef 167STRLEN
386d01d6
GS
168Perl_is_utf8_char(pTHX_ U8 *s)
169{
170 U8 u = *s;
067a85ef
A
171 STRLEN slen, len;
172 UV uv, ouv;
386d01d6 173
1d72bdf6 174 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
175 return 1;
176
60006e79 177 if (!UTF8_IS_START(u))
386d01d6
GS
178 return 0;
179
9f07fdcd 180 len = UTF8SKIP(s);
386d01d6 181
60006e79 182 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
183 return 0;
184
386d01d6
GS
185 slen = len - 1;
186 s++;
1d72bdf6
NIS
187 /* The initial value is dubious */
188 uv = u;
067a85ef 189 ouv = uv;
386d01d6 190 while (slen--) {
60006e79 191 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 192 return 0;
8850bf83 193 uv = UTF8_ACCUMULATE(uv, *s);
067a85ef
A
194 if (uv < ouv)
195 return 0;
196 ouv = uv;
386d01d6
GS
197 s++;
198 }
067a85ef 199
5bbb0b5a 200 if (UNISKIP(uv) < len)
067a85ef
A
201 return 0;
202
386d01d6
GS
203 return len;
204}
205
6662521e 206/*
eebe1485 207=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 208
5da9da9e
JH
209Returns true if first C<len> bytes of the given string form a valid UTF8
210string, false otherwise. Note that 'a valid UTF8 string' does not mean
211'a string that contains UTF8' because a valid ASCII string is a valid
212UTF8 string.
6662521e
GS
213
214=cut
215*/
216
8e84507e 217bool
6662521e
GS
218Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
219{
067a85ef 220 U8* x = s;
1aa99e6b 221 U8* send;
067a85ef
A
222 STRLEN c;
223
1aa99e6b 224 if (!len)
6cd5fe39 225 len = strlen((char *)s);
1aa99e6b
IH
226 send = s + len;
227
6662521e
GS
228 while (x < send) {
229 c = is_utf8_char(x);
067a85ef
A
230 if (!c)
231 return FALSE;
6662521e 232 x += c;
6662521e 233 }
60006e79
JH
234 if (x != send)
235 return FALSE;
067a85ef
A
236
237 return TRUE;
6662521e
GS
238}
239
67e989fb 240/*
9041c2e3 241=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 242
9041c2e3
NIS
243Bottom level UTF-8 decode routine.
244Returns the unicode code point value of the first character in the string C<s>
ba210ebe 245which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 246C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
247
248If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
249is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
250it is assumed that the caller will raise a warning, and this function
28d3d195
JH
251will silently just set C<retlen> to C<-1> and return zero. If the
252C<flags> does not contain UTF8_CHECK_ONLY, warnings about
253malformations will be given, C<retlen> will be set to the expected
254length of the UTF-8 character in bytes, and zero will be returned.
255
256The C<flags> can also contain various flags to allow deviations from
257the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 258
9041c2e3
NIS
259Most code should use utf8_to_uvchr() rather than call this directly.
260
dcad2880 261=cut */
67e989fb 262
a0ed51b3 263UV
9041c2e3 264Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
a0ed51b3 265{
ba210ebe
JH
266 UV uv = *s, ouv;
267 STRLEN len = 1;
268 bool dowarn = ckWARN_d(WARN_UTF8);
269 STRLEN expectlen = 0;
a0dbb045
JH
270 U32 warning = 0;
271
272/* This list is a superset of the UTF8_ALLOW_XXX. */
273
274#define UTF8_WARN_EMPTY 1
275#define UTF8_WARN_CONTINUATION 2
276#define UTF8_WARN_NON_CONTINUATION 3
277#define UTF8_WARN_FE_FF 4
278#define UTF8_WARN_SHORT 5
279#define UTF8_WARN_OVERFLOW 6
280#define UTF8_WARN_SURROGATE 7
281#define UTF8_WARN_BOM 8
282#define UTF8_WARN_LONG 9
283#define UTF8_WARN_FFFF 10
284
285 if (curlen == 0 &&
286 !(flags & UTF8_ALLOW_EMPTY)) {
287 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
288 goto malformed;
289 }
290
1d72bdf6 291 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
292 if (retlen)
293 *retlen = 1;
c4d5f83a 294 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 295 }
67e989fb 296
421a8bf2 297 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 298 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 299 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
300 goto malformed;
301 }
302
421a8bf2 303 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 304 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 305 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
306 goto malformed;
307 }
9041c2e3 308
1d72bdf6 309#ifdef EBCDIC
75383841 310 uv = NATIVE_TO_UTF(uv);
1d72bdf6 311#else
fcc8fcf6
JH
312 if ((uv == 0xfe || uv == 0xff) &&
313 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 314 warning = UTF8_WARN_FE_FF;
ba210ebe 315 goto malformed;
a0ed51b3 316 }
1d72bdf6
NIS
317#endif
318
ba210ebe
JH
319 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
320 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
321 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
322 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
323#ifdef EBCDIC
324 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
325 else { len = 7; uv &= 0x01; }
326#else
ba210ebe
JH
327 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
328 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
329 else { len = 13; uv = 0; } /* whoa! */
330#endif
331
a0ed51b3
LW
332 if (retlen)
333 *retlen = len;
9041c2e3 334
ba210ebe
JH
335 expectlen = len;
336
fcc8fcf6
JH
337 if ((curlen < expectlen) &&
338 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 339 warning = UTF8_WARN_SHORT;
ba210ebe
JH
340 goto malformed;
341 }
342
343 len--;
a0ed51b3 344 s++;
ba210ebe
JH
345 ouv = uv;
346
a0ed51b3 347 while (len--) {
421a8bf2
JH
348 if (!UTF8_IS_CONTINUATION(*s) &&
349 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
350 s--;
351 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 352 goto malformed;
a0ed51b3
LW
353 }
354 else
8850bf83 355 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
356 if (!(uv > ouv)) {
357 /* These cannot be allowed. */
358 if (uv == ouv) {
359 if (!(flags & UTF8_ALLOW_LONG)) {
360 warning = UTF8_WARN_LONG;
361 goto malformed;
362 }
363 }
364 else { /* uv < ouv */
365 /* This cannot be allowed. */
366 warning = UTF8_WARN_OVERFLOW;
367 goto malformed;
368 }
ba210ebe
JH
369 }
370 s++;
371 ouv = uv;
372 }
373
421a8bf2 374 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 375 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 376 warning = UTF8_WARN_SURROGATE;
ba210ebe 377 goto malformed;
421a8bf2 378 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 379 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 380 warning = UTF8_WARN_BOM;
ba210ebe 381 goto malformed;
fcc8fcf6
JH
382 } else if ((expectlen > UNISKIP(uv)) &&
383 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 384 warning = UTF8_WARN_LONG;
ba210ebe 385 goto malformed;
421a8bf2 386 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 387 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 388 warning = UTF8_WARN_FFFF;
a9917092 389 goto malformed;
a0ed51b3 390 }
ba210ebe 391
a0ed51b3 392 return uv;
ba210ebe
JH
393
394malformed:
395
fcc8fcf6 396 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 397 if (retlen)
cc366d4b 398 *retlen = -1;
ba210ebe
JH
399 return 0;
400 }
401
a0dbb045
JH
402 if (dowarn) {
403 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
404
405 switch (warning) {
406 case 0: /* Intentionally empty. */ break;
407 case UTF8_WARN_EMPTY:
408 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
409 break;
410 case UTF8_WARN_CONTINUATION:
411 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
412 break;
413 case UTF8_WARN_NON_CONTINUATION:
414 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
415 (UV)s[1], uv);
416 break;
417 case UTF8_WARN_FE_FF:
418 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
419 break;
420 case UTF8_WARN_SHORT:
421 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
422 curlen, curlen == 1 ? "" : "s", expectlen);
423 break;
424 case UTF8_WARN_OVERFLOW:
425 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
426 ouv, *s);
427 break;
428 case UTF8_WARN_SURROGATE:
429 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
430 break;
431 case UTF8_WARN_BOM:
432 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
433 break;
434 case UTF8_WARN_LONG:
435 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
436 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
437 break;
438 case UTF8_WARN_FFFF:
439 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
440 break;
441 default:
442 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
443 break;
444 }
445
446 if (warning) {
447 char *s = SvPVX(sv);
448
449 if (PL_op)
450 Perl_warner(aTHX_ WARN_UTF8,
451 "%s in %s", s, PL_op_desc[PL_op->op_type]);
452 else
453 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
454 }
455 }
456
ba210ebe 457 if (retlen)
28d3d195 458 *retlen = expectlen ? expectlen : len;
ba210ebe 459
28d3d195 460 return 0;
a0ed51b3
LW
461}
462
8e84507e 463/*
9041c2e3 464=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
8e84507e 465
9041c2e3 466Returns the native character value of the first character in the string C<s>
8e84507e 467which is assumed to be in UTF8 encoding; C<retlen> will be set to the
7df053ec 468length, in bytes, of that character.
8e84507e 469
9041c2e3
NIS
470Allows length and flags to be passed to low level routine.
471
472=cut
473*/
474
475UV
476Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
477{
478 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
c4d5f83a 479 return UNI_TO_NATIVE(uv);
9041c2e3
NIS
480}
481
482/*
483=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
484
485Returns the native character value of the first character in the string C<s>
486which is assumed to be in UTF8 encoding; C<retlen> will be set to the
487length, in bytes, of that character.
488
489If C<s> does not point to a well-formed UTF8 character, zero is
490returned and retlen is set, if possible, to -1.
491
492=cut
493*/
494
495UV
496Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
497{
498 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
499}
500
501/*
502=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
503
504Returns the Unicode code point of the first character in the string C<s>
505which is assumed to be in UTF8 encoding; C<retlen> will be set to the
506length, in bytes, of that character.
507
508This function should only be used when returned UV is considered
509an index into the Unicode semantic tables (e.g. swashes).
510
ba210ebe
JH
511If C<s> does not point to a well-formed UTF8 character, zero is
512returned and retlen is set, if possible, to -1.
8e84507e
NIS
513
514=cut
515*/
516
517UV
9041c2e3 518Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
8e84507e 519{
9041c2e3
NIS
520 /* Call the low level routine asking for checks */
521 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
522}
523
b76347f2 524/*
eebe1485 525=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
b76347f2
JH
526
527Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
528Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
529up past C<e>, croaks.
b76347f2
JH
530
531=cut
532*/
533
534STRLEN
535Perl_utf8_length(pTHX_ U8* s, U8* e)
536{
537 STRLEN len = 0;
538
8850bf83
JH
539 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
540 * the bitops (especially ~) can create illegal UTF-8.
541 * In other words: in Perl UTF-8 is not just for Unicode. */
542
b76347f2 543 if (e < s)
02eb7b47 544 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 545 while (s < e) {
02eb7b47 546 U8 t = UTF8SKIP(s);
b76347f2
JH
547
548 if (e - s < t)
c4d5f83a 549 Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
b76347f2
JH
550 s += t;
551 len++;
552 }
553
554 return len;
555}
556
b06226ff 557/*
eebe1485 558=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
559
560Returns the number of UTF8 characters between the UTF-8 pointers C<a>
561and C<b>.
562
563WARNING: use only if you *know* that the pointers point inside the
564same UTF-8 buffer.
565
566=cut */
a0ed51b3 567
02eb7b47 568IV
864dbfa3 569Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 570{
02eb7b47
JH
571 IV off = 0;
572
8850bf83
JH
573 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
574 * the bitops (especially ~) can create illegal UTF-8.
575 * In other words: in Perl UTF-8 is not just for Unicode. */
576
a0ed51b3
LW
577 if (a < b) {
578 while (a < b) {
02eb7b47
JH
579 U8 c = UTF8SKIP(a);
580
581 if (b - a < c)
582 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
583 a += c;
a0ed51b3
LW
584 off--;
585 }
586 }
587 else {
588 while (b < a) {
02eb7b47
JH
589 U8 c = UTF8SKIP(b);
590
591 if (a - b < c)
592 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
593 b += c;
a0ed51b3
LW
594 off++;
595 }
596 }
02eb7b47 597
a0ed51b3
LW
598 return off;
599}
600
b06226ff 601/*
eebe1485 602=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
b06226ff 603
8850bf83
JH
604Return the UTF-8 pointer C<s> displaced by C<off> characters, either
605forward or backward.
b06226ff
JH
606
607WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
608the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
609on the first byte of character or just after the last byte of a character.
b06226ff
JH
610
611=cut */
a0ed51b3
LW
612
613U8 *
864dbfa3 614Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 615{
8850bf83
JH
616 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
617 * the bitops (especially ~) can create illegal UTF-8.
618 * In other words: in Perl UTF-8 is not just for Unicode. */
619
a0ed51b3
LW
620 if (off >= 0) {
621 while (off--)
622 s += UTF8SKIP(s);
623 }
624 else {
625 while (off++) {
626 s--;
8850bf83
JH
627 while (UTF8_IS_CONTINUATION(*s))
628 s--;
a0ed51b3
LW
629 }
630 }
631 return s;
632}
633
6940069f 634/*
eebe1485 635=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 636
246fae53
MG
637Converts a string C<s> of length C<len> from UTF8 into byte encoding.
638Unlike C<bytes_to_utf8>, this over-writes the original string, and
639updates len to contain the new length.
67e989fb 640Returns zero on failure, setting C<len> to -1.
6940069f
GS
641
642=cut
643*/
644
645U8 *
246fae53 646Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 647{
6940069f
GS
648 U8 *send;
649 U8 *d;
dcad2880 650 U8 *save = s;
246fae53
MG
651
652 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
653 for (send = s + *len; s < send; ) {
654 U8 c = *s++;
655
1d72bdf6
NIS
656 if (!UTF8_IS_INVARIANT(c) &&
657 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
658 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
659 *len = -1;
660 return 0;
661 }
246fae53 662 }
dcad2880
JH
663
664 d = s = save;
6940069f 665 while (s < send) {
ed646e6e 666 STRLEN ulen;
9041c2e3 667 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 668 s += ulen;
6940069f
GS
669 }
670 *d = '\0';
246fae53 671 *len = d - save;
6940069f
GS
672 return save;
673}
674
675/*
f9a63242
JH
676=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
677
678Converts a string C<s> of length C<len> from UTF8 into byte encoding.
679Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
680the newly-created string, and updates C<len> to contain the new
681length. Returns the original string if no conversion occurs, C<len>
682is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6830 if C<s> is converted or contains all 7bit characters.
f9a63242
JH
684
685=cut */
686
687U8 *
688Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
689{
f9a63242
JH
690 U8 *d;
691 U8 *start = s;
db42d148 692 U8 *send;
f9a63242
JH
693 I32 count = 0;
694
695 if (!*is_utf8)
696 return start;
697
ef9edfd0 698 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
699 for (send = s + *len; s < send;) {
700 U8 c = *s++;
1d72bdf6 701 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
702 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
703 (c = *s++) && UTF8_IS_CONTINUATION(c))
704 count++;
705 else
f9a63242 706 return start;
db42d148 707 }
f9a63242
JH
708 }
709
710 *is_utf8 = 0;
711
f9a63242 712 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 713 s = start; start = d;
f9a63242
JH
714 while (s < send) {
715 U8 c = *s++;
c4d5f83a
NIS
716 if (!UTF8_IS_INVARIANT(c)) {
717 /* Then it is two-byte encoded */
718 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
719 c = ASCII_TO_NATIVE(c);
720 }
721 *d++ = c;
f9a63242
JH
722 }
723 *d = '\0';
724 *len = d - start;
725 return start;
726}
727
728/*
eebe1485 729=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
730
731Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
732Returns a pointer to the newly-created string, and sets C<len> to
733reflect the new length.
6940069f 734
497711e7 735=cut
6940069f
GS
736*/
737
738U8*
6662521e 739Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 740{
6940069f
GS
741 U8 *send;
742 U8 *d;
743 U8 *dst;
6662521e 744 send = s + (*len);
6940069f 745
6662521e 746 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
747 dst = d;
748
749 while (s < send) {
db42d148 750 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
751 if (UNI_IS_INVARIANT(uv))
752 *d++ = UTF_TO_NATIVE(uv);
6940069f 753 else {
90f44359
JH
754 *d++ = UTF8_EIGHT_BIT_HI(uv);
755 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
756 }
757 }
758 *d = '\0';
6662521e 759 *len = d-dst;
6940069f
GS
760 return dst;
761}
762
a0ed51b3 763/*
dea0fc0b 764 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
765 *
766 * Destination must be pre-extended to 3/2 source. Do not use in-place.
767 * We optimize for native, for obvious reasons. */
768
769U8*
dea0fc0b 770Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 771{
dea0fc0b
JH
772 U8* pend;
773 U8* dstart = d;
774
775 if (bytelen & 1)
a7867d0a 776 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
777
778 pend = p + bytelen;
779
a0ed51b3 780 while (p < pend) {
dea0fc0b
JH
781 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
782 p += 2;
a0ed51b3
LW
783 if (uv < 0x80) {
784 *d++ = uv;
785 continue;
786 }
787 if (uv < 0x800) {
788 *d++ = (( uv >> 6) | 0xc0);
789 *d++ = (( uv & 0x3f) | 0x80);
790 continue;
791 }
792 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
793 UV low = *p++;
794 if (low < 0xdc00 || low >= 0xdfff)
795 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
796 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
797 }
798 if (uv < 0x10000) {
799 *d++ = (( uv >> 12) | 0xe0);
800 *d++ = (((uv >> 6) & 0x3f) | 0x80);
801 *d++ = (( uv & 0x3f) | 0x80);
802 continue;
803 }
804 else {
805 *d++ = (( uv >> 18) | 0xf0);
806 *d++ = (((uv >> 12) & 0x3f) | 0x80);
807 *d++ = (((uv >> 6) & 0x3f) | 0x80);
808 *d++ = (( uv & 0x3f) | 0x80);
809 continue;
810 }
811 }
dea0fc0b 812 *newlen = d - dstart;
a0ed51b3
LW
813 return d;
814}
815
816/* Note: this one is slightly destructive of the source. */
817
818U8*
dea0fc0b 819Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
820{
821 U8* s = (U8*)p;
822 U8* send = s + bytelen;
823 while (s < send) {
824 U8 tmp = s[0];
825 s[0] = s[1];
826 s[1] = tmp;
827 s += 2;
828 }
dea0fc0b 829 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
830}
831
832/* for now these are all defined (inefficiently) in terms of the utf8 versions */
833
834bool
864dbfa3 835Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 836{
ad391ad9 837 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 838 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
839 return is_utf8_alnum(tmpbuf);
840}
841
842bool
b8c5462f
JH
843Perl_is_uni_alnumc(pTHX_ U32 c)
844{
ad391ad9 845 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 846 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
847 return is_utf8_alnumc(tmpbuf);
848}
849
850bool
864dbfa3 851Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 852{
ad391ad9 853 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 854 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
855 return is_utf8_idfirst(tmpbuf);
856}
857
858bool
864dbfa3 859Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 860{
ad391ad9 861 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 862 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
863 return is_utf8_alpha(tmpbuf);
864}
865
866bool
4d61ec05
GS
867Perl_is_uni_ascii(pTHX_ U32 c)
868{
ad391ad9 869 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 870 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
871 return is_utf8_ascii(tmpbuf);
872}
873
874bool
864dbfa3 875Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 876{
ad391ad9 877 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 878 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
879 return is_utf8_space(tmpbuf);
880}
881
882bool
864dbfa3 883Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 884{
ad391ad9 885 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 886 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
887 return is_utf8_digit(tmpbuf);
888}
889
890bool
864dbfa3 891Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 892{
ad391ad9 893 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 894 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
895 return is_utf8_upper(tmpbuf);
896}
897
898bool
864dbfa3 899Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 900{
ad391ad9 901 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 902 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
903 return is_utf8_lower(tmpbuf);
904}
905
906bool
b8c5462f
JH
907Perl_is_uni_cntrl(pTHX_ U32 c)
908{
ad391ad9 909 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 910 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
911 return is_utf8_cntrl(tmpbuf);
912}
913
914bool
915Perl_is_uni_graph(pTHX_ U32 c)
916{
ad391ad9 917 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 918 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
919 return is_utf8_graph(tmpbuf);
920}
921
922bool
864dbfa3 923Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 924{
ad391ad9 925 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 926 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
927 return is_utf8_print(tmpbuf);
928}
929
b8c5462f 930bool
f248d071 931Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 932{
ad391ad9 933 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 934 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
935 return is_utf8_punct(tmpbuf);
936}
937
4d61ec05
GS
938bool
939Perl_is_uni_xdigit(pTHX_ U32 c)
940{
ad391ad9 941 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 942 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
943 return is_utf8_xdigit(tmpbuf);
944}
945
a0ed51b3 946U32
864dbfa3 947Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 948{
ad391ad9 949 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 950 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
951 return to_utf8_upper(tmpbuf);
952}
953
954U32
864dbfa3 955Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 956{
ad391ad9 957 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 958 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
959 return to_utf8_title(tmpbuf);
960}
961
962U32
864dbfa3 963Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 964{
ad391ad9 965 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 966 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
967 return to_utf8_lower(tmpbuf);
968}
969
970/* for now these all assume no locale info available for Unicode > 255 */
971
972bool
864dbfa3 973Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3
LW
974{
975 return is_uni_alnum(c); /* XXX no locale support yet */
976}
977
978bool
b8c5462f
JH
979Perl_is_uni_alnumc_lc(pTHX_ U32 c)
980{
981 return is_uni_alnumc(c); /* XXX no locale support yet */
982}
983
984bool
864dbfa3 985Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3
LW
986{
987 return is_uni_idfirst(c); /* XXX no locale support yet */
988}
989
990bool
864dbfa3 991Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3
LW
992{
993 return is_uni_alpha(c); /* XXX no locale support yet */
994}
995
996bool
4d61ec05
GS
997Perl_is_uni_ascii_lc(pTHX_ U32 c)
998{
999 return is_uni_ascii(c); /* XXX no locale support yet */
1000}
1001
1002bool
864dbfa3 1003Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3
LW
1004{
1005 return is_uni_space(c); /* XXX no locale support yet */
1006}
1007
1008bool
864dbfa3 1009Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3
LW
1010{
1011 return is_uni_digit(c); /* XXX no locale support yet */
1012}
1013
1014bool
864dbfa3 1015Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1016{
1017 return is_uni_upper(c); /* XXX no locale support yet */
1018}
1019
1020bool
864dbfa3 1021Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1022{
1023 return is_uni_lower(c); /* XXX no locale support yet */
1024}
1025
1026bool
b8c5462f
JH
1027Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1028{
1029 return is_uni_cntrl(c); /* XXX no locale support yet */
1030}
1031
1032bool
1033Perl_is_uni_graph_lc(pTHX_ U32 c)
1034{
1035 return is_uni_graph(c); /* XXX no locale support yet */
1036}
1037
1038bool
864dbfa3 1039Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3
LW
1040{
1041 return is_uni_print(c); /* XXX no locale support yet */
1042}
1043
b8c5462f
JH
1044bool
1045Perl_is_uni_punct_lc(pTHX_ U32 c)
1046{
1047 return is_uni_punct(c); /* XXX no locale support yet */
1048}
1049
4d61ec05
GS
1050bool
1051Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1052{
1053 return is_uni_xdigit(c); /* XXX no locale support yet */
1054}
1055
a0ed51b3 1056U32
864dbfa3 1057Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1058{
1059 return to_uni_upper(c); /* XXX no locale support yet */
1060}
1061
1062U32
864dbfa3 1063Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3
LW
1064{
1065 return to_uni_title(c); /* XXX no locale support yet */
1066}
1067
1068U32
864dbfa3 1069Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1070{
1071 return to_uni_lower(c); /* XXX no locale support yet */
1072}
1073
a0ed51b3 1074bool
864dbfa3 1075Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1076{
386d01d6
GS
1077 if (!is_utf8_char(p))
1078 return FALSE;
a0ed51b3 1079 if (!PL_utf8_alnum)
289d4f09
ML
1080 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1081 * descendant of isalnum(3), in other words, it doesn't
1082 * contain the '_'. --jhi */
1083 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1084 return swash_fetch(PL_utf8_alnum, p);
1085/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1086#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1087 if (!PL_utf8_alnum)
1088 PL_utf8_alnum = swash_init("utf8", "",
1089 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1090 return swash_fetch(PL_utf8_alnum, p);
1091#endif
1092}
1093
1094bool
b8c5462f
JH
1095Perl_is_utf8_alnumc(pTHX_ U8 *p)
1096{
386d01d6
GS
1097 if (!is_utf8_char(p))
1098 return FALSE;
b8c5462f
JH
1099 if (!PL_utf8_alnum)
1100 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1101 return swash_fetch(PL_utf8_alnum, p);
1102/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1103#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1104 if (!PL_utf8_alnum)
1105 PL_utf8_alnum = swash_init("utf8", "",
1106 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1107 return swash_fetch(PL_utf8_alnum, p);
1108#endif
1109}
1110
1111bool
864dbfa3 1112Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1113{
1114 return *p == '_' || is_utf8_alpha(p);
1115}
1116
1117bool
864dbfa3 1118Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1119{
386d01d6
GS
1120 if (!is_utf8_char(p))
1121 return FALSE;
a0ed51b3 1122 if (!PL_utf8_alpha)
e24b16f9 1123 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1124 return swash_fetch(PL_utf8_alpha, p);
1125}
1126
1127bool
b8c5462f
JH
1128Perl_is_utf8_ascii(pTHX_ U8 *p)
1129{
386d01d6
GS
1130 if (!is_utf8_char(p))
1131 return FALSE;
b8c5462f
JH
1132 if (!PL_utf8_ascii)
1133 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1134 return swash_fetch(PL_utf8_ascii, p);
1135}
1136
1137bool
864dbfa3 1138Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1139{
386d01d6
GS
1140 if (!is_utf8_char(p))
1141 return FALSE;
a0ed51b3 1142 if (!PL_utf8_space)
3bec3564 1143 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1144 return swash_fetch(PL_utf8_space, p);
1145}
1146
1147bool
864dbfa3 1148Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1149{
386d01d6
GS
1150 if (!is_utf8_char(p))
1151 return FALSE;
a0ed51b3 1152 if (!PL_utf8_digit)
e24b16f9 1153 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1154 return swash_fetch(PL_utf8_digit, p);
1155}
1156
1157bool
864dbfa3 1158Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1159{
386d01d6
GS
1160 if (!is_utf8_char(p))
1161 return FALSE;
a0ed51b3 1162 if (!PL_utf8_upper)
e24b16f9 1163 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1164 return swash_fetch(PL_utf8_upper, p);
1165}
1166
1167bool
864dbfa3 1168Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1169{
386d01d6
GS
1170 if (!is_utf8_char(p))
1171 return FALSE;
a0ed51b3 1172 if (!PL_utf8_lower)
e24b16f9 1173 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1174 return swash_fetch(PL_utf8_lower, p);
1175}
1176
1177bool
b8c5462f
JH
1178Perl_is_utf8_cntrl(pTHX_ U8 *p)
1179{
386d01d6
GS
1180 if (!is_utf8_char(p))
1181 return FALSE;
b8c5462f
JH
1182 if (!PL_utf8_cntrl)
1183 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1184 return swash_fetch(PL_utf8_cntrl, p);
1185}
1186
1187bool
1188Perl_is_utf8_graph(pTHX_ U8 *p)
1189{
386d01d6
GS
1190 if (!is_utf8_char(p))
1191 return FALSE;
b8c5462f
JH
1192 if (!PL_utf8_graph)
1193 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1194 return swash_fetch(PL_utf8_graph, p);
1195}
1196
1197bool
864dbfa3 1198Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1199{
386d01d6
GS
1200 if (!is_utf8_char(p))
1201 return FALSE;
a0ed51b3 1202 if (!PL_utf8_print)
e24b16f9 1203 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1204 return swash_fetch(PL_utf8_print, p);
1205}
1206
1207bool
b8c5462f
JH
1208Perl_is_utf8_punct(pTHX_ U8 *p)
1209{
386d01d6
GS
1210 if (!is_utf8_char(p))
1211 return FALSE;
b8c5462f
JH
1212 if (!PL_utf8_punct)
1213 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1214 return swash_fetch(PL_utf8_punct, p);
1215}
1216
1217bool
1218Perl_is_utf8_xdigit(pTHX_ U8 *p)
1219{
386d01d6
GS
1220 if (!is_utf8_char(p))
1221 return FALSE;
b8c5462f
JH
1222 if (!PL_utf8_xdigit)
1223 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1224 return swash_fetch(PL_utf8_xdigit, p);
1225}
1226
1227bool
864dbfa3 1228Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1229{
386d01d6
GS
1230 if (!is_utf8_char(p))
1231 return FALSE;
a0ed51b3 1232 if (!PL_utf8_mark)
e24b16f9 1233 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1234 return swash_fetch(PL_utf8_mark, p);
1235}
1236
2104c8d9 1237UV
864dbfa3 1238Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3
LW
1239{
1240 UV uv;
1241
1242 if (!PL_utf8_toupper)
e24b16f9 1243 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1244 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1245 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1246}
1247
2104c8d9 1248UV
864dbfa3 1249Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3
LW
1250{
1251 UV uv;
1252
1253 if (!PL_utf8_totitle)
e24b16f9 1254 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1255 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1256 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1257}
1258
2104c8d9 1259UV
864dbfa3 1260Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3
LW
1261{
1262 UV uv;
1263
1264 if (!PL_utf8_tolower)
e24b16f9 1265 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1266 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1267 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1268}
1269
1270/* a "swash" is a swatch hash */
1271
1272SV*
864dbfa3 1273Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1274{
1275 SV* retval;
bf1fed83 1276 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1277 dSP;
1b026014 1278 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
ce3b816e 1279
1b026014 1280 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e
GS
1281 ENTER;
1282 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1283 LEAVE;
1284 }
1285 SPAGAIN;
a0ed51b3
LW
1286 PUSHSTACKi(PERLSI_MAGIC);
1287 PUSHMARK(SP);
1288 EXTEND(SP,5);
1289 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1290 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1291 PUSHs(listsv);
1292 PUSHs(sv_2mortal(newSViv(minbits)));
1293 PUSHs(sv_2mortal(newSViv(none)));
1294 PUTBACK;
1295 ENTER;
1296 SAVEI32(PL_hints);
1297 PL_hints = 0;
1298 save_re_context();
bf1fed83
JH
1299 if (PL_curcop == &PL_compiling)
1300 /* XXX ought to be handled by lex_start */
1301 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1302 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1303 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1304 else
e24b16f9 1305 retval = &PL_sv_undef;
a0ed51b3
LW
1306 LEAVE;
1307 POPSTACK;
e24b16f9 1308 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1309 STRLEN len;
1310 char* pv = SvPV(tokenbufsv, len);
1311
1312 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1313 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1314 }
1315 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1316 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1317 return retval;
1318}
1319
1320UV
864dbfa3 1321Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3
LW
1322{
1323 HV* hv = (HV*)SvRV(sv);
1324 U32 klen = UTF8SKIP(ptr) - 1;
1325 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1326 STRLEN slen;
1327 STRLEN needents = (klen ? 64 : 128);
dfe13c55 1328 U8 *tmps;
a0ed51b3
LW
1329 U32 bit;
1330 SV *retval;
1331
1332 /*
1333 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1334 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1335 * it's nothing to sniff at.) Pity we usually come through at least
1336 * two function calls to get here...
1337 *
1338 * NB: this code assumes that swatches are never modified, once generated!
1339 */
1340
1341 if (hv == PL_last_swash_hv &&
1342 klen == PL_last_swash_klen &&
12ae5dfc 1343 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3
LW
1344 {
1345 tmps = PL_last_swash_tmps;
1346 slen = PL_last_swash_slen;
1347 }
1348 else {
1349 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1350 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1351
1352 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1353 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3
LW
1354 dSP;
1355 ENTER;
1356 SAVETMPS;
1357 save_re_context();
1358 PUSHSTACKi(PERLSI_MAGIC);
1359 PUSHMARK(SP);
1360 EXTEND(SP,3);
1361 PUSHs((SV*)sv);
9041c2e3
NIS
1362 /* We call utf8_to_uni as we want and index into Unicode tables,
1363 not a native character number.
1364 */
1365 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
a0ed51b3
LW
1366 PUSHs(sv_2mortal(newSViv(needents)));
1367 PUTBACK;
864dbfa3 1368 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1369 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1370 else
e24b16f9 1371 retval = &PL_sv_undef;
a0ed51b3
LW
1372 POPSTACK;
1373 FREETMPS;
1374 LEAVE;
e24b16f9
GS
1375 if (PL_curcop == &PL_compiling)
1376 PL_curcop->op_private = PL_hints;
a0ed51b3 1377
dfe13c55 1378 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1379
dfe13c55 1380 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
cea2e8a9 1381 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1382 }
1383
1384 PL_last_swash_hv = hv;
1385 PL_last_swash_klen = klen;
1386 PL_last_swash_tmps = tmps;
1387 PL_last_swash_slen = slen;
1388 if (klen)
1389 Copy(ptr, PL_last_swash_key, klen, U8);
1390 }
1391
9faf8d75 1392 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1393 case 1:
1394 bit = 1 << (off & 7);
1395 off >>= 3;
1396 return (tmps[off] & bit) != 0;
1397 case 8:
1398 return tmps[off];
1399 case 16:
1400 off <<= 1;
1401 return (tmps[off] << 8) + tmps[off + 1] ;
1402 case 32:
1403 off <<= 2;
1404 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1405 }
cea2e8a9 1406 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1407 return 0;
1408}