This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Minor naming change UTF8_IS_ASCII => UTF8_IS_INVARIANT
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
9041c2e3 30=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3
LW
48{
49 if (uv < 0x80) {
50 *d++ = uv;
51 return d;
52 }
53 if (uv < 0x800) {
54 *d++ = (( uv >> 6) | 0xc0);
55 *d++ = (( uv & 0x3f) | 0x80);
56 return d;
57 }
58 if (uv < 0x10000) {
59 *d++ = (( uv >> 12) | 0xe0);
60 *d++ = (((uv >> 6) & 0x3f) | 0x80);
61 *d++ = (( uv & 0x3f) | 0x80);
62 return d;
63 }
64 if (uv < 0x200000) {
65 *d++ = (( uv >> 18) | 0xf0);
66 *d++ = (((uv >> 12) & 0x3f) | 0x80);
67 *d++ = (((uv >> 6) & 0x3f) | 0x80);
68 *d++ = (( uv & 0x3f) | 0x80);
69 return d;
70 }
71 if (uv < 0x4000000) {
72 *d++ = (( uv >> 24) | 0xf8);
73 *d++ = (((uv >> 18) & 0x3f) | 0x80);
74 *d++ = (((uv >> 12) & 0x3f) | 0x80);
75 *d++ = (((uv >> 6) & 0x3f) | 0x80);
76 *d++ = (( uv & 0x3f) | 0x80);
77 return d;
78 }
79 if (uv < 0x80000000) {
80 *d++ = (( uv >> 30) | 0xfc);
81 *d++ = (((uv >> 24) & 0x3f) | 0x80);
82 *d++ = (((uv >> 18) & 0x3f) | 0x80);
83 *d++ = (((uv >> 12) & 0x3f) | 0x80);
84 *d++ = (((uv >> 6) & 0x3f) | 0x80);
85 *d++ = (( uv & 0x3f) | 0x80);
86 return d;
87 }
6b8eaf93 88#ifdef HAS_QUAD
d7578b48 89 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
90#endif
91 {
92 *d++ = 0xfe; /* Can't match U+FEFF! */
93 *d++ = (((uv >> 30) & 0x3f) | 0x80);
94 *d++ = (((uv >> 24) & 0x3f) | 0x80);
95 *d++ = (((uv >> 18) & 0x3f) | 0x80);
96 *d++ = (((uv >> 12) & 0x3f) | 0x80);
97 *d++ = (((uv >> 6) & 0x3f) | 0x80);
98 *d++ = (( uv & 0x3f) | 0x80);
99 return d;
100 }
6b8eaf93 101#ifdef HAS_QUAD
a0ed51b3
LW
102 {
103 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
104 *d++ = 0x80; /* 6 Reserved bits */
105 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
106 *d++ = (((uv >> 54) & 0x3f) | 0x80);
107 *d++ = (((uv >> 48) & 0x3f) | 0x80);
108 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
109 *d++ = (((uv >> 36) & 0x3f) | 0x80);
110 *d++ = (((uv >> 30) & 0x3f) | 0x80);
111 *d++ = (((uv >> 24) & 0x3f) | 0x80);
112 *d++ = (((uv >> 18) & 0x3f) | 0x80);
113 *d++ = (((uv >> 12) & 0x3f) | 0x80);
114 *d++ = (((uv >> 6) & 0x3f) | 0x80);
115 *d++ = (( uv & 0x3f) | 0x80);
116 return d;
117 }
118#endif
119}
120
eebe1485 121/*
9041c2e3
NIS
122=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
123
124Adds the UTF8 representation of the Native codepoint C<uv> to the end
125of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
126bytes available. The return value is the pointer to the byte after the
127end of the new character. In other words,
128
129 d = uvchr_to_utf8(d, uv);
130
131is the recommended wide native character-aware way of saying
132
133 *(d++) = uv;
134
135=cut
136*/
137
138U8 *
139Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
140{
141 if (uv < 0x100)
142 uv = NATIVE_TO_ASCII(uv);
143 return Perl_uvuni_to_utf8(aTHX_ d, uv);
144}
145
146
147/*
eebe1485
SC
148=for apidoc A|STRLEN|is_utf8_char|U8 *s
149
5da9da9e
JH
150Tests if some arbitrary number of bytes begins in a valid UTF-8
151character. Note that an ASCII character is a valid UTF-8 character.
152The actual number of bytes in the UTF-8 character will be returned if
153it is valid, otherwise 0.
9041c2e3 154
5da9da9e 155=cut */
067a85ef 156STRLEN
386d01d6
GS
157Perl_is_utf8_char(pTHX_ U8 *s)
158{
159 U8 u = *s;
067a85ef
A
160 STRLEN slen, len;
161 UV uv, ouv;
386d01d6 162
60006e79 163 if (UTF8_IS_ASCII(u))
386d01d6
GS
164 return 1;
165
60006e79 166 if (!UTF8_IS_START(u))
386d01d6
GS
167 return 0;
168
9f07fdcd 169 len = UTF8SKIP(s);
386d01d6 170
60006e79 171 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
172 return 0;
173
386d01d6
GS
174 slen = len - 1;
175 s++;
067a85ef
A
176 uv = u;
177 ouv = uv;
386d01d6 178 while (slen--) {
60006e79 179 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 180 return 0;
8850bf83 181 uv = UTF8_ACCUMULATE(uv, *s);
067a85ef
A
182 if (uv < ouv)
183 return 0;
184 ouv = uv;
386d01d6
GS
185 s++;
186 }
067a85ef 187
5bbb0b5a 188 if (UNISKIP(uv) < len)
067a85ef
A
189 return 0;
190
386d01d6
GS
191 return len;
192}
193
6662521e 194/*
eebe1485 195=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 196
5da9da9e
JH
197Returns true if first C<len> bytes of the given string form a valid UTF8
198string, false otherwise. Note that 'a valid UTF8 string' does not mean
199'a string that contains UTF8' because a valid ASCII string is a valid
200UTF8 string.
6662521e
GS
201
202=cut
203*/
204
8e84507e 205bool
6662521e
GS
206Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
207{
067a85ef 208 U8* x = s;
1aa99e6b 209 U8* send;
067a85ef
A
210 STRLEN c;
211
1aa99e6b 212 if (!len)
6cd5fe39 213 len = strlen((char *)s);
1aa99e6b
IH
214 send = s + len;
215
6662521e
GS
216 while (x < send) {
217 c = is_utf8_char(x);
067a85ef
A
218 if (!c)
219 return FALSE;
6662521e 220 x += c;
6662521e 221 }
60006e79
JH
222 if (x != send)
223 return FALSE;
067a85ef
A
224
225 return TRUE;
6662521e
GS
226}
227
67e989fb 228/*
9041c2e3 229=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 230
9041c2e3
NIS
231Bottom level UTF-8 decode routine.
232Returns the unicode code point value of the first character in the string C<s>
ba210ebe 233which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 234C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
235
236If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
237is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
238it is assumed that the caller will raise a warning, and this function
28d3d195
JH
239will silently just set C<retlen> to C<-1> and return zero. If the
240C<flags> does not contain UTF8_CHECK_ONLY, warnings about
241malformations will be given, C<retlen> will be set to the expected
242length of the UTF-8 character in bytes, and zero will be returned.
243
244The C<flags> can also contain various flags to allow deviations from
245the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 246
9041c2e3
NIS
247Most code should use utf8_to_uvchr() rather than call this directly.
248
dcad2880 249=cut */
67e989fb 250
a0ed51b3 251UV
9041c2e3 252Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
a0ed51b3 253{
ba210ebe
JH
254 UV uv = *s, ouv;
255 STRLEN len = 1;
256 bool dowarn = ckWARN_d(WARN_UTF8);
257 STRLEN expectlen = 0;
a0dbb045
JH
258 U32 warning = 0;
259
260/* This list is a superset of the UTF8_ALLOW_XXX. */
261
262#define UTF8_WARN_EMPTY 1
263#define UTF8_WARN_CONTINUATION 2
264#define UTF8_WARN_NON_CONTINUATION 3
265#define UTF8_WARN_FE_FF 4
266#define UTF8_WARN_SHORT 5
267#define UTF8_WARN_OVERFLOW 6
268#define UTF8_WARN_SURROGATE 7
269#define UTF8_WARN_BOM 8
270#define UTF8_WARN_LONG 9
271#define UTF8_WARN_FFFF 10
272
273 if (curlen == 0 &&
274 !(flags & UTF8_ALLOW_EMPTY)) {
275 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
276 goto malformed;
277 }
278
421a8bf2 279 if (UTF8_IS_ASCII(uv)) {
a0ed51b3
LW
280 if (retlen)
281 *retlen = 1;
9041c2e3 282 return (UV) (*s);
a0ed51b3 283 }
67e989fb 284
421a8bf2 285 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 286 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 287 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
288 goto malformed;
289 }
290
421a8bf2 291 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 292 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 293 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
294 goto malformed;
295 }
9041c2e3 296
fcc8fcf6
JH
297 if ((uv == 0xfe || uv == 0xff) &&
298 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 299 warning = UTF8_WARN_FE_FF;
ba210ebe 300 goto malformed;
a0ed51b3 301 }
fcc8fcf6 302
ba210ebe
JH
303 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
304 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
305 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
306 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
307 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
308 else if (!(uv & 0x01)) { len = 7; uv = 0; }
3c77ea2b 309 else { len = 13; uv = 0; } /* whoa! */
fcc8fcf6 310
a0ed51b3
LW
311 if (retlen)
312 *retlen = len;
9041c2e3 313
ba210ebe
JH
314 expectlen = len;
315
fcc8fcf6
JH
316 if ((curlen < expectlen) &&
317 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 318 warning = UTF8_WARN_SHORT;
ba210ebe
JH
319 goto malformed;
320 }
321
322 len--;
a0ed51b3 323 s++;
ba210ebe
JH
324 ouv = uv;
325
a0ed51b3 326 while (len--) {
421a8bf2
JH
327 if (!UTF8_IS_CONTINUATION(*s) &&
328 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
329 s--;
330 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 331 goto malformed;
a0ed51b3
LW
332 }
333 else
8850bf83 334 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
335 if (!(uv > ouv)) {
336 /* These cannot be allowed. */
337 if (uv == ouv) {
338 if (!(flags & UTF8_ALLOW_LONG)) {
339 warning = UTF8_WARN_LONG;
340 goto malformed;
341 }
342 }
343 else { /* uv < ouv */
344 /* This cannot be allowed. */
345 warning = UTF8_WARN_OVERFLOW;
346 goto malformed;
347 }
ba210ebe
JH
348 }
349 s++;
350 ouv = uv;
351 }
352
421a8bf2 353 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 354 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 355 warning = UTF8_WARN_SURROGATE;
ba210ebe 356 goto malformed;
421a8bf2 357 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 358 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 359 warning = UTF8_WARN_BOM;
ba210ebe 360 goto malformed;
fcc8fcf6
JH
361 } else if ((expectlen > UNISKIP(uv)) &&
362 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 363 warning = UTF8_WARN_LONG;
ba210ebe 364 goto malformed;
421a8bf2 365 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 366 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 367 warning = UTF8_WARN_FFFF;
a9917092 368 goto malformed;
a0ed51b3 369 }
ba210ebe 370
a0ed51b3 371 return uv;
ba210ebe
JH
372
373malformed:
374
fcc8fcf6 375 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 376 if (retlen)
cc366d4b 377 *retlen = -1;
ba210ebe
JH
378 return 0;
379 }
380
a0dbb045
JH
381 if (dowarn) {
382 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
383
384 switch (warning) {
385 case 0: /* Intentionally empty. */ break;
386 case UTF8_WARN_EMPTY:
387 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
388 break;
389 case UTF8_WARN_CONTINUATION:
390 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
391 break;
392 case UTF8_WARN_NON_CONTINUATION:
393 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
394 (UV)s[1], uv);
395 break;
396 case UTF8_WARN_FE_FF:
397 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
398 break;
399 case UTF8_WARN_SHORT:
400 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
401 curlen, curlen == 1 ? "" : "s", expectlen);
402 break;
403 case UTF8_WARN_OVERFLOW:
404 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
405 ouv, *s);
406 break;
407 case UTF8_WARN_SURROGATE:
408 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
409 break;
410 case UTF8_WARN_BOM:
411 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
412 break;
413 case UTF8_WARN_LONG:
414 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
415 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
416 break;
417 case UTF8_WARN_FFFF:
418 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
419 break;
420 default:
421 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
422 break;
423 }
424
425 if (warning) {
426 char *s = SvPVX(sv);
427
428 if (PL_op)
429 Perl_warner(aTHX_ WARN_UTF8,
430 "%s in %s", s, PL_op_desc[PL_op->op_type]);
431 else
432 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
433 }
434 }
435
ba210ebe 436 if (retlen)
28d3d195 437 *retlen = expectlen ? expectlen : len;
ba210ebe 438
28d3d195 439 return 0;
a0ed51b3
LW
440}
441
8e84507e 442/*
9041c2e3 443=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
8e84507e 444
9041c2e3 445Returns the native character value of the first character in the string C<s>
8e84507e 446which is assumed to be in UTF8 encoding; C<retlen> will be set to the
7df053ec 447length, in bytes, of that character.
8e84507e 448
9041c2e3
NIS
449Allows length and flags to be passed to low level routine.
450
451=cut
452*/
453
454UV
455Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
456{
457 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
458 if (uv < 0x100)
459 return (UV) ASCII_TO_NATIVE(uv);
460 return uv;
461}
462
463/*
464=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
465
466Returns the native character value of the first character in the string C<s>
467which is assumed to be in UTF8 encoding; C<retlen> will be set to the
468length, in bytes, of that character.
469
470If C<s> does not point to a well-formed UTF8 character, zero is
471returned and retlen is set, if possible, to -1.
472
473=cut
474*/
475
476UV
477Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
478{
479 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
480}
481
482/*
483=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
484
485Returns the Unicode code point of the first character in the string C<s>
486which is assumed to be in UTF8 encoding; C<retlen> will be set to the
487length, in bytes, of that character.
488
489This function should only be used when returned UV is considered
490an index into the Unicode semantic tables (e.g. swashes).
491
ba210ebe
JH
492If C<s> does not point to a well-formed UTF8 character, zero is
493returned and retlen is set, if possible, to -1.
8e84507e
NIS
494
495=cut
496*/
497
498UV
9041c2e3 499Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
8e84507e 500{
9041c2e3
NIS
501 /* Call the low level routine asking for checks */
502 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
503}
504
b76347f2 505/*
eebe1485 506=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
b76347f2
JH
507
508Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
509Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
510up past C<e>, croaks.
b76347f2
JH
511
512=cut
513*/
514
515STRLEN
516Perl_utf8_length(pTHX_ U8* s, U8* e)
517{
518 STRLEN len = 0;
519
8850bf83
JH
520 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
521 * the bitops (especially ~) can create illegal UTF-8.
522 * In other words: in Perl UTF-8 is not just for Unicode. */
523
b76347f2 524 if (e < s)
02eb7b47 525 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 526 while (s < e) {
02eb7b47 527 U8 t = UTF8SKIP(s);
b76347f2
JH
528
529 if (e - s < t)
02eb7b47 530 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
531 s += t;
532 len++;
533 }
534
535 return len;
536}
537
b06226ff 538/*
eebe1485 539=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
540
541Returns the number of UTF8 characters between the UTF-8 pointers C<a>
542and C<b>.
543
544WARNING: use only if you *know* that the pointers point inside the
545same UTF-8 buffer.
546
547=cut */
a0ed51b3 548
02eb7b47 549IV
864dbfa3 550Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 551{
02eb7b47
JH
552 IV off = 0;
553
8850bf83
JH
554 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
555 * the bitops (especially ~) can create illegal UTF-8.
556 * In other words: in Perl UTF-8 is not just for Unicode. */
557
a0ed51b3
LW
558 if (a < b) {
559 while (a < b) {
02eb7b47
JH
560 U8 c = UTF8SKIP(a);
561
562 if (b - a < c)
563 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
564 a += c;
a0ed51b3
LW
565 off--;
566 }
567 }
568 else {
569 while (b < a) {
02eb7b47
JH
570 U8 c = UTF8SKIP(b);
571
572 if (a - b < c)
573 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
574 b += c;
a0ed51b3
LW
575 off++;
576 }
577 }
02eb7b47 578
a0ed51b3
LW
579 return off;
580}
581
b06226ff 582/*
eebe1485 583=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
b06226ff 584
8850bf83
JH
585Return the UTF-8 pointer C<s> displaced by C<off> characters, either
586forward or backward.
b06226ff
JH
587
588WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
589the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
590on the first byte of character or just after the last byte of a character.
b06226ff
JH
591
592=cut */
a0ed51b3
LW
593
594U8 *
864dbfa3 595Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 596{
8850bf83
JH
597 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
598 * the bitops (especially ~) can create illegal UTF-8.
599 * In other words: in Perl UTF-8 is not just for Unicode. */
600
a0ed51b3
LW
601 if (off >= 0) {
602 while (off--)
603 s += UTF8SKIP(s);
604 }
605 else {
606 while (off++) {
607 s--;
8850bf83
JH
608 while (UTF8_IS_CONTINUATION(*s))
609 s--;
a0ed51b3
LW
610 }
611 }
612 return s;
613}
614
6940069f 615/*
eebe1485 616=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 617
246fae53
MG
618Converts a string C<s> of length C<len> from UTF8 into byte encoding.
619Unlike C<bytes_to_utf8>, this over-writes the original string, and
620updates len to contain the new length.
67e989fb 621Returns zero on failure, setting C<len> to -1.
6940069f
GS
622
623=cut
624*/
625
626U8 *
246fae53 627Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 628{
6940069f
GS
629 U8 *send;
630 U8 *d;
dcad2880 631 U8 *save = s;
246fae53
MG
632
633 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
634 for (send = s + *len; s < send; ) {
635 U8 c = *s++;
636
9f9ab905 637 if (c >= 0x80 &&
dcad2880
JH
638 ((s >= send) ||
639 ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
640 *len = -1;
641 return 0;
642 }
246fae53 643 }
dcad2880
JH
644
645 d = s = save;
6940069f 646 while (s < send) {
ed646e6e 647 STRLEN ulen;
9041c2e3 648 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 649 s += ulen;
6940069f
GS
650 }
651 *d = '\0';
246fae53 652 *len = d - save;
6940069f
GS
653 return save;
654}
655
656/*
f9a63242
JH
657=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
658
659Converts a string C<s> of length C<len> from UTF8 into byte encoding.
660Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
661the newly-created string, and updates C<len> to contain the new
662length. Returns the original string if no conversion occurs, C<len>
663is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6640 if C<s> is converted or contains all 7bit characters.
f9a63242
JH
665
666=cut */
667
668U8 *
669Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
670{
f9a63242
JH
671 U8 *d;
672 U8 *start = s;
db42d148 673 U8 *send;
f9a63242
JH
674 I32 count = 0;
675
676 if (!*is_utf8)
677 return start;
678
ef9edfd0 679 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
680 for (send = s + *len; s < send;) {
681 U8 c = *s++;
db42d148
NIS
682 if (!UTF8_IS_ASCII(c)) {
683 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
684 (c = *s++) && UTF8_IS_CONTINUATION(c))
685 count++;
686 else
f9a63242 687 return start;
db42d148 688 }
f9a63242
JH
689 }
690
691 *is_utf8 = 0;
692
db42d148
NIS
693#ifndef EBCDIC
694 /* Can use as-is if no high chars */
f9a63242
JH
695 if (!count)
696 return start;
db42d148 697#endif
f9a63242
JH
698
699 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 700 s = start; start = d;
f9a63242
JH
701 while (s < send) {
702 U8 c = *s++;
db42d148
NIS
703 if (!UTF8_IS_ASCII(c))
704 c = UTF8_ACCUMULATE(c, *s++);
705 *d++ = ASCII_TO_NATIVE(c);
f9a63242
JH
706 }
707 *d = '\0';
708 *len = d - start;
709 return start;
710}
711
712/*
eebe1485 713=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
714
715Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
716Returns a pointer to the newly-created string, and sets C<len> to
717reflect the new length.
6940069f 718
497711e7 719=cut
6940069f
GS
720*/
721
722U8*
6662521e 723Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 724{
6940069f
GS
725 U8 *send;
726 U8 *d;
727 U8 *dst;
6662521e 728 send = s + (*len);
6940069f 729
6662521e 730 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
731 dst = d;
732
733 while (s < send) {
db42d148
NIS
734 UV uv = NATIVE_TO_ASCII(*s++);
735 if (UTF8_IS_ASCII(uv))
736 *d++ = uv;
6940069f 737 else {
90f44359
JH
738 *d++ = UTF8_EIGHT_BIT_HI(uv);
739 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
740 }
741 }
742 *d = '\0';
6662521e 743 *len = d-dst;
6940069f
GS
744 return dst;
745}
746
a0ed51b3 747/*
dea0fc0b 748 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
749 *
750 * Destination must be pre-extended to 3/2 source. Do not use in-place.
751 * We optimize for native, for obvious reasons. */
752
753U8*
dea0fc0b 754Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 755{
dea0fc0b
JH
756 U8* pend;
757 U8* dstart = d;
758
759 if (bytelen & 1)
a7867d0a 760 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
761
762 pend = p + bytelen;
763
a0ed51b3 764 while (p < pend) {
dea0fc0b
JH
765 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
766 p += 2;
a0ed51b3
LW
767 if (uv < 0x80) {
768 *d++ = uv;
769 continue;
770 }
771 if (uv < 0x800) {
772 *d++ = (( uv >> 6) | 0xc0);
773 *d++ = (( uv & 0x3f) | 0x80);
774 continue;
775 }
776 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
777 UV low = *p++;
778 if (low < 0xdc00 || low >= 0xdfff)
779 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
780 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
781 }
782 if (uv < 0x10000) {
783 *d++ = (( uv >> 12) | 0xe0);
784 *d++ = (((uv >> 6) & 0x3f) | 0x80);
785 *d++ = (( uv & 0x3f) | 0x80);
786 continue;
787 }
788 else {
789 *d++ = (( uv >> 18) | 0xf0);
790 *d++ = (((uv >> 12) & 0x3f) | 0x80);
791 *d++ = (((uv >> 6) & 0x3f) | 0x80);
792 *d++ = (( uv & 0x3f) | 0x80);
793 continue;
794 }
795 }
dea0fc0b 796 *newlen = d - dstart;
a0ed51b3
LW
797 return d;
798}
799
800/* Note: this one is slightly destructive of the source. */
801
802U8*
dea0fc0b 803Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
804{
805 U8* s = (U8*)p;
806 U8* send = s + bytelen;
807 while (s < send) {
808 U8 tmp = s[0];
809 s[0] = s[1];
810 s[1] = tmp;
811 s += 2;
812 }
dea0fc0b 813 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
814}
815
816/* for now these are all defined (inefficiently) in terms of the utf8 versions */
817
818bool
864dbfa3 819Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 820{
ad391ad9 821 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 822 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
823 return is_utf8_alnum(tmpbuf);
824}
825
826bool
b8c5462f
JH
827Perl_is_uni_alnumc(pTHX_ U32 c)
828{
ad391ad9 829 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 830 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
831 return is_utf8_alnumc(tmpbuf);
832}
833
834bool
864dbfa3 835Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 836{
ad391ad9 837 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 838 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
839 return is_utf8_idfirst(tmpbuf);
840}
841
842bool
864dbfa3 843Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 844{
ad391ad9 845 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 846 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
847 return is_utf8_alpha(tmpbuf);
848}
849
850bool
4d61ec05
GS
851Perl_is_uni_ascii(pTHX_ U32 c)
852{
ad391ad9 853 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 854 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
855 return is_utf8_ascii(tmpbuf);
856}
857
858bool
864dbfa3 859Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 860{
ad391ad9 861 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 862 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
863 return is_utf8_space(tmpbuf);
864}
865
866bool
864dbfa3 867Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 868{
ad391ad9 869 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 870 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
871 return is_utf8_digit(tmpbuf);
872}
873
874bool
864dbfa3 875Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 876{
ad391ad9 877 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 878 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
879 return is_utf8_upper(tmpbuf);
880}
881
882bool
864dbfa3 883Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 884{
ad391ad9 885 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 886 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
887 return is_utf8_lower(tmpbuf);
888}
889
890bool
b8c5462f
JH
891Perl_is_uni_cntrl(pTHX_ U32 c)
892{
ad391ad9 893 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 894 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
895 return is_utf8_cntrl(tmpbuf);
896}
897
898bool
899Perl_is_uni_graph(pTHX_ U32 c)
900{
ad391ad9 901 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 902 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
903 return is_utf8_graph(tmpbuf);
904}
905
906bool
864dbfa3 907Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 908{
ad391ad9 909 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 910 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
911 return is_utf8_print(tmpbuf);
912}
913
b8c5462f 914bool
f248d071 915Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 916{
ad391ad9 917 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 918 uvuni_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
919 return is_utf8_punct(tmpbuf);
920}
921
4d61ec05
GS
922bool
923Perl_is_uni_xdigit(pTHX_ U32 c)
924{
ad391ad9 925 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 926 uvuni_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
927 return is_utf8_xdigit(tmpbuf);
928}
929
a0ed51b3 930U32
864dbfa3 931Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 932{
ad391ad9 933 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 934 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
935 return to_utf8_upper(tmpbuf);
936}
937
938U32
864dbfa3 939Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 940{
ad391ad9 941 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 942 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
943 return to_utf8_title(tmpbuf);
944}
945
946U32
864dbfa3 947Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 948{
ad391ad9 949 U8 tmpbuf[UTF8_MAXLEN+1];
9041c2e3 950 uvuni_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
951 return to_utf8_lower(tmpbuf);
952}
953
954/* for now these all assume no locale info available for Unicode > 255 */
955
956bool
864dbfa3 957Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3
LW
958{
959 return is_uni_alnum(c); /* XXX no locale support yet */
960}
961
962bool
b8c5462f
JH
963Perl_is_uni_alnumc_lc(pTHX_ U32 c)
964{
965 return is_uni_alnumc(c); /* XXX no locale support yet */
966}
967
968bool
864dbfa3 969Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3
LW
970{
971 return is_uni_idfirst(c); /* XXX no locale support yet */
972}
973
974bool
864dbfa3 975Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3
LW
976{
977 return is_uni_alpha(c); /* XXX no locale support yet */
978}
979
980bool
4d61ec05
GS
981Perl_is_uni_ascii_lc(pTHX_ U32 c)
982{
983 return is_uni_ascii(c); /* XXX no locale support yet */
984}
985
986bool
864dbfa3 987Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3
LW
988{
989 return is_uni_space(c); /* XXX no locale support yet */
990}
991
992bool
864dbfa3 993Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3
LW
994{
995 return is_uni_digit(c); /* XXX no locale support yet */
996}
997
998bool
864dbfa3 999Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1000{
1001 return is_uni_upper(c); /* XXX no locale support yet */
1002}
1003
1004bool
864dbfa3 1005Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1006{
1007 return is_uni_lower(c); /* XXX no locale support yet */
1008}
1009
1010bool
b8c5462f
JH
1011Perl_is_uni_cntrl_lc(pTHX_ U32 c)
1012{
1013 return is_uni_cntrl(c); /* XXX no locale support yet */
1014}
1015
1016bool
1017Perl_is_uni_graph_lc(pTHX_ U32 c)
1018{
1019 return is_uni_graph(c); /* XXX no locale support yet */
1020}
1021
1022bool
864dbfa3 1023Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3
LW
1024{
1025 return is_uni_print(c); /* XXX no locale support yet */
1026}
1027
b8c5462f
JH
1028bool
1029Perl_is_uni_punct_lc(pTHX_ U32 c)
1030{
1031 return is_uni_punct(c); /* XXX no locale support yet */
1032}
1033
4d61ec05
GS
1034bool
1035Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1036{
1037 return is_uni_xdigit(c); /* XXX no locale support yet */
1038}
1039
a0ed51b3 1040U32
864dbfa3 1041Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1042{
1043 return to_uni_upper(c); /* XXX no locale support yet */
1044}
1045
1046U32
864dbfa3 1047Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3
LW
1048{
1049 return to_uni_title(c); /* XXX no locale support yet */
1050}
1051
1052U32
864dbfa3 1053Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1054{
1055 return to_uni_lower(c); /* XXX no locale support yet */
1056}
1057
a0ed51b3 1058bool
864dbfa3 1059Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1060{
386d01d6
GS
1061 if (!is_utf8_char(p))
1062 return FALSE;
a0ed51b3 1063 if (!PL_utf8_alnum)
289d4f09
ML
1064 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1065 * descendant of isalnum(3), in other words, it doesn't
1066 * contain the '_'. --jhi */
1067 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1068 return swash_fetch(PL_utf8_alnum, p);
1069/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1070#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1071 if (!PL_utf8_alnum)
1072 PL_utf8_alnum = swash_init("utf8", "",
1073 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1074 return swash_fetch(PL_utf8_alnum, p);
1075#endif
1076}
1077
1078bool
b8c5462f
JH
1079Perl_is_utf8_alnumc(pTHX_ U8 *p)
1080{
386d01d6
GS
1081 if (!is_utf8_char(p))
1082 return FALSE;
b8c5462f
JH
1083 if (!PL_utf8_alnum)
1084 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1085 return swash_fetch(PL_utf8_alnum, p);
1086/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1087#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1088 if (!PL_utf8_alnum)
1089 PL_utf8_alnum = swash_init("utf8", "",
1090 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1091 return swash_fetch(PL_utf8_alnum, p);
1092#endif
1093}
1094
1095bool
864dbfa3 1096Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1097{
1098 return *p == '_' || is_utf8_alpha(p);
1099}
1100
1101bool
864dbfa3 1102Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1103{
386d01d6
GS
1104 if (!is_utf8_char(p))
1105 return FALSE;
a0ed51b3 1106 if (!PL_utf8_alpha)
e24b16f9 1107 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1108 return swash_fetch(PL_utf8_alpha, p);
1109}
1110
1111bool
b8c5462f
JH
1112Perl_is_utf8_ascii(pTHX_ U8 *p)
1113{
386d01d6
GS
1114 if (!is_utf8_char(p))
1115 return FALSE;
b8c5462f
JH
1116 if (!PL_utf8_ascii)
1117 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1118 return swash_fetch(PL_utf8_ascii, p);
1119}
1120
1121bool
864dbfa3 1122Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1123{
386d01d6
GS
1124 if (!is_utf8_char(p))
1125 return FALSE;
a0ed51b3 1126 if (!PL_utf8_space)
3bec3564 1127 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1128 return swash_fetch(PL_utf8_space, p);
1129}
1130
1131bool
864dbfa3 1132Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1133{
386d01d6
GS
1134 if (!is_utf8_char(p))
1135 return FALSE;
a0ed51b3 1136 if (!PL_utf8_digit)
e24b16f9 1137 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1138 return swash_fetch(PL_utf8_digit, p);
1139}
1140
1141bool
864dbfa3 1142Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1143{
386d01d6
GS
1144 if (!is_utf8_char(p))
1145 return FALSE;
a0ed51b3 1146 if (!PL_utf8_upper)
e24b16f9 1147 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1148 return swash_fetch(PL_utf8_upper, p);
1149}
1150
1151bool
864dbfa3 1152Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1153{
386d01d6
GS
1154 if (!is_utf8_char(p))
1155 return FALSE;
a0ed51b3 1156 if (!PL_utf8_lower)
e24b16f9 1157 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1158 return swash_fetch(PL_utf8_lower, p);
1159}
1160
1161bool
b8c5462f
JH
1162Perl_is_utf8_cntrl(pTHX_ U8 *p)
1163{
386d01d6
GS
1164 if (!is_utf8_char(p))
1165 return FALSE;
b8c5462f
JH
1166 if (!PL_utf8_cntrl)
1167 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1168 return swash_fetch(PL_utf8_cntrl, p);
1169}
1170
1171bool
1172Perl_is_utf8_graph(pTHX_ U8 *p)
1173{
386d01d6
GS
1174 if (!is_utf8_char(p))
1175 return FALSE;
b8c5462f
JH
1176 if (!PL_utf8_graph)
1177 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_graph, p);
1179}
1180
1181bool
864dbfa3 1182Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1183{
386d01d6
GS
1184 if (!is_utf8_char(p))
1185 return FALSE;
a0ed51b3 1186 if (!PL_utf8_print)
e24b16f9 1187 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1188 return swash_fetch(PL_utf8_print, p);
1189}
1190
1191bool
b8c5462f
JH
1192Perl_is_utf8_punct(pTHX_ U8 *p)
1193{
386d01d6
GS
1194 if (!is_utf8_char(p))
1195 return FALSE;
b8c5462f
JH
1196 if (!PL_utf8_punct)
1197 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_punct, p);
1199}
1200
1201bool
1202Perl_is_utf8_xdigit(pTHX_ U8 *p)
1203{
386d01d6
GS
1204 if (!is_utf8_char(p))
1205 return FALSE;
b8c5462f
JH
1206 if (!PL_utf8_xdigit)
1207 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1208 return swash_fetch(PL_utf8_xdigit, p);
1209}
1210
1211bool
864dbfa3 1212Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1213{
386d01d6
GS
1214 if (!is_utf8_char(p))
1215 return FALSE;
a0ed51b3 1216 if (!PL_utf8_mark)
e24b16f9 1217 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1218 return swash_fetch(PL_utf8_mark, p);
1219}
1220
2104c8d9 1221UV
864dbfa3 1222Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3
LW
1223{
1224 UV uv;
1225
1226 if (!PL_utf8_toupper)
e24b16f9 1227 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1228 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1229 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1230}
1231
2104c8d9 1232UV
864dbfa3 1233Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3
LW
1234{
1235 UV uv;
1236
1237 if (!PL_utf8_totitle)
e24b16f9 1238 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1239 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1240 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1241}
1242
2104c8d9 1243UV
864dbfa3 1244Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3
LW
1245{
1246 UV uv;
1247
1248 if (!PL_utf8_tolower)
e24b16f9 1249 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1250 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1251 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1252}
1253
1254/* a "swash" is a swatch hash */
1255
1256SV*
864dbfa3 1257Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1258{
1259 SV* retval;
bf1fed83 1260 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1261 dSP;
ce3b816e
GS
1262
1263 if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
1264 ENTER;
1265 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1266 LEAVE;
1267 }
1268 SPAGAIN;
a0ed51b3
LW
1269 PUSHSTACKi(PERLSI_MAGIC);
1270 PUSHMARK(SP);
1271 EXTEND(SP,5);
1272 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1273 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1274 PUSHs(listsv);
1275 PUSHs(sv_2mortal(newSViv(minbits)));
1276 PUSHs(sv_2mortal(newSViv(none)));
1277 PUTBACK;
1278 ENTER;
1279 SAVEI32(PL_hints);
1280 PL_hints = 0;
1281 save_re_context();
bf1fed83
JH
1282 if (PL_curcop == &PL_compiling)
1283 /* XXX ought to be handled by lex_start */
1284 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1285 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1286 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1287 else
e24b16f9 1288 retval = &PL_sv_undef;
a0ed51b3
LW
1289 LEAVE;
1290 POPSTACK;
e24b16f9 1291 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1292 STRLEN len;
1293 char* pv = SvPV(tokenbufsv, len);
1294
1295 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1296 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1297 }
1298 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1299 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1300 return retval;
1301}
1302
1303UV
864dbfa3 1304Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3
LW
1305{
1306 HV* hv = (HV*)SvRV(sv);
1307 U32 klen = UTF8SKIP(ptr) - 1;
1308 U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
1309 STRLEN slen;
1310 STRLEN needents = (klen ? 64 : 128);
dfe13c55 1311 U8 *tmps;
a0ed51b3
LW
1312 U32 bit;
1313 SV *retval;
1314
1315 /*
1316 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1317 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1318 * it's nothing to sniff at.) Pity we usually come through at least
1319 * two function calls to get here...
1320 *
1321 * NB: this code assumes that swatches are never modified, once generated!
1322 */
1323
1324 if (hv == PL_last_swash_hv &&
1325 klen == PL_last_swash_klen &&
12ae5dfc 1326 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3
LW
1327 {
1328 tmps = PL_last_swash_tmps;
1329 slen = PL_last_swash_slen;
1330 }
1331 else {
1332 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1333 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1334
1335 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1336 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3
LW
1337 dSP;
1338 ENTER;
1339 SAVETMPS;
1340 save_re_context();
1341 PUSHSTACKi(PERLSI_MAGIC);
1342 PUSHMARK(SP);
1343 EXTEND(SP,3);
1344 PUSHs((SV*)sv);
9041c2e3
NIS
1345 /* We call utf8_to_uni as we want and index into Unicode tables,
1346 not a native character number.
1347 */
1348 PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
a0ed51b3
LW
1349 PUSHs(sv_2mortal(newSViv(needents)));
1350 PUTBACK;
864dbfa3 1351 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1352 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1353 else
e24b16f9 1354 retval = &PL_sv_undef;
a0ed51b3
LW
1355 POPSTACK;
1356 FREETMPS;
1357 LEAVE;
e24b16f9
GS
1358 if (PL_curcop == &PL_compiling)
1359 PL_curcop->op_private = PL_hints;
a0ed51b3 1360
dfe13c55 1361 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1362
dfe13c55 1363 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
cea2e8a9 1364 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1365 }
1366
1367 PL_last_swash_hv = hv;
1368 PL_last_swash_klen = klen;
1369 PL_last_swash_tmps = tmps;
1370 PL_last_swash_slen = slen;
1371 if (klen)
1372 Copy(ptr, PL_last_swash_key, klen, U8);
1373 }
1374
9faf8d75 1375 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1376 case 1:
1377 bit = 1 << (off & 7);
1378 off >>= 3;
1379 return (tmps[off] & bit) != 0;
1380 case 8:
1381 return tmps[off];
1382 case 16:
1383 off <<= 1;
1384 return (tmps[off] << 8) + tmps[off + 1] ;
1385 case 32:
1386 off <<= 2;
1387 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1388 }
cea2e8a9 1389 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1390 return 0;
1391}