This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Net::Ping patch, adds stream protocol
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
9041c2e3 30=for apidoc A|U8*|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a
NIS
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
51 return d;
52 }
1d72bdf6
NIS
53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3
LW
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3
LW
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3
LW
132}
133
9041c2e3
NIS
134
135
136/*
eebe1485
SC
137=for apidoc A|STRLEN|is_utf8_char|U8 *s
138
5da9da9e 139Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 140character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
141The actual number of bytes in the UTF-8 character will be returned if
142it is valid, otherwise 0.
9041c2e3 143
5da9da9e 144=cut */
067a85ef 145STRLEN
386d01d6
GS
146Perl_is_utf8_char(pTHX_ U8 *s)
147{
148 U8 u = *s;
067a85ef
A
149 STRLEN slen, len;
150 UV uv, ouv;
386d01d6 151
1d72bdf6 152 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
153 return 1;
154
60006e79 155 if (!UTF8_IS_START(u))
386d01d6
GS
156 return 0;
157
9f07fdcd 158 len = UTF8SKIP(s);
386d01d6 159
60006e79 160 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
161 return 0;
162
386d01d6
GS
163 slen = len - 1;
164 s++;
1d72bdf6
NIS
165 /* The initial value is dubious */
166 uv = u;
067a85ef 167 ouv = uv;
386d01d6 168 while (slen--) {
60006e79 169 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 170 return 0;
8850bf83 171 uv = UTF8_ACCUMULATE(uv, *s);
067a85ef
A
172 if (uv < ouv)
173 return 0;
174 ouv = uv;
386d01d6
GS
175 s++;
176 }
067a85ef 177
5bbb0b5a 178 if (UNISKIP(uv) < len)
067a85ef
A
179 return 0;
180
386d01d6
GS
181 return len;
182}
183
6662521e 184/*
eebe1485 185=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 186
5da9da9e
JH
187Returns true if first C<len> bytes of the given string form a valid UTF8
188string, false otherwise. Note that 'a valid UTF8 string' does not mean
189'a string that contains UTF8' because a valid ASCII string is a valid
190UTF8 string.
6662521e
GS
191
192=cut
193*/
194
8e84507e 195bool
6662521e
GS
196Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
197{
067a85ef 198 U8* x = s;
1aa99e6b 199 U8* send;
067a85ef
A
200 STRLEN c;
201
1aa99e6b 202 if (!len)
6cd5fe39 203 len = strlen((char *)s);
1aa99e6b
IH
204 send = s + len;
205
6662521e
GS
206 while (x < send) {
207 c = is_utf8_char(x);
067a85ef
A
208 if (!c)
209 return FALSE;
6662521e 210 x += c;
6662521e 211 }
60006e79
JH
212 if (x != send)
213 return FALSE;
067a85ef
A
214
215 return TRUE;
6662521e
GS
216}
217
67e989fb 218/*
9041c2e3 219=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 220
9041c2e3
NIS
221Bottom level UTF-8 decode routine.
222Returns the unicode code point value of the first character in the string C<s>
ba210ebe 223which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 224C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
225
226If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
227is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
228it is assumed that the caller will raise a warning, and this function
28d3d195
JH
229will silently just set C<retlen> to C<-1> and return zero. If the
230C<flags> does not contain UTF8_CHECK_ONLY, warnings about
231malformations will be given, C<retlen> will be set to the expected
232length of the UTF-8 character in bytes, and zero will be returned.
233
234The C<flags> can also contain various flags to allow deviations from
235the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 236
9041c2e3
NIS
237Most code should use utf8_to_uvchr() rather than call this directly.
238
dcad2880 239=cut */
67e989fb 240
a0ed51b3 241UV
9041c2e3 242Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
a0ed51b3 243{
ba210ebe
JH
244 UV uv = *s, ouv;
245 STRLEN len = 1;
246 bool dowarn = ckWARN_d(WARN_UTF8);
247 STRLEN expectlen = 0;
a0dbb045
JH
248 U32 warning = 0;
249
250/* This list is a superset of the UTF8_ALLOW_XXX. */
251
252#define UTF8_WARN_EMPTY 1
253#define UTF8_WARN_CONTINUATION 2
254#define UTF8_WARN_NON_CONTINUATION 3
255#define UTF8_WARN_FE_FF 4
256#define UTF8_WARN_SHORT 5
257#define UTF8_WARN_OVERFLOW 6
258#define UTF8_WARN_SURROGATE 7
259#define UTF8_WARN_BOM 8
260#define UTF8_WARN_LONG 9
261#define UTF8_WARN_FFFF 10
262
263 if (curlen == 0 &&
264 !(flags & UTF8_ALLOW_EMPTY)) {
265 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
266 goto malformed;
267 }
268
1d72bdf6 269 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
270 if (retlen)
271 *retlen = 1;
c4d5f83a 272 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 273 }
67e989fb 274
421a8bf2 275 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 276 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 277 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
278 goto malformed;
279 }
280
421a8bf2 281 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 282 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 283 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
284 goto malformed;
285 }
9041c2e3 286
1d72bdf6 287#ifdef EBCDIC
75383841 288 uv = NATIVE_TO_UTF(uv);
1d72bdf6 289#else
fcc8fcf6
JH
290 if ((uv == 0xfe || uv == 0xff) &&
291 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 292 warning = UTF8_WARN_FE_FF;
ba210ebe 293 goto malformed;
a0ed51b3 294 }
1d72bdf6
NIS
295#endif
296
ba210ebe
JH
297 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
298 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
299 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
300 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
301#ifdef EBCDIC
302 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
303 else { len = 7; uv &= 0x01; }
304#else
ba210ebe
JH
305 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
306 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
307 else { len = 13; uv = 0; } /* whoa! */
308#endif
309
a0ed51b3
LW
310 if (retlen)
311 *retlen = len;
9041c2e3 312
ba210ebe
JH
313 expectlen = len;
314
fcc8fcf6
JH
315 if ((curlen < expectlen) &&
316 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 317 warning = UTF8_WARN_SHORT;
ba210ebe
JH
318 goto malformed;
319 }
320
321 len--;
a0ed51b3 322 s++;
ba210ebe
JH
323 ouv = uv;
324
a0ed51b3 325 while (len--) {
421a8bf2
JH
326 if (!UTF8_IS_CONTINUATION(*s) &&
327 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
328 s--;
329 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 330 goto malformed;
a0ed51b3
LW
331 }
332 else
8850bf83 333 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
334 if (!(uv > ouv)) {
335 /* These cannot be allowed. */
336 if (uv == ouv) {
337 if (!(flags & UTF8_ALLOW_LONG)) {
338 warning = UTF8_WARN_LONG;
339 goto malformed;
340 }
341 }
342 else { /* uv < ouv */
343 /* This cannot be allowed. */
344 warning = UTF8_WARN_OVERFLOW;
345 goto malformed;
346 }
ba210ebe
JH
347 }
348 s++;
349 ouv = uv;
350 }
351
421a8bf2 352 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 353 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 354 warning = UTF8_WARN_SURROGATE;
ba210ebe 355 goto malformed;
421a8bf2 356 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 357 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 358 warning = UTF8_WARN_BOM;
ba210ebe 359 goto malformed;
fcc8fcf6
JH
360 } else if ((expectlen > UNISKIP(uv)) &&
361 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 362 warning = UTF8_WARN_LONG;
ba210ebe 363 goto malformed;
421a8bf2 364 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 365 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 366 warning = UTF8_WARN_FFFF;
a9917092 367 goto malformed;
a0ed51b3 368 }
ba210ebe 369
a0ed51b3 370 return uv;
ba210ebe
JH
371
372malformed:
373
fcc8fcf6 374 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 375 if (retlen)
cc366d4b 376 *retlen = -1;
ba210ebe
JH
377 return 0;
378 }
379
a0dbb045
JH
380 if (dowarn) {
381 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
382
383 switch (warning) {
384 case 0: /* Intentionally empty. */ break;
385 case UTF8_WARN_EMPTY:
386 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
387 break;
388 case UTF8_WARN_CONTINUATION:
389 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
390 break;
391 case UTF8_WARN_NON_CONTINUATION:
392 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
393 (UV)s[1], uv);
394 break;
395 case UTF8_WARN_FE_FF:
396 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
397 break;
398 case UTF8_WARN_SHORT:
399 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
400 curlen, curlen == 1 ? "" : "s", expectlen);
401 break;
402 case UTF8_WARN_OVERFLOW:
403 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
404 ouv, *s);
405 break;
406 case UTF8_WARN_SURROGATE:
407 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
408 break;
409 case UTF8_WARN_BOM:
410 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
411 break;
412 case UTF8_WARN_LONG:
413 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
414 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
415 break;
416 case UTF8_WARN_FFFF:
417 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
418 break;
419 default:
420 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
421 break;
422 }
423
424 if (warning) {
425 char *s = SvPVX(sv);
426
427 if (PL_op)
428 Perl_warner(aTHX_ WARN_UTF8,
429 "%s in %s", s, PL_op_desc[PL_op->op_type]);
430 else
431 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
432 }
433 }
434
ba210ebe 435 if (retlen)
28d3d195 436 *retlen = expectlen ? expectlen : len;
ba210ebe 437
28d3d195 438 return 0;
a0ed51b3
LW
439}
440
8e84507e 441/*
9041c2e3
NIS
442=for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
443
444Returns the native character value of the first character in the string C<s>
445which is assumed to be in UTF8 encoding; C<retlen> will be set to the
446length, in bytes, of that character.
447
448If C<s> does not point to a well-formed UTF8 character, zero is
449returned and retlen is set, if possible, to -1.
450
451=cut
452*/
453
454UV
455Perl_utf8_to_uvchr(pTHX_ U8* s, STRLEN* retlen)
456{
457 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
458}
459
460/*
461=for apidoc A|U8* s|utf8_to_uvuni|STRLEN *retlen
462
463Returns the Unicode code point of the first character in the string C<s>
464which is assumed to be in UTF8 encoding; C<retlen> will be set to the
465length, in bytes, of that character.
466
467This function should only be used when returned UV is considered
468an index into the Unicode semantic tables (e.g. swashes).
469
ba210ebe
JH
470If C<s> does not point to a well-formed UTF8 character, zero is
471returned and retlen is set, if possible, to -1.
8e84507e
NIS
472
473=cut
474*/
475
476UV
9041c2e3 477Perl_utf8_to_uvuni(pTHX_ U8* s, STRLEN* retlen)
8e84507e 478{
9041c2e3
NIS
479 /* Call the low level routine asking for checks */
480 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
481}
482
b76347f2 483/*
eebe1485 484=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
b76347f2
JH
485
486Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
487Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
488up past C<e>, croaks.
b76347f2
JH
489
490=cut
491*/
492
493STRLEN
494Perl_utf8_length(pTHX_ U8* s, U8* e)
495{
496 STRLEN len = 0;
497
8850bf83
JH
498 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
499 * the bitops (especially ~) can create illegal UTF-8.
500 * In other words: in Perl UTF-8 is not just for Unicode. */
501
b76347f2 502 if (e < s)
02eb7b47 503 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 504 while (s < e) {
02eb7b47 505 U8 t = UTF8SKIP(s);
b76347f2
JH
506
507 if (e - s < t)
c4d5f83a 508 Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
b76347f2
JH
509 s += t;
510 len++;
511 }
512
513 return len;
514}
515
b06226ff 516/*
eebe1485 517=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
518
519Returns the number of UTF8 characters between the UTF-8 pointers C<a>
520and C<b>.
521
522WARNING: use only if you *know* that the pointers point inside the
523same UTF-8 buffer.
524
525=cut */
a0ed51b3 526
02eb7b47 527IV
864dbfa3 528Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 529{
02eb7b47
JH
530 IV off = 0;
531
8850bf83
JH
532 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
533 * the bitops (especially ~) can create illegal UTF-8.
534 * In other words: in Perl UTF-8 is not just for Unicode. */
535
a0ed51b3
LW
536 if (a < b) {
537 while (a < b) {
02eb7b47
JH
538 U8 c = UTF8SKIP(a);
539
540 if (b - a < c)
541 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
542 a += c;
a0ed51b3
LW
543 off--;
544 }
545 }
546 else {
547 while (b < a) {
02eb7b47
JH
548 U8 c = UTF8SKIP(b);
549
550 if (a - b < c)
551 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
552 b += c;
a0ed51b3
LW
553 off++;
554 }
555 }
02eb7b47 556
a0ed51b3
LW
557 return off;
558}
559
b06226ff 560/*
eebe1485 561=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
b06226ff 562
8850bf83
JH
563Return the UTF-8 pointer C<s> displaced by C<off> characters, either
564forward or backward.
b06226ff
JH
565
566WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
567the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
568on the first byte of character or just after the last byte of a character.
b06226ff
JH
569
570=cut */
a0ed51b3
LW
571
572U8 *
864dbfa3 573Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 574{
8850bf83
JH
575 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
576 * the bitops (especially ~) can create illegal UTF-8.
577 * In other words: in Perl UTF-8 is not just for Unicode. */
578
a0ed51b3
LW
579 if (off >= 0) {
580 while (off--)
581 s += UTF8SKIP(s);
582 }
583 else {
584 while (off++) {
585 s--;
8850bf83
JH
586 while (UTF8_IS_CONTINUATION(*s))
587 s--;
a0ed51b3
LW
588 }
589 }
590 return s;
591}
592
6940069f 593/*
eebe1485 594=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 595
246fae53
MG
596Converts a string C<s> of length C<len> from UTF8 into byte encoding.
597Unlike C<bytes_to_utf8>, this over-writes the original string, and
598updates len to contain the new length.
67e989fb 599Returns zero on failure, setting C<len> to -1.
6940069f
GS
600
601=cut
602*/
603
604U8 *
246fae53 605Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
6940069f 606{
6940069f
GS
607 U8 *send;
608 U8 *d;
dcad2880 609 U8 *save = s;
246fae53
MG
610
611 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
612 for (send = s + *len; s < send; ) {
613 U8 c = *s++;
614
1d72bdf6
NIS
615 if (!UTF8_IS_INVARIANT(c) &&
616 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
617 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
618 *len = -1;
619 return 0;
620 }
246fae53 621 }
dcad2880
JH
622
623 d = s = save;
6940069f 624 while (s < send) {
ed646e6e 625 STRLEN ulen;
9041c2e3 626 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 627 s += ulen;
6940069f
GS
628 }
629 *d = '\0';
246fae53 630 *len = d - save;
6940069f
GS
631 return save;
632}
633
634/*
f9a63242
JH
635=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
636
637Converts a string C<s> of length C<len> from UTF8 into byte encoding.
638Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
639the newly-created string, and updates C<len> to contain the new
640length. Returns the original string if no conversion occurs, C<len>
641is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6420 if C<s> is converted or contains all 7bit characters.
f9a63242
JH
643
644=cut */
645
646U8 *
647Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
648{
f9a63242
JH
649 U8 *d;
650 U8 *start = s;
db42d148 651 U8 *send;
f9a63242
JH
652 I32 count = 0;
653
654 if (!*is_utf8)
655 return start;
656
ef9edfd0 657 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
658 for (send = s + *len; s < send;) {
659 U8 c = *s++;
1d72bdf6 660 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
661 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
662 (c = *s++) && UTF8_IS_CONTINUATION(c))
663 count++;
664 else
f9a63242 665 return start;
db42d148 666 }
f9a63242
JH
667 }
668
669 *is_utf8 = 0;
670
f9a63242 671 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 672 s = start; start = d;
f9a63242
JH
673 while (s < send) {
674 U8 c = *s++;
c4d5f83a
NIS
675 if (!UTF8_IS_INVARIANT(c)) {
676 /* Then it is two-byte encoded */
677 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
678 c = ASCII_TO_NATIVE(c);
679 }
680 *d++ = c;
f9a63242
JH
681 }
682 *d = '\0';
683 *len = d - start;
684 return start;
685}
686
687/*
eebe1485 688=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
689
690Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
691Returns a pointer to the newly-created string, and sets C<len> to
692reflect the new length.
6940069f 693
497711e7 694=cut
6940069f
GS
695*/
696
697U8*
6662521e 698Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
6940069f 699{
6940069f
GS
700 U8 *send;
701 U8 *d;
702 U8 *dst;
6662521e 703 send = s + (*len);
6940069f 704
6662521e 705 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
706 dst = d;
707
708 while (s < send) {
db42d148 709 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
710 if (UNI_IS_INVARIANT(uv))
711 *d++ = UTF_TO_NATIVE(uv);
6940069f 712 else {
90f44359
JH
713 *d++ = UTF8_EIGHT_BIT_HI(uv);
714 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
715 }
716 }
717 *d = '\0';
6662521e 718 *len = d-dst;
6940069f
GS
719 return dst;
720}
721
a0ed51b3 722/*
dea0fc0b 723 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
724 *
725 * Destination must be pre-extended to 3/2 source. Do not use in-place.
726 * We optimize for native, for obvious reasons. */
727
728U8*
dea0fc0b 729Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 730{
dea0fc0b
JH
731 U8* pend;
732 U8* dstart = d;
733
734 if (bytelen & 1)
a7867d0a 735 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
736
737 pend = p + bytelen;
738
a0ed51b3 739 while (p < pend) {
dea0fc0b
JH
740 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
741 p += 2;
a0ed51b3
LW
742 if (uv < 0x80) {
743 *d++ = uv;
744 continue;
745 }
746 if (uv < 0x800) {
747 *d++ = (( uv >> 6) | 0xc0);
748 *d++ = (( uv & 0x3f) | 0x80);
749 continue;
750 }
751 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
752 UV low = *p++;
753 if (low < 0xdc00 || low >= 0xdfff)
754 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
755 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
756 }
757 if (uv < 0x10000) {
758 *d++ = (( uv >> 12) | 0xe0);
759 *d++ = (((uv >> 6) & 0x3f) | 0x80);
760 *d++ = (( uv & 0x3f) | 0x80);
761 continue;
762 }
763 else {
764 *d++ = (( uv >> 18) | 0xf0);
765 *d++ = (((uv >> 12) & 0x3f) | 0x80);
766 *d++ = (((uv >> 6) & 0x3f) | 0x80);
767 *d++ = (( uv & 0x3f) | 0x80);
768 continue;
769 }
770 }
dea0fc0b 771 *newlen = d - dstart;
a0ed51b3
LW
772 return d;
773}
774
775/* Note: this one is slightly destructive of the source. */
776
777U8*
dea0fc0b 778Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
779{
780 U8* s = (U8*)p;
781 U8* send = s + bytelen;
782 while (s < send) {
783 U8 tmp = s[0];
784 s[0] = s[1];
785 s[1] = tmp;
786 s += 2;
787 }
dea0fc0b 788 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
789}
790
791/* for now these are all defined (inefficiently) in terms of the utf8 versions */
792
793bool
864dbfa3 794Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 795{
ad391ad9 796 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 797 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
798 return is_utf8_alnum(tmpbuf);
799}
800
801bool
b8c5462f
JH
802Perl_is_uni_alnumc(pTHX_ U32 c)
803{
ad391ad9 804 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 805 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
806 return is_utf8_alnumc(tmpbuf);
807}
808
809bool
864dbfa3 810Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 811{
ad391ad9 812 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 813 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
814 return is_utf8_idfirst(tmpbuf);
815}
816
817bool
864dbfa3 818Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 819{
ad391ad9 820 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 821 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
822 return is_utf8_alpha(tmpbuf);
823}
824
825bool
4d61ec05
GS
826Perl_is_uni_ascii(pTHX_ U32 c)
827{
ad391ad9 828 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 829 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
830 return is_utf8_ascii(tmpbuf);
831}
832
833bool
864dbfa3 834Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 835{
ad391ad9 836 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 837 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
838 return is_utf8_space(tmpbuf);
839}
840
841bool
864dbfa3 842Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 843{
ad391ad9 844 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 845 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
846 return is_utf8_digit(tmpbuf);
847}
848
849bool
864dbfa3 850Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 851{
ad391ad9 852 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 853 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
854 return is_utf8_upper(tmpbuf);
855}
856
857bool
864dbfa3 858Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 859{
ad391ad9 860 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 861 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
862 return is_utf8_lower(tmpbuf);
863}
864
865bool
b8c5462f
JH
866Perl_is_uni_cntrl(pTHX_ U32 c)
867{
ad391ad9 868 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 869 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
870 return is_utf8_cntrl(tmpbuf);
871}
872
873bool
874Perl_is_uni_graph(pTHX_ U32 c)
875{
ad391ad9 876 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 877 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
878 return is_utf8_graph(tmpbuf);
879}
880
881bool
864dbfa3 882Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 883{
ad391ad9 884 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 885 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
886 return is_utf8_print(tmpbuf);
887}
888
b8c5462f 889bool
f248d071 890Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 891{
ad391ad9 892 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 893 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
894 return is_utf8_punct(tmpbuf);
895}
896
4d61ec05
GS
897bool
898Perl_is_uni_xdigit(pTHX_ U32 c)
899{
ad391ad9 900 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 901 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
902 return is_utf8_xdigit(tmpbuf);
903}
904
a0ed51b3 905U32
864dbfa3 906Perl_to_uni_upper(pTHX_ U32 c)
a0ed51b3 907{
ad391ad9 908 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 909 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
910 return to_utf8_upper(tmpbuf);
911}
912
913U32
864dbfa3 914Perl_to_uni_title(pTHX_ U32 c)
a0ed51b3 915{
ad391ad9 916 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 917 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
918 return to_utf8_title(tmpbuf);
919}
920
921U32
864dbfa3 922Perl_to_uni_lower(pTHX_ U32 c)
a0ed51b3 923{
ad391ad9 924 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 925 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
926 return to_utf8_lower(tmpbuf);
927}
928
929/* for now these all assume no locale info available for Unicode > 255 */
930
931bool
864dbfa3 932Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3
LW
933{
934 return is_uni_alnum(c); /* XXX no locale support yet */
935}
936
937bool
b8c5462f
JH
938Perl_is_uni_alnumc_lc(pTHX_ U32 c)
939{
940 return is_uni_alnumc(c); /* XXX no locale support yet */
941}
942
943bool
864dbfa3 944Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3
LW
945{
946 return is_uni_idfirst(c); /* XXX no locale support yet */
947}
948
949bool
864dbfa3 950Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3
LW
951{
952 return is_uni_alpha(c); /* XXX no locale support yet */
953}
954
955bool
4d61ec05
GS
956Perl_is_uni_ascii_lc(pTHX_ U32 c)
957{
958 return is_uni_ascii(c); /* XXX no locale support yet */
959}
960
961bool
864dbfa3 962Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3
LW
963{
964 return is_uni_space(c); /* XXX no locale support yet */
965}
966
967bool
864dbfa3 968Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3
LW
969{
970 return is_uni_digit(c); /* XXX no locale support yet */
971}
972
973bool
864dbfa3 974Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
975{
976 return is_uni_upper(c); /* XXX no locale support yet */
977}
978
979bool
864dbfa3 980Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
981{
982 return is_uni_lower(c); /* XXX no locale support yet */
983}
984
985bool
b8c5462f
JH
986Perl_is_uni_cntrl_lc(pTHX_ U32 c)
987{
988 return is_uni_cntrl(c); /* XXX no locale support yet */
989}
990
991bool
992Perl_is_uni_graph_lc(pTHX_ U32 c)
993{
994 return is_uni_graph(c); /* XXX no locale support yet */
995}
996
997bool
864dbfa3 998Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3
LW
999{
1000 return is_uni_print(c); /* XXX no locale support yet */
1001}
1002
b8c5462f
JH
1003bool
1004Perl_is_uni_punct_lc(pTHX_ U32 c)
1005{
1006 return is_uni_punct(c); /* XXX no locale support yet */
1007}
1008
4d61ec05
GS
1009bool
1010Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1011{
1012 return is_uni_xdigit(c); /* XXX no locale support yet */
1013}
1014
a0ed51b3 1015U32
864dbfa3 1016Perl_to_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
1017{
1018 return to_uni_upper(c); /* XXX no locale support yet */
1019}
1020
1021U32
864dbfa3 1022Perl_to_uni_title_lc(pTHX_ U32 c)
a0ed51b3
LW
1023{
1024 return to_uni_title(c); /* XXX no locale support yet */
1025}
1026
1027U32
864dbfa3 1028Perl_to_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
1029{
1030 return to_uni_lower(c); /* XXX no locale support yet */
1031}
1032
a0ed51b3 1033bool
864dbfa3 1034Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1035{
386d01d6
GS
1036 if (!is_utf8_char(p))
1037 return FALSE;
a0ed51b3 1038 if (!PL_utf8_alnum)
289d4f09
ML
1039 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1040 * descendant of isalnum(3), in other words, it doesn't
1041 * contain the '_'. --jhi */
1042 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1043 return swash_fetch(PL_utf8_alnum, p);
1044/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1045#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1046 if (!PL_utf8_alnum)
1047 PL_utf8_alnum = swash_init("utf8", "",
1048 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1049 return swash_fetch(PL_utf8_alnum, p);
1050#endif
1051}
1052
1053bool
b8c5462f
JH
1054Perl_is_utf8_alnumc(pTHX_ U8 *p)
1055{
386d01d6
GS
1056 if (!is_utf8_char(p))
1057 return FALSE;
b8c5462f
JH
1058 if (!PL_utf8_alnum)
1059 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1060 return swash_fetch(PL_utf8_alnum, p);
1061/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1062#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1063 if (!PL_utf8_alnum)
1064 PL_utf8_alnum = swash_init("utf8", "",
1065 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1066 return swash_fetch(PL_utf8_alnum, p);
1067#endif
1068}
1069
1070bool
864dbfa3 1071Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1072{
1073 return *p == '_' || is_utf8_alpha(p);
1074}
1075
1076bool
864dbfa3 1077Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1078{
386d01d6
GS
1079 if (!is_utf8_char(p))
1080 return FALSE;
a0ed51b3 1081 if (!PL_utf8_alpha)
e24b16f9 1082 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1083 return swash_fetch(PL_utf8_alpha, p);
1084}
1085
1086bool
b8c5462f
JH
1087Perl_is_utf8_ascii(pTHX_ U8 *p)
1088{
386d01d6
GS
1089 if (!is_utf8_char(p))
1090 return FALSE;
b8c5462f
JH
1091 if (!PL_utf8_ascii)
1092 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1093 return swash_fetch(PL_utf8_ascii, p);
1094}
1095
1096bool
864dbfa3 1097Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1098{
386d01d6
GS
1099 if (!is_utf8_char(p))
1100 return FALSE;
a0ed51b3 1101 if (!PL_utf8_space)
3bec3564 1102 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1103 return swash_fetch(PL_utf8_space, p);
1104}
1105
1106bool
864dbfa3 1107Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1108{
386d01d6
GS
1109 if (!is_utf8_char(p))
1110 return FALSE;
a0ed51b3 1111 if (!PL_utf8_digit)
e24b16f9 1112 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1113 return swash_fetch(PL_utf8_digit, p);
1114}
1115
1116bool
864dbfa3 1117Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1118{
386d01d6
GS
1119 if (!is_utf8_char(p))
1120 return FALSE;
a0ed51b3 1121 if (!PL_utf8_upper)
e24b16f9 1122 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1123 return swash_fetch(PL_utf8_upper, p);
1124}
1125
1126bool
864dbfa3 1127Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1128{
386d01d6
GS
1129 if (!is_utf8_char(p))
1130 return FALSE;
a0ed51b3 1131 if (!PL_utf8_lower)
e24b16f9 1132 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1133 return swash_fetch(PL_utf8_lower, p);
1134}
1135
1136bool
b8c5462f
JH
1137Perl_is_utf8_cntrl(pTHX_ U8 *p)
1138{
386d01d6
GS
1139 if (!is_utf8_char(p))
1140 return FALSE;
b8c5462f
JH
1141 if (!PL_utf8_cntrl)
1142 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1143 return swash_fetch(PL_utf8_cntrl, p);
1144}
1145
1146bool
1147Perl_is_utf8_graph(pTHX_ U8 *p)
1148{
386d01d6
GS
1149 if (!is_utf8_char(p))
1150 return FALSE;
b8c5462f
JH
1151 if (!PL_utf8_graph)
1152 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1153 return swash_fetch(PL_utf8_graph, p);
1154}
1155
1156bool
864dbfa3 1157Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1158{
386d01d6
GS
1159 if (!is_utf8_char(p))
1160 return FALSE;
a0ed51b3 1161 if (!PL_utf8_print)
e24b16f9 1162 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1163 return swash_fetch(PL_utf8_print, p);
1164}
1165
1166bool
b8c5462f
JH
1167Perl_is_utf8_punct(pTHX_ U8 *p)
1168{
386d01d6
GS
1169 if (!is_utf8_char(p))
1170 return FALSE;
b8c5462f
JH
1171 if (!PL_utf8_punct)
1172 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1173 return swash_fetch(PL_utf8_punct, p);
1174}
1175
1176bool
1177Perl_is_utf8_xdigit(pTHX_ U8 *p)
1178{
386d01d6
GS
1179 if (!is_utf8_char(p))
1180 return FALSE;
b8c5462f
JH
1181 if (!PL_utf8_xdigit)
1182 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1183 return swash_fetch(PL_utf8_xdigit, p);
1184}
1185
1186bool
864dbfa3 1187Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1188{
386d01d6
GS
1189 if (!is_utf8_char(p))
1190 return FALSE;
a0ed51b3 1191 if (!PL_utf8_mark)
e24b16f9 1192 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
a0ed51b3
LW
1193 return swash_fetch(PL_utf8_mark, p);
1194}
1195
2104c8d9 1196UV
864dbfa3 1197Perl_to_utf8_upper(pTHX_ U8 *p)
a0ed51b3
LW
1198{
1199 UV uv;
1200
1201 if (!PL_utf8_toupper)
e24b16f9 1202 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
a0ed51b3 1203 uv = swash_fetch(PL_utf8_toupper, p);
9041c2e3 1204 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1205}
1206
2104c8d9 1207UV
864dbfa3 1208Perl_to_utf8_title(pTHX_ U8 *p)
a0ed51b3
LW
1209{
1210 UV uv;
1211
1212 if (!PL_utf8_totitle)
e24b16f9 1213 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
a0ed51b3 1214 uv = swash_fetch(PL_utf8_totitle, p);
9041c2e3 1215 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1216}
1217
2104c8d9 1218UV
864dbfa3 1219Perl_to_utf8_lower(pTHX_ U8 *p)
a0ed51b3
LW
1220{
1221 UV uv;
1222
1223 if (!PL_utf8_tolower)
e24b16f9 1224 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
a0ed51b3 1225 uv = swash_fetch(PL_utf8_tolower, p);
9041c2e3 1226 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
a0ed51b3
LW
1227}
1228
1229/* a "swash" is a swatch hash */
1230
1231SV*
864dbfa3 1232Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1233{
1234 SV* retval;
bf1fed83 1235 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1236 dSP;
1b026014 1237 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
ce3b816e 1238
1b026014 1239 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e
GS
1240 ENTER;
1241 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1242 LEAVE;
1243 }
1244 SPAGAIN;
a0ed51b3
LW
1245 PUSHSTACKi(PERLSI_MAGIC);
1246 PUSHMARK(SP);
1247 EXTEND(SP,5);
1248 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1249 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1250 PUSHs(listsv);
1251 PUSHs(sv_2mortal(newSViv(minbits)));
1252 PUSHs(sv_2mortal(newSViv(none)));
1253 PUTBACK;
1254 ENTER;
1255 SAVEI32(PL_hints);
1256 PL_hints = 0;
1257 save_re_context();
bf1fed83
JH
1258 if (PL_curcop == &PL_compiling)
1259 /* XXX ought to be handled by lex_start */
1260 sv_setpv(tokenbufsv, PL_tokenbuf);
864dbfa3 1261 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1262 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1263 else
e24b16f9 1264 retval = &PL_sv_undef;
a0ed51b3
LW
1265 LEAVE;
1266 POPSTACK;
e24b16f9 1267 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1268 STRLEN len;
1269 char* pv = SvPV(tokenbufsv, len);
1270
1271 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1272 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1273 }
1274 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1275 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1276 return retval;
1277}
1278
1279UV
864dbfa3 1280Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
a0ed51b3
LW
1281{
1282 HV* hv = (HV*)SvRV(sv);
7d85a32c
JH
1283 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1284 then the "swatch" is a vec() for al the chars which start
1285 with 0xAA..0xYY
1286 So the key in the hash is length of encoded char -1
1287 */
a0ed51b3 1288 U32 klen = UTF8SKIP(ptr) - 1;
7d85a32c 1289 U32 off = ptr[klen];
a0ed51b3 1290 STRLEN slen;
7d85a32c 1291 STRLEN needents;
dfe13c55 1292 U8 *tmps;
a0ed51b3
LW
1293 U32 bit;
1294 SV *retval;
1295
7d85a32c
JH
1296 if (klen == 0)
1297 {
1298 /* If char in invariant then swatch is for all the invariant chars
1299 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1300 */
1301 needents = UTF_CONTINUATION_MARK;
1302 off = NATIVE_TO_UTF(ptr[klen]);
1303 }
1304 else
1305 {
1306 /* If char is encoded then swatch is for the prefix */
1307 needents = (1 << UTF_ACCUMULATION_SHIFT);
1308 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1309 }
1310
a0ed51b3
LW
1311 /*
1312 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1313 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1314 * it's nothing to sniff at.) Pity we usually come through at least
1315 * two function calls to get here...
1316 *
1317 * NB: this code assumes that swatches are never modified, once generated!
1318 */
1319
1320 if (hv == PL_last_swash_hv &&
1321 klen == PL_last_swash_klen &&
12ae5dfc 1322 (!klen || memEQ((char *)ptr,(char *)PL_last_swash_key,klen)) )
a0ed51b3
LW
1323 {
1324 tmps = PL_last_swash_tmps;
1325 slen = PL_last_swash_slen;
1326 }
1327 else {
1328 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1329 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1330
1331 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1332 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1333 dSP;
2b9d42f0
NIS
1334 /* We use utf8n_to_uvuni() as we want an index into
1335 Unicode tables, not a native character number.
1336 */
1337 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
a0ed51b3
LW
1338 ENTER;
1339 SAVETMPS;
1340 save_re_context();
1341 PUSHSTACKi(PERLSI_MAGIC);
1342 PUSHMARK(SP);
1343 EXTEND(SP,3);
1344 PUSHs((SV*)sv);
ffbc6a93
JH
1345 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1346 PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1347 PUSHs(sv_2mortal(newSViv(needents)));
1348 PUTBACK;
864dbfa3 1349 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1350 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1351 else
e24b16f9 1352 retval = &PL_sv_undef;
a0ed51b3
LW
1353 POPSTACK;
1354 FREETMPS;
1355 LEAVE;
e24b16f9
GS
1356 if (PL_curcop == &PL_compiling)
1357 PL_curcop->op_private = PL_hints;
a0ed51b3 1358
dfe13c55 1359 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1360
7d85a32c 1361 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1362 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1363 }
1364
1365 PL_last_swash_hv = hv;
1366 PL_last_swash_klen = klen;
1367 PL_last_swash_tmps = tmps;
1368 PL_last_swash_slen = slen;
1369 if (klen)
1370 Copy(ptr, PL_last_swash_key, klen, U8);
1371 }
1372
9faf8d75 1373 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1374 case 1:
1375 bit = 1 << (off & 7);
1376 off >>= 3;
1377 return (tmps[off] & bit) != 0;
1378 case 8:
1379 return tmps[off];
1380 case 16:
1381 off <<= 1;
1382 return (tmps[off] << 8) + tmps[off + 1] ;
1383 case 32:
1384 off <<= 2;
1385 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1386 }
cea2e8a9 1387 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1388 return 0;
1389}
2b9d42f0
NIS
1390
1391
1392/*
1393=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
1394
1395Adds the UTF8 representation of the Native codepoint C<uv> to the end
1396of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1397bytes available. The return value is the pointer to the byte after the
1398end of the new character. In other words,
1399
1400 d = uvchr_to_utf8(d, uv);
1401
1402is the recommended wide native character-aware way of saying
1403
1404 *(d++) = uv;
1405
1406=cut
1407*/
1408
1409/* On ASCII machines this is normally a macro but we want a
1410 real function in case XS code wants it
1411*/
1412#undef Perl_uvchr_to_utf8
1413U8 *
1414Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1415{
1416 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1417}
1418
1419
1420/*
1421=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
1422
1423Returns the native character value of the first character in the string C<s>
1424which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1425length, in bytes, of that character.
1426
1427Allows length and flags to be passed to low level routine.
1428
1429=cut
1430*/
1431/* On ASCII machines this is normally a macro but we want a
1432 real function in case XS code wants it
1433*/
1434#undef Perl_utf8n_to_uvchr
1435UV
1436Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
1437{
1438 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1439 return UNI_TO_NATIVE(uv);
1440}
1441
1442