This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: DRAFT perlpacktut.pod v0.0
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a
NIS
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
51 return d;
52 }
1d72bdf6
NIS
53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3
LW
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3
LW
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3
LW
132}
133
9041c2e3
NIS
134
135
136/*
eebe1485
SC
137=for apidoc A|STRLEN|is_utf8_char|U8 *s
138
5da9da9e 139Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 140character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
141The actual number of bytes in the UTF-8 character will be returned if
142it is valid, otherwise 0.
9041c2e3 143
37607a96
PK
144=cut
145*/
067a85ef 146STRLEN
386d01d6
GS
147Perl_is_utf8_char(pTHX_ U8 *s)
148{
149 U8 u = *s;
067a85ef
A
150 STRLEN slen, len;
151 UV uv, ouv;
386d01d6 152
1d72bdf6 153 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
154 return 1;
155
60006e79 156 if (!UTF8_IS_START(u))
386d01d6
GS
157 return 0;
158
9f07fdcd 159 len = UTF8SKIP(s);
386d01d6 160
60006e79 161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
162 return 0;
163
386d01d6
GS
164 slen = len - 1;
165 s++;
6eb6869e 166 u &= UTF_START_MASK(len);
1d72bdf6 167 uv = u;
067a85ef 168 ouv = uv;
386d01d6 169 while (slen--) {
60006e79 170 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 171 return 0;
8850bf83 172 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 173 if (uv < ouv)
067a85ef
A
174 return 0;
175 ouv = uv;
386d01d6
GS
176 s++;
177 }
067a85ef 178
5bbb0b5a 179 if (UNISKIP(uv) < len)
067a85ef
A
180 return 0;
181
386d01d6
GS
182 return len;
183}
184
6662521e 185/*
eebe1485 186=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 187
5da9da9e
JH
188Returns true if first C<len> bytes of the given string form a valid UTF8
189string, false otherwise. Note that 'a valid UTF8 string' does not mean
190'a string that contains UTF8' because a valid ASCII string is a valid
191UTF8 string.
6662521e
GS
192
193=cut
194*/
195
8e84507e 196bool
6662521e
GS
197Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
198{
067a85ef 199 U8* x = s;
1aa99e6b 200 U8* send;
067a85ef
A
201 STRLEN c;
202
1aa99e6b 203 if (!len)
6cd5fe39 204 len = strlen((char *)s);
1aa99e6b
IH
205 send = s + len;
206
6662521e
GS
207 while (x < send) {
208 c = is_utf8_char(x);
067a85ef
A
209 if (!c)
210 return FALSE;
6662521e 211 x += c;
6662521e 212 }
60006e79
JH
213 if (x != send)
214 return FALSE;
067a85ef
A
215
216 return TRUE;
6662521e
GS
217}
218
67e989fb 219/*
9041c2e3 220=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 221
9041c2e3
NIS
222Bottom level UTF-8 decode routine.
223Returns the unicode code point value of the first character in the string C<s>
ba210ebe 224which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 225C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
226
227If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
228is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
229it is assumed that the caller will raise a warning, and this function
28d3d195
JH
230will silently just set C<retlen> to C<-1> and return zero. If the
231C<flags> does not contain UTF8_CHECK_ONLY, warnings about
232malformations will be given, C<retlen> will be set to the expected
233length of the UTF-8 character in bytes, and zero will be returned.
234
235The C<flags> can also contain various flags to allow deviations from
236the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 237
9041c2e3
NIS
238Most code should use utf8_to_uvchr() rather than call this directly.
239
37607a96
PK
240=cut
241*/
67e989fb 242
a0ed51b3 243UV
37607a96 244Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 245{
9c5ffd7c 246 UV uv = *s, ouv = 0;
ba210ebe
JH
247 STRLEN len = 1;
248 bool dowarn = ckWARN_d(WARN_UTF8);
249 STRLEN expectlen = 0;
a0dbb045
JH
250 U32 warning = 0;
251
252/* This list is a superset of the UTF8_ALLOW_XXX. */
253
254#define UTF8_WARN_EMPTY 1
255#define UTF8_WARN_CONTINUATION 2
256#define UTF8_WARN_NON_CONTINUATION 3
257#define UTF8_WARN_FE_FF 4
258#define UTF8_WARN_SHORT 5
259#define UTF8_WARN_OVERFLOW 6
260#define UTF8_WARN_SURROGATE 7
261#define UTF8_WARN_BOM 8
262#define UTF8_WARN_LONG 9
263#define UTF8_WARN_FFFF 10
264
265 if (curlen == 0 &&
266 !(flags & UTF8_ALLOW_EMPTY)) {
267 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
268 goto malformed;
269 }
270
1d72bdf6 271 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
272 if (retlen)
273 *retlen = 1;
c4d5f83a 274 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 275 }
67e989fb 276
421a8bf2 277 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 278 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 279 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
280 goto malformed;
281 }
282
421a8bf2 283 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 284 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 285 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
286 goto malformed;
287 }
9041c2e3 288
1d72bdf6 289#ifdef EBCDIC
75383841 290 uv = NATIVE_TO_UTF(uv);
1d72bdf6 291#else
fcc8fcf6
JH
292 if ((uv == 0xfe || uv == 0xff) &&
293 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 294 warning = UTF8_WARN_FE_FF;
ba210ebe 295 goto malformed;
a0ed51b3 296 }
1d72bdf6
NIS
297#endif
298
ba210ebe
JH
299 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
300 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
301 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
302 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
303#ifdef EBCDIC
304 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
305 else { len = 7; uv &= 0x01; }
306#else
ba210ebe
JH
307 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
308 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
309 else { len = 13; uv = 0; } /* whoa! */
310#endif
311
a0ed51b3
LW
312 if (retlen)
313 *retlen = len;
9041c2e3 314
ba210ebe
JH
315 expectlen = len;
316
fcc8fcf6
JH
317 if ((curlen < expectlen) &&
318 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 319 warning = UTF8_WARN_SHORT;
ba210ebe
JH
320 goto malformed;
321 }
322
323 len--;
a0ed51b3 324 s++;
ba210ebe
JH
325 ouv = uv;
326
a0ed51b3 327 while (len--) {
421a8bf2
JH
328 if (!UTF8_IS_CONTINUATION(*s) &&
329 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
330 s--;
331 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 332 goto malformed;
a0ed51b3
LW
333 }
334 else
8850bf83 335 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
336 if (!(uv > ouv)) {
337 /* These cannot be allowed. */
338 if (uv == ouv) {
339 if (!(flags & UTF8_ALLOW_LONG)) {
340 warning = UTF8_WARN_LONG;
341 goto malformed;
342 }
343 }
344 else { /* uv < ouv */
345 /* This cannot be allowed. */
346 warning = UTF8_WARN_OVERFLOW;
347 goto malformed;
348 }
ba210ebe
JH
349 }
350 s++;
351 ouv = uv;
352 }
353
421a8bf2 354 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 355 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 356 warning = UTF8_WARN_SURROGATE;
ba210ebe 357 goto malformed;
421a8bf2 358 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 359 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 360 warning = UTF8_WARN_BOM;
ba210ebe 361 goto malformed;
fcc8fcf6
JH
362 } else if ((expectlen > UNISKIP(uv)) &&
363 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 364 warning = UTF8_WARN_LONG;
ba210ebe 365 goto malformed;
421a8bf2 366 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 367 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 368 warning = UTF8_WARN_FFFF;
a9917092 369 goto malformed;
a0ed51b3 370 }
ba210ebe 371
a0ed51b3 372 return uv;
ba210ebe
JH
373
374malformed:
375
fcc8fcf6 376 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 377 if (retlen)
cc366d4b 378 *retlen = -1;
ba210ebe
JH
379 return 0;
380 }
381
a0dbb045
JH
382 if (dowarn) {
383 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
384
385 switch (warning) {
386 case 0: /* Intentionally empty. */ break;
387 case UTF8_WARN_EMPTY:
388 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
389 break;
390 case UTF8_WARN_CONTINUATION:
391 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
392 break;
393 case UTF8_WARN_NON_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
395 (UV)s[1], uv);
396 break;
397 case UTF8_WARN_FE_FF:
398 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
399 break;
400 case UTF8_WARN_SHORT:
401 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
402 curlen, curlen == 1 ? "" : "s", expectlen);
403 break;
404 case UTF8_WARN_OVERFLOW:
405 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
406 ouv, *s);
407 break;
408 case UTF8_WARN_SURROGATE:
409 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
410 break;
411 case UTF8_WARN_BOM:
412 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
413 break;
414 case UTF8_WARN_LONG:
415 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
416 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
417 break;
418 case UTF8_WARN_FFFF:
419 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
420 break;
421 default:
422 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
423 break;
424 }
425
426 if (warning) {
427 char *s = SvPVX(sv);
428
429 if (PL_op)
430 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 431 "%s in %s", s, OP_DESC(PL_op));
a0dbb045
JH
432 else
433 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
434 }
435 }
436
ba210ebe 437 if (retlen)
28d3d195 438 *retlen = expectlen ? expectlen : len;
ba210ebe 439
28d3d195 440 return 0;
a0ed51b3
LW
441}
442
8e84507e 443/*
37607a96 444=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
445
446Returns the native character value of the first character in the string C<s>
447which is assumed to be in UTF8 encoding; C<retlen> will be set to the
448length, in bytes, of that character.
449
450If C<s> does not point to a well-formed UTF8 character, zero is
451returned and retlen is set, if possible, to -1.
452
453=cut
454*/
455
456UV
37607a96 457Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3
NIS
458{
459 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
460}
461
462/*
37607a96 463=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
464
465Returns the Unicode code point of the first character in the string C<s>
466which is assumed to be in UTF8 encoding; C<retlen> will be set to the
467length, in bytes, of that character.
468
469This function should only be used when returned UV is considered
470an index into the Unicode semantic tables (e.g. swashes).
471
ba210ebe
JH
472If C<s> does not point to a well-formed UTF8 character, zero is
473returned and retlen is set, if possible, to -1.
8e84507e
NIS
474
475=cut
476*/
477
478UV
37607a96 479Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 480{
9041c2e3
NIS
481 /* Call the low level routine asking for checks */
482 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
483}
484
b76347f2 485/*
37607a96 486=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
487
488Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
489Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
490up past C<e>, croaks.
b76347f2
JH
491
492=cut
493*/
494
495STRLEN
37607a96 496Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
497{
498 STRLEN len = 0;
499
8850bf83
JH
500 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
501 * the bitops (especially ~) can create illegal UTF-8.
502 * In other words: in Perl UTF-8 is not just for Unicode. */
503
b76347f2 504 if (e < s)
02eb7b47 505 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 506 while (s < e) {
02eb7b47 507 U8 t = UTF8SKIP(s);
b76347f2
JH
508
509 if (e - s < t)
0064a8a9 510 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
511 s += t;
512 len++;
513 }
514
515 return len;
516}
517
b06226ff 518/*
eebe1485 519=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
520
521Returns the number of UTF8 characters between the UTF-8 pointers C<a>
522and C<b>.
523
524WARNING: use only if you *know* that the pointers point inside the
525same UTF-8 buffer.
526
37607a96
PK
527=cut
528*/
a0ed51b3 529
02eb7b47 530IV
864dbfa3 531Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 532{
02eb7b47
JH
533 IV off = 0;
534
8850bf83
JH
535 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
536 * the bitops (especially ~) can create illegal UTF-8.
537 * In other words: in Perl UTF-8 is not just for Unicode. */
538
a0ed51b3
LW
539 if (a < b) {
540 while (a < b) {
02eb7b47
JH
541 U8 c = UTF8SKIP(a);
542
543 if (b - a < c)
544 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
545 a += c;
a0ed51b3
LW
546 off--;
547 }
548 }
549 else {
550 while (b < a) {
02eb7b47
JH
551 U8 c = UTF8SKIP(b);
552
553 if (a - b < c)
554 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
555 b += c;
a0ed51b3
LW
556 off++;
557 }
558 }
02eb7b47 559
a0ed51b3
LW
560 return off;
561}
562
b06226ff 563/*
37607a96 564=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 565
8850bf83
JH
566Return the UTF-8 pointer C<s> displaced by C<off> characters, either
567forward or backward.
b06226ff
JH
568
569WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
570the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
571on the first byte of character or just after the last byte of a character.
b06226ff 572
37607a96
PK
573=cut
574*/
a0ed51b3
LW
575
576U8 *
864dbfa3 577Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 578{
8850bf83
JH
579 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
580 * the bitops (especially ~) can create illegal UTF-8.
581 * In other words: in Perl UTF-8 is not just for Unicode. */
582
a0ed51b3
LW
583 if (off >= 0) {
584 while (off--)
585 s += UTF8SKIP(s);
586 }
587 else {
588 while (off++) {
589 s--;
8850bf83
JH
590 while (UTF8_IS_CONTINUATION(*s))
591 s--;
a0ed51b3
LW
592 }
593 }
594 return s;
595}
596
6940069f 597/*
eebe1485 598=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 599
246fae53
MG
600Converts a string C<s> of length C<len> from UTF8 into byte encoding.
601Unlike C<bytes_to_utf8>, this over-writes the original string, and
602updates len to contain the new length.
67e989fb 603Returns zero on failure, setting C<len> to -1.
6940069f
GS
604
605=cut
606*/
607
608U8 *
37607a96 609Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 610{
6940069f
GS
611 U8 *send;
612 U8 *d;
dcad2880 613 U8 *save = s;
246fae53
MG
614
615 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
616 for (send = s + *len; s < send; ) {
617 U8 c = *s++;
618
1d72bdf6
NIS
619 if (!UTF8_IS_INVARIANT(c) &&
620 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
621 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
622 *len = -1;
623 return 0;
624 }
246fae53 625 }
dcad2880
JH
626
627 d = s = save;
6940069f 628 while (s < send) {
ed646e6e 629 STRLEN ulen;
9041c2e3 630 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 631 s += ulen;
6940069f
GS
632 }
633 *d = '\0';
246fae53 634 *len = d - save;
6940069f
GS
635 return save;
636}
637
638/*
f9a63242
JH
639=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
640
641Converts a string C<s> of length C<len> from UTF8 into byte encoding.
642Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
643the newly-created string, and updates C<len> to contain the new
644length. Returns the original string if no conversion occurs, C<len>
645is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6460 if C<s> is converted or contains all 7bit characters.
f9a63242 647
37607a96
PK
648=cut
649*/
f9a63242
JH
650
651U8 *
37607a96 652Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 653{
f9a63242
JH
654 U8 *d;
655 U8 *start = s;
db42d148 656 U8 *send;
f9a63242
JH
657 I32 count = 0;
658
659 if (!*is_utf8)
660 return start;
661
ef9edfd0 662 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
663 for (send = s + *len; s < send;) {
664 U8 c = *s++;
1d72bdf6 665 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
666 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
667 (c = *s++) && UTF8_IS_CONTINUATION(c))
668 count++;
669 else
f9a63242 670 return start;
db42d148 671 }
f9a63242
JH
672 }
673
674 *is_utf8 = 0;
675
f9a63242 676 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 677 s = start; start = d;
f9a63242
JH
678 while (s < send) {
679 U8 c = *s++;
c4d5f83a
NIS
680 if (!UTF8_IS_INVARIANT(c)) {
681 /* Then it is two-byte encoded */
682 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
683 c = ASCII_TO_NATIVE(c);
684 }
685 *d++ = c;
f9a63242
JH
686 }
687 *d = '\0';
688 *len = d - start;
689 return start;
690}
691
692/*
eebe1485 693=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
694
695Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
696Returns a pointer to the newly-created string, and sets C<len> to
697reflect the new length.
6940069f 698
497711e7 699=cut
6940069f
GS
700*/
701
702U8*
37607a96 703Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 704{
6940069f
GS
705 U8 *send;
706 U8 *d;
707 U8 *dst;
6662521e 708 send = s + (*len);
6940069f 709
6662521e 710 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
711 dst = d;
712
713 while (s < send) {
db42d148 714 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
715 if (UNI_IS_INVARIANT(uv))
716 *d++ = UTF_TO_NATIVE(uv);
6940069f 717 else {
90f44359
JH
718 *d++ = UTF8_EIGHT_BIT_HI(uv);
719 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
720 }
721 }
722 *d = '\0';
6662521e 723 *len = d-dst;
6940069f
GS
724 return dst;
725}
726
a0ed51b3 727/*
dea0fc0b 728 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
729 *
730 * Destination must be pre-extended to 3/2 source. Do not use in-place.
731 * We optimize for native, for obvious reasons. */
732
733U8*
dea0fc0b 734Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 735{
dea0fc0b
JH
736 U8* pend;
737 U8* dstart = d;
738
739 if (bytelen & 1)
a7867d0a 740 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
741
742 pend = p + bytelen;
743
a0ed51b3 744 while (p < pend) {
dea0fc0b
JH
745 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
746 p += 2;
a0ed51b3
LW
747 if (uv < 0x80) {
748 *d++ = uv;
749 continue;
750 }
751 if (uv < 0x800) {
752 *d++ = (( uv >> 6) | 0xc0);
753 *d++ = (( uv & 0x3f) | 0x80);
754 continue;
755 }
756 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
757 UV low = *p++;
758 if (low < 0xdc00 || low >= 0xdfff)
759 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
760 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
761 }
762 if (uv < 0x10000) {
763 *d++ = (( uv >> 12) | 0xe0);
764 *d++ = (((uv >> 6) & 0x3f) | 0x80);
765 *d++ = (( uv & 0x3f) | 0x80);
766 continue;
767 }
768 else {
769 *d++ = (( uv >> 18) | 0xf0);
770 *d++ = (((uv >> 12) & 0x3f) | 0x80);
771 *d++ = (((uv >> 6) & 0x3f) | 0x80);
772 *d++ = (( uv & 0x3f) | 0x80);
773 continue;
774 }
775 }
dea0fc0b 776 *newlen = d - dstart;
a0ed51b3
LW
777 return d;
778}
779
780/* Note: this one is slightly destructive of the source. */
781
782U8*
dea0fc0b 783Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
784{
785 U8* s = (U8*)p;
786 U8* send = s + bytelen;
787 while (s < send) {
788 U8 tmp = s[0];
789 s[0] = s[1];
790 s[1] = tmp;
791 s += 2;
792 }
dea0fc0b 793 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
794}
795
796/* for now these are all defined (inefficiently) in terms of the utf8 versions */
797
798bool
864dbfa3 799Perl_is_uni_alnum(pTHX_ U32 c)
a0ed51b3 800{
ad391ad9 801 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 802 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
803 return is_utf8_alnum(tmpbuf);
804}
805
806bool
b8c5462f
JH
807Perl_is_uni_alnumc(pTHX_ U32 c)
808{
ad391ad9 809 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 810 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
811 return is_utf8_alnumc(tmpbuf);
812}
813
814bool
864dbfa3 815Perl_is_uni_idfirst(pTHX_ U32 c)
a0ed51b3 816{
ad391ad9 817 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 818 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
819 return is_utf8_idfirst(tmpbuf);
820}
821
822bool
864dbfa3 823Perl_is_uni_alpha(pTHX_ U32 c)
a0ed51b3 824{
ad391ad9 825 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 826 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
827 return is_utf8_alpha(tmpbuf);
828}
829
830bool
4d61ec05
GS
831Perl_is_uni_ascii(pTHX_ U32 c)
832{
ad391ad9 833 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 834 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
835 return is_utf8_ascii(tmpbuf);
836}
837
838bool
864dbfa3 839Perl_is_uni_space(pTHX_ U32 c)
a0ed51b3 840{
ad391ad9 841 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 842 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
843 return is_utf8_space(tmpbuf);
844}
845
846bool
864dbfa3 847Perl_is_uni_digit(pTHX_ U32 c)
a0ed51b3 848{
ad391ad9 849 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 850 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
851 return is_utf8_digit(tmpbuf);
852}
853
854bool
864dbfa3 855Perl_is_uni_upper(pTHX_ U32 c)
a0ed51b3 856{
ad391ad9 857 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 858 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
859 return is_utf8_upper(tmpbuf);
860}
861
862bool
864dbfa3 863Perl_is_uni_lower(pTHX_ U32 c)
a0ed51b3 864{
ad391ad9 865 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 866 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
867 return is_utf8_lower(tmpbuf);
868}
869
870bool
b8c5462f
JH
871Perl_is_uni_cntrl(pTHX_ U32 c)
872{
ad391ad9 873 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 874 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
875 return is_utf8_cntrl(tmpbuf);
876}
877
878bool
879Perl_is_uni_graph(pTHX_ U32 c)
880{
ad391ad9 881 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 882 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
883 return is_utf8_graph(tmpbuf);
884}
885
886bool
864dbfa3 887Perl_is_uni_print(pTHX_ U32 c)
a0ed51b3 888{
ad391ad9 889 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 890 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
891 return is_utf8_print(tmpbuf);
892}
893
b8c5462f 894bool
f248d071 895Perl_is_uni_punct(pTHX_ U32 c)
b8c5462f 896{
ad391ad9 897 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 898 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
899 return is_utf8_punct(tmpbuf);
900}
901
4d61ec05
GS
902bool
903Perl_is_uni_xdigit(pTHX_ U32 c)
904{
a2a2844f 905 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 906 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
907 return is_utf8_xdigit(tmpbuf);
908}
909
a0ed51b3 910U32
a2a2844f 911Perl_to_uni_upper(pTHX_ U32 c, U8* p, STRLEN *lenp)
a0ed51b3 912{
a2a2844f 913 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 914 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 915 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3
LW
916}
917
918U32
a2a2844f 919Perl_to_uni_title(pTHX_ U32 c, U8* p, STRLEN *lenp)
a0ed51b3 920{
a2a2844f 921 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 922 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 923 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3
LW
924}
925
926U32
a2a2844f 927Perl_to_uni_lower(pTHX_ U32 c, U8* p, STRLEN *lenp)
a0ed51b3 928{
ad391ad9 929 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 930 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 931 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3
LW
932}
933
934/* for now these all assume no locale info available for Unicode > 255 */
935
936bool
864dbfa3 937Perl_is_uni_alnum_lc(pTHX_ U32 c)
a0ed51b3
LW
938{
939 return is_uni_alnum(c); /* XXX no locale support yet */
940}
941
942bool
b8c5462f
JH
943Perl_is_uni_alnumc_lc(pTHX_ U32 c)
944{
945 return is_uni_alnumc(c); /* XXX no locale support yet */
946}
947
948bool
864dbfa3 949Perl_is_uni_idfirst_lc(pTHX_ U32 c)
a0ed51b3
LW
950{
951 return is_uni_idfirst(c); /* XXX no locale support yet */
952}
953
954bool
864dbfa3 955Perl_is_uni_alpha_lc(pTHX_ U32 c)
a0ed51b3
LW
956{
957 return is_uni_alpha(c); /* XXX no locale support yet */
958}
959
960bool
4d61ec05
GS
961Perl_is_uni_ascii_lc(pTHX_ U32 c)
962{
963 return is_uni_ascii(c); /* XXX no locale support yet */
964}
965
966bool
864dbfa3 967Perl_is_uni_space_lc(pTHX_ U32 c)
a0ed51b3
LW
968{
969 return is_uni_space(c); /* XXX no locale support yet */
970}
971
972bool
864dbfa3 973Perl_is_uni_digit_lc(pTHX_ U32 c)
a0ed51b3
LW
974{
975 return is_uni_digit(c); /* XXX no locale support yet */
976}
977
978bool
864dbfa3 979Perl_is_uni_upper_lc(pTHX_ U32 c)
a0ed51b3
LW
980{
981 return is_uni_upper(c); /* XXX no locale support yet */
982}
983
984bool
864dbfa3 985Perl_is_uni_lower_lc(pTHX_ U32 c)
a0ed51b3
LW
986{
987 return is_uni_lower(c); /* XXX no locale support yet */
988}
989
990bool
b8c5462f
JH
991Perl_is_uni_cntrl_lc(pTHX_ U32 c)
992{
993 return is_uni_cntrl(c); /* XXX no locale support yet */
994}
995
996bool
997Perl_is_uni_graph_lc(pTHX_ U32 c)
998{
999 return is_uni_graph(c); /* XXX no locale support yet */
1000}
1001
1002bool
864dbfa3 1003Perl_is_uni_print_lc(pTHX_ U32 c)
a0ed51b3
LW
1004{
1005 return is_uni_print(c); /* XXX no locale support yet */
1006}
1007
b8c5462f
JH
1008bool
1009Perl_is_uni_punct_lc(pTHX_ U32 c)
1010{
1011 return is_uni_punct(c); /* XXX no locale support yet */
1012}
1013
4d61ec05
GS
1014bool
1015Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1016{
1017 return is_uni_xdigit(c); /* XXX no locale support yet */
1018}
1019
a0ed51b3 1020bool
864dbfa3 1021Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1022{
386d01d6
GS
1023 if (!is_utf8_char(p))
1024 return FALSE;
a0ed51b3 1025 if (!PL_utf8_alnum)
289d4f09
ML
1026 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1027 * descendant of isalnum(3), in other words, it doesn't
1028 * contain the '_'. --jhi */
1029 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1030 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1031/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1032#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1033 if (!PL_utf8_alnum)
1034 PL_utf8_alnum = swash_init("utf8", "",
1035 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1036 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1037#endif
1038}
1039
1040bool
b8c5462f
JH
1041Perl_is_utf8_alnumc(pTHX_ U8 *p)
1042{
386d01d6
GS
1043 if (!is_utf8_char(p))
1044 return FALSE;
b8c5462f
JH
1045 if (!PL_utf8_alnum)
1046 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1047 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1048/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1049#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1050 if (!PL_utf8_alnum)
1051 PL_utf8_alnum = swash_init("utf8", "",
1052 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1053 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1054#endif
1055}
1056
1057bool
864dbfa3 1058Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1059{
1060 return *p == '_' || is_utf8_alpha(p);
1061}
1062
1063bool
864dbfa3 1064Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1065{
386d01d6
GS
1066 if (!is_utf8_char(p))
1067 return FALSE;
a0ed51b3 1068 if (!PL_utf8_alpha)
e24b16f9 1069 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1070 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3
LW
1071}
1072
1073bool
b8c5462f
JH
1074Perl_is_utf8_ascii(pTHX_ U8 *p)
1075{
386d01d6
GS
1076 if (!is_utf8_char(p))
1077 return FALSE;
b8c5462f
JH
1078 if (!PL_utf8_ascii)
1079 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1080 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f
JH
1081}
1082
1083bool
864dbfa3 1084Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1085{
386d01d6
GS
1086 if (!is_utf8_char(p))
1087 return FALSE;
a0ed51b3 1088 if (!PL_utf8_space)
3bec3564 1089 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1090 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3
LW
1091}
1092
1093bool
864dbfa3 1094Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1095{
386d01d6
GS
1096 if (!is_utf8_char(p))
1097 return FALSE;
a0ed51b3 1098 if (!PL_utf8_digit)
e24b16f9 1099 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1100 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3
LW
1101}
1102
1103bool
864dbfa3 1104Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1105{
386d01d6
GS
1106 if (!is_utf8_char(p))
1107 return FALSE;
a0ed51b3 1108 if (!PL_utf8_upper)
e24b16f9 1109 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1110 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3
LW
1111}
1112
1113bool
864dbfa3 1114Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1115{
386d01d6
GS
1116 if (!is_utf8_char(p))
1117 return FALSE;
a0ed51b3 1118 if (!PL_utf8_lower)
e24b16f9 1119 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1120 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3
LW
1121}
1122
1123bool
b8c5462f
JH
1124Perl_is_utf8_cntrl(pTHX_ U8 *p)
1125{
386d01d6
GS
1126 if (!is_utf8_char(p))
1127 return FALSE;
b8c5462f
JH
1128 if (!PL_utf8_cntrl)
1129 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1130 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f
JH
1131}
1132
1133bool
1134Perl_is_utf8_graph(pTHX_ U8 *p)
1135{
386d01d6
GS
1136 if (!is_utf8_char(p))
1137 return FALSE;
b8c5462f
JH
1138 if (!PL_utf8_graph)
1139 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1140 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f
JH
1141}
1142
1143bool
864dbfa3 1144Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1145{
386d01d6
GS
1146 if (!is_utf8_char(p))
1147 return FALSE;
a0ed51b3 1148 if (!PL_utf8_print)
e24b16f9 1149 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1150 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3
LW
1151}
1152
1153bool
b8c5462f
JH
1154Perl_is_utf8_punct(pTHX_ U8 *p)
1155{
386d01d6
GS
1156 if (!is_utf8_char(p))
1157 return FALSE;
b8c5462f
JH
1158 if (!PL_utf8_punct)
1159 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1160 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f
JH
1161}
1162
1163bool
1164Perl_is_utf8_xdigit(pTHX_ U8 *p)
1165{
386d01d6
GS
1166 if (!is_utf8_char(p))
1167 return FALSE;
b8c5462f
JH
1168 if (!PL_utf8_xdigit)
1169 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1170 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f
JH
1171}
1172
1173bool
864dbfa3 1174Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1175{
386d01d6
GS
1176 if (!is_utf8_char(p))
1177 return FALSE;
a0ed51b3 1178 if (!PL_utf8_mark)
e24b16f9 1179 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1180 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3
LW
1181}
1182
6b5c0936
JH
1183/*
1184=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1185
1186The "p" contains the pointer to the UTF-8 string encoding
1187the character that is being converted.
1188
1189The "ustrp" is a pointer to the character buffer to put the
1190conversion result to. The "lenp" is a pointer to the length
1191of the result.
1192
1193The "swash" is a pointer to the swash to use.
1194
1195The "normal" is a string like "ToLower" which means the swash
1196$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1197and loaded by SWASHGET, using lib/utf8_heavy.pl.
1198
1199The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1200the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936
JH
1201lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1202to the hash is by Perl_to_utf8_case().
1203
1204=cut
1205 */
1206
2104c8d9 1207UV
983ffd37 1208Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3
LW
1209{
1210 UV uv;
1211
983ffd37
JH
1212 if (!*swashp)
1213 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1214 uv = swash_fetch(*swashp, p, TRUE);
1215 if (uv)
1216 uv = UNI_TO_NATIVE(uv);
1217 else {
1218 HV *hv;
1219 SV *keysv;
1220 HE *he;
1221
1222 uv = utf8_to_uvchr(p, 0);
4d7ead49
JH
1223 if (uv <= 0xff)
1224 uv = NATIVE_TO_UTF(uv);
983ffd37
JH
1225
1226 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1227 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37
JH
1228 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1229 SV *val = HeVAL(he);
1230 char *s = SvPV(val, *lenp);
1231 U8 c = *(U8*)s;
1232 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1233 Copy(s, ustrp, *lenp, U8);
1234 else {
4d7ead49 1235 c = UTF_TO_NATIVE(c);
983ffd37
JH
1236 /* something in the 0x80..0xFF range */
1237 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1238 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1239 *lenp = 2;
1240 }
1241 return 0;
1242 }
1243 }
a2a2844f
JH
1244 *lenp = UNISKIP(uv);
1245 uvuni_to_utf8(ustrp, uv);
1246 return uv;
a0ed51b3
LW
1247}
1248
2104c8d9 1249UV
983ffd37 1250Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1251{
983ffd37 1252 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1253 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1254}
a0ed51b3 1255
983ffd37
JH
1256UV
1257Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1258{
1259 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1260 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1261}
1262
2104c8d9 1263UV
a2a2844f 1264Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1265{
983ffd37 1266 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1267 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1268}
1269
1270UV
1271Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1272{
1273 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1274 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1275}
1276
1277/* a "swash" is a swatch hash */
1278
1279SV*
864dbfa3 1280Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1281{
1282 SV* retval;
bf1fed83 1283 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1284 dSP;
1b026014 1285 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1286 SV* errsv_save;
ce3b816e 1287
1b026014 1288 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1289 ENTER;
f8be5cf0 1290 errsv_save = newSVsv(ERRSV);
ce3b816e 1291 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0
JH
1292 if (!SvTRUE(ERRSV))
1293 sv_setsv(ERRSV, errsv_save);
1294 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1295 LEAVE;
1296 }
1297 SPAGAIN;
a0ed51b3
LW
1298 PUSHSTACKi(PERLSI_MAGIC);
1299 PUSHMARK(SP);
1300 EXTEND(SP,5);
1301 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1302 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1303 PUSHs(listsv);
1304 PUSHs(sv_2mortal(newSViv(minbits)));
1305 PUSHs(sv_2mortal(newSViv(none)));
1306 PUTBACK;
1307 ENTER;
1308 SAVEI32(PL_hints);
1309 PL_hints = 0;
1310 save_re_context();
bf1fed83
JH
1311 if (PL_curcop == &PL_compiling)
1312 /* XXX ought to be handled by lex_start */
1313 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1314 errsv_save = newSVsv(ERRSV);
864dbfa3 1315 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1316 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1317 else
e24b16f9 1318 retval = &PL_sv_undef;
f8be5cf0
JH
1319 if (!SvTRUE(ERRSV))
1320 sv_setsv(ERRSV, errsv_save);
1321 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1322 LEAVE;
1323 POPSTACK;
e24b16f9 1324 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1325 STRLEN len;
1326 char* pv = SvPV(tokenbufsv, len);
1327
1328 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1329 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1330 }
1331 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1332 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1333 return retval;
1334}
1335
035d37be
JH
1336
1337/* This API is wrong for special case conversions since we may need to
1338 * return several Unicode characters for a single Unicode character
1339 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1340 * the lower-level routine, and it is similarly broken for returning
1341 * multiple values. --jhi */
a0ed51b3 1342UV
3568d838 1343Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1344{
1345 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1346 U32 klen;
1347 U32 off;
a0ed51b3 1348 STRLEN slen;
7d85a32c 1349 STRLEN needents;
4ea42e7f 1350 U8 *tmps = NULL;
a0ed51b3
LW
1351 U32 bit;
1352 SV *retval;
3568d838
JH
1353 U8 tmputf8[2];
1354 UV c = NATIVE_TO_ASCII(*ptr);
1355
1356 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1357 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1358 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1359 ptr = tmputf8;
1360 }
1361 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1362 * then the "swatch" is a vec() for al the chars which start
1363 * with 0xAA..0xYY
1364 * So the key in the hash (klen) is length of encoded char -1
1365 */
1366 klen = UTF8SKIP(ptr) - 1;
1367 off = ptr[klen];
a0ed51b3 1368
7d85a32c
JH
1369 if (klen == 0)
1370 {
1371 /* If char in invariant then swatch is for all the invariant chars
1372 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1373 */
1374 needents = UTF_CONTINUATION_MARK;
1375 off = NATIVE_TO_UTF(ptr[klen]);
1376 }
1377 else
1378 {
1379 /* If char is encoded then swatch is for the prefix */
1380 needents = (1 << UTF_ACCUMULATION_SHIFT);
1381 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1382 }
1383
a0ed51b3
LW
1384 /*
1385 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1386 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1387 * it's nothing to sniff at.) Pity we usually come through at least
1388 * two function calls to get here...
1389 *
1390 * NB: this code assumes that swatches are never modified, once generated!
1391 */
1392
3568d838 1393 if (hv == PL_last_swash_hv &&
a0ed51b3 1394 klen == PL_last_swash_klen &&
3568d838 1395 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1396 {
1397 tmps = PL_last_swash_tmps;
1398 slen = PL_last_swash_slen;
1399 }
1400 else {
1401 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1402 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1403
1404 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1405 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1406 dSP;
2b9d42f0
NIS
1407 /* We use utf8n_to_uvuni() as we want an index into
1408 Unicode tables, not a native character number.
1409 */
1410 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1411 SV *errsv_save;
a0ed51b3
LW
1412 ENTER;
1413 SAVETMPS;
1414 save_re_context();
1415 PUSHSTACKi(PERLSI_MAGIC);
1416 PUSHMARK(SP);
1417 EXTEND(SP,3);
1418 PUSHs((SV*)sv);
ffbc6a93 1419 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1420 PUSHs(sv_2mortal(newSViv((klen) ?
1421 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1422 PUSHs(sv_2mortal(newSViv(needents)));
1423 PUTBACK;
f8be5cf0 1424 errsv_save = newSVsv(ERRSV);
864dbfa3 1425 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1426 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1427 else
e24b16f9 1428 retval = &PL_sv_undef;
f8be5cf0
JH
1429 if (!SvTRUE(ERRSV))
1430 sv_setsv(ERRSV, errsv_save);
1431 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1432 POPSTACK;
1433 FREETMPS;
1434 LEAVE;
e24b16f9
GS
1435 if (PL_curcop == &PL_compiling)
1436 PL_curcop->op_private = PL_hints;
a0ed51b3 1437
dfe13c55 1438 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1439
7d85a32c 1440 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1441 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1442 }
1443
1444 PL_last_swash_hv = hv;
1445 PL_last_swash_klen = klen;
1446 PL_last_swash_tmps = tmps;
1447 PL_last_swash_slen = slen;
1448 if (klen)
1449 Copy(ptr, PL_last_swash_key, klen, U8);
1450 }
1451
9faf8d75 1452 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1453 case 1:
1454 bit = 1 << (off & 7);
1455 off >>= 3;
1456 return (tmps[off] & bit) != 0;
1457 case 8:
1458 return tmps[off];
1459 case 16:
1460 off <<= 1;
1461 return (tmps[off] << 8) + tmps[off + 1] ;
1462 case 32:
1463 off <<= 2;
1464 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1465 }
cea2e8a9 1466 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1467 return 0;
1468}
2b9d42f0
NIS
1469
1470
1471/*
37607a96 1472=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0
NIS
1473
1474Adds the UTF8 representation of the Native codepoint C<uv> to the end
1475of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1476bytes available. The return value is the pointer to the byte after the
1477end of the new character. In other words,
1478
1479 d = uvchr_to_utf8(d, uv);
1480
1481is the recommended wide native character-aware way of saying
1482
1483 *(d++) = uv;
1484
1485=cut
1486*/
1487
1488/* On ASCII machines this is normally a macro but we want a
1489 real function in case XS code wants it
1490*/
1491#undef Perl_uvchr_to_utf8
1492U8 *
1493Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1494{
1495 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1496}
1497
1498
1499/*
37607a96 1500=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1501
1502Returns the native character value of the first character in the string C<s>
1503which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1504length, in bytes, of that character.
1505
1506Allows length and flags to be passed to low level routine.
1507
1508=cut
1509*/
1510/* On ASCII machines this is normally a macro but we want a
1511 real function in case XS code wants it
1512*/
1513#undef Perl_utf8n_to_uvchr
1514UV
37607a96 1515Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1516{
1517 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1518 return UNI_TO_NATIVE(uv);
1519}
1520
e6b2e755
JH
1521char *
1522Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1523{
1524 int truncated = 0;
1525 char *s, *e;
1526
1527 sv_setpvn(dsv, "", 0);
1528 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1529 UV u;
1530 if (pvlim && SvCUR(dsv) >= pvlim) {
1531 truncated++;
1532 break;
1533 }
1534 u = utf8_to_uvchr((U8*)s, 0);
1535 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1536 }
1537 if (truncated)
1538 sv_catpvn(dsv, "...", 3);
1539
1540 return SvPVX(dsv);
1541}
2b9d42f0 1542
e6b2e755
JH
1543char *
1544Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1545{
701a277b
JH
1546 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1547 pvlim, flags);
1548}
1549
1550I32
1551Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
1552{
1553 register U8 *a = (U8*)s1;
1554 register U8 *b = (U8*)s2;
1555 STRLEN la, lb;
1556 UV ca, cb;
1557 STRLEN ulen1, ulen2;
1558 U8 tmpbuf1[UTF8_MAXLEN*3+1];
1559 U8 tmpbuf2[UTF8_MAXLEN*3+1];
1560
1561 while (len) {
1562 if (u1)
1563 ca = utf8_to_uvchr((U8*)a, &la);
1564 else {
1565 ca = *a;
1566 la = 1;
1567 }
1568 if (u2)
1569 cb = utf8_to_uvchr((U8*)b, &lb);
1570 else {
1571 cb = *b;
1572 lb = 1;
1573 }
1574 if (ca != cb) {
1575 if (u1)
1576 to_uni_lower(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1577 else
1578 ulen1 = 1;
1579 if (u2)
1580 to_uni_lower(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1581 else
1582 ulen2 = 1;
1583 if (ulen1 != ulen2
1584 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1585 || memNE(tmpbuf1, tmpbuf2, ulen1))
1586 return 1;
1587 }
1588 a += la;
1589 b += lb;
1590 }
1591 return 0;
e6b2e755 1592}
701a277b 1593