This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Re: [PATCH t/op/pack.t] Cleanup
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
c4d5f83a
NIS
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
51 return d;
52 }
2d331972 53#if defined(EBCDIC)
1d72bdf6
NIS
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
a0ed51b3
LW
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
6b8eaf93 100#ifdef HAS_QUAD
d7578b48 101 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
6b8eaf93 113#ifdef HAS_QUAD
a0ed51b3
LW
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
1d72bdf6 131#endif /* Loop style */
a0ed51b3
LW
132}
133
9041c2e3
NIS
134
135
136/*
eebe1485
SC
137=for apidoc A|STRLEN|is_utf8_char|U8 *s
138
5da9da9e 139Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 140character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
141The actual number of bytes in the UTF-8 character will be returned if
142it is valid, otherwise 0.
9041c2e3 143
37607a96
PK
144=cut
145*/
067a85ef 146STRLEN
386d01d6
GS
147Perl_is_utf8_char(pTHX_ U8 *s)
148{
149 U8 u = *s;
067a85ef
A
150 STRLEN slen, len;
151 UV uv, ouv;
386d01d6 152
1d72bdf6 153 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
154 return 1;
155
60006e79 156 if (!UTF8_IS_START(u))
386d01d6
GS
157 return 0;
158
9f07fdcd 159 len = UTF8SKIP(s);
386d01d6 160
60006e79 161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
162 return 0;
163
386d01d6
GS
164 slen = len - 1;
165 s++;
6eb6869e 166 u &= UTF_START_MASK(len);
1d72bdf6 167 uv = u;
067a85ef 168 ouv = uv;
386d01d6 169 while (slen--) {
60006e79 170 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 171 return 0;
8850bf83 172 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 173 if (uv < ouv)
067a85ef
A
174 return 0;
175 ouv = uv;
386d01d6
GS
176 s++;
177 }
067a85ef 178
5bbb0b5a 179 if (UNISKIP(uv) < len)
067a85ef
A
180 return 0;
181
386d01d6
GS
182 return len;
183}
184
6662521e 185/*
eebe1485 186=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 187
5da9da9e
JH
188Returns true if first C<len> bytes of the given string form a valid UTF8
189string, false otherwise. Note that 'a valid UTF8 string' does not mean
190'a string that contains UTF8' because a valid ASCII string is a valid
191UTF8 string.
6662521e
GS
192
193=cut
194*/
195
8e84507e 196bool
6662521e
GS
197Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
198{
067a85ef 199 U8* x = s;
1aa99e6b 200 U8* send;
067a85ef
A
201 STRLEN c;
202
1aa99e6b 203 if (!len)
6cd5fe39 204 len = strlen((char *)s);
1aa99e6b
IH
205 send = s + len;
206
6662521e
GS
207 while (x < send) {
208 c = is_utf8_char(x);
067a85ef
A
209 if (!c)
210 return FALSE;
6662521e 211 x += c;
6662521e 212 }
60006e79
JH
213 if (x != send)
214 return FALSE;
067a85ef
A
215
216 return TRUE;
6662521e
GS
217}
218
67e989fb 219/*
9041c2e3 220=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 221
9041c2e3
NIS
222Bottom level UTF-8 decode routine.
223Returns the unicode code point value of the first character in the string C<s>
ba210ebe 224which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 225C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
226
227If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
228is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
229it is assumed that the caller will raise a warning, and this function
28d3d195
JH
230will silently just set C<retlen> to C<-1> and return zero. If the
231C<flags> does not contain UTF8_CHECK_ONLY, warnings about
232malformations will be given, C<retlen> will be set to the expected
233length of the UTF-8 character in bytes, and zero will be returned.
234
235The C<flags> can also contain various flags to allow deviations from
236the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 237
9041c2e3
NIS
238Most code should use utf8_to_uvchr() rather than call this directly.
239
37607a96
PK
240=cut
241*/
67e989fb 242
a0ed51b3 243UV
37607a96 244Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 245{
9c5ffd7c 246 UV uv = *s, ouv = 0;
ba210ebe
JH
247 STRLEN len = 1;
248 bool dowarn = ckWARN_d(WARN_UTF8);
249 STRLEN expectlen = 0;
a0dbb045
JH
250 U32 warning = 0;
251
252/* This list is a superset of the UTF8_ALLOW_XXX. */
253
254#define UTF8_WARN_EMPTY 1
255#define UTF8_WARN_CONTINUATION 2
256#define UTF8_WARN_NON_CONTINUATION 3
257#define UTF8_WARN_FE_FF 4
258#define UTF8_WARN_SHORT 5
259#define UTF8_WARN_OVERFLOW 6
260#define UTF8_WARN_SURROGATE 7
261#define UTF8_WARN_BOM 8
262#define UTF8_WARN_LONG 9
263#define UTF8_WARN_FFFF 10
264
265 if (curlen == 0 &&
266 !(flags & UTF8_ALLOW_EMPTY)) {
267 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
268 goto malformed;
269 }
270
1d72bdf6 271 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
272 if (retlen)
273 *retlen = 1;
c4d5f83a 274 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 275 }
67e989fb 276
421a8bf2 277 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 278 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 279 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
280 goto malformed;
281 }
282
421a8bf2 283 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 284 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 285 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
286 goto malformed;
287 }
9041c2e3 288
1d72bdf6 289#ifdef EBCDIC
75383841 290 uv = NATIVE_TO_UTF(uv);
1d72bdf6 291#else
fcc8fcf6
JH
292 if ((uv == 0xfe || uv == 0xff) &&
293 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 294 warning = UTF8_WARN_FE_FF;
ba210ebe 295 goto malformed;
a0ed51b3 296 }
1d72bdf6
NIS
297#endif
298
ba210ebe
JH
299 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
300 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
301 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
302 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
303#ifdef EBCDIC
304 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
305 else { len = 7; uv &= 0x01; }
306#else
ba210ebe
JH
307 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
308 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
309 else { len = 13; uv = 0; } /* whoa! */
310#endif
311
a0ed51b3
LW
312 if (retlen)
313 *retlen = len;
9041c2e3 314
ba210ebe
JH
315 expectlen = len;
316
fcc8fcf6
JH
317 if ((curlen < expectlen) &&
318 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 319 warning = UTF8_WARN_SHORT;
ba210ebe
JH
320 goto malformed;
321 }
322
323 len--;
a0ed51b3 324 s++;
ba210ebe
JH
325 ouv = uv;
326
a0ed51b3 327 while (len--) {
421a8bf2
JH
328 if (!UTF8_IS_CONTINUATION(*s) &&
329 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
330 s--;
331 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 332 goto malformed;
a0ed51b3
LW
333 }
334 else
8850bf83 335 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
336 if (!(uv > ouv)) {
337 /* These cannot be allowed. */
338 if (uv == ouv) {
339 if (!(flags & UTF8_ALLOW_LONG)) {
340 warning = UTF8_WARN_LONG;
341 goto malformed;
342 }
343 }
344 else { /* uv < ouv */
345 /* This cannot be allowed. */
346 warning = UTF8_WARN_OVERFLOW;
347 goto malformed;
348 }
ba210ebe
JH
349 }
350 s++;
351 ouv = uv;
352 }
353
421a8bf2 354 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 355 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 356 warning = UTF8_WARN_SURROGATE;
ba210ebe 357 goto malformed;
421a8bf2 358 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 359 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 360 warning = UTF8_WARN_BOM;
ba210ebe 361 goto malformed;
fcc8fcf6
JH
362 } else if ((expectlen > UNISKIP(uv)) &&
363 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 364 warning = UTF8_WARN_LONG;
ba210ebe 365 goto malformed;
421a8bf2 366 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 367 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 368 warning = UTF8_WARN_FFFF;
a9917092 369 goto malformed;
a0ed51b3 370 }
ba210ebe 371
a0ed51b3 372 return uv;
ba210ebe
JH
373
374malformed:
375
fcc8fcf6 376 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 377 if (retlen)
cc366d4b 378 *retlen = -1;
ba210ebe
JH
379 return 0;
380 }
381
a0dbb045
JH
382 if (dowarn) {
383 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
384
385 switch (warning) {
386 case 0: /* Intentionally empty. */ break;
387 case UTF8_WARN_EMPTY:
388 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
389 break;
390 case UTF8_WARN_CONTINUATION:
391 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
392 break;
393 case UTF8_WARN_NON_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
395 (UV)s[1], uv);
396 break;
397 case UTF8_WARN_FE_FF:
398 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
399 break;
400 case UTF8_WARN_SHORT:
401 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
402 curlen, curlen == 1 ? "" : "s", expectlen);
b31f83c2 403 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
404 break;
405 case UTF8_WARN_OVERFLOW:
406 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
407 ouv, *s);
408 break;
409 case UTF8_WARN_SURROGATE:
410 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
411 break;
412 case UTF8_WARN_BOM:
413 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
414 break;
415 case UTF8_WARN_LONG:
416 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
417 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
418 break;
419 case UTF8_WARN_FFFF:
420 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
421 break;
422 default:
423 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
424 break;
425 }
426
427 if (warning) {
428 char *s = SvPVX(sv);
429
430 if (PL_op)
431 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 432 "%s in %s", s, OP_DESC(PL_op));
a0dbb045
JH
433 else
434 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
435 }
436 }
437
ba210ebe 438 if (retlen)
28d3d195 439 *retlen = expectlen ? expectlen : len;
ba210ebe 440
28d3d195 441 return 0;
a0ed51b3
LW
442}
443
8e84507e 444/*
37607a96 445=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
446
447Returns the native character value of the first character in the string C<s>
448which is assumed to be in UTF8 encoding; C<retlen> will be set to the
449length, in bytes, of that character.
450
451If C<s> does not point to a well-formed UTF8 character, zero is
452returned and retlen is set, if possible, to -1.
453
454=cut
455*/
456
457UV
37607a96 458Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3
NIS
459{
460 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
461}
462
463/*
37607a96 464=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
465
466Returns the Unicode code point of the first character in the string C<s>
467which is assumed to be in UTF8 encoding; C<retlen> will be set to the
468length, in bytes, of that character.
469
470This function should only be used when returned UV is considered
471an index into the Unicode semantic tables (e.g. swashes).
472
ba210ebe
JH
473If C<s> does not point to a well-formed UTF8 character, zero is
474returned and retlen is set, if possible, to -1.
8e84507e
NIS
475
476=cut
477*/
478
479UV
37607a96 480Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 481{
9041c2e3
NIS
482 /* Call the low level routine asking for checks */
483 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
484}
485
b76347f2 486/*
37607a96 487=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
488
489Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
490Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
491up past C<e>, croaks.
b76347f2
JH
492
493=cut
494*/
495
496STRLEN
37607a96 497Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
498{
499 STRLEN len = 0;
500
8850bf83
JH
501 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
502 * the bitops (especially ~) can create illegal UTF-8.
503 * In other words: in Perl UTF-8 is not just for Unicode. */
504
b76347f2 505 if (e < s)
02eb7b47 506 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 507 while (s < e) {
02eb7b47 508 U8 t = UTF8SKIP(s);
b76347f2
JH
509
510 if (e - s < t)
0064a8a9 511 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
512 s += t;
513 len++;
514 }
515
516 return len;
517}
518
b06226ff 519/*
eebe1485 520=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
521
522Returns the number of UTF8 characters between the UTF-8 pointers C<a>
523and C<b>.
524
525WARNING: use only if you *know* that the pointers point inside the
526same UTF-8 buffer.
527
37607a96
PK
528=cut
529*/
a0ed51b3 530
02eb7b47 531IV
864dbfa3 532Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 533{
02eb7b47
JH
534 IV off = 0;
535
8850bf83
JH
536 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
537 * the bitops (especially ~) can create illegal UTF-8.
538 * In other words: in Perl UTF-8 is not just for Unicode. */
539
a0ed51b3
LW
540 if (a < b) {
541 while (a < b) {
02eb7b47
JH
542 U8 c = UTF8SKIP(a);
543
544 if (b - a < c)
545 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
546 a += c;
a0ed51b3
LW
547 off--;
548 }
549 }
550 else {
551 while (b < a) {
02eb7b47
JH
552 U8 c = UTF8SKIP(b);
553
554 if (a - b < c)
555 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
556 b += c;
a0ed51b3
LW
557 off++;
558 }
559 }
02eb7b47 560
a0ed51b3
LW
561 return off;
562}
563
b06226ff 564/*
37607a96 565=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 566
8850bf83
JH
567Return the UTF-8 pointer C<s> displaced by C<off> characters, either
568forward or backward.
b06226ff
JH
569
570WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
571the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
572on the first byte of character or just after the last byte of a character.
b06226ff 573
37607a96
PK
574=cut
575*/
a0ed51b3
LW
576
577U8 *
864dbfa3 578Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 579{
8850bf83
JH
580 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
581 * the bitops (especially ~) can create illegal UTF-8.
582 * In other words: in Perl UTF-8 is not just for Unicode. */
583
a0ed51b3
LW
584 if (off >= 0) {
585 while (off--)
586 s += UTF8SKIP(s);
587 }
588 else {
589 while (off++) {
590 s--;
8850bf83
JH
591 while (UTF8_IS_CONTINUATION(*s))
592 s--;
a0ed51b3
LW
593 }
594 }
595 return s;
596}
597
6940069f 598/*
eebe1485 599=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 600
246fae53
MG
601Converts a string C<s> of length C<len> from UTF8 into byte encoding.
602Unlike C<bytes_to_utf8>, this over-writes the original string, and
603updates len to contain the new length.
67e989fb 604Returns zero on failure, setting C<len> to -1.
6940069f
GS
605
606=cut
607*/
608
609U8 *
37607a96 610Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 611{
6940069f
GS
612 U8 *send;
613 U8 *d;
dcad2880 614 U8 *save = s;
246fae53
MG
615
616 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
617 for (send = s + *len; s < send; ) {
618 U8 c = *s++;
619
1d72bdf6
NIS
620 if (!UTF8_IS_INVARIANT(c) &&
621 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
622 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
623 *len = -1;
624 return 0;
625 }
246fae53 626 }
dcad2880
JH
627
628 d = s = save;
6940069f 629 while (s < send) {
ed646e6e 630 STRLEN ulen;
9041c2e3 631 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 632 s += ulen;
6940069f
GS
633 }
634 *d = '\0';
246fae53 635 *len = d - save;
6940069f
GS
636 return save;
637}
638
639/*
f9a63242
JH
640=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
641
642Converts a string C<s> of length C<len> from UTF8 into byte encoding.
643Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
644the newly-created string, and updates C<len> to contain the new
645length. Returns the original string if no conversion occurs, C<len>
646is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6470 if C<s> is converted or contains all 7bit characters.
f9a63242 648
37607a96
PK
649=cut
650*/
f9a63242
JH
651
652U8 *
37607a96 653Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 654{
f9a63242
JH
655 U8 *d;
656 U8 *start = s;
db42d148 657 U8 *send;
f9a63242
JH
658 I32 count = 0;
659
660 if (!*is_utf8)
661 return start;
662
ef9edfd0 663 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
664 for (send = s + *len; s < send;) {
665 U8 c = *s++;
1d72bdf6 666 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
667 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
668 (c = *s++) && UTF8_IS_CONTINUATION(c))
669 count++;
670 else
f9a63242 671 return start;
db42d148 672 }
f9a63242
JH
673 }
674
675 *is_utf8 = 0;
676
f9a63242 677 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 678 s = start; start = d;
f9a63242
JH
679 while (s < send) {
680 U8 c = *s++;
c4d5f83a
NIS
681 if (!UTF8_IS_INVARIANT(c)) {
682 /* Then it is two-byte encoded */
683 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
684 c = ASCII_TO_NATIVE(c);
685 }
686 *d++ = c;
f9a63242
JH
687 }
688 *d = '\0';
689 *len = d - start;
690 return start;
691}
692
693/*
eebe1485 694=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
695
696Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
697Returns a pointer to the newly-created string, and sets C<len> to
698reflect the new length.
6940069f 699
497711e7 700=cut
6940069f
GS
701*/
702
703U8*
37607a96 704Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 705{
6940069f
GS
706 U8 *send;
707 U8 *d;
708 U8 *dst;
6662521e 709 send = s + (*len);
6940069f 710
6662521e 711 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
712 dst = d;
713
714 while (s < send) {
db42d148 715 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
716 if (UNI_IS_INVARIANT(uv))
717 *d++ = UTF_TO_NATIVE(uv);
6940069f 718 else {
90f44359
JH
719 *d++ = UTF8_EIGHT_BIT_HI(uv);
720 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
721 }
722 }
723 *d = '\0';
6662521e 724 *len = d-dst;
6940069f
GS
725 return dst;
726}
727
a0ed51b3 728/*
dea0fc0b 729 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
730 *
731 * Destination must be pre-extended to 3/2 source. Do not use in-place.
732 * We optimize for native, for obvious reasons. */
733
734U8*
dea0fc0b 735Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 736{
dea0fc0b
JH
737 U8* pend;
738 U8* dstart = d;
739
740 if (bytelen & 1)
a7867d0a 741 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
742
743 pend = p + bytelen;
744
a0ed51b3 745 while (p < pend) {
dea0fc0b
JH
746 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
747 p += 2;
a0ed51b3
LW
748 if (uv < 0x80) {
749 *d++ = uv;
750 continue;
751 }
752 if (uv < 0x800) {
753 *d++ = (( uv >> 6) | 0xc0);
754 *d++ = (( uv & 0x3f) | 0x80);
755 continue;
756 }
757 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
758 UV low = *p++;
759 if (low < 0xdc00 || low >= 0xdfff)
760 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
761 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
762 }
763 if (uv < 0x10000) {
764 *d++ = (( uv >> 12) | 0xe0);
765 *d++ = (((uv >> 6) & 0x3f) | 0x80);
766 *d++ = (( uv & 0x3f) | 0x80);
767 continue;
768 }
769 else {
770 *d++ = (( uv >> 18) | 0xf0);
771 *d++ = (((uv >> 12) & 0x3f) | 0x80);
772 *d++ = (((uv >> 6) & 0x3f) | 0x80);
773 *d++ = (( uv & 0x3f) | 0x80);
774 continue;
775 }
776 }
dea0fc0b 777 *newlen = d - dstart;
a0ed51b3
LW
778 return d;
779}
780
781/* Note: this one is slightly destructive of the source. */
782
783U8*
dea0fc0b 784Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
785{
786 U8* s = (U8*)p;
787 U8* send = s + bytelen;
788 while (s < send) {
789 U8 tmp = s[0];
790 s[0] = s[1];
791 s[1] = tmp;
792 s += 2;
793 }
dea0fc0b 794 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
795}
796
797/* for now these are all defined (inefficiently) in terms of the utf8 versions */
798
799bool
84afefe6 800Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 801{
ad391ad9 802 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 803 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
804 return is_utf8_alnum(tmpbuf);
805}
806
807bool
84afefe6 808Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 809{
ad391ad9 810 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 811 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
812 return is_utf8_alnumc(tmpbuf);
813}
814
815bool
84afefe6 816Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 817{
ad391ad9 818 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 819 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
820 return is_utf8_idfirst(tmpbuf);
821}
822
823bool
84afefe6 824Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 825{
ad391ad9 826 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 827 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
828 return is_utf8_alpha(tmpbuf);
829}
830
831bool
84afefe6 832Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 833{
ad391ad9 834 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 835 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
836 return is_utf8_ascii(tmpbuf);
837}
838
839bool
84afefe6 840Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 841{
ad391ad9 842 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 843 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
844 return is_utf8_space(tmpbuf);
845}
846
847bool
84afefe6 848Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 849{
ad391ad9 850 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 851 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
852 return is_utf8_digit(tmpbuf);
853}
854
855bool
84afefe6 856Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 857{
ad391ad9 858 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 859 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
860 return is_utf8_upper(tmpbuf);
861}
862
863bool
84afefe6 864Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 865{
ad391ad9 866 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 867 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
868 return is_utf8_lower(tmpbuf);
869}
870
871bool
84afefe6 872Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 873{
ad391ad9 874 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 875 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
876 return is_utf8_cntrl(tmpbuf);
877}
878
879bool
84afefe6 880Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 881{
ad391ad9 882 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 883 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
884 return is_utf8_graph(tmpbuf);
885}
886
887bool
84afefe6 888Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 889{
ad391ad9 890 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 891 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
892 return is_utf8_print(tmpbuf);
893}
894
b8c5462f 895bool
84afefe6 896Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 897{
ad391ad9 898 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 899 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
900 return is_utf8_punct(tmpbuf);
901}
902
4d61ec05 903bool
84afefe6 904Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 905{
a2a2844f 906 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 907 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
908 return is_utf8_xdigit(tmpbuf);
909}
910
84afefe6
JH
911UV
912Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 913{
a2a2844f 914 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 915 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 916 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3
LW
917}
918
84afefe6
JH
919UV
920Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 921{
a2a2844f 922 U8 tmpbuf[UTF8_MAXLEN*2+1];
2b9d42f0 923 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 924 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3
LW
925}
926
84afefe6
JH
927UV
928Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 929{
ad391ad9 930 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 931 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 932 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3
LW
933}
934
84afefe6
JH
935UV
936Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
937{
938 U8 tmpbuf[UTF8_MAXLEN+1];
939 uvchr_to_utf8(tmpbuf, (UV)c);
940 return to_utf8_fold(tmpbuf, p, lenp);
941}
942
a0ed51b3
LW
943/* for now these all assume no locale info available for Unicode > 255 */
944
945bool
84afefe6 946Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
947{
948 return is_uni_alnum(c); /* XXX no locale support yet */
949}
950
951bool
84afefe6 952Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
953{
954 return is_uni_alnumc(c); /* XXX no locale support yet */
955}
956
957bool
84afefe6 958Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
959{
960 return is_uni_idfirst(c); /* XXX no locale support yet */
961}
962
963bool
84afefe6 964Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
965{
966 return is_uni_alpha(c); /* XXX no locale support yet */
967}
968
969bool
84afefe6 970Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
971{
972 return is_uni_ascii(c); /* XXX no locale support yet */
973}
974
975bool
84afefe6 976Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
977{
978 return is_uni_space(c); /* XXX no locale support yet */
979}
980
981bool
84afefe6 982Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
983{
984 return is_uni_digit(c); /* XXX no locale support yet */
985}
986
987bool
84afefe6 988Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
989{
990 return is_uni_upper(c); /* XXX no locale support yet */
991}
992
993bool
84afefe6 994Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
995{
996 return is_uni_lower(c); /* XXX no locale support yet */
997}
998
999bool
84afefe6 1000Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1001{
1002 return is_uni_cntrl(c); /* XXX no locale support yet */
1003}
1004
1005bool
84afefe6 1006Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1007{
1008 return is_uni_graph(c); /* XXX no locale support yet */
1009}
1010
1011bool
84afefe6 1012Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1013{
1014 return is_uni_print(c); /* XXX no locale support yet */
1015}
1016
b8c5462f 1017bool
84afefe6 1018Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1019{
1020 return is_uni_punct(c); /* XXX no locale support yet */
1021}
1022
4d61ec05 1023bool
84afefe6 1024Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1025{
1026 return is_uni_xdigit(c); /* XXX no locale support yet */
1027}
1028
a0ed51b3 1029bool
864dbfa3 1030Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1031{
386d01d6
GS
1032 if (!is_utf8_char(p))
1033 return FALSE;
a0ed51b3 1034 if (!PL_utf8_alnum)
289d4f09
ML
1035 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1036 * descendant of isalnum(3), in other words, it doesn't
1037 * contain the '_'. --jhi */
1038 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1039 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1040/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1041#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1042 if (!PL_utf8_alnum)
1043 PL_utf8_alnum = swash_init("utf8", "",
1044 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1045 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1046#endif
1047}
1048
1049bool
b8c5462f
JH
1050Perl_is_utf8_alnumc(pTHX_ U8 *p)
1051{
386d01d6
GS
1052 if (!is_utf8_char(p))
1053 return FALSE;
b8c5462f
JH
1054 if (!PL_utf8_alnum)
1055 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1056 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1057/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1058#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1059 if (!PL_utf8_alnum)
1060 PL_utf8_alnum = swash_init("utf8", "",
1061 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1062 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1063#endif
1064}
1065
1066bool
864dbfa3 1067Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1068{
1069 return *p == '_' || is_utf8_alpha(p);
1070}
1071
1072bool
864dbfa3 1073Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1074{
386d01d6
GS
1075 if (!is_utf8_char(p))
1076 return FALSE;
a0ed51b3 1077 if (!PL_utf8_alpha)
e24b16f9 1078 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1079 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3
LW
1080}
1081
1082bool
b8c5462f
JH
1083Perl_is_utf8_ascii(pTHX_ U8 *p)
1084{
386d01d6
GS
1085 if (!is_utf8_char(p))
1086 return FALSE;
b8c5462f
JH
1087 if (!PL_utf8_ascii)
1088 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1089 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f
JH
1090}
1091
1092bool
864dbfa3 1093Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1094{
386d01d6
GS
1095 if (!is_utf8_char(p))
1096 return FALSE;
a0ed51b3 1097 if (!PL_utf8_space)
3bec3564 1098 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1099 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3
LW
1100}
1101
1102bool
864dbfa3 1103Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1104{
386d01d6
GS
1105 if (!is_utf8_char(p))
1106 return FALSE;
a0ed51b3 1107 if (!PL_utf8_digit)
e24b16f9 1108 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1109 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3
LW
1110}
1111
1112bool
864dbfa3 1113Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1114{
386d01d6
GS
1115 if (!is_utf8_char(p))
1116 return FALSE;
a0ed51b3 1117 if (!PL_utf8_upper)
e24b16f9 1118 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1119 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3
LW
1120}
1121
1122bool
864dbfa3 1123Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1124{
386d01d6
GS
1125 if (!is_utf8_char(p))
1126 return FALSE;
a0ed51b3 1127 if (!PL_utf8_lower)
e24b16f9 1128 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1129 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3
LW
1130}
1131
1132bool
b8c5462f
JH
1133Perl_is_utf8_cntrl(pTHX_ U8 *p)
1134{
386d01d6
GS
1135 if (!is_utf8_char(p))
1136 return FALSE;
b8c5462f
JH
1137 if (!PL_utf8_cntrl)
1138 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1139 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f
JH
1140}
1141
1142bool
1143Perl_is_utf8_graph(pTHX_ U8 *p)
1144{
386d01d6
GS
1145 if (!is_utf8_char(p))
1146 return FALSE;
b8c5462f
JH
1147 if (!PL_utf8_graph)
1148 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1149 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f
JH
1150}
1151
1152bool
864dbfa3 1153Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1154{
386d01d6
GS
1155 if (!is_utf8_char(p))
1156 return FALSE;
a0ed51b3 1157 if (!PL_utf8_print)
e24b16f9 1158 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1159 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3
LW
1160}
1161
1162bool
b8c5462f
JH
1163Perl_is_utf8_punct(pTHX_ U8 *p)
1164{
386d01d6
GS
1165 if (!is_utf8_char(p))
1166 return FALSE;
b8c5462f
JH
1167 if (!PL_utf8_punct)
1168 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1169 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f
JH
1170}
1171
1172bool
1173Perl_is_utf8_xdigit(pTHX_ U8 *p)
1174{
386d01d6
GS
1175 if (!is_utf8_char(p))
1176 return FALSE;
b8c5462f
JH
1177 if (!PL_utf8_xdigit)
1178 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1179 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f
JH
1180}
1181
1182bool
864dbfa3 1183Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1184{
386d01d6
GS
1185 if (!is_utf8_char(p))
1186 return FALSE;
a0ed51b3 1187 if (!PL_utf8_mark)
e24b16f9 1188 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1189 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3
LW
1190}
1191
6b5c0936
JH
1192/*
1193=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1194
1195The "p" contains the pointer to the UTF-8 string encoding
1196the character that is being converted.
1197
1198The "ustrp" is a pointer to the character buffer to put the
1199conversion result to. The "lenp" is a pointer to the length
1200of the result.
1201
1202The "swash" is a pointer to the swash to use.
1203
1204The "normal" is a string like "ToLower" which means the swash
1205$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1206and loaded by SWASHGET, using lib/utf8_heavy.pl.
1207
1208The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1209the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936
JH
1210lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1211to the hash is by Perl_to_utf8_case().
1212
1213=cut
1214 */
1215
2104c8d9 1216UV
983ffd37 1217Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3
LW
1218{
1219 UV uv;
1220
983ffd37
JH
1221 if (!*swashp)
1222 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1223 uv = swash_fetch(*swashp, p, TRUE);
1224 if (uv)
1225 uv = UNI_TO_NATIVE(uv);
1226 else {
1227 HV *hv;
1228 SV *keysv;
1229 HE *he;
1230
1231 uv = utf8_to_uvchr(p, 0);
1232
1233 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1234 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37
JH
1235 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1236 SV *val = HeVAL(he);
1237 char *s = SvPV(val, *lenp);
1238 U8 c = *(U8*)s;
1239 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1240 Copy(s, ustrp, *lenp, U8);
1241 else {
1242 /* something in the 0x80..0xFF range */
1243 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1244 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1245 *lenp = 2;
1246 }
1247 return 0;
1248 }
1249 }
a2a2844f
JH
1250 *lenp = UNISKIP(uv);
1251 uvuni_to_utf8(ustrp, uv);
1252 return uv;
a0ed51b3
LW
1253}
1254
2104c8d9 1255UV
983ffd37 1256Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1257{
983ffd37 1258 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1259 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1260}
a0ed51b3 1261
983ffd37
JH
1262UV
1263Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1264{
1265 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1266 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1267}
1268
2104c8d9 1269UV
a2a2844f 1270Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1271{
983ffd37 1272 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1273 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1274}
1275
1276UV
1277Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1278{
1279 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1280 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1281}
1282
1283/* a "swash" is a swatch hash */
1284
1285SV*
864dbfa3 1286Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1287{
1288 SV* retval;
bf1fed83 1289 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1290 dSP;
1b026014 1291 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1292 SV* errsv_save;
ce3b816e 1293
1b026014 1294 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1295 ENTER;
f8be5cf0 1296 errsv_save = newSVsv(ERRSV);
ce3b816e 1297 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0
JH
1298 if (!SvTRUE(ERRSV))
1299 sv_setsv(ERRSV, errsv_save);
1300 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1301 LEAVE;
1302 }
1303 SPAGAIN;
a0ed51b3
LW
1304 PUSHSTACKi(PERLSI_MAGIC);
1305 PUSHMARK(SP);
1306 EXTEND(SP,5);
1307 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1308 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1309 PUSHs(listsv);
1310 PUSHs(sv_2mortal(newSViv(minbits)));
1311 PUSHs(sv_2mortal(newSViv(none)));
1312 PUTBACK;
1313 ENTER;
1314 SAVEI32(PL_hints);
1315 PL_hints = 0;
1316 save_re_context();
bf1fed83
JH
1317 if (PL_curcop == &PL_compiling)
1318 /* XXX ought to be handled by lex_start */
1319 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1320 errsv_save = newSVsv(ERRSV);
864dbfa3 1321 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1322 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1323 else
e24b16f9 1324 retval = &PL_sv_undef;
f8be5cf0
JH
1325 if (!SvTRUE(ERRSV))
1326 sv_setsv(ERRSV, errsv_save);
1327 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1328 LEAVE;
1329 POPSTACK;
e24b16f9 1330 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1331 STRLEN len;
1332 char* pv = SvPV(tokenbufsv, len);
1333
1334 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1335 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1336 }
1337 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1338 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1339 return retval;
1340}
1341
035d37be
JH
1342
1343/* This API is wrong for special case conversions since we may need to
1344 * return several Unicode characters for a single Unicode character
1345 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1346 * the lower-level routine, and it is similarly broken for returning
1347 * multiple values. --jhi */
a0ed51b3 1348UV
3568d838 1349Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1350{
1351 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1352 U32 klen;
1353 U32 off;
a0ed51b3 1354 STRLEN slen;
7d85a32c 1355 STRLEN needents;
4ea42e7f 1356 U8 *tmps = NULL;
a0ed51b3
LW
1357 U32 bit;
1358 SV *retval;
3568d838
JH
1359 U8 tmputf8[2];
1360 UV c = NATIVE_TO_ASCII(*ptr);
1361
1362 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1363 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1364 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1365 ptr = tmputf8;
1366 }
1367 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1368 * then the "swatch" is a vec() for al the chars which start
1369 * with 0xAA..0xYY
1370 * So the key in the hash (klen) is length of encoded char -1
1371 */
1372 klen = UTF8SKIP(ptr) - 1;
1373 off = ptr[klen];
a0ed51b3 1374
7d85a32c
JH
1375 if (klen == 0)
1376 {
1377 /* If char in invariant then swatch is for all the invariant chars
1378 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1379 */
1380 needents = UTF_CONTINUATION_MARK;
1381 off = NATIVE_TO_UTF(ptr[klen]);
1382 }
1383 else
1384 {
1385 /* If char is encoded then swatch is for the prefix */
1386 needents = (1 << UTF_ACCUMULATION_SHIFT);
1387 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1388 }
1389
a0ed51b3
LW
1390 /*
1391 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1392 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1393 * it's nothing to sniff at.) Pity we usually come through at least
1394 * two function calls to get here...
1395 *
1396 * NB: this code assumes that swatches are never modified, once generated!
1397 */
1398
3568d838 1399 if (hv == PL_last_swash_hv &&
a0ed51b3 1400 klen == PL_last_swash_klen &&
3568d838 1401 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1402 {
1403 tmps = PL_last_swash_tmps;
1404 slen = PL_last_swash_slen;
1405 }
1406 else {
1407 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1408 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1409
1410 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1411 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1412 dSP;
2b9d42f0
NIS
1413 /* We use utf8n_to_uvuni() as we want an index into
1414 Unicode tables, not a native character number.
1415 */
1416 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1417 SV *errsv_save;
a0ed51b3
LW
1418 ENTER;
1419 SAVETMPS;
1420 save_re_context();
1421 PUSHSTACKi(PERLSI_MAGIC);
1422 PUSHMARK(SP);
1423 EXTEND(SP,3);
1424 PUSHs((SV*)sv);
ffbc6a93 1425 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1426 PUSHs(sv_2mortal(newSViv((klen) ?
1427 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1428 PUSHs(sv_2mortal(newSViv(needents)));
1429 PUTBACK;
f8be5cf0 1430 errsv_save = newSVsv(ERRSV);
864dbfa3 1431 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1432 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1433 else
e24b16f9 1434 retval = &PL_sv_undef;
f8be5cf0
JH
1435 if (!SvTRUE(ERRSV))
1436 sv_setsv(ERRSV, errsv_save);
1437 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1438 POPSTACK;
1439 FREETMPS;
1440 LEAVE;
e24b16f9
GS
1441 if (PL_curcop == &PL_compiling)
1442 PL_curcop->op_private = PL_hints;
a0ed51b3 1443
dfe13c55 1444 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1445
7d85a32c 1446 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1447 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1448 }
1449
1450 PL_last_swash_hv = hv;
1451 PL_last_swash_klen = klen;
1452 PL_last_swash_tmps = tmps;
1453 PL_last_swash_slen = slen;
1454 if (klen)
1455 Copy(ptr, PL_last_swash_key, klen, U8);
1456 }
1457
9faf8d75 1458 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1459 case 1:
1460 bit = 1 << (off & 7);
1461 off >>= 3;
1462 return (tmps[off] & bit) != 0;
1463 case 8:
1464 return tmps[off];
1465 case 16:
1466 off <<= 1;
1467 return (tmps[off] << 8) + tmps[off + 1] ;
1468 case 32:
1469 off <<= 2;
1470 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1471 }
cea2e8a9 1472 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1473 return 0;
1474}
2b9d42f0
NIS
1475
1476
1477/*
37607a96 1478=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0
NIS
1479
1480Adds the UTF8 representation of the Native codepoint C<uv> to the end
1481of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1482bytes available. The return value is the pointer to the byte after the
1483end of the new character. In other words,
1484
1485 d = uvchr_to_utf8(d, uv);
1486
1487is the recommended wide native character-aware way of saying
1488
1489 *(d++) = uv;
1490
1491=cut
1492*/
1493
1494/* On ASCII machines this is normally a macro but we want a
1495 real function in case XS code wants it
1496*/
1497#undef Perl_uvchr_to_utf8
1498U8 *
1499Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1500{
1501 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1502}
1503
1504
1505/*
37607a96 1506=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1507
1508Returns the native character value of the first character in the string C<s>
1509which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1510length, in bytes, of that character.
1511
1512Allows length and flags to be passed to low level routine.
1513
1514=cut
1515*/
1516/* On ASCII machines this is normally a macro but we want a
1517 real function in case XS code wants it
1518*/
1519#undef Perl_utf8n_to_uvchr
1520UV
37607a96 1521Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1522{
1523 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1524 return UNI_TO_NATIVE(uv);
1525}
1526
e6b2e755
JH
1527char *
1528Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1529{
1530 int truncated = 0;
1531 char *s, *e;
1532
1533 sv_setpvn(dsv, "", 0);
1534 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1535 UV u;
1536 if (pvlim && SvCUR(dsv) >= pvlim) {
1537 truncated++;
1538 break;
1539 }
1540 u = utf8_to_uvchr((U8*)s, 0);
1541 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1542 }
1543 if (truncated)
1544 sv_catpvn(dsv, "...", 3);
1545
1546 return SvPVX(dsv);
1547}
2b9d42f0 1548
e6b2e755
JH
1549char *
1550Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1551{
701a277b
JH
1552 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1553 pvlim, flags);
1554}
1555
1556I32
1557Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, const char *s2, bool u2, register I32 len)
1558{
1559 register U8 *a = (U8*)s1;
1560 register U8 *b = (U8*)s2;
1561 STRLEN la, lb;
1562 UV ca, cb;
1563 STRLEN ulen1, ulen2;
1564 U8 tmpbuf1[UTF8_MAXLEN*3+1];
1565 U8 tmpbuf2[UTF8_MAXLEN*3+1];
1566
1567 while (len) {
1568 if (u1)
1569 ca = utf8_to_uvchr((U8*)a, &la);
1570 else {
1571 ca = *a;
1572 la = 1;
1573 }
1574 if (u2)
1575 cb = utf8_to_uvchr((U8*)b, &lb);
1576 else {
1577 cb = *b;
1578 lb = 1;
1579 }
1580 if (ca != cb) {
1581 if (u1)
596717ce 1582 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
701a277b
JH
1583 else
1584 ulen1 = 1;
1585 if (u2)
596717ce 1586 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
701a277b
JH
1587 else
1588 ulen2 = 1;
1589 if (ulen1 != ulen2
1590 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
726d7ae4 1591 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
701a277b
JH
1592 return 1;
1593 }
1594 a += la;
1595 b += lb;
1596 }
1597 return 0;
e6b2e755 1598}
701a277b 1599