This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
HP-UX nm seems to repeat itself.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
be3c0a43 3 * Copyright (c) 1998-2002, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
ccfc67b7
JH
27/*
28=head1 Unicode Support
a0ed51b3 29
b851fbc1 30=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
b851fbc1
JH
37 d = uvuni_to_utf8_flags(d, uv, flags);
38
39or, in most cases,
40
9041c2e3 41 d = uvuni_to_utf8(d, uv);
eebe1485 42
b851fbc1
JH
43(which is equivalent to)
44
45 d = uvuni_to_utf8_flags(d, uv, 0);
46
eebe1485
SC
47is the recommended Unicode-aware way of saying
48
49 *(d++) = uv;
50
51=cut
52*/
53
dfe13c55 54U8 *
b851fbc1 55Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 56{
62961d2e 57 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
58 if (UNICODE_IS_SURROGATE(uv) &&
59 !(flags & UNICODE_ALLOW_SURROGATE))
507b9800 60 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
61 else if (
62 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
63 !(flags & UNICODE_ALLOW_FDD0))
64 ||
65 ((uv & 0xFFFF) == 0xFFFE &&
66 !(flags & UNICODE_ALLOW_FFFE))
67 ||
68 ((uv & 0xFFFF) == 0xFFFF &&
69 !(flags & UNICODE_ALLOW_FFFF))) &&
70 /* UNICODE_ALLOW_SUPER includes
71 * FFFEs and FFFFs beyond 0x10FFFF. */
72 ((uv <= PERL_UNICODE_MAX) ||
73 !(flags & UNICODE_ALLOW_SUPER))
74 )
507b9800
JH
75 Perl_warner(aTHX_ WARN_UTF8,
76 "Unicode character 0x%04"UVxf" is illegal", uv);
77 }
c4d5f83a
NIS
78 if (UNI_IS_INVARIANT(uv)) {
79 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
80 return d;
81 }
2d331972 82#if defined(EBCDIC)
1d72bdf6
NIS
83 else {
84 STRLEN len = UNISKIP(uv);
85 U8 *p = d+len-1;
86 while (p > d) {
87 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
88 uv >>= UTF_ACCUMULATION_SHIFT;
89 }
90 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
91 return d+len;
92 }
93#else /* Non loop style */
a0ed51b3
LW
94 if (uv < 0x800) {
95 *d++ = (( uv >> 6) | 0xc0);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x10000) {
100 *d++ = (( uv >> 12) | 0xe0);
101 *d++ = (((uv >> 6) & 0x3f) | 0x80);
102 *d++ = (( uv & 0x3f) | 0x80);
103 return d;
104 }
105 if (uv < 0x200000) {
106 *d++ = (( uv >> 18) | 0xf0);
107 *d++ = (((uv >> 12) & 0x3f) | 0x80);
108 *d++ = (((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x4000000) {
113 *d++ = (( uv >> 24) | 0xf8);
114 *d++ = (((uv >> 18) & 0x3f) | 0x80);
115 *d++ = (((uv >> 12) & 0x3f) | 0x80);
116 *d++ = (((uv >> 6) & 0x3f) | 0x80);
117 *d++ = (( uv & 0x3f) | 0x80);
118 return d;
119 }
120 if (uv < 0x80000000) {
121 *d++ = (( uv >> 30) | 0xfc);
122 *d++ = (((uv >> 24) & 0x3f) | 0x80);
123 *d++ = (((uv >> 18) & 0x3f) | 0x80);
124 *d++ = (((uv >> 12) & 0x3f) | 0x80);
125 *d++ = (((uv >> 6) & 0x3f) | 0x80);
126 *d++ = (( uv & 0x3f) | 0x80);
127 return d;
128 }
6b8eaf93 129#ifdef HAS_QUAD
d7578b48 130 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
131#endif
132 {
133 *d++ = 0xfe; /* Can't match U+FEFF! */
134 *d++ = (((uv >> 30) & 0x3f) | 0x80);
135 *d++ = (((uv >> 24) & 0x3f) | 0x80);
136 *d++ = (((uv >> 18) & 0x3f) | 0x80);
137 *d++ = (((uv >> 12) & 0x3f) | 0x80);
138 *d++ = (((uv >> 6) & 0x3f) | 0x80);
139 *d++ = (( uv & 0x3f) | 0x80);
140 return d;
141 }
6b8eaf93 142#ifdef HAS_QUAD
a0ed51b3
LW
143 {
144 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
145 *d++ = 0x80; /* 6 Reserved bits */
146 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
147 *d++ = (((uv >> 54) & 0x3f) | 0x80);
148 *d++ = (((uv >> 48) & 0x3f) | 0x80);
149 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
150 *d++ = (((uv >> 36) & 0x3f) | 0x80);
151 *d++ = (((uv >> 30) & 0x3f) | 0x80);
152 *d++ = (((uv >> 24) & 0x3f) | 0x80);
153 *d++ = (((uv >> 18) & 0x3f) | 0x80);
154 *d++ = (((uv >> 12) & 0x3f) | 0x80);
155 *d++ = (((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (( uv & 0x3f) | 0x80);
157 return d;
158 }
159#endif
1d72bdf6 160#endif /* Loop style */
a0ed51b3 161}
b851fbc1
JH
162
163U8 *
164Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165{
166 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
167}
9041c2e3
NIS
168
169
170/*
eebe1485
SC
171=for apidoc A|STRLEN|is_utf8_char|U8 *s
172
5da9da9e 173Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 174character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
175The actual number of bytes in the UTF-8 character will be returned if
176it is valid, otherwise 0.
9041c2e3 177
37607a96
PK
178=cut
179*/
067a85ef 180STRLEN
386d01d6
GS
181Perl_is_utf8_char(pTHX_ U8 *s)
182{
183 U8 u = *s;
067a85ef
A
184 STRLEN slen, len;
185 UV uv, ouv;
386d01d6 186
1d72bdf6 187 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
188 return 1;
189
60006e79 190 if (!UTF8_IS_START(u))
386d01d6
GS
191 return 0;
192
9f07fdcd 193 len = UTF8SKIP(s);
386d01d6 194
60006e79 195 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
196 return 0;
197
386d01d6
GS
198 slen = len - 1;
199 s++;
6eb6869e 200 u &= UTF_START_MASK(len);
1d72bdf6 201 uv = u;
067a85ef 202 ouv = uv;
386d01d6 203 while (slen--) {
60006e79 204 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 205 return 0;
8850bf83 206 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 207 if (uv < ouv)
067a85ef
A
208 return 0;
209 ouv = uv;
386d01d6
GS
210 s++;
211 }
067a85ef 212
5bbb0b5a 213 if (UNISKIP(uv) < len)
067a85ef
A
214 return 0;
215
386d01d6
GS
216 return len;
217}
218
6662521e 219/*
eebe1485 220=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 221
5da9da9e
JH
222Returns true if first C<len> bytes of the given string form a valid UTF8
223string, false otherwise. Note that 'a valid UTF8 string' does not mean
224'a string that contains UTF8' because a valid ASCII string is a valid
225UTF8 string.
6662521e
GS
226
227=cut
228*/
229
8e84507e 230bool
6662521e
GS
231Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
232{
067a85ef 233 U8* x = s;
1aa99e6b 234 U8* send;
067a85ef
A
235 STRLEN c;
236
1aa99e6b 237 if (!len)
6cd5fe39 238 len = strlen((char *)s);
1aa99e6b
IH
239 send = s + len;
240
6662521e
GS
241 while (x < send) {
242 c = is_utf8_char(x);
067a85ef
A
243 if (!c)
244 return FALSE;
6662521e 245 x += c;
6662521e 246 }
60006e79
JH
247 if (x != send)
248 return FALSE;
067a85ef
A
249
250 return TRUE;
6662521e
GS
251}
252
67e989fb 253/*
9041c2e3 254=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 255
9041c2e3
NIS
256Bottom level UTF-8 decode routine.
257Returns the unicode code point value of the first character in the string C<s>
ba210ebe 258which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 259C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
260
261If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
262is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
263it is assumed that the caller will raise a warning, and this function
28d3d195
JH
264will silently just set C<retlen> to C<-1> and return zero. If the
265C<flags> does not contain UTF8_CHECK_ONLY, warnings about
266malformations will be given, C<retlen> will be set to the expected
267length of the UTF-8 character in bytes, and zero will be returned.
268
269The C<flags> can also contain various flags to allow deviations from
270the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 271
9041c2e3
NIS
272Most code should use utf8_to_uvchr() rather than call this directly.
273
37607a96
PK
274=cut
275*/
67e989fb 276
a0ed51b3 277UV
37607a96 278Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 279{
097fb8e2 280 U8 *s0 = s;
9c5ffd7c 281 UV uv = *s, ouv = 0;
ba210ebe
JH
282 STRLEN len = 1;
283 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 284 UV startbyte = *s;
ba210ebe 285 STRLEN expectlen = 0;
a0dbb045
JH
286 U32 warning = 0;
287
288/* This list is a superset of the UTF8_ALLOW_XXX. */
289
290#define UTF8_WARN_EMPTY 1
291#define UTF8_WARN_CONTINUATION 2
292#define UTF8_WARN_NON_CONTINUATION 3
293#define UTF8_WARN_FE_FF 4
294#define UTF8_WARN_SHORT 5
295#define UTF8_WARN_OVERFLOW 6
296#define UTF8_WARN_SURROGATE 7
297#define UTF8_WARN_BOM 8
298#define UTF8_WARN_LONG 9
299#define UTF8_WARN_FFFF 10
300
301 if (curlen == 0 &&
302 !(flags & UTF8_ALLOW_EMPTY)) {
303 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
304 goto malformed;
305 }
306
1d72bdf6 307 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
308 if (retlen)
309 *retlen = 1;
c4d5f83a 310 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 311 }
67e989fb 312
421a8bf2 313 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 314 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 315 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
316 goto malformed;
317 }
318
421a8bf2 319 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 320 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 321 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
322 goto malformed;
323 }
9041c2e3 324
1d72bdf6 325#ifdef EBCDIC
75383841 326 uv = NATIVE_TO_UTF(uv);
1d72bdf6 327#else
fcc8fcf6
JH
328 if ((uv == 0xfe || uv == 0xff) &&
329 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 330 warning = UTF8_WARN_FE_FF;
ba210ebe 331 goto malformed;
a0ed51b3 332 }
1d72bdf6
NIS
333#endif
334
ba210ebe
JH
335 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
336 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
337 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
338 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
339#ifdef EBCDIC
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else { len = 7; uv &= 0x01; }
342#else
ba210ebe
JH
343 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
344 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
345 else { len = 13; uv = 0; } /* whoa! */
346#endif
347
a0ed51b3
LW
348 if (retlen)
349 *retlen = len;
9041c2e3 350
ba210ebe
JH
351 expectlen = len;
352
fcc8fcf6
JH
353 if ((curlen < expectlen) &&
354 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 355 warning = UTF8_WARN_SHORT;
ba210ebe
JH
356 goto malformed;
357 }
358
359 len--;
a0ed51b3 360 s++;
ba210ebe
JH
361 ouv = uv;
362
a0ed51b3 363 while (len--) {
421a8bf2
JH
364 if (!UTF8_IS_CONTINUATION(*s) &&
365 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
366 s--;
367 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 368 goto malformed;
a0ed51b3
LW
369 }
370 else
8850bf83 371 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
372 if (!(uv > ouv)) {
373 /* These cannot be allowed. */
374 if (uv == ouv) {
375 if (!(flags & UTF8_ALLOW_LONG)) {
376 warning = UTF8_WARN_LONG;
377 goto malformed;
378 }
379 }
380 else { /* uv < ouv */
381 /* This cannot be allowed. */
382 warning = UTF8_WARN_OVERFLOW;
383 goto malformed;
384 }
ba210ebe
JH
385 }
386 s++;
387 ouv = uv;
388 }
389
421a8bf2 390 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 391 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 392 warning = UTF8_WARN_SURROGATE;
ba210ebe 393 goto malformed;
421a8bf2 394 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 395 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 396 warning = UTF8_WARN_BOM;
ba210ebe 397 goto malformed;
fcc8fcf6
JH
398 } else if ((expectlen > UNISKIP(uv)) &&
399 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 400 warning = UTF8_WARN_LONG;
ba210ebe 401 goto malformed;
421a8bf2 402 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 403 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 404 warning = UTF8_WARN_FFFF;
a9917092 405 goto malformed;
a0ed51b3 406 }
ba210ebe 407
a0ed51b3 408 return uv;
ba210ebe
JH
409
410malformed:
411
fcc8fcf6 412 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 413 if (retlen)
cc366d4b 414 *retlen = -1;
ba210ebe
JH
415 return 0;
416 }
417
a0dbb045
JH
418 if (dowarn) {
419 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
420
421 switch (warning) {
422 case 0: /* Intentionally empty. */ break;
423 case UTF8_WARN_EMPTY:
424 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
425 break;
426 case UTF8_WARN_CONTINUATION:
097fb8e2 427 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
428 break;
429 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
430 if (s == s0)
431 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
432 (UV)s[1], startbyte);
433 else
434 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
435 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
436
a0dbb045
JH
437 break;
438 case UTF8_WARN_FE_FF:
439 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
440 break;
441 case UTF8_WARN_SHORT:
097fb8e2
JH
442 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
443 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 444 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
445 break;
446 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
447 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
448 ouv, *s, startbyte);
a0dbb045
JH
449 break;
450 case UTF8_WARN_SURROGATE:
451 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
452 break;
453 case UTF8_WARN_BOM:
454 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
455 break;
456 case UTF8_WARN_LONG:
097fb8e2
JH
457 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
458 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
459 break;
460 case UTF8_WARN_FFFF:
461 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
462 break;
463 default:
464 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
465 break;
466 }
467
468 if (warning) {
469 char *s = SvPVX(sv);
470
471 if (PL_op)
472 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 473 "%s in %s", s, OP_DESC(PL_op));
a0dbb045
JH
474 else
475 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
476 }
477 }
478
ba210ebe 479 if (retlen)
28d3d195 480 *retlen = expectlen ? expectlen : len;
ba210ebe 481
28d3d195 482 return 0;
a0ed51b3
LW
483}
484
8e84507e 485/*
37607a96 486=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
487
488Returns the native character value of the first character in the string C<s>
489which is assumed to be in UTF8 encoding; C<retlen> will be set to the
490length, in bytes, of that character.
491
492If C<s> does not point to a well-formed UTF8 character, zero is
493returned and retlen is set, if possible, to -1.
494
495=cut
496*/
497
498UV
37607a96 499Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3
NIS
500{
501 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
502}
503
504/*
37607a96 505=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
506
507Returns the Unicode code point of the first character in the string C<s>
508which is assumed to be in UTF8 encoding; C<retlen> will be set to the
509length, in bytes, of that character.
510
511This function should only be used when returned UV is considered
512an index into the Unicode semantic tables (e.g. swashes).
513
ba210ebe
JH
514If C<s> does not point to a well-formed UTF8 character, zero is
515returned and retlen is set, if possible, to -1.
8e84507e
NIS
516
517=cut
518*/
519
520UV
37607a96 521Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 522{
9041c2e3
NIS
523 /* Call the low level routine asking for checks */
524 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
525}
526
b76347f2 527/*
37607a96 528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
529
530Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
531Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
b76347f2
JH
533
534=cut
535*/
536
537STRLEN
37607a96 538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
539{
540 STRLEN len = 0;
541
8850bf83
JH
542 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543 * the bitops (especially ~) can create illegal UTF-8.
544 * In other words: in Perl UTF-8 is not just for Unicode. */
545
b76347f2 546 if (e < s)
02eb7b47 547 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 548 while (s < e) {
02eb7b47 549 U8 t = UTF8SKIP(s);
b76347f2
JH
550
551 if (e - s < t)
0064a8a9 552 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
553 s += t;
554 len++;
555 }
556
557 return len;
558}
559
b06226ff 560/*
eebe1485 561=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
562
563Returns the number of UTF8 characters between the UTF-8 pointers C<a>
564and C<b>.
565
566WARNING: use only if you *know* that the pointers point inside the
567same UTF-8 buffer.
568
37607a96
PK
569=cut
570*/
a0ed51b3 571
02eb7b47 572IV
864dbfa3 573Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 574{
02eb7b47
JH
575 IV off = 0;
576
8850bf83
JH
577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
580
a0ed51b3
LW
581 if (a < b) {
582 while (a < b) {
02eb7b47
JH
583 U8 c = UTF8SKIP(a);
584
585 if (b - a < c)
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 a += c;
a0ed51b3
LW
588 off--;
589 }
590 }
591 else {
592 while (b < a) {
02eb7b47
JH
593 U8 c = UTF8SKIP(b);
594
595 if (a - b < c)
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
597 b += c;
a0ed51b3
LW
598 off++;
599 }
600 }
02eb7b47 601
a0ed51b3
LW
602 return off;
603}
604
b06226ff 605/*
37607a96 606=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 607
8850bf83
JH
608Return the UTF-8 pointer C<s> displaced by C<off> characters, either
609forward or backward.
b06226ff
JH
610
611WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
612the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613on the first byte of character or just after the last byte of a character.
b06226ff 614
37607a96
PK
615=cut
616*/
a0ed51b3
LW
617
618U8 *
864dbfa3 619Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 620{
8850bf83
JH
621 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
622 * the bitops (especially ~) can create illegal UTF-8.
623 * In other words: in Perl UTF-8 is not just for Unicode. */
624
a0ed51b3
LW
625 if (off >= 0) {
626 while (off--)
627 s += UTF8SKIP(s);
628 }
629 else {
630 while (off++) {
631 s--;
8850bf83
JH
632 while (UTF8_IS_CONTINUATION(*s))
633 s--;
a0ed51b3
LW
634 }
635 }
636 return s;
637}
638
6940069f 639/*
eebe1485 640=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 641
246fae53
MG
642Converts a string C<s> of length C<len> from UTF8 into byte encoding.
643Unlike C<bytes_to_utf8>, this over-writes the original string, and
644updates len to contain the new length.
67e989fb 645Returns zero on failure, setting C<len> to -1.
6940069f
GS
646
647=cut
648*/
649
650U8 *
37607a96 651Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 652{
6940069f
GS
653 U8 *send;
654 U8 *d;
dcad2880 655 U8 *save = s;
246fae53
MG
656
657 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
658 for (send = s + *len; s < send; ) {
659 U8 c = *s++;
660
1d72bdf6
NIS
661 if (!UTF8_IS_INVARIANT(c) &&
662 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
663 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
664 *len = -1;
665 return 0;
666 }
246fae53 667 }
dcad2880
JH
668
669 d = s = save;
6940069f 670 while (s < send) {
ed646e6e 671 STRLEN ulen;
9041c2e3 672 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 673 s += ulen;
6940069f
GS
674 }
675 *d = '\0';
246fae53 676 *len = d - save;
6940069f
GS
677 return save;
678}
679
680/*
f9a63242
JH
681=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
682
683Converts a string C<s> of length C<len> from UTF8 into byte encoding.
684Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
685the newly-created string, and updates C<len> to contain the new
686length. Returns the original string if no conversion occurs, C<len>
687is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6880 if C<s> is converted or contains all 7bit characters.
f9a63242 689
37607a96
PK
690=cut
691*/
f9a63242
JH
692
693U8 *
37607a96 694Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 695{
f9a63242
JH
696 U8 *d;
697 U8 *start = s;
db42d148 698 U8 *send;
f9a63242
JH
699 I32 count = 0;
700
701 if (!*is_utf8)
702 return start;
703
ef9edfd0 704 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
705 for (send = s + *len; s < send;) {
706 U8 c = *s++;
1d72bdf6 707 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
708 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
709 (c = *s++) && UTF8_IS_CONTINUATION(c))
710 count++;
711 else
f9a63242 712 return start;
db42d148 713 }
f9a63242
JH
714 }
715
716 *is_utf8 = 0;
717
f9a63242 718 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 719 s = start; start = d;
f9a63242
JH
720 while (s < send) {
721 U8 c = *s++;
c4d5f83a
NIS
722 if (!UTF8_IS_INVARIANT(c)) {
723 /* Then it is two-byte encoded */
724 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
725 c = ASCII_TO_NATIVE(c);
726 }
727 *d++ = c;
f9a63242
JH
728 }
729 *d = '\0';
730 *len = d - start;
731 return start;
732}
733
734/*
eebe1485 735=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
736
737Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
738Returns a pointer to the newly-created string, and sets C<len> to
739reflect the new length.
6940069f 740
497711e7 741=cut
6940069f
GS
742*/
743
744U8*
37607a96 745Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 746{
6940069f
GS
747 U8 *send;
748 U8 *d;
749 U8 *dst;
6662521e 750 send = s + (*len);
6940069f 751
6662521e 752 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
753 dst = d;
754
755 while (s < send) {
db42d148 756 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
757 if (UNI_IS_INVARIANT(uv))
758 *d++ = UTF_TO_NATIVE(uv);
6940069f 759 else {
90f44359
JH
760 *d++ = UTF8_EIGHT_BIT_HI(uv);
761 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
762 }
763 }
764 *d = '\0';
6662521e 765 *len = d-dst;
6940069f
GS
766 return dst;
767}
768
a0ed51b3 769/*
dea0fc0b 770 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
771 *
772 * Destination must be pre-extended to 3/2 source. Do not use in-place.
773 * We optimize for native, for obvious reasons. */
774
775U8*
dea0fc0b 776Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 777{
dea0fc0b
JH
778 U8* pend;
779 U8* dstart = d;
780
781 if (bytelen & 1)
a7867d0a 782 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
783
784 pend = p + bytelen;
785
a0ed51b3 786 while (p < pend) {
dea0fc0b
JH
787 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
788 p += 2;
a0ed51b3
LW
789 if (uv < 0x80) {
790 *d++ = uv;
791 continue;
792 }
793 if (uv < 0x800) {
794 *d++ = (( uv >> 6) | 0xc0);
795 *d++ = (( uv & 0x3f) | 0x80);
796 continue;
797 }
798 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
799 UV low = *p++;
800 if (low < 0xdc00 || low >= 0xdfff)
801 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
802 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
803 }
804 if (uv < 0x10000) {
805 *d++ = (( uv >> 12) | 0xe0);
806 *d++ = (((uv >> 6) & 0x3f) | 0x80);
807 *d++ = (( uv & 0x3f) | 0x80);
808 continue;
809 }
810 else {
811 *d++ = (( uv >> 18) | 0xf0);
812 *d++ = (((uv >> 12) & 0x3f) | 0x80);
813 *d++ = (((uv >> 6) & 0x3f) | 0x80);
814 *d++ = (( uv & 0x3f) | 0x80);
815 continue;
816 }
817 }
dea0fc0b 818 *newlen = d - dstart;
a0ed51b3
LW
819 return d;
820}
821
822/* Note: this one is slightly destructive of the source. */
823
824U8*
dea0fc0b 825Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
826{
827 U8* s = (U8*)p;
828 U8* send = s + bytelen;
829 while (s < send) {
830 U8 tmp = s[0];
831 s[0] = s[1];
832 s[1] = tmp;
833 s += 2;
834 }
dea0fc0b 835 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
836}
837
838/* for now these are all defined (inefficiently) in terms of the utf8 versions */
839
840bool
84afefe6 841Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 842{
ad391ad9 843 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 844 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
845 return is_utf8_alnum(tmpbuf);
846}
847
848bool
84afefe6 849Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 850{
ad391ad9 851 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 852 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
853 return is_utf8_alnumc(tmpbuf);
854}
855
856bool
84afefe6 857Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 858{
ad391ad9 859 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 860 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
861 return is_utf8_idfirst(tmpbuf);
862}
863
864bool
84afefe6 865Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 866{
ad391ad9 867 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 868 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
869 return is_utf8_alpha(tmpbuf);
870}
871
872bool
84afefe6 873Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 874{
ad391ad9 875 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 876 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
877 return is_utf8_ascii(tmpbuf);
878}
879
880bool
84afefe6 881Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 882{
ad391ad9 883 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 884 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
885 return is_utf8_space(tmpbuf);
886}
887
888bool
84afefe6 889Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 890{
ad391ad9 891 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 892 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
893 return is_utf8_digit(tmpbuf);
894}
895
896bool
84afefe6 897Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 898{
ad391ad9 899 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 900 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
901 return is_utf8_upper(tmpbuf);
902}
903
904bool
84afefe6 905Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 906{
ad391ad9 907 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 908 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
909 return is_utf8_lower(tmpbuf);
910}
911
912bool
84afefe6 913Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 914{
ad391ad9 915 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 916 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
917 return is_utf8_cntrl(tmpbuf);
918}
919
920bool
84afefe6 921Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 922{
ad391ad9 923 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 924 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
925 return is_utf8_graph(tmpbuf);
926}
927
928bool
84afefe6 929Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 930{
ad391ad9 931 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 932 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
933 return is_utf8_print(tmpbuf);
934}
935
b8c5462f 936bool
84afefe6 937Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 938{
ad391ad9 939 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 940 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
941 return is_utf8_punct(tmpbuf);
942}
943
4d61ec05 944bool
84afefe6 945Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 946{
e7ae6809 947 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 948 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
949 return is_utf8_xdigit(tmpbuf);
950}
951
84afefe6
JH
952UV
953Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 954{
0ebc6274
JH
955 uvchr_to_utf8(p, c);
956 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
957}
958
84afefe6
JH
959UV
960Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 961{
0ebc6274
JH
962 uvchr_to_utf8(p, c);
963 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
964}
965
84afefe6
JH
966UV
967Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 968{
0ebc6274
JH
969 uvchr_to_utf8(p, c);
970 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
971}
972
84afefe6
JH
973UV
974Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
975{
0ebc6274
JH
976 uvchr_to_utf8(p, c);
977 return to_utf8_fold(p, p, lenp);
84afefe6
JH
978}
979
a0ed51b3
LW
980/* for now these all assume no locale info available for Unicode > 255 */
981
982bool
84afefe6 983Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
984{
985 return is_uni_alnum(c); /* XXX no locale support yet */
986}
987
988bool
84afefe6 989Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
990{
991 return is_uni_alnumc(c); /* XXX no locale support yet */
992}
993
994bool
84afefe6 995Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
996{
997 return is_uni_idfirst(c); /* XXX no locale support yet */
998}
999
1000bool
84afefe6 1001Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1002{
1003 return is_uni_alpha(c); /* XXX no locale support yet */
1004}
1005
1006bool
84afefe6 1007Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1008{
1009 return is_uni_ascii(c); /* XXX no locale support yet */
1010}
1011
1012bool
84afefe6 1013Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1014{
1015 return is_uni_space(c); /* XXX no locale support yet */
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1020{
1021 return is_uni_digit(c); /* XXX no locale support yet */
1022}
1023
1024bool
84afefe6 1025Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1026{
1027 return is_uni_upper(c); /* XXX no locale support yet */
1028}
1029
1030bool
84afefe6 1031Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1032{
1033 return is_uni_lower(c); /* XXX no locale support yet */
1034}
1035
1036bool
84afefe6 1037Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1038{
1039 return is_uni_cntrl(c); /* XXX no locale support yet */
1040}
1041
1042bool
84afefe6 1043Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1044{
1045 return is_uni_graph(c); /* XXX no locale support yet */
1046}
1047
1048bool
84afefe6 1049Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1050{
1051 return is_uni_print(c); /* XXX no locale support yet */
1052}
1053
b8c5462f 1054bool
84afefe6 1055Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1056{
1057 return is_uni_punct(c); /* XXX no locale support yet */
1058}
1059
4d61ec05 1060bool
84afefe6 1061Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1062{
1063 return is_uni_xdigit(c); /* XXX no locale support yet */
1064}
1065
b7ac61fa
JH
1066U32
1067Perl_to_uni_upper_lc(pTHX_ U32 c)
1068{
ee099d14
JH
1069 /* XXX returns only the first character -- do not use XXX */
1070 /* XXX no locale support yet */
1071 STRLEN len;
1072 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1073 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1074}
1075
1076U32
1077Perl_to_uni_title_lc(pTHX_ U32 c)
1078{
ee099d14
JH
1079 /* XXX returns only the first character XXX -- do not use XXX */
1080 /* XXX no locale support yet */
1081 STRLEN len;
1082 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1083 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1084}
1085
1086U32
1087Perl_to_uni_lower_lc(pTHX_ U32 c)
1088{
ee099d14
JH
1089 /* XXX returns only the first character -- do not use XXX */
1090 /* XXX no locale support yet */
1091 STRLEN len;
1092 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1093 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1094}
1095
a0ed51b3 1096bool
864dbfa3 1097Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1098{
386d01d6
GS
1099 if (!is_utf8_char(p))
1100 return FALSE;
a0ed51b3 1101 if (!PL_utf8_alnum)
289d4f09
ML
1102 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1103 * descendant of isalnum(3), in other words, it doesn't
1104 * contain the '_'. --jhi */
1105 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1106 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1107/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1108#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1109 if (!PL_utf8_alnum)
1110 PL_utf8_alnum = swash_init("utf8", "",
1111 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1112 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1113#endif
1114}
1115
1116bool
b8c5462f
JH
1117Perl_is_utf8_alnumc(pTHX_ U8 *p)
1118{
386d01d6
GS
1119 if (!is_utf8_char(p))
1120 return FALSE;
b8c5462f
JH
1121 if (!PL_utf8_alnum)
1122 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1123 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1124/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1125#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1126 if (!PL_utf8_alnum)
1127 PL_utf8_alnum = swash_init("utf8", "",
1128 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1129 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1130#endif
1131}
1132
1133bool
864dbfa3 1134Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1135{
1136 return *p == '_' || is_utf8_alpha(p);
1137}
1138
1139bool
864dbfa3 1140Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1141{
386d01d6
GS
1142 if (!is_utf8_char(p))
1143 return FALSE;
a0ed51b3 1144 if (!PL_utf8_alpha)
e24b16f9 1145 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1146 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3
LW
1147}
1148
1149bool
b8c5462f
JH
1150Perl_is_utf8_ascii(pTHX_ U8 *p)
1151{
386d01d6
GS
1152 if (!is_utf8_char(p))
1153 return FALSE;
b8c5462f
JH
1154 if (!PL_utf8_ascii)
1155 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1156 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f
JH
1157}
1158
1159bool
864dbfa3 1160Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1161{
386d01d6
GS
1162 if (!is_utf8_char(p))
1163 return FALSE;
a0ed51b3 1164 if (!PL_utf8_space)
3bec3564 1165 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1166 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3
LW
1167}
1168
1169bool
864dbfa3 1170Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1171{
386d01d6
GS
1172 if (!is_utf8_char(p))
1173 return FALSE;
a0ed51b3 1174 if (!PL_utf8_digit)
e24b16f9 1175 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1176 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3
LW
1177}
1178
1179bool
864dbfa3 1180Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1181{
386d01d6
GS
1182 if (!is_utf8_char(p))
1183 return FALSE;
a0ed51b3 1184 if (!PL_utf8_upper)
e24b16f9 1185 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1186 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3
LW
1187}
1188
1189bool
864dbfa3 1190Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1191{
386d01d6
GS
1192 if (!is_utf8_char(p))
1193 return FALSE;
a0ed51b3 1194 if (!PL_utf8_lower)
e24b16f9 1195 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1196 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3
LW
1197}
1198
1199bool
b8c5462f
JH
1200Perl_is_utf8_cntrl(pTHX_ U8 *p)
1201{
386d01d6
GS
1202 if (!is_utf8_char(p))
1203 return FALSE;
b8c5462f
JH
1204 if (!PL_utf8_cntrl)
1205 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1206 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f
JH
1207}
1208
1209bool
1210Perl_is_utf8_graph(pTHX_ U8 *p)
1211{
386d01d6
GS
1212 if (!is_utf8_char(p))
1213 return FALSE;
b8c5462f
JH
1214 if (!PL_utf8_graph)
1215 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1216 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f
JH
1217}
1218
1219bool
864dbfa3 1220Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1221{
386d01d6
GS
1222 if (!is_utf8_char(p))
1223 return FALSE;
a0ed51b3 1224 if (!PL_utf8_print)
e24b16f9 1225 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1226 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3
LW
1227}
1228
1229bool
b8c5462f
JH
1230Perl_is_utf8_punct(pTHX_ U8 *p)
1231{
386d01d6
GS
1232 if (!is_utf8_char(p))
1233 return FALSE;
b8c5462f
JH
1234 if (!PL_utf8_punct)
1235 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1236 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f
JH
1237}
1238
1239bool
1240Perl_is_utf8_xdigit(pTHX_ U8 *p)
1241{
386d01d6
GS
1242 if (!is_utf8_char(p))
1243 return FALSE;
b8c5462f
JH
1244 if (!PL_utf8_xdigit)
1245 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1246 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f
JH
1247}
1248
1249bool
864dbfa3 1250Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1251{
386d01d6
GS
1252 if (!is_utf8_char(p))
1253 return FALSE;
a0ed51b3 1254 if (!PL_utf8_mark)
e24b16f9 1255 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1256 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3
LW
1257}
1258
6b5c0936
JH
1259/*
1260=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1261
1262The "p" contains the pointer to the UTF-8 string encoding
1263the character that is being converted.
1264
1265The "ustrp" is a pointer to the character buffer to put the
1266conversion result to. The "lenp" is a pointer to the length
1267of the result.
1268
0134edef 1269The "swashp" is a pointer to the swash to use.
6b5c0936 1270
0134edef
JH
1271Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1272and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1273but not always, a multicharacter mapping), is tried first.
6b5c0936 1274
0134edef
JH
1275The "special" is a string like "utf8::ToSpecLower", which means the
1276hash %utf8::ToSpecLower. The access to the hash is through
1277Perl_to_utf8_case().
6b5c0936 1278
0134edef
JH
1279The "normal" is a string like "ToLower" which means the swash
1280%utf8::ToLower.
1281
1282=cut */
6b5c0936 1283
2104c8d9 1284UV
a6872d42 1285Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1286{
0134edef 1287 UV uv0, uv1;
2f9475ad 1288 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1289 STRLEN len = 0;
a0ed51b3 1290
1feea2c7
JH
1291 uv0 = utf8_to_uvchr(p, 0);
1292 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1293 * are necessary in EBCDIC, they are redundant no-ops
1294 * in ASCII-ish platforms, and hopefully optimized away. */
1295 uv1 = NATIVE_TO_UNI(uv0);
1296 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1297
1298 if (!*swashp) /* load on-demand */
1299 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1300
1301 if (special) {
1302 /* It might be "special" (sometimes, but not always,
2a37f04d 1303 * a multicharacter mapping) */
983ffd37
JH
1304 HV *hv;
1305 SV *keysv;
1306 HE *he;
2a37f04d 1307 SV *val;
2f9475ad 1308
983ffd37 1309 if ((hv = get_hv(special, FALSE)) &&
1feea2c7 1310 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
2a37f04d
JH
1311 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1312 (val = HeVAL(he))) {
47654450 1313 char *s;
47654450 1314
2a37f04d 1315 s = SvPV(val, len);
47654450
JH
1316 if (len == 1)
1317 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1318 else {
2f9475ad
JH
1319#ifdef EBCDIC
1320 /* If we have EBCDIC we need to remap the characters
1321 * since any characters in the low 256 are Unicode
1322 * code points, not EBCDIC. */
7cda7a3d 1323 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1324
1325 d = tmpbuf;
1326 if (SvUTF8(val)) {
1327 STRLEN tlen = 0;
1328
1329 while (t < tend) {
1330 UV c = utf8_to_uvchr(t, &tlen);
1331 if (tlen > 0) {
1332 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1333 t += tlen;
1334 }
1335 else
1336 break;
1337 }
1338 }
1339 else {
36fec512
JH
1340 while (t < tend) {
1341 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1342 t++;
1343 }
2f9475ad
JH
1344 }
1345 len = d - tmpbuf;
1346 Copy(tmpbuf, ustrp, len, U8);
1347#else
d2dcd0fb 1348 Copy(s, ustrp, len, U8);
2f9475ad 1349#endif
29e98929 1350 }
983ffd37 1351 }
0134edef
JH
1352 }
1353
1354 if (!len && *swashp) {
1355 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1356
1357 if (uv2) {
1358 /* It was "normal" (a single character mapping). */
1359 UV uv3 = UNI_TO_NATIVE(uv2);
1360
e9101d72 1361 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1362 }
1363 }
1feea2c7 1364
0134edef
JH
1365 if (!len) /* Neither: just copy. */
1366 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1367
2a37f04d
JH
1368 if (lenp)
1369 *lenp = len;
1370
0134edef 1371 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1372}
1373
d3e79532
JH
1374/*
1375=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1376
1377Convert the UTF-8 encoded character at p to its uppercase version and
1378store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1379that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1380uppercase version may be longer than the original character (up to two
1381characters).
1382
1383The first character of the uppercased version is returned
1384(but note, as explained above, that there may be more.)
1385
1386=cut */
1387
2104c8d9 1388UV
983ffd37 1389Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1390{
983ffd37 1391 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1392 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1393}
a0ed51b3 1394
d3e79532
JH
1395/*
1396=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1397
1398Convert the UTF-8 encoded character at p to its titlecase version and
1399store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1400that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1401titlecase version may be longer than the original character (up to two
1402characters).
1403
1404The first character of the titlecased version is returned
1405(but note, as explained above, that there may be more.)
1406
1407=cut */
1408
983ffd37
JH
1409UV
1410Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1411{
1412 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1413 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1414}
1415
d3e79532
JH
1416/*
1417=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1418
1419Convert the UTF-8 encoded character at p to its lowercase version and
1420store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1421that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1422lowercase version may be longer than the original character (up to two
1423characters).
1424
1425The first character of the lowercased version is returned
1426(but note, as explained above, that there may be more.)
1427
1428=cut */
1429
2104c8d9 1430UV
a2a2844f 1431Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1432{
983ffd37 1433 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1434 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1435}
1436
d3e79532
JH
1437/*
1438=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1439
1440Convert the UTF-8 encoded character at p to its foldcase version and
1441store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1442that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1443foldcase version may be longer than the original character (up to
1444three characters).
1445
1446The first character of the foldcased version is returned
1447(but note, as explained above, that there may be more.)
1448
1449=cut */
1450
b4e400f9
JH
1451UV
1452Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1453{
1454 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1455 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1456}
1457
1458/* a "swash" is a swatch hash */
1459
1460SV*
864dbfa3 1461Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1462{
1463 SV* retval;
bf1fed83 1464 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1465 dSP;
1b026014 1466 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1467 SV* errsv_save;
ce3b816e 1468
1b026014 1469 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1470 ENTER;
f8be5cf0 1471 errsv_save = newSVsv(ERRSV);
ce3b816e 1472 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0
JH
1473 if (!SvTRUE(ERRSV))
1474 sv_setsv(ERRSV, errsv_save);
1475 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1476 LEAVE;
1477 }
1478 SPAGAIN;
a0ed51b3
LW
1479 PUSHSTACKi(PERLSI_MAGIC);
1480 PUSHMARK(SP);
1481 EXTEND(SP,5);
1482 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1483 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1484 PUSHs(listsv);
1485 PUSHs(sv_2mortal(newSViv(minbits)));
1486 PUSHs(sv_2mortal(newSViv(none)));
1487 PUTBACK;
1488 ENTER;
1489 SAVEI32(PL_hints);
1490 PL_hints = 0;
1491 save_re_context();
bf1fed83
JH
1492 if (PL_curcop == &PL_compiling)
1493 /* XXX ought to be handled by lex_start */
1494 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1495 errsv_save = newSVsv(ERRSV);
864dbfa3 1496 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1497 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1498 else
e24b16f9 1499 retval = &PL_sv_undef;
f8be5cf0
JH
1500 if (!SvTRUE(ERRSV))
1501 sv_setsv(ERRSV, errsv_save);
1502 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1503 LEAVE;
1504 POPSTACK;
e24b16f9 1505 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1506 STRLEN len;
1507 char* pv = SvPV(tokenbufsv, len);
1508
1509 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1510 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1511 }
1512 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1513 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1514 return retval;
1515}
1516
035d37be
JH
1517
1518/* This API is wrong for special case conversions since we may need to
1519 * return several Unicode characters for a single Unicode character
1520 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1521 * the lower-level routine, and it is similarly broken for returning
1522 * multiple values. --jhi */
a0ed51b3 1523UV
3568d838 1524Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1525{
1526 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1527 U32 klen;
1528 U32 off;
a0ed51b3 1529 STRLEN slen;
7d85a32c 1530 STRLEN needents;
4ea42e7f 1531 U8 *tmps = NULL;
a0ed51b3
LW
1532 U32 bit;
1533 SV *retval;
3568d838
JH
1534 U8 tmputf8[2];
1535 UV c = NATIVE_TO_ASCII(*ptr);
1536
1537 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1538 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1539 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1540 ptr = tmputf8;
1541 }
1542 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1543 * then the "swatch" is a vec() for al the chars which start
1544 * with 0xAA..0xYY
1545 * So the key in the hash (klen) is length of encoded char -1
1546 */
1547 klen = UTF8SKIP(ptr) - 1;
1548 off = ptr[klen];
a0ed51b3 1549
7d85a32c
JH
1550 if (klen == 0)
1551 {
1552 /* If char in invariant then swatch is for all the invariant chars
1553 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1554 */
1555 needents = UTF_CONTINUATION_MARK;
1556 off = NATIVE_TO_UTF(ptr[klen]);
1557 }
1558 else
1559 {
1560 /* If char is encoded then swatch is for the prefix */
1561 needents = (1 << UTF_ACCUMULATION_SHIFT);
1562 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1563 }
1564
a0ed51b3
LW
1565 /*
1566 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1567 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1568 * it's nothing to sniff at.) Pity we usually come through at least
1569 * two function calls to get here...
1570 *
1571 * NB: this code assumes that swatches are never modified, once generated!
1572 */
1573
3568d838 1574 if (hv == PL_last_swash_hv &&
a0ed51b3 1575 klen == PL_last_swash_klen &&
3568d838 1576 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1577 {
1578 tmps = PL_last_swash_tmps;
1579 slen = PL_last_swash_slen;
1580 }
1581 else {
1582 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1583 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1584
1585 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1586 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1587 dSP;
2b9d42f0
NIS
1588 /* We use utf8n_to_uvuni() as we want an index into
1589 Unicode tables, not a native character number.
1590 */
1591 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1592 SV *errsv_save;
a0ed51b3
LW
1593 ENTER;
1594 SAVETMPS;
1595 save_re_context();
1596 PUSHSTACKi(PERLSI_MAGIC);
1597 PUSHMARK(SP);
1598 EXTEND(SP,3);
1599 PUSHs((SV*)sv);
ffbc6a93 1600 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1601 PUSHs(sv_2mortal(newSViv((klen) ?
1602 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1603 PUSHs(sv_2mortal(newSViv(needents)));
1604 PUTBACK;
f8be5cf0 1605 errsv_save = newSVsv(ERRSV);
864dbfa3 1606 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1607 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1608 else
e24b16f9 1609 retval = &PL_sv_undef;
f8be5cf0
JH
1610 if (!SvTRUE(ERRSV))
1611 sv_setsv(ERRSV, errsv_save);
1612 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1613 POPSTACK;
1614 FREETMPS;
1615 LEAVE;
e24b16f9
GS
1616 if (PL_curcop == &PL_compiling)
1617 PL_curcop->op_private = PL_hints;
a0ed51b3 1618
dfe13c55 1619 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1620
7d85a32c 1621 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1622 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1623 }
1624
1625 PL_last_swash_hv = hv;
1626 PL_last_swash_klen = klen;
1627 PL_last_swash_tmps = tmps;
1628 PL_last_swash_slen = slen;
1629 if (klen)
1630 Copy(ptr, PL_last_swash_key, klen, U8);
1631 }
1632
9faf8d75 1633 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1634 case 1:
1635 bit = 1 << (off & 7);
1636 off >>= 3;
1637 return (tmps[off] & bit) != 0;
1638 case 8:
1639 return tmps[off];
1640 case 16:
1641 off <<= 1;
1642 return (tmps[off] << 8) + tmps[off + 1] ;
1643 case 32:
1644 off <<= 2;
1645 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1646 }
cea2e8a9 1647 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1648 return 0;
1649}
2b9d42f0
NIS
1650
1651
1652/*
37607a96 1653=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0
NIS
1654
1655Adds the UTF8 representation of the Native codepoint C<uv> to the end
1656of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1657bytes available. The return value is the pointer to the byte after the
1658end of the new character. In other words,
1659
1660 d = uvchr_to_utf8(d, uv);
1661
1662is the recommended wide native character-aware way of saying
1663
1664 *(d++) = uv;
1665
1666=cut
1667*/
1668
1669/* On ASCII machines this is normally a macro but we want a
1670 real function in case XS code wants it
1671*/
1672#undef Perl_uvchr_to_utf8
1673U8 *
1674Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1675{
b851fbc1 1676 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1677}
1678
b851fbc1
JH
1679U8 *
1680Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1681{
1682 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1683}
2b9d42f0
NIS
1684
1685/*
37607a96 1686=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1687
1688Returns the native character value of the first character in the string C<s>
1689which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1690length, in bytes, of that character.
1691
1692Allows length and flags to be passed to low level routine.
1693
1694=cut
1695*/
0a2ef054
JH
1696/* On ASCII machines this is normally a macro but we want
1697 a real function in case XS code wants it
2b9d42f0
NIS
1698*/
1699#undef Perl_utf8n_to_uvchr
1700UV
37607a96 1701Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1702{
1703 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1704 return UNI_TO_NATIVE(uv);
1705}
1706
d2cc3551
JH
1707/*
1708=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1709
1710Build to the scalar dsv a displayable version of the string spv,
1711length len, the displayable version being at most pvlim bytes long
1712(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1713
9e55ce06 1714The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1715isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1716to display the \\[nrfta\\] as the backslashed versions (like '\n')
1717(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1718UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1719UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1720
d2cc3551
JH
1721The pointer to the PV of the dsv is returned.
1722
1723=cut */
e6b2e755
JH
1724char *
1725Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1726{
1727 int truncated = 0;
1728 char *s, *e;
1729
1730 sv_setpvn(dsv, "", 0);
1731 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1732 UV u;
c728cb41
JH
1733 bool ok = FALSE;
1734
e6b2e755
JH
1735 if (pvlim && SvCUR(dsv) >= pvlim) {
1736 truncated++;
1737 break;
1738 }
1739 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1740 if (u < 256) {
c728cb41
JH
1741 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1742 switch (u & 0xFF) {
1743 case '\n':
1744 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1745 case '\r':
1746 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1747 case '\t':
1748 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1749 case '\f':
1750 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1751 case '\a':
1752 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1753 case '\\':
1754 Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
1755 default: break;
1756 }
1757 }
00e86452
JH
1758 /* isPRINT() is the locale-blind version. */
1759 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1760 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054
JH
1761 ok = TRUE;
1762 }
c728cb41
JH
1763 }
1764 if (!ok)
9e55ce06 1765 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1766 }
1767 if (truncated)
1768 sv_catpvn(dsv, "...", 3);
1769
1770 return SvPVX(dsv);
1771}
2b9d42f0 1772
d2cc3551
JH
1773/*
1774=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1775
1776Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1777the displayable version being at most pvlim bytes long
d2cc3551 1778(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1779
1780The flags argument is as in pv_uni_display().
1781
d2cc3551
JH
1782The pointer to the PV of the dsv is returned.
1783
1784=cut */
e6b2e755
JH
1785char *
1786Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1787{
701a277b
JH
1788 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1789 pvlim, flags);
1790}
1791
d2cc3551 1792/*
d07ddd77 1793=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1794
1795Return true if the strings s1 and s2 differ case-insensitively, false
1796if not (if they are equal case-insensitively). If u1 is true, the
1797string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1798the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1799are false, the respective string is assumed to be in native 8-bit
1800encoding.
1801
1802If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1803in there (they will point at the beginning of the I<next> character).
1804If the pointers behind pe1 or pe2 are non-NULL, they are the end
1805pointers beyond which scanning will not continue under any
1806circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1807s2+l2 will be used as goal end pointers that will also stop the scan,
1808and which qualify towards defining a successful match: all the scans
1809that define an explicit length must reach their goal pointers for
1810a match to succeed).
d2cc3551
JH
1811
1812For case-insensitiveness, the "casefolding" of Unicode is used
1813instead of upper/lowercasing both the characters, see
1814http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1815
1816=cut */
701a277b 1817I32
d07ddd77 1818Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1819{
5469e704
JH
1820 register U8 *p1 = (U8*)s1;
1821 register U8 *p2 = (U8*)s2;
d07ddd77
JH
1822 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1823 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1824 STRLEN n1 = 0, n2 = 0;
ffce6cc2
JH
1825 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1826 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8
JH
1827 U8 natbuf[1+1];
1828 STRLEN foldlen1, foldlen2;
d07ddd77 1829 bool match;
332ddc25 1830
d07ddd77
JH
1831 if (pe1)
1832 e1 = *(U8**)pe1;
1833 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1834 f1 = (U8*)s1 + l1;
1835 if (pe2)
1836 e2 = *(U8**)pe2;
1837 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1838 f2 = (U8*)s2 + l2;
1839
1840 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1841 return 1; /* mismatch; possible infinite loop or false positive */
1842
a6872d42
JH
1843 if (!u1 || !u2)
1844 natbuf[1] = 0; /* Need to terminate the buffer. */
1845
d07ddd77
JH
1846 while ((e1 == 0 || p1 < e1) &&
1847 (f1 == 0 || p1 < f1) &&
1848 (e2 == 0 || p2 < e2) &&
1849 (f2 == 0 || p2 < f2)) {
1850 if (n1 == 0) {
d7f013c8
JH
1851 if (u1)
1852 to_utf8_fold(p1, foldbuf1, &foldlen1);
1853 else {
f5cee151 1854 natbuf[0] = *p1;
d7f013c8
JH
1855 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1856 }
1857 q1 = foldbuf1;
d07ddd77 1858 n1 = foldlen1;
332ddc25 1859 }
d07ddd77 1860 if (n2 == 0) {
d7f013c8
JH
1861 if (u2)
1862 to_utf8_fold(p2, foldbuf2, &foldlen2);
1863 else {
f5cee151 1864 natbuf[0] = *p2;
d7f013c8
JH
1865 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1866 }
1867 q2 = foldbuf2;
d07ddd77 1868 n2 = foldlen2;
332ddc25 1869 }
d07ddd77
JH
1870 while (n1 && n2) {
1871 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1872 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1873 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1874 return 1; /* mismatch */
d07ddd77 1875 n1 -= UTF8SKIP(q1);
d7f013c8 1876 q1 += UTF8SKIP(q1);
d07ddd77 1877 n2 -= UTF8SKIP(q2);
d7f013c8 1878 q2 += UTF8SKIP(q2);
701a277b 1879 }
d07ddd77 1880 if (n1 == 0)
d7f013c8 1881 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1882 if (n2 == 0)
d7f013c8
JH
1883 p2 += u2 ? UTF8SKIP(p2) : 1;
1884
d2cc3551 1885 }
5469e704 1886
d07ddd77
JH
1887 /* A match is defined by all the scans that specified
1888 * an explicit length reaching their final goals. */
1889 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
1890
1891 if (match) {
d07ddd77
JH
1892 if (pe1)
1893 *pe1 = (char*)p1;
1894 if (pe2)
1895 *pe2 = (char*)p2;
5469e704
JH
1896 }
1897
1898 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1899}
701a277b 1900