This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Sterner admonishment on -t.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
bc89e66f 3 * Copyright (c) 1998-2001, Larry Wall
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
27/* Unicode support */
28
eebe1485 29/*
37607a96 30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
eebe1485
SC
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
9041c2e3 35end of the new character. In other words,
eebe1485 36
9041c2e3 37 d = uvuni_to_utf8(d, uv);
eebe1485
SC
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
dfe13c55 46U8 *
9041c2e3 47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
a0ed51b3 48{
507b9800
JH
49 if (ckWARN_d(WARN_UTF8)) {
50 if (UNICODE_IS_SURROGATE(uv))
51 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
52 else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
53 (uv == 0xFFFE || uv == 0xFFFF))
54 Perl_warner(aTHX_ WARN_UTF8,
55 "Unicode character 0x%04"UVxf" is illegal", uv);
56 }
c4d5f83a
NIS
57 if (UNI_IS_INVARIANT(uv)) {
58 *d++ = UTF_TO_NATIVE(uv);
a0ed51b3
LW
59 return d;
60 }
2d331972 61#if defined(EBCDIC)
1d72bdf6
NIS
62 else {
63 STRLEN len = UNISKIP(uv);
64 U8 *p = d+len-1;
65 while (p > d) {
66 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
67 uv >>= UTF_ACCUMULATION_SHIFT;
68 }
69 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
70 return d+len;
71 }
72#else /* Non loop style */
a0ed51b3
LW
73 if (uv < 0x800) {
74 *d++ = (( uv >> 6) | 0xc0);
75 *d++ = (( uv & 0x3f) | 0x80);
76 return d;
77 }
78 if (uv < 0x10000) {
79 *d++ = (( uv >> 12) | 0xe0);
80 *d++ = (((uv >> 6) & 0x3f) | 0x80);
81 *d++ = (( uv & 0x3f) | 0x80);
82 return d;
83 }
84 if (uv < 0x200000) {
85 *d++ = (( uv >> 18) | 0xf0);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x4000000) {
92 *d++ = (( uv >> 24) | 0xf8);
93 *d++ = (((uv >> 18) & 0x3f) | 0x80);
94 *d++ = (((uv >> 12) & 0x3f) | 0x80);
95 *d++ = (((uv >> 6) & 0x3f) | 0x80);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x80000000) {
100 *d++ = (( uv >> 30) | 0xfc);
101 *d++ = (((uv >> 24) & 0x3f) | 0x80);
102 *d++ = (((uv >> 18) & 0x3f) | 0x80);
103 *d++ = (((uv >> 12) & 0x3f) | 0x80);
104 *d++ = (((uv >> 6) & 0x3f) | 0x80);
105 *d++ = (( uv & 0x3f) | 0x80);
106 return d;
107 }
6b8eaf93 108#ifdef HAS_QUAD
d7578b48 109 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
110#endif
111 {
112 *d++ = 0xfe; /* Can't match U+FEFF! */
113 *d++ = (((uv >> 30) & 0x3f) | 0x80);
114 *d++ = (((uv >> 24) & 0x3f) | 0x80);
115 *d++ = (((uv >> 18) & 0x3f) | 0x80);
116 *d++ = (((uv >> 12) & 0x3f) | 0x80);
117 *d++ = (((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (( uv & 0x3f) | 0x80);
119 return d;
120 }
6b8eaf93 121#ifdef HAS_QUAD
a0ed51b3
LW
122 {
123 *d++ = 0xff; /* Can't match U+FFFE! */
3c77ea2b
GS
124 *d++ = 0x80; /* 6 Reserved bits */
125 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
126 *d++ = (((uv >> 54) & 0x3f) | 0x80);
127 *d++ = (((uv >> 48) & 0x3f) | 0x80);
128 *d++ = (((uv >> 42) & 0x3f) | 0x80);
a0ed51b3
LW
129 *d++ = (((uv >> 36) & 0x3f) | 0x80);
130 *d++ = (((uv >> 30) & 0x3f) | 0x80);
131 *d++ = (((uv >> 24) & 0x3f) | 0x80);
132 *d++ = (((uv >> 18) & 0x3f) | 0x80);
133 *d++ = (((uv >> 12) & 0x3f) | 0x80);
134 *d++ = (((uv >> 6) & 0x3f) | 0x80);
135 *d++ = (( uv & 0x3f) | 0x80);
136 return d;
137 }
138#endif
1d72bdf6 139#endif /* Loop style */
a0ed51b3
LW
140}
141
9041c2e3
NIS
142
143
144/*
eebe1485
SC
145=for apidoc A|STRLEN|is_utf8_char|U8 *s
146
5da9da9e 147Tests if some arbitrary number of bytes begins in a valid UTF-8
1d72bdf6 148character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
5da9da9e
JH
149The actual number of bytes in the UTF-8 character will be returned if
150it is valid, otherwise 0.
9041c2e3 151
37607a96
PK
152=cut
153*/
067a85ef 154STRLEN
386d01d6
GS
155Perl_is_utf8_char(pTHX_ U8 *s)
156{
157 U8 u = *s;
067a85ef
A
158 STRLEN slen, len;
159 UV uv, ouv;
386d01d6 160
1d72bdf6 161 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
162 return 1;
163
60006e79 164 if (!UTF8_IS_START(u))
386d01d6
GS
165 return 0;
166
9f07fdcd 167 len = UTF8SKIP(s);
386d01d6 168
60006e79 169 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
170 return 0;
171
386d01d6
GS
172 slen = len - 1;
173 s++;
6eb6869e 174 u &= UTF_START_MASK(len);
1d72bdf6 175 uv = u;
067a85ef 176 ouv = uv;
386d01d6 177 while (slen--) {
60006e79 178 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 179 return 0;
8850bf83 180 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 181 if (uv < ouv)
067a85ef
A
182 return 0;
183 ouv = uv;
386d01d6
GS
184 s++;
185 }
067a85ef 186
5bbb0b5a 187 if (UNISKIP(uv) < len)
067a85ef
A
188 return 0;
189
386d01d6
GS
190 return len;
191}
192
6662521e 193/*
eebe1485 194=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 195
5da9da9e
JH
196Returns true if first C<len> bytes of the given string form a valid UTF8
197string, false otherwise. Note that 'a valid UTF8 string' does not mean
198'a string that contains UTF8' because a valid ASCII string is a valid
199UTF8 string.
6662521e
GS
200
201=cut
202*/
203
8e84507e 204bool
6662521e
GS
205Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
206{
067a85ef 207 U8* x = s;
1aa99e6b 208 U8* send;
067a85ef
A
209 STRLEN c;
210
1aa99e6b 211 if (!len)
6cd5fe39 212 len = strlen((char *)s);
1aa99e6b
IH
213 send = s + len;
214
6662521e
GS
215 while (x < send) {
216 c = is_utf8_char(x);
067a85ef
A
217 if (!c)
218 return FALSE;
6662521e 219 x += c;
6662521e 220 }
60006e79
JH
221 if (x != send)
222 return FALSE;
067a85ef
A
223
224 return TRUE;
6662521e
GS
225}
226
67e989fb 227/*
9041c2e3 228=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 229
9041c2e3
NIS
230Bottom level UTF-8 decode routine.
231Returns the unicode code point value of the first character in the string C<s>
ba210ebe 232which is assumed to be in UTF8 encoding and no longer than C<curlen>;
7df053ec 233C<retlen> will be set to the length, in bytes, of that character.
67e989fb
JH
234
235If C<s> does not point to a well-formed UTF8 character, the behaviour
dcad2880
JH
236is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
237it is assumed that the caller will raise a warning, and this function
28d3d195
JH
238will silently just set C<retlen> to C<-1> and return zero. If the
239C<flags> does not contain UTF8_CHECK_ONLY, warnings about
240malformations will be given, C<retlen> will be set to the expected
241length of the UTF-8 character in bytes, and zero will be returned.
242
243The C<flags> can also contain various flags to allow deviations from
244the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 245
9041c2e3
NIS
246Most code should use utf8_to_uvchr() rather than call this directly.
247
37607a96
PK
248=cut
249*/
67e989fb 250
a0ed51b3 251UV
37607a96 252Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 253{
097fb8e2 254 U8 *s0 = s;
9c5ffd7c 255 UV uv = *s, ouv = 0;
ba210ebe
JH
256 STRLEN len = 1;
257 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 258 UV startbyte = *s;
ba210ebe 259 STRLEN expectlen = 0;
a0dbb045
JH
260 U32 warning = 0;
261
262/* This list is a superset of the UTF8_ALLOW_XXX. */
263
264#define UTF8_WARN_EMPTY 1
265#define UTF8_WARN_CONTINUATION 2
266#define UTF8_WARN_NON_CONTINUATION 3
267#define UTF8_WARN_FE_FF 4
268#define UTF8_WARN_SHORT 5
269#define UTF8_WARN_OVERFLOW 6
270#define UTF8_WARN_SURROGATE 7
271#define UTF8_WARN_BOM 8
272#define UTF8_WARN_LONG 9
273#define UTF8_WARN_FFFF 10
274
275 if (curlen == 0 &&
276 !(flags & UTF8_ALLOW_EMPTY)) {
277 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
278 goto malformed;
279 }
280
1d72bdf6 281 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
282 if (retlen)
283 *retlen = 1;
c4d5f83a 284 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 285 }
67e989fb 286
421a8bf2 287 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 288 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 289 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
290 goto malformed;
291 }
292
421a8bf2 293 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 294 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 295 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
296 goto malformed;
297 }
9041c2e3 298
1d72bdf6 299#ifdef EBCDIC
75383841 300 uv = NATIVE_TO_UTF(uv);
1d72bdf6 301#else
fcc8fcf6
JH
302 if ((uv == 0xfe || uv == 0xff) &&
303 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 304 warning = UTF8_WARN_FE_FF;
ba210ebe 305 goto malformed;
a0ed51b3 306 }
1d72bdf6
NIS
307#endif
308
ba210ebe
JH
309 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
310 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
311 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
312 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
313#ifdef EBCDIC
314 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
315 else { len = 7; uv &= 0x01; }
316#else
ba210ebe
JH
317 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
318 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
319 else { len = 13; uv = 0; } /* whoa! */
320#endif
321
a0ed51b3
LW
322 if (retlen)
323 *retlen = len;
9041c2e3 324
ba210ebe
JH
325 expectlen = len;
326
fcc8fcf6
JH
327 if ((curlen < expectlen) &&
328 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 329 warning = UTF8_WARN_SHORT;
ba210ebe
JH
330 goto malformed;
331 }
332
333 len--;
a0ed51b3 334 s++;
ba210ebe
JH
335 ouv = uv;
336
a0ed51b3 337 while (len--) {
421a8bf2
JH
338 if (!UTF8_IS_CONTINUATION(*s) &&
339 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
340 s--;
341 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 342 goto malformed;
a0ed51b3
LW
343 }
344 else
8850bf83 345 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
346 if (!(uv > ouv)) {
347 /* These cannot be allowed. */
348 if (uv == ouv) {
349 if (!(flags & UTF8_ALLOW_LONG)) {
350 warning = UTF8_WARN_LONG;
351 goto malformed;
352 }
353 }
354 else { /* uv < ouv */
355 /* This cannot be allowed. */
356 warning = UTF8_WARN_OVERFLOW;
357 goto malformed;
358 }
ba210ebe
JH
359 }
360 s++;
361 ouv = uv;
362 }
363
421a8bf2 364 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 365 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 366 warning = UTF8_WARN_SURROGATE;
ba210ebe 367 goto malformed;
421a8bf2 368 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
fcc8fcf6 369 !(flags & UTF8_ALLOW_BOM)) {
a0dbb045 370 warning = UTF8_WARN_BOM;
ba210ebe 371 goto malformed;
fcc8fcf6
JH
372 } else if ((expectlen > UNISKIP(uv)) &&
373 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 374 warning = UTF8_WARN_LONG;
ba210ebe 375 goto malformed;
421a8bf2 376 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 377 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 378 warning = UTF8_WARN_FFFF;
a9917092 379 goto malformed;
a0ed51b3 380 }
ba210ebe 381
a0ed51b3 382 return uv;
ba210ebe
JH
383
384malformed:
385
fcc8fcf6 386 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 387 if (retlen)
cc366d4b 388 *retlen = -1;
ba210ebe
JH
389 return 0;
390 }
391
a0dbb045
JH
392 if (dowarn) {
393 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
394
395 switch (warning) {
396 case 0: /* Intentionally empty. */ break;
397 case UTF8_WARN_EMPTY:
398 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
399 break;
400 case UTF8_WARN_CONTINUATION:
097fb8e2 401 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
402 break;
403 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
404 if (s == s0)
405 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
406 (UV)s[1], startbyte);
407 else
408 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
409 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
410
a0dbb045
JH
411 break;
412 case UTF8_WARN_FE_FF:
413 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
414 break;
415 case UTF8_WARN_SHORT:
097fb8e2
JH
416 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
417 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 418 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
419 break;
420 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
421 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
422 ouv, *s, startbyte);
a0dbb045
JH
423 break;
424 case UTF8_WARN_SURROGATE:
425 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
426 break;
427 case UTF8_WARN_BOM:
428 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
429 break;
430 case UTF8_WARN_LONG:
097fb8e2
JH
431 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
432 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
433 break;
434 case UTF8_WARN_FFFF:
435 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
436 break;
437 default:
438 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
439 break;
440 }
441
442 if (warning) {
443 char *s = SvPVX(sv);
444
445 if (PL_op)
446 Perl_warner(aTHX_ WARN_UTF8,
53e06cf0 447 "%s in %s", s, OP_DESC(PL_op));
a0dbb045
JH
448 else
449 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
450 }
451 }
452
ba210ebe 453 if (retlen)
28d3d195 454 *retlen = expectlen ? expectlen : len;
ba210ebe 455
28d3d195 456 return 0;
a0ed51b3
LW
457}
458
8e84507e 459/*
37607a96 460=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
461
462Returns the native character value of the first character in the string C<s>
463which is assumed to be in UTF8 encoding; C<retlen> will be set to the
464length, in bytes, of that character.
465
466If C<s> does not point to a well-formed UTF8 character, zero is
467returned and retlen is set, if possible, to -1.
468
469=cut
470*/
471
472UV
37607a96 473Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3
NIS
474{
475 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
476}
477
478/*
37607a96 479=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
480
481Returns the Unicode code point of the first character in the string C<s>
482which is assumed to be in UTF8 encoding; C<retlen> will be set to the
483length, in bytes, of that character.
484
485This function should only be used when returned UV is considered
486an index into the Unicode semantic tables (e.g. swashes).
487
ba210ebe
JH
488If C<s> does not point to a well-formed UTF8 character, zero is
489returned and retlen is set, if possible, to -1.
8e84507e
NIS
490
491=cut
492*/
493
494UV
37607a96 495Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 496{
9041c2e3
NIS
497 /* Call the low level routine asking for checks */
498 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
8e84507e
NIS
499}
500
b76347f2 501/*
37607a96 502=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
503
504Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
505Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
506up past C<e>, croaks.
b76347f2
JH
507
508=cut
509*/
510
511STRLEN
37607a96 512Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
513{
514 STRLEN len = 0;
515
8850bf83
JH
516 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
517 * the bitops (especially ~) can create illegal UTF-8.
518 * In other words: in Perl UTF-8 is not just for Unicode. */
519
b76347f2 520 if (e < s)
02eb7b47 521 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
b76347f2 522 while (s < e) {
02eb7b47 523 U8 t = UTF8SKIP(s);
b76347f2
JH
524
525 if (e - s < t)
0064a8a9 526 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
b76347f2
JH
527 s += t;
528 len++;
529 }
530
531 return len;
532}
533
b06226ff 534/*
eebe1485 535=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff
JH
536
537Returns the number of UTF8 characters between the UTF-8 pointers C<a>
538and C<b>.
539
540WARNING: use only if you *know* that the pointers point inside the
541same UTF-8 buffer.
542
37607a96
PK
543=cut
544*/
a0ed51b3 545
02eb7b47 546IV
864dbfa3 547Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 548{
02eb7b47
JH
549 IV off = 0;
550
8850bf83
JH
551 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
552 * the bitops (especially ~) can create illegal UTF-8.
553 * In other words: in Perl UTF-8 is not just for Unicode. */
554
a0ed51b3
LW
555 if (a < b) {
556 while (a < b) {
02eb7b47
JH
557 U8 c = UTF8SKIP(a);
558
559 if (b - a < c)
560 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
561 a += c;
a0ed51b3
LW
562 off--;
563 }
564 }
565 else {
566 while (b < a) {
02eb7b47
JH
567 U8 c = UTF8SKIP(b);
568
569 if (a - b < c)
570 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
571 b += c;
a0ed51b3
LW
572 off++;
573 }
574 }
02eb7b47 575
a0ed51b3
LW
576 return off;
577}
578
b06226ff 579/*
37607a96 580=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 581
8850bf83
JH
582Return the UTF-8 pointer C<s> displaced by C<off> characters, either
583forward or backward.
b06226ff
JH
584
585WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
586the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
587on the first byte of character or just after the last byte of a character.
b06226ff 588
37607a96
PK
589=cut
590*/
a0ed51b3
LW
591
592U8 *
864dbfa3 593Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 594{
8850bf83
JH
595 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
596 * the bitops (especially ~) can create illegal UTF-8.
597 * In other words: in Perl UTF-8 is not just for Unicode. */
598
a0ed51b3
LW
599 if (off >= 0) {
600 while (off--)
601 s += UTF8SKIP(s);
602 }
603 else {
604 while (off++) {
605 s--;
8850bf83
JH
606 while (UTF8_IS_CONTINUATION(*s))
607 s--;
a0ed51b3
LW
608 }
609 }
610 return s;
611}
612
6940069f 613/*
eebe1485 614=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 615
246fae53
MG
616Converts a string C<s> of length C<len> from UTF8 into byte encoding.
617Unlike C<bytes_to_utf8>, this over-writes the original string, and
618updates len to contain the new length.
67e989fb 619Returns zero on failure, setting C<len> to -1.
6940069f
GS
620
621=cut
622*/
623
624U8 *
37607a96 625Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 626{
6940069f
GS
627 U8 *send;
628 U8 *d;
dcad2880 629 U8 *save = s;
246fae53
MG
630
631 /* ensure valid UTF8 and chars < 256 before updating string */
dcad2880
JH
632 for (send = s + *len; s < send; ) {
633 U8 c = *s++;
634
1d72bdf6
NIS
635 if (!UTF8_IS_INVARIANT(c) &&
636 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
637 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
638 *len = -1;
639 return 0;
640 }
246fae53 641 }
dcad2880
JH
642
643 d = s = save;
6940069f 644 while (s < send) {
ed646e6e 645 STRLEN ulen;
9041c2e3 646 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 647 s += ulen;
6940069f
GS
648 }
649 *d = '\0';
246fae53 650 *len = d - save;
6940069f
GS
651 return save;
652}
653
654/*
f9a63242
JH
655=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
656
657Converts a string C<s> of length C<len> from UTF8 into byte encoding.
658Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
659the newly-created string, and updates C<len> to contain the new
660length. Returns the original string if no conversion occurs, C<len>
661is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6620 if C<s> is converted or contains all 7bit characters.
f9a63242 663
37607a96
PK
664=cut
665*/
f9a63242
JH
666
667U8 *
37607a96 668Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 669{
f9a63242
JH
670 U8 *d;
671 U8 *start = s;
db42d148 672 U8 *send;
f9a63242
JH
673 I32 count = 0;
674
675 if (!*is_utf8)
676 return start;
677
ef9edfd0 678 /* ensure valid UTF8 and chars < 256 before converting string */
f9a63242
JH
679 for (send = s + *len; s < send;) {
680 U8 c = *s++;
1d72bdf6 681 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
682 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
683 (c = *s++) && UTF8_IS_CONTINUATION(c))
684 count++;
685 else
f9a63242 686 return start;
db42d148 687 }
f9a63242
JH
688 }
689
690 *is_utf8 = 0;
691
f9a63242 692 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 693 s = start; start = d;
f9a63242
JH
694 while (s < send) {
695 U8 c = *s++;
c4d5f83a
NIS
696 if (!UTF8_IS_INVARIANT(c)) {
697 /* Then it is two-byte encoded */
698 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
699 c = ASCII_TO_NATIVE(c);
700 }
701 *d++ = c;
f9a63242
JH
702 }
703 *d = '\0';
704 *len = d - start;
705 return start;
706}
707
708/*
eebe1485 709=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f
GS
710
711Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
6662521e
GS
712Returns a pointer to the newly-created string, and sets C<len> to
713reflect the new length.
6940069f 714
497711e7 715=cut
6940069f
GS
716*/
717
718U8*
37607a96 719Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 720{
6940069f
GS
721 U8 *send;
722 U8 *d;
723 U8 *dst;
6662521e 724 send = s + (*len);
6940069f 725
6662521e 726 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
727 dst = d;
728
729 while (s < send) {
db42d148 730 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a
NIS
731 if (UNI_IS_INVARIANT(uv))
732 *d++ = UTF_TO_NATIVE(uv);
6940069f 733 else {
90f44359
JH
734 *d++ = UTF8_EIGHT_BIT_HI(uv);
735 *d++ = UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
736 }
737 }
738 *d = '\0';
6662521e 739 *len = d-dst;
6940069f
GS
740 return dst;
741}
742
a0ed51b3 743/*
dea0fc0b 744 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
745 *
746 * Destination must be pre-extended to 3/2 source. Do not use in-place.
747 * We optimize for native, for obvious reasons. */
748
749U8*
dea0fc0b 750Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 751{
dea0fc0b
JH
752 U8* pend;
753 U8* dstart = d;
754
755 if (bytelen & 1)
a7867d0a 756 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
757
758 pend = p + bytelen;
759
a0ed51b3 760 while (p < pend) {
dea0fc0b
JH
761 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
762 p += 2;
a0ed51b3
LW
763 if (uv < 0x80) {
764 *d++ = uv;
765 continue;
766 }
767 if (uv < 0x800) {
768 *d++ = (( uv >> 6) | 0xc0);
769 *d++ = (( uv & 0x3f) | 0x80);
770 continue;
771 }
772 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
dea0fc0b
JH
773 UV low = *p++;
774 if (low < 0xdc00 || low >= 0xdfff)
775 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
776 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
777 }
778 if (uv < 0x10000) {
779 *d++ = (( uv >> 12) | 0xe0);
780 *d++ = (((uv >> 6) & 0x3f) | 0x80);
781 *d++ = (( uv & 0x3f) | 0x80);
782 continue;
783 }
784 else {
785 *d++ = (( uv >> 18) | 0xf0);
786 *d++ = (((uv >> 12) & 0x3f) | 0x80);
787 *d++ = (((uv >> 6) & 0x3f) | 0x80);
788 *d++ = (( uv & 0x3f) | 0x80);
789 continue;
790 }
791 }
dea0fc0b 792 *newlen = d - dstart;
a0ed51b3
LW
793 return d;
794}
795
796/* Note: this one is slightly destructive of the source. */
797
798U8*
dea0fc0b 799Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
800{
801 U8* s = (U8*)p;
802 U8* send = s + bytelen;
803 while (s < send) {
804 U8 tmp = s[0];
805 s[0] = s[1];
806 s[1] = tmp;
807 s += 2;
808 }
dea0fc0b 809 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
810}
811
812/* for now these are all defined (inefficiently) in terms of the utf8 versions */
813
814bool
84afefe6 815Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 816{
ad391ad9 817 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 818 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
819 return is_utf8_alnum(tmpbuf);
820}
821
822bool
84afefe6 823Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 824{
ad391ad9 825 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 826 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
827 return is_utf8_alnumc(tmpbuf);
828}
829
830bool
84afefe6 831Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 832{
ad391ad9 833 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 834 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
835 return is_utf8_idfirst(tmpbuf);
836}
837
838bool
84afefe6 839Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 840{
ad391ad9 841 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 842 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
843 return is_utf8_alpha(tmpbuf);
844}
845
846bool
84afefe6 847Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 848{
ad391ad9 849 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 850 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
851 return is_utf8_ascii(tmpbuf);
852}
853
854bool
84afefe6 855Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 856{
ad391ad9 857 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 858 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
859 return is_utf8_space(tmpbuf);
860}
861
862bool
84afefe6 863Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 864{
ad391ad9 865 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 866 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
867 return is_utf8_digit(tmpbuf);
868}
869
870bool
84afefe6 871Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 872{
ad391ad9 873 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 874 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
875 return is_utf8_upper(tmpbuf);
876}
877
878bool
84afefe6 879Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 880{
ad391ad9 881 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 882 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
883 return is_utf8_lower(tmpbuf);
884}
885
886bool
84afefe6 887Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 888{
ad391ad9 889 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 890 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
891 return is_utf8_cntrl(tmpbuf);
892}
893
894bool
84afefe6 895Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 896{
ad391ad9 897 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 898 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
899 return is_utf8_graph(tmpbuf);
900}
901
902bool
84afefe6 903Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 904{
ad391ad9 905 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 906 uvchr_to_utf8(tmpbuf, (UV)c);
a0ed51b3
LW
907 return is_utf8_print(tmpbuf);
908}
909
b8c5462f 910bool
84afefe6 911Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 912{
ad391ad9 913 U8 tmpbuf[UTF8_MAXLEN+1];
2b9d42f0 914 uvchr_to_utf8(tmpbuf, (UV)c);
b8c5462f
JH
915 return is_utf8_punct(tmpbuf);
916}
917
4d61ec05 918bool
84afefe6 919Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 920{
e7ae6809 921 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 922 uvchr_to_utf8(tmpbuf, (UV)c);
4d61ec05
GS
923 return is_utf8_xdigit(tmpbuf);
924}
925
84afefe6
JH
926UV
927Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 928{
e7ae6809 929 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 930 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 931 return to_utf8_upper(tmpbuf, p, lenp);
a0ed51b3
LW
932}
933
84afefe6
JH
934UV
935Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 936{
e7ae6809 937 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 938 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 939 return to_utf8_title(tmpbuf, p, lenp);
a0ed51b3
LW
940}
941
84afefe6
JH
942UV
943Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 944{
d90824e8 945 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
2b9d42f0 946 uvchr_to_utf8(tmpbuf, (UV)c);
a2a2844f 947 return to_utf8_lower(tmpbuf, p, lenp);
a0ed51b3
LW
948}
949
84afefe6
JH
950UV
951Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
952{
d90824e8 953 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
84afefe6
JH
954 uvchr_to_utf8(tmpbuf, (UV)c);
955 return to_utf8_fold(tmpbuf, p, lenp);
956}
957
a0ed51b3
LW
958/* for now these all assume no locale info available for Unicode > 255 */
959
960bool
84afefe6 961Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
962{
963 return is_uni_alnum(c); /* XXX no locale support yet */
964}
965
966bool
84afefe6 967Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
968{
969 return is_uni_alnumc(c); /* XXX no locale support yet */
970}
971
972bool
84afefe6 973Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
974{
975 return is_uni_idfirst(c); /* XXX no locale support yet */
976}
977
978bool
84afefe6 979Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
980{
981 return is_uni_alpha(c); /* XXX no locale support yet */
982}
983
984bool
84afefe6 985Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
986{
987 return is_uni_ascii(c); /* XXX no locale support yet */
988}
989
990bool
84afefe6 991Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
992{
993 return is_uni_space(c); /* XXX no locale support yet */
994}
995
996bool
84afefe6 997Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
998{
999 return is_uni_digit(c); /* XXX no locale support yet */
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1004{
1005 return is_uni_upper(c); /* XXX no locale support yet */
1006}
1007
1008bool
84afefe6 1009Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1010{
1011 return is_uni_lower(c); /* XXX no locale support yet */
1012}
1013
1014bool
84afefe6 1015Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1016{
1017 return is_uni_cntrl(c); /* XXX no locale support yet */
1018}
1019
1020bool
84afefe6 1021Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1022{
1023 return is_uni_graph(c); /* XXX no locale support yet */
1024}
1025
1026bool
84afefe6 1027Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1028{
1029 return is_uni_print(c); /* XXX no locale support yet */
1030}
1031
b8c5462f 1032bool
84afefe6 1033Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1034{
1035 return is_uni_punct(c); /* XXX no locale support yet */
1036}
1037
4d61ec05 1038bool
84afefe6 1039Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1040{
1041 return is_uni_xdigit(c); /* XXX no locale support yet */
1042}
1043
b7ac61fa
JH
1044U32
1045Perl_to_uni_upper_lc(pTHX_ U32 c)
1046{
ee099d14
JH
1047 /* XXX returns only the first character -- do not use XXX */
1048 /* XXX no locale support yet */
1049 STRLEN len;
1050 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1051 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1052}
1053
1054U32
1055Perl_to_uni_title_lc(pTHX_ U32 c)
1056{
ee099d14
JH
1057 /* XXX returns only the first character XXX -- do not use XXX */
1058 /* XXX no locale support yet */
1059 STRLEN len;
1060 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1061 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1062}
1063
1064U32
1065Perl_to_uni_lower_lc(pTHX_ U32 c)
1066{
ee099d14
JH
1067 /* XXX returns only the first character -- do not use XXX */
1068 /* XXX no locale support yet */
1069 STRLEN len;
1070 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1071 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1072}
1073
a0ed51b3 1074bool
864dbfa3 1075Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1076{
386d01d6
GS
1077 if (!is_utf8_char(p))
1078 return FALSE;
a0ed51b3 1079 if (!PL_utf8_alnum)
289d4f09
ML
1080 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1081 * descendant of isalnum(3), in other words, it doesn't
1082 * contain the '_'. --jhi */
1083 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
3568d838 1084 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1085/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1086#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1087 if (!PL_utf8_alnum)
1088 PL_utf8_alnum = swash_init("utf8", "",
1089 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1090 return swash_fetch(PL_utf8_alnum, p, TRUE);
a0ed51b3
LW
1091#endif
1092}
1093
1094bool
b8c5462f
JH
1095Perl_is_utf8_alnumc(pTHX_ U8 *p)
1096{
386d01d6
GS
1097 if (!is_utf8_char(p))
1098 return FALSE;
b8c5462f
JH
1099 if (!PL_utf8_alnum)
1100 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
3568d838 1101 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1102/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1103#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1104 if (!PL_utf8_alnum)
1105 PL_utf8_alnum = swash_init("utf8", "",
1106 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
3568d838 1107 return swash_fetch(PL_utf8_alnum, p, TRUE);
b8c5462f
JH
1108#endif
1109}
1110
1111bool
864dbfa3 1112Perl_is_utf8_idfirst(pTHX_ U8 *p)
a0ed51b3
LW
1113{
1114 return *p == '_' || is_utf8_alpha(p);
1115}
1116
1117bool
864dbfa3 1118Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1119{
386d01d6
GS
1120 if (!is_utf8_char(p))
1121 return FALSE;
a0ed51b3 1122 if (!PL_utf8_alpha)
e24b16f9 1123 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
3568d838 1124 return swash_fetch(PL_utf8_alpha, p, TRUE);
a0ed51b3
LW
1125}
1126
1127bool
b8c5462f
JH
1128Perl_is_utf8_ascii(pTHX_ U8 *p)
1129{
386d01d6
GS
1130 if (!is_utf8_char(p))
1131 return FALSE;
b8c5462f
JH
1132 if (!PL_utf8_ascii)
1133 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
3568d838 1134 return swash_fetch(PL_utf8_ascii, p, TRUE);
b8c5462f
JH
1135}
1136
1137bool
864dbfa3 1138Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1139{
386d01d6
GS
1140 if (!is_utf8_char(p))
1141 return FALSE;
a0ed51b3 1142 if (!PL_utf8_space)
3bec3564 1143 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
3568d838 1144 return swash_fetch(PL_utf8_space, p, TRUE);
a0ed51b3
LW
1145}
1146
1147bool
864dbfa3 1148Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1149{
386d01d6
GS
1150 if (!is_utf8_char(p))
1151 return FALSE;
a0ed51b3 1152 if (!PL_utf8_digit)
e24b16f9 1153 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
3568d838 1154 return swash_fetch(PL_utf8_digit, p, TRUE);
a0ed51b3
LW
1155}
1156
1157bool
864dbfa3 1158Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1159{
386d01d6
GS
1160 if (!is_utf8_char(p))
1161 return FALSE;
a0ed51b3 1162 if (!PL_utf8_upper)
e24b16f9 1163 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
3568d838 1164 return swash_fetch(PL_utf8_upper, p, TRUE);
a0ed51b3
LW
1165}
1166
1167bool
864dbfa3 1168Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1169{
386d01d6
GS
1170 if (!is_utf8_char(p))
1171 return FALSE;
a0ed51b3 1172 if (!PL_utf8_lower)
e24b16f9 1173 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
3568d838 1174 return swash_fetch(PL_utf8_lower, p, TRUE);
a0ed51b3
LW
1175}
1176
1177bool
b8c5462f
JH
1178Perl_is_utf8_cntrl(pTHX_ U8 *p)
1179{
386d01d6
GS
1180 if (!is_utf8_char(p))
1181 return FALSE;
b8c5462f
JH
1182 if (!PL_utf8_cntrl)
1183 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
3568d838 1184 return swash_fetch(PL_utf8_cntrl, p, TRUE);
b8c5462f
JH
1185}
1186
1187bool
1188Perl_is_utf8_graph(pTHX_ U8 *p)
1189{
386d01d6
GS
1190 if (!is_utf8_char(p))
1191 return FALSE;
b8c5462f
JH
1192 if (!PL_utf8_graph)
1193 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
3568d838 1194 return swash_fetch(PL_utf8_graph, p, TRUE);
b8c5462f
JH
1195}
1196
1197bool
864dbfa3 1198Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1199{
386d01d6
GS
1200 if (!is_utf8_char(p))
1201 return FALSE;
a0ed51b3 1202 if (!PL_utf8_print)
e24b16f9 1203 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
3568d838 1204 return swash_fetch(PL_utf8_print, p, TRUE);
a0ed51b3
LW
1205}
1206
1207bool
b8c5462f
JH
1208Perl_is_utf8_punct(pTHX_ U8 *p)
1209{
386d01d6
GS
1210 if (!is_utf8_char(p))
1211 return FALSE;
b8c5462f
JH
1212 if (!PL_utf8_punct)
1213 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
3568d838 1214 return swash_fetch(PL_utf8_punct, p, TRUE);
b8c5462f
JH
1215}
1216
1217bool
1218Perl_is_utf8_xdigit(pTHX_ U8 *p)
1219{
386d01d6
GS
1220 if (!is_utf8_char(p))
1221 return FALSE;
b8c5462f
JH
1222 if (!PL_utf8_xdigit)
1223 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
3568d838 1224 return swash_fetch(PL_utf8_xdigit, p, TRUE);
b8c5462f
JH
1225}
1226
1227bool
864dbfa3 1228Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1229{
386d01d6
GS
1230 if (!is_utf8_char(p))
1231 return FALSE;
a0ed51b3 1232 if (!PL_utf8_mark)
e24b16f9 1233 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
3568d838 1234 return swash_fetch(PL_utf8_mark, p, TRUE);
a0ed51b3
LW
1235}
1236
6b5c0936
JH
1237/*
1238=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1239
1240The "p" contains the pointer to the UTF-8 string encoding
1241the character that is being converted.
1242
1243The "ustrp" is a pointer to the character buffer to put the
1244conversion result to. The "lenp" is a pointer to the length
1245of the result.
1246
1247The "swash" is a pointer to the swash to use.
1248
1249The "normal" is a string like "ToLower" which means the swash
1250$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1251and loaded by SWASHGET, using lib/utf8_heavy.pl.
1252
1253The "special" is a string like "utf8::ToSpecLower", which means
d1be9408 1254the hash %utf8::ToSpecLower, which is stored in the same file,
6b5c0936
JH
1255lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1256to the hash is by Perl_to_utf8_case().
1257
1258=cut
1259 */
1260
2104c8d9 1261UV
983ffd37 1262Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp,char *normal, char *special)
a0ed51b3
LW
1263{
1264 UV uv;
1265
983ffd37
JH
1266 if (!*swashp)
1267 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1268 uv = swash_fetch(*swashp, p, TRUE);
1269 if (uv)
1270 uv = UNI_TO_NATIVE(uv);
1271 else {
1272 HV *hv;
1273 SV *keysv;
1274 HE *he;
1275
1276 uv = utf8_to_uvchr(p, 0);
1277
1278 if ((hv = get_hv(special, FALSE)) &&
6d47b937 1279 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
983ffd37
JH
1280 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1281 SV *val = HeVAL(he);
1282 char *s = SvPV(val, *lenp);
1283 U8 c = *(U8*)s;
1284 if (*lenp > 1 || UNI_IS_INVARIANT(c))
1285 Copy(s, ustrp, *lenp, U8);
1286 else {
1287 /* something in the 0x80..0xFF range */
1288 ustrp[0] = UTF8_EIGHT_BIT_HI(c);
1289 ustrp[1] = UTF8_EIGHT_BIT_LO(c);
1290 *lenp = 2;
1291 }
1292 return 0;
1293 }
1294 }
a58d912c 1295 if (lenp)
d90824e8 1296 *lenp = UNISKIP(uv);
a2a2844f
JH
1297 uvuni_to_utf8(ustrp, uv);
1298 return uv;
a0ed51b3
LW
1299}
1300
2104c8d9 1301UV
983ffd37 1302Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1303{
983ffd37 1304 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1305 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1306}
a0ed51b3 1307
983ffd37
JH
1308UV
1309Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1310{
1311 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1312 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1313}
1314
2104c8d9 1315UV
a2a2844f 1316Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1317{
983ffd37 1318 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1319 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1320}
1321
1322UV
1323Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1324{
1325 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1326 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1327}
1328
1329/* a "swash" is a swatch hash */
1330
1331SV*
864dbfa3 1332Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1333{
1334 SV* retval;
bf1fed83 1335 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1336 dSP;
1b026014 1337 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1338 SV* errsv_save;
ce3b816e 1339
1b026014 1340 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1341 ENTER;
f8be5cf0 1342 errsv_save = newSVsv(ERRSV);
ce3b816e 1343 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0
JH
1344 if (!SvTRUE(ERRSV))
1345 sv_setsv(ERRSV, errsv_save);
1346 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1347 LEAVE;
1348 }
1349 SPAGAIN;
a0ed51b3
LW
1350 PUSHSTACKi(PERLSI_MAGIC);
1351 PUSHMARK(SP);
1352 EXTEND(SP,5);
1353 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1354 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1355 PUSHs(listsv);
1356 PUSHs(sv_2mortal(newSViv(minbits)));
1357 PUSHs(sv_2mortal(newSViv(none)));
1358 PUTBACK;
1359 ENTER;
1360 SAVEI32(PL_hints);
1361 PL_hints = 0;
1362 save_re_context();
bf1fed83
JH
1363 if (PL_curcop == &PL_compiling)
1364 /* XXX ought to be handled by lex_start */
1365 sv_setpv(tokenbufsv, PL_tokenbuf);
f8be5cf0 1366 errsv_save = newSVsv(ERRSV);
864dbfa3 1367 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1368 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1369 else
e24b16f9 1370 retval = &PL_sv_undef;
f8be5cf0
JH
1371 if (!SvTRUE(ERRSV))
1372 sv_setsv(ERRSV, errsv_save);
1373 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1374 LEAVE;
1375 POPSTACK;
e24b16f9 1376 if (PL_curcop == &PL_compiling) {
bf1fed83
JH
1377 STRLEN len;
1378 char* pv = SvPV(tokenbufsv, len);
1379
1380 Copy(pv, PL_tokenbuf, len+1, char);
e24b16f9 1381 PL_curcop->op_private = PL_hints;
a0ed51b3
LW
1382 }
1383 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
cea2e8a9 1384 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
a0ed51b3
LW
1385 return retval;
1386}
1387
035d37be
JH
1388
1389/* This API is wrong for special case conversions since we may need to
1390 * return several Unicode characters for a single Unicode character
1391 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1392 * the lower-level routine, and it is similarly broken for returning
1393 * multiple values. --jhi */
a0ed51b3 1394UV
3568d838 1395Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1396{
1397 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1398 U32 klen;
1399 U32 off;
a0ed51b3 1400 STRLEN slen;
7d85a32c 1401 STRLEN needents;
4ea42e7f 1402 U8 *tmps = NULL;
a0ed51b3
LW
1403 U32 bit;
1404 SV *retval;
3568d838
JH
1405 U8 tmputf8[2];
1406 UV c = NATIVE_TO_ASCII(*ptr);
1407
1408 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1409 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1410 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1411 ptr = tmputf8;
1412 }
1413 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1414 * then the "swatch" is a vec() for al the chars which start
1415 * with 0xAA..0xYY
1416 * So the key in the hash (klen) is length of encoded char -1
1417 */
1418 klen = UTF8SKIP(ptr) - 1;
1419 off = ptr[klen];
a0ed51b3 1420
7d85a32c
JH
1421 if (klen == 0)
1422 {
1423 /* If char in invariant then swatch is for all the invariant chars
1424 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1425 */
1426 needents = UTF_CONTINUATION_MARK;
1427 off = NATIVE_TO_UTF(ptr[klen]);
1428 }
1429 else
1430 {
1431 /* If char is encoded then swatch is for the prefix */
1432 needents = (1 << UTF_ACCUMULATION_SHIFT);
1433 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1434 }
1435
a0ed51b3
LW
1436 /*
1437 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1438 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1439 * it's nothing to sniff at.) Pity we usually come through at least
1440 * two function calls to get here...
1441 *
1442 * NB: this code assumes that swatches are never modified, once generated!
1443 */
1444
3568d838 1445 if (hv == PL_last_swash_hv &&
a0ed51b3 1446 klen == PL_last_swash_klen &&
3568d838 1447 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1448 {
1449 tmps = PL_last_swash_tmps;
1450 slen = PL_last_swash_slen;
1451 }
1452 else {
1453 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1454 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1455
1456 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1457 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1458 dSP;
2b9d42f0
NIS
1459 /* We use utf8n_to_uvuni() as we want an index into
1460 Unicode tables, not a native character number.
1461 */
1462 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
f8be5cf0 1463 SV *errsv_save;
a0ed51b3
LW
1464 ENTER;
1465 SAVETMPS;
1466 save_re_context();
1467 PUSHSTACKi(PERLSI_MAGIC);
1468 PUSHMARK(SP);
1469 EXTEND(SP,3);
1470 PUSHs((SV*)sv);
ffbc6a93 1471 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1472 PUSHs(sv_2mortal(newSViv((klen) ?
1473 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1474 PUSHs(sv_2mortal(newSViv(needents)));
1475 PUTBACK;
f8be5cf0 1476 errsv_save = newSVsv(ERRSV);
864dbfa3 1477 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1478 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1479 else
e24b16f9 1480 retval = &PL_sv_undef;
f8be5cf0
JH
1481 if (!SvTRUE(ERRSV))
1482 sv_setsv(ERRSV, errsv_save);
1483 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1484 POPSTACK;
1485 FREETMPS;
1486 LEAVE;
e24b16f9
GS
1487 if (PL_curcop == &PL_compiling)
1488 PL_curcop->op_private = PL_hints;
a0ed51b3 1489
dfe13c55 1490 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1491
7d85a32c 1492 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1493 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1494 }
1495
1496 PL_last_swash_hv = hv;
1497 PL_last_swash_klen = klen;
1498 PL_last_swash_tmps = tmps;
1499 PL_last_swash_slen = slen;
1500 if (klen)
1501 Copy(ptr, PL_last_swash_key, klen, U8);
1502 }
1503
9faf8d75 1504 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1505 case 1:
1506 bit = 1 << (off & 7);
1507 off >>= 3;
1508 return (tmps[off] & bit) != 0;
1509 case 8:
1510 return tmps[off];
1511 case 16:
1512 off <<= 1;
1513 return (tmps[off] << 8) + tmps[off + 1] ;
1514 case 32:
1515 off <<= 2;
1516 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1517 }
cea2e8a9 1518 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1519 return 0;
1520}
2b9d42f0
NIS
1521
1522
1523/*
37607a96 1524=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0
NIS
1525
1526Adds the UTF8 representation of the Native codepoint C<uv> to the end
1527of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1528bytes available. The return value is the pointer to the byte after the
1529end of the new character. In other words,
1530
1531 d = uvchr_to_utf8(d, uv);
1532
1533is the recommended wide native character-aware way of saying
1534
1535 *(d++) = uv;
1536
1537=cut
1538*/
1539
1540/* On ASCII machines this is normally a macro but we want a
1541 real function in case XS code wants it
1542*/
1543#undef Perl_uvchr_to_utf8
1544U8 *
1545Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1546{
1547 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1548}
1549
1550
1551/*
37607a96 1552=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1553
1554Returns the native character value of the first character in the string C<s>
1555which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1556length, in bytes, of that character.
1557
1558Allows length and flags to be passed to low level routine.
1559
1560=cut
1561*/
1562/* On ASCII machines this is normally a macro but we want a
1563 real function in case XS code wants it
1564*/
1565#undef Perl_utf8n_to_uvchr
1566UV
37607a96 1567Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1568{
1569 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1570 return UNI_TO_NATIVE(uv);
1571}
1572
d2cc3551
JH
1573/*
1574=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1575
1576Build to the scalar dsv a displayable version of the string spv,
1577length len, the displayable version being at most pvlim bytes long
1578(if longer, the rest is truncated and "..." will be appended).
1579The flags argument is currently unused but available for future extensions.
1580The pointer to the PV of the dsv is returned.
1581
1582=cut */
e6b2e755
JH
1583char *
1584Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1585{
1586 int truncated = 0;
1587 char *s, *e;
1588
1589 sv_setpvn(dsv, "", 0);
1590 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1591 UV u;
1592 if (pvlim && SvCUR(dsv) >= pvlim) {
1593 truncated++;
1594 break;
1595 }
1596 u = utf8_to_uvchr((U8*)s, 0);
1597 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1598 }
1599 if (truncated)
1600 sv_catpvn(dsv, "...", 3);
1601
1602 return SvPVX(dsv);
1603}
2b9d42f0 1604
d2cc3551
JH
1605/*
1606=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1607
1608Build to the scalar dsv a displayable version of the scalar sv,
1609he displayable version being at most pvlim bytes long
1610(if longer, the rest is truncated and "..." will be appended).
1611The flags argument is currently unused but available for future extensions.
1612The pointer to the PV of the dsv is returned.
1613
1614=cut */
e6b2e755
JH
1615char *
1616Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1617{
701a277b
JH
1618 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1619 pvlim, flags);
1620}
1621
d2cc3551
JH
1622/*
1623=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
1624
1625Return true if the strings s1 and s2 differ case-insensitively, false
1626if not (if they are equal case-insensitively). If u1 is true, the
1627string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
332ddc25 1628the string s2 is assumed to be in UTF-8-encoded Unicode.
d2cc3551
JH
1629
1630For case-insensitiveness, the "casefolding" of Unicode is used
1631instead of upper/lowercasing both the characters, see
1632http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1633
1634=cut */
701a277b 1635I32
332ddc25
JH
1636Perl_ibcmp_utf8(pTHX_ const char *s1, bool u1, register I32 len1, const char *s2, bool u2, register I32 len2)
1637{
1638 register U8 *a = (U8*)s1;
1639 register U8 *b = (U8*)s2;
1640 register U8 *ae = b + len1;
1641 register U8 *be = b + len2;
1642 STRLEN la, lb;
1643 UV ca, cb;
1644 STRLEN ulen1, ulen2;
e7ae6809
JH
1645 U8 tmpbuf1[UTF8_MAXLEN_FOLD+1];
1646 U8 tmpbuf2[UTF8_MAXLEN_FOLD+1];
332ddc25
JH
1647
1648 while (a < ae && b < be) {
1649 if (u1) {
1650 if (a + UTF8SKIP(a) > ae)
1651 break;
1652 ca = utf8_to_uvchr((U8*)a, &la);
1653 } else {
1654 ca = *a;
1655 la = 1;
1656 }
1657 if (u2) {
1658 if (b + UTF8SKIP(b) > be)
1659 break;
1660 cb = utf8_to_uvchr((U8*)b, &lb);
1661 } else {
1662 cb = *b;
1663 lb = 1;
1664 }
1665 if (ca != cb) {
701a277b 1666 if (u1)
332ddc25
JH
1667 to_uni_fold(NATIVE_TO_UNI(ca), tmpbuf1, &ulen1);
1668 else
1669 ulen1 = 1;
701a277b 1670 if (u2)
332ddc25
JH
1671 to_uni_fold(NATIVE_TO_UNI(cb), tmpbuf2, &ulen2);
1672 else
1673 ulen2 = 1;
1674 if (ulen1 != ulen2
1675 || (ulen1 == 1 && PL_fold[ca] != PL_fold[cb])
1676 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
1677 return 1; /* mismatch */
701a277b 1678 }
332ddc25
JH
1679 a += la;
1680 b += lb;
d2cc3551 1681 }
332ddc25 1682 return a == ae && b == be ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1683}
701a277b 1684