This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[PAUSE] CPAN Upload: A/AN/ANDK/CPAN-1.80.tar.gz
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
646ca15d
JH
189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
ST
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
3b0fc154 241#ifdef IS_UTF8_CHAR
768c67ee 242 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 245 return is_utf8_char_slow(s, len);
386d01d6
GS
246}
247
6662521e 248/*
7fc63493 249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 250
c9ada85f 251Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
6662521e 255
768c67ee
JH
256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
6662521e
GS
258=cut
259*/
260
8e84507e 261bool
7fc63493 262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 263{
7fc63493
AL
264 const U8* x = s;
265 const U8* send;
067a85ef 266
44f8325f 267 if (!len)
e1ec3a88 268 len = strlen((const char *)s);
1aa99e6b
IH
269 send = s + len;
270
6662521e 271 while (x < send) {
a3b680e6 272 STRLEN c;
1acdb0da
JH
273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
768c67ee 277 goto out;
1acdb0da
JH
278 else {
279 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
768c67ee
JH
282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
646ca15d
JH
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee
JH
295
296 out:
60006e79
JH
297 if (x != send)
298 return FALSE;
067a85ef
A
299
300 return TRUE;
6662521e
GS
301}
302
67e989fb 303/*
814fafa7
NC
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
7fc63493
AL
329 const U8* x = s;
330 const U8* send;
81cd54e3
JH
331 STRLEN c;
332
44f8325f 333 if (!len)
e1ec3a88 334 len = strlen((const char *)s);
81cd54e3 335 send = s + len;
768c67ee
JH
336 if (el)
337 *el = 0;
81cd54e3
JH
338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
81cd54e3 345 else {
768c67ee
JH
346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
81cd54e3 359 }
768c67ee
JH
360 x += c;
361 if (el)
362 (*el)++;
81cd54e3 363 }
768c67ee
JH
364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
81cd54e3 369 return FALSE;
81cd54e3
JH
370
371 return TRUE;
372}
373
374/*
768c67ee 375
7fc63493 376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 377
9041c2e3
NIS
378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
1e54db1a 380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 381C<retlen> will be set to the length, in bytes, of that character.
67e989fb 382
1e54db1a 383If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
28d3d195
JH
386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 393
9041c2e3
NIS
394Most code should use utf8_to_uvchr() rather than call this directly.
395
37607a96
PK
396=cut
397*/
67e989fb 398
a0ed51b3 399UV
7fc63493 400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 401{
7fc63493 402 const U8 *s0 = s;
9c5ffd7c 403 UV uv = *s, ouv = 0;
ba210ebe 404 STRLEN len = 1;
7fc63493
AL
405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
ba210ebe 407 STRLEN expectlen = 0;
a0dbb045
JH
408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
c867b360
JH
419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
425 goto malformed;
426 }
427
1d72bdf6 428 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
429 if (retlen)
430 *retlen = 1;
c4d5f83a 431 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 432 }
67e989fb 433
421a8bf2 434 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 435 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 436 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
437 goto malformed;
438 }
439
421a8bf2 440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 442 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
443 goto malformed;
444 }
9041c2e3 445
1d72bdf6 446#ifdef EBCDIC
75383841 447 uv = NATIVE_TO_UTF(uv);
1d72bdf6 448#else
fcc8fcf6
JH
449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 451 warning = UTF8_WARN_FE_FF;
ba210ebe 452 goto malformed;
a0ed51b3 453 }
1d72bdf6
NIS
454#endif
455
ba210ebe
JH
456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
ba210ebe
JH
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
a0ed51b3
LW
469 if (retlen)
470 *retlen = len;
9041c2e3 471
ba210ebe
JH
472 expectlen = len;
473
fcc8fcf6
JH
474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 476 warning = UTF8_WARN_SHORT;
ba210ebe
JH
477 goto malformed;
478 }
479
480 len--;
a0ed51b3 481 s++;
ba210ebe
JH
482 ouv = uv;
483
a0ed51b3 484 while (len--) {
421a8bf2
JH
485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 489 goto malformed;
a0ed51b3
LW
490 }
491 else
8850bf83 492 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
75dbc644 496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
ba210ebe
JH
506 }
507 s++;
508 ouv = uv;
509 }
510
421a8bf2 511 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 513 warning = UTF8_WARN_SURROGATE;
ba210ebe 514 goto malformed;
eb160463 515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 517 warning = UTF8_WARN_LONG;
ba210ebe 518 goto malformed;
421a8bf2 519 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 520 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 521 warning = UTF8_WARN_FFFF;
a9917092 522 goto malformed;
a0ed51b3 523 }
ba210ebe 524
a0ed51b3 525 return uv;
ba210ebe
JH
526
527malformed:
528
fcc8fcf6 529 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 530 if (retlen)
cc366d4b 531 *retlen = -1;
ba210ebe
JH
532 return 0;
533 }
534
a0dbb045 535 if (dowarn) {
44f8325f 536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045
JH
537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
54667de8 541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
542 break;
543 case UTF8_WARN_CONTINUATION:
097fb8e2 544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
545 break;
546 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
551405c4
AL
550 else {
551 const int len = (int)(s-s0);
097fb8e2 552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
a0dbb045
JH
556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
097fb8e2 561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 563 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
564 break;
565 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
a0dbb045
JH
568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
a0dbb045 572 case UTF8_WARN_LONG:
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
54667de8 580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
581 break;
582 }
583
584 if (warning) {
44f8325f 585 const char * const s = SvPVX_const(sv);
a0dbb045
JH
586
587 if (PL_op)
9014280d 588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 589 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 590 else
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
592 }
593 }
594
ba210ebe 595 if (retlen)
28d3d195 596 *retlen = expectlen ? expectlen : len;
ba210ebe 597
28d3d195 598 return 0;
a0ed51b3
LW
599}
600
8e84507e 601/*
7fc63493 602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
603
604Returns the native character value of the first character in the string C<s>
1e54db1a 605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
606length, in bytes, of that character.
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 616{
1754c1a1
NC
617 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
619}
620
621/*
7fc63493 622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
623
624Returns the Unicode code point of the first character in the string C<s>
1e54db1a 625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
1e54db1a 631If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 632returned and retlen is set, if possible, to -1.
8e84507e
NIS
633
634=cut
635*/
636
637UV
7fc63493 638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 639{
9041c2e3 640 /* Call the low level routine asking for checks */
89ebb4a3 641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
643}
644
b76347f2 645/*
35a4481c 646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
647
648Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
b76347f2
JH
651
652=cut
653*/
654
655STRLEN
35a4481c 656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
657{
658 STRLEN len = 0;
659
8850bf83
JH
660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
a3b680e6
AL
664 if (e < s)
665 goto warn_and_return;
b76347f2 666 while (s < e) {
4373e329 667 const U8 t = UTF8SKIP(s);
901b21bf 668 if (e - s < t) {
a3b680e6 669 warn_and_return:
901b21bf
JH
670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 673 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
b76347f2
JH
679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
b06226ff 686/*
35a4481c 687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 688
1e54db1a 689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
37607a96
PK
695=cut
696*/
a0ed51b3 697
02eb7b47 698IV
35a4481c 699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 700{
02eb7b47
JH
701 IV off = 0;
702
8850bf83
JH
703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
a0ed51b3
LW
707 if (a < b) {
708 while (a < b) {
35a4481c 709 const U8 c = UTF8SKIP(a);
a3b680e6
AL
710 if (b - a < c)
711 goto warn_and_return;
02eb7b47 712 a += c;
a0ed51b3
LW
713 off--;
714 }
715 }
716 else {
717 while (b < a) {
4373e329 718 const U8 c = UTF8SKIP(b);
02eb7b47 719
901b21bf 720 if (a - b < c) {
a3b680e6 721 warn_and_return:
901b21bf
JH
722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
02eb7b47 731 b += c;
a0ed51b3
LW
732 off++;
733 }
734 }
02eb7b47 735
a0ed51b3
LW
736 return off;
737}
738
b06226ff 739/*
37607a96 740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 741
8850bf83
JH
742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
b06226ff
JH
744
745WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
b06226ff 748
37607a96
PK
749=cut
750*/
a0ed51b3
LW
751
752U8 *
4373e329 753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 754{
8850bf83
JH
755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
a0ed51b3
LW
759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
8850bf83
JH
766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
a0ed51b3
LW
768 }
769 }
4373e329 770 return (U8 *)s;
a0ed51b3
LW
771}
772
6940069f 773/*
eebe1485 774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 775
1e54db1a 776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
67e989fb 779Returns zero on failure, setting C<len> to -1.
6940069f
GS
780
781=cut
782*/
783
784U8 *
37607a96 785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 786{
6940069f
GS
787 U8 *send;
788 U8 *d;
dcad2880 789 U8 *save = s;
246fae53 790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
1d72bdf6
NIS
795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
798 *len = -1;
799 return 0;
800 }
246fae53 801 }
dcad2880
JH
802
803 d = s = save;
6940069f 804 while (s < send) {
ed646e6e 805 STRLEN ulen;
9041c2e3 806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 807 s += ulen;
6940069f
GS
808 }
809 *d = '\0';
246fae53 810 *len = d - save;
6940069f
GS
811 return save;
812}
813
814/*
e1ec3a88 815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 816
1e54db1a 817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
f9a63242 823
37607a96
PK
824=cut
825*/
f9a63242
JH
826
827U8 *
e1ec3a88 828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 829{
f9a63242 830 U8 *d;
e1ec3a88
AL
831 const U8 *start = s;
832 const U8 *send;
f9a63242
JH
833 I32 count = 0;
834
835 if (!*is_utf8)
73d840c0 836 return (U8 *)start;
f9a63242 837
1e54db1a 838 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 839 for (send = s + *len; s < send;) {
e1ec3a88 840 U8 c = *s++;
1d72bdf6 841 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
73d840c0 846 return (U8 *)start;
db42d148 847 }
f9a63242
JH
848 }
849
850 *is_utf8 = 0;
851
a02a5408 852 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 853 s = start; start = d;
f9a63242
JH
854 while (s < send) {
855 U8 c = *s++;
c4d5f83a
NIS
856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
f9a63242
JH
862 }
863 *d = '\0';
864 *len = d - start;
73d840c0 865 return (U8 *)start;
f9a63242
JH
866}
867
868/*
35a4481c 869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 870
1e54db1a 871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
6940069f 874
1e54db1a 875If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
876see sv_recode_to_utf8().
877
497711e7 878=cut
6940069f
GS
879*/
880
881U8*
35a4481c 882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 883{
35a4481c 884 const U8 * const send = s + (*len);
6940069f
GS
885 U8 *d;
886 U8 *dst;
6940069f 887
a02a5408 888 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
889 dst = d;
890
891 while (s < send) {
35a4481c 892 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 893 if (UNI_IS_INVARIANT(uv))
eb160463 894 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 895 else {
eb160463
GS
896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
898 }
899 }
900 *d = '\0';
6662521e 901 *len = d-dst;
6940069f
GS
902 return dst;
903}
904
a0ed51b3 905/*
dea0fc0b 906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
dea0fc0b 912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 913{
dea0fc0b
JH
914 U8* pend;
915 U8* dstart = d;
916
1de9afcd
RGS
917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
dea0fc0b 923 if (bytelen & 1)
014ead4b 924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
925
926 pend = p + bytelen;
927
a0ed51b3 928 while (p < pend) {
dea0fc0b
JH
929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
a0ed51b3 931 if (uv < 0x80) {
eb160463 932 *d++ = (U8)uv;
a0ed51b3
LW
933 continue;
934 }
935 if (uv < 0x800) {
eb160463
GS
936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
941 UV low = (p[0] << 8) + p[1];
942 p += 2;
dea0fc0b
JH
943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
eb160463
GS
948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
951 continue;
952 }
953 else {
eb160463
GS
954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
958 continue;
959 }
960 }
dea0fc0b 961 *newlen = d - dstart;
a0ed51b3
LW
962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
dea0fc0b 968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
dea0fc0b 978 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
84afefe6 984Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1076 return is_utf8_print(tmpbuf);
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1084 return is_utf8_punct(tmpbuf);
1085}
1086
4d61ec05 1087bool
84afefe6 1088Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
84afefe6
JH
1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1097{
0ebc6274
JH
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1100}
1101
84afefe6
JH
1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1104{
0ebc6274
JH
1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1107}
1108
84afefe6
JH
1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1111{
0ebc6274
JH
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1114}
1115
84afefe6
JH
1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
0ebc6274
JH
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1121}
1122
a0ed51b3
LW
1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
84afefe6 1126Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
b8c5462f 1197bool
84afefe6 1198Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
4d61ec05 1203bool
84afefe6 1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
b7ac61fa
JH
1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
ee099d14
JH
1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
89ebb4a3 1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1216 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
ee099d14
JH
1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
89ebb4a3 1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1226 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
ee099d14
JH
1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
89ebb4a3 1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1236 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1237}
1238
7452cf6a 1239static bool
5141f98e 1240S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 0, 0);
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
671c33bf
NC
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1257}
1258
1259bool
7fc63493 1260Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1261{
671c33bf 1262 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1267{
82686b01
JH
1268 if (*p == '_')
1269 return TRUE;
bde6a22d
NC
1270 /* is_utf8_idstart would be more logical. */
1271 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1272}
1273
1274bool
7fc63493 1275Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1276{
1277 if (*p == '_')
1278 return TRUE;
bde6a22d 1279 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1280}
1281
1282bool
7fc63493 1283Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1284{
bde6a22d 1285 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1286}
1287
1288bool
7fc63493 1289Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1290{
bde6a22d 1291 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1292}
1293
1294bool
7fc63493 1295Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1296{
bde6a22d 1297 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1298}
1299
1300bool
7fc63493 1301Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1302{
bde6a22d 1303 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1304}
1305
1306bool
7fc63493 1307Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1308{
bde6a22d 1309 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1310}
1311
1312bool
7fc63493 1313Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1314{
bde6a22d 1315 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1320{
bde6a22d 1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1326{
bde6a22d 1327 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1328}
1329
1330bool
7fc63493 1331Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1332{
bde6a22d 1333 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1334}
1335
1336bool
7fc63493 1337Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1338{
bde6a22d 1339 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1340}
1341
1342bool
7fc63493 1343Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1344{
bde6a22d 1345 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1350{
bde6a22d 1351 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1352}
1353
6b5c0936
JH
1354/*
1355=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1356
1357The "p" contains the pointer to the UTF-8 string encoding
1358the character that is being converted.
1359
1360The "ustrp" is a pointer to the character buffer to put the
1361conversion result to. The "lenp" is a pointer to the length
1362of the result.
1363
0134edef 1364The "swashp" is a pointer to the swash to use.
6b5c0936 1365
0134edef 1366Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1367and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1368but not always, a multicharacter mapping), is tried first.
6b5c0936 1369
0134edef
JH
1370The "special" is a string like "utf8::ToSpecLower", which means the
1371hash %utf8::ToSpecLower. The access to the hash is through
1372Perl_to_utf8_case().
6b5c0936 1373
0134edef
JH
1374The "normal" is a string like "ToLower" which means the swash
1375%utf8::ToLower.
1376
1377=cut */
6b5c0936 1378
2104c8d9 1379UV
9a957fbc
AL
1380Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1381 SV **swashp, const char *normal, const char *special)
a0ed51b3 1382{
89ebb4a3 1383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1384 STRLEN len = 0;
a0ed51b3 1385
aec46f14 1386 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1387 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1388 * are necessary in EBCDIC, they are redundant no-ops
1389 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1390 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1391 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1392
1393 if (!*swashp) /* load on-demand */
1394 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1395
b08cf34e
JH
1396 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1397 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1398 /* It might be "special" (sometimes, but not always,
2a37f04d 1399 * a multicharacter mapping) */
983ffd37 1400 HV *hv;
b08cf34e
JH
1401 SV **svp;
1402
1403 if ((hv = get_hv(special, FALSE)) &&
1404 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 (*svp)) {
cfd0369c 1406 const char *s;
47654450 1407
cfd0369c 1408 s = SvPV_const(*svp, len);
47654450
JH
1409 if (len == 1)
1410 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1411 else {
2f9475ad
JH
1412#ifdef EBCDIC
1413 /* If we have EBCDIC we need to remap the characters
1414 * since any characters in the low 256 are Unicode
1415 * code points, not EBCDIC. */
7cda7a3d 1416 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1417
1418 d = tmpbuf;
b08cf34e 1419 if (SvUTF8(*svp)) {
2f9475ad
JH
1420 STRLEN tlen = 0;
1421
1422 while (t < tend) {
1423 UV c = utf8_to_uvchr(t, &tlen);
1424 if (tlen > 0) {
1425 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 t += tlen;
1427 }
1428 else
1429 break;
1430 }
1431 }
1432 else {
36fec512
JH
1433 while (t < tend) {
1434 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1435 t++;
1436 }
2f9475ad
JH
1437 }
1438 len = d - tmpbuf;
1439 Copy(tmpbuf, ustrp, len, U8);
1440#else
d2dcd0fb 1441 Copy(s, ustrp, len, U8);
2f9475ad 1442#endif
29e98929 1443 }
983ffd37 1444 }
0134edef
JH
1445 }
1446
1447 if (!len && *swashp) {
1448 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1449
1450 if (uv2) {
1451 /* It was "normal" (a single character mapping). */
1452 UV uv3 = UNI_TO_NATIVE(uv2);
1453
e9101d72 1454 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1455 }
1456 }
1feea2c7 1457
0134edef
JH
1458 if (!len) /* Neither: just copy. */
1459 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460
2a37f04d
JH
1461 if (lenp)
1462 *lenp = len;
1463
0134edef 1464 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1465}
1466
d3e79532 1467/*
7fc63493 1468=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1469
1470Convert the UTF-8 encoded character at p to its uppercase version and
1471store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1472that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1473the uppercase version may be longer than the original character.
d3e79532
JH
1474
1475The first character of the uppercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
2104c8d9 1480UV
7fc63493 1481Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1482{
983ffd37 1483 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1484 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1485}
a0ed51b3 1486
d3e79532 1487/*
7fc63493 1488=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1489
1490Convert the UTF-8 encoded character at p to its titlecase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1492that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1493titlecase version may be longer than the original character.
d3e79532
JH
1494
1495The first character of the titlecased version is returned
1496(but note, as explained above, that there may be more.)
1497
1498=cut */
1499
983ffd37 1500UV
7fc63493 1501Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1502{
1503 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1504 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1505}
1506
d3e79532 1507/*
7fc63493 1508=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1509
1510Convert the UTF-8 encoded character at p to its lowercase version and
1511store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1512that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513lowercase version may be longer than the original character.
d3e79532
JH
1514
1515The first character of the lowercased version is returned
1516(but note, as explained above, that there may be more.)
1517
1518=cut */
1519
2104c8d9 1520UV
7fc63493 1521Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1522{
983ffd37 1523 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1524 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1525}
1526
d3e79532 1527/*
7fc63493 1528=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1529
1530Convert the UTF-8 encoded character at p to its foldcase version and
1531store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1532that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1533foldcase version may be longer than the original character (up to
1534three characters).
1535
1536The first character of the foldcased version is returned
1537(but note, as explained above, that there may be more.)
1538
1539=cut */
1540
b4e400f9 1541UV
7fc63493 1542Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1543{
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1546}
1547
1548/* a "swash" is a swatch hash */
1549
1550SV*
7fc63493 1551Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1552{
27da23d5 1553 dVAR;
a0ed51b3 1554 SV* retval;
9a957fbc 1555 SV* const tokenbufsv = sv_newmortal();
8e84507e 1556 dSP;
7fc63493
AL
1557 const size_t pkg_len = strlen(pkg);
1558 const size_t name_len = strlen(name);
aec46f14 1559 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1560 SV* errsv_save;
ce3b816e 1561
96ca9f55
DM
1562 PUSHSTACKi(PERLSI_MAGIC);
1563 ENTER;
1564 SAVEI32(PL_hints);
1565 PL_hints = 0;
1566 save_re_context();
1b026014 1567 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1568 ENTER;
f8be5cf0 1569 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1570 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1571 Nullsv);
f8be5cf0
JH
1572 if (!SvTRUE(ERRSV))
1573 sv_setsv(ERRSV, errsv_save);
1574 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1575 LEAVE;
1576 }
1577 SPAGAIN;
a0ed51b3
LW
1578 PUSHMARK(SP);
1579 EXTEND(SP,5);
71bed85a
NC
1580 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1581 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1582 PUSHs(listsv);
1583 PUSHs(sv_2mortal(newSViv(minbits)));
1584 PUSHs(sv_2mortal(newSViv(none)));
1585 PUTBACK;
923e4eb5 1586 if (IN_PERL_COMPILETIME) {
bf1fed83 1587 /* XXX ought to be handled by lex_start */
82686b01 1588 SAVEI32(PL_in_my);
2b4bd638 1589 PL_in_my = 0;
bf1fed83 1590 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1591 }
f8be5cf0 1592 errsv_save = newSVsv(ERRSV);
864dbfa3 1593 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1594 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1595 else
e24b16f9 1596 retval = &PL_sv_undef;
f8be5cf0
JH
1597 if (!SvTRUE(ERRSV))
1598 sv_setsv(ERRSV, errsv_save);
1599 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1600 LEAVE;
1601 POPSTACK;
923e4eb5 1602 if (IN_PERL_COMPILETIME) {
bf1fed83 1603 STRLEN len;
aec46f14 1604 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1605
1606 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1607 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1608 }
bc45ce41
JH
1609 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1610 if (SvPOK(retval))
35c1215d
NC
1611 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1612 retval);
cea2e8a9 1613 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1614 }
a0ed51b3
LW
1615 return retval;
1616}
1617
035d37be
JH
1618
1619/* This API is wrong for special case conversions since we may need to
1620 * return several Unicode characters for a single Unicode character
1621 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1622 * the lower-level routine, and it is similarly broken for returning
1623 * multiple values. --jhi */
979f2922 1624/* Now SWASHGET is recasted into S_swash_get in this file. */
a0ed51b3 1625UV
7fc63493 1626Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1627{
27da23d5 1628 dVAR;
aec46f14 1629 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1630 U32 klen;
1631 U32 off;
a0ed51b3 1632 STRLEN slen;
7d85a32c 1633 STRLEN needents;
cfd0369c 1634 const U8 *tmps = NULL;
a0ed51b3 1635 U32 bit;
979f2922 1636 SV *swatch;
3568d838
JH
1637 U8 tmputf8[2];
1638 UV c = NATIVE_TO_ASCII(*ptr);
1639
1640 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
ST
1641 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1642 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1643 ptr = tmputf8;
3568d838
JH
1644 }
1645 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1646 * then the "swatch" is a vec() for al the chars which start
1647 * with 0xAA..0xYY
1648 * So the key in the hash (klen) is length of encoded char -1
1649 */
1650 klen = UTF8SKIP(ptr) - 1;
1651 off = ptr[klen];
a0ed51b3 1652
979f2922 1653 if (klen == 0) {
7d85a32c 1654 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1655 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1656 */
979f2922
ST
1657 needents = UTF_CONTINUATION_MARK;
1658 off = NATIVE_TO_UTF(ptr[klen]);
1659 }
1660 else {
7d85a32c 1661 /* If char is encoded then swatch is for the prefix */
979f2922
ST
1662 needents = (1 << UTF_ACCUMULATION_SHIFT);
1663 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1664 }
7d85a32c 1665
a0ed51b3
LW
1666 /*
1667 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1668 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1669 * it's nothing to sniff at.) Pity we usually come through at least
1670 * two function calls to get here...
1671 *
1672 * NB: this code assumes that swatches are never modified, once generated!
1673 */
1674
3568d838 1675 if (hv == PL_last_swash_hv &&
a0ed51b3 1676 klen == PL_last_swash_klen &&
27da23d5 1677 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1678 {
1679 tmps = PL_last_swash_tmps;
1680 slen = PL_last_swash_slen;
1681 }
1682 else {
1683 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1684 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1685
979f2922
ST
1686 /* If not cached, generate it via swash_get */
1687 if (!svp || !SvPOK(*svp)
1688 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1689 /* We use utf8n_to_uvuni() as we want an index into
1690 Unicode tables, not a native character number.
1691 */
aec46f14 1692 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1693 ckWARN(WARN_UTF8) ?
1694 0 : UTF8_ALLOW_ANY);
979f2922
ST
1695 swatch = swash_get(sv,
1696 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1697 (klen) ? (code_point & ~(needents - 1)) : 0,
1698 needents);
1699
923e4eb5 1700 if (IN_PERL_COMPILETIME)
eb160463 1701 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1702
979f2922 1703 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1704
979f2922
ST
1705 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1706 || (slen << 3) < needents)
1707 Perl_croak(aTHX_ "The swatch does not have proper length");
a0ed51b3
LW
1708 }
1709
1710 PL_last_swash_hv = hv;
1711 PL_last_swash_klen = klen;
cfd0369c
NC
1712 /* FIXME change interpvar.h? */
1713 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1714 PL_last_swash_slen = slen;
1715 if (klen)
1716 Copy(ptr, PL_last_swash_key, klen, U8);
1717 }
1718
9faf8d75 1719 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1720 case 1:
1721 bit = 1 << (off & 7);
1722 off >>= 3;
1723 return (tmps[off] & bit) != 0;
1724 case 8:
1725 return tmps[off];
1726 case 16:
1727 off <<= 1;
1728 return (tmps[off] << 8) + tmps[off + 1] ;
1729 case 32:
1730 off <<= 2;
1731 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1732 }
cea2e8a9 1733 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1734 return 0;
1735}
2b9d42f0 1736
979f2922
ST
1737/* Note:
1738 * Returns a swatch (a bit vector string) for a code point sequence
1739 * that starts from the value C<start> and comprises the number C<span>.
1740 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1741 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1742 */
1743STATIC SV*
1744S_swash_get(pTHX_ SV* swash, UV start, UV span)
1745{
1746 SV *swatch;
1747 U8 *l, *lend, *x, *xend, *s, *nl;
1748 STRLEN lcur, xcur, scur;
1749
1750 HV* const hv = (HV*)SvRV(swash);
1751 SV** listsvp = hv_fetch(hv, "LIST", 4, FALSE);
1752 SV** typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
1753 SV** bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
1754 SV** nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
1755 SV** extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
1756 U8* typestr = (U8*)SvPV_nolen(*typesvp);
1757 int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1758 STRLEN bits = SvUV(*bitssvp);
1759 STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1760 UV none = SvUV(*nonesvp);
1761 UV end = start + span;
1762
1763 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1764 Perl_croak(aTHX_ "swash_get: unknown bits %"UVuf, (UV) bits);
1765 }
1766
1767 /* create and initialize $swatch */
1768 swatch = newSVpvn("",0);
1769 scur = octets ? (span * octets) : (span + 7) / 8;
1770 SvGROW(swatch, scur + 1);
1771 s = (U8*)SvPVX(swatch);
1772 if (octets && none) {
1773 const U8* e = s + scur;
1774 while (s < e) {
1775 if (bits == 8)
1776 *s++ = (U8)(none & 0xff);
1777 else if (bits == 16) {
1778 *s++ = (U8)((none >> 8) & 0xff);
1779 *s++ = (U8)( none & 0xff);
1780 }
1781 else if (bits == 32) {
1782 *s++ = (U8)((none >> 24) & 0xff);
1783 *s++ = (U8)((none >> 16) & 0xff);
1784 *s++ = (U8)((none >> 8) & 0xff);
1785 *s++ = (U8)( none & 0xff);
1786 }
1787 }
1788 *s = '\0';
1789 }
1790 else {
1791 (void)memzero((U8*)s, scur + 1);
1792 }
1793 SvCUR_set(swatch, scur);
1794 s = (U8*)SvPVX(swatch);
1795
1796 /* read $swash->{LIST} */
1797 l = (U8*)SvPV(*listsvp, lcur);
1798 lend = l + lcur;
1799 while (l < lend) {
1800 UV min, max, val, key;
1801 STRLEN numlen;
1802 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1803
1804 nl = (U8*)memchr(l, '\n', lend - l);
1805
1806 numlen = lend - l;
1807 min = grok_hex((char *)l, &numlen, &flags, NULL);
1808 if (numlen)
1809 l += numlen;
1810 else if (nl) {
1811 l = nl + 1; /* 1 is length of "\n" */
1812 continue;
1813 }
1814 else {
1815 l = lend; /* to LIST's end at which \n is not found */
1816 break;
1817 }
1818
1819 if (isBLANK(*l)) {
1820 ++l;
1821 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1822 numlen = lend - l;
1823 max = grok_hex((char *)l, &numlen, &flags, NULL);
1824 if (numlen)
1825 l += numlen;
1826 else
1827 max = min;
1828
1829 if (octets) {
1830 if (isBLANK(*l)) {
1831 ++l;
1832 flags = PERL_SCAN_SILENT_ILLDIGIT |
1833 PERL_SCAN_DISALLOW_PREFIX;
1834 numlen = lend - l;
1835 val = grok_hex((char *)l, &numlen, &flags, NULL);
1836 if (numlen)
1837 l += numlen;
1838 else
1839 val = 0;
1840 }
1841 else {
1842 val = 0;
1843 if (typeto) {
1844 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1845 typestr, l);
1846 }
1847 }
1848 }
1849 }
1850 else {
1851 max = min;
1852 if (octets) {
1853 val = 0;
1854 if (typeto) {
1855 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1856 }
1857 }
1858 }
1859
1860 if (nl)
1861 l = nl + 1;
1862 else
1863 l = lend;
1864
1865 if (max < start)
1866 continue;
1867
1868 if (octets) {
1869 if (min < start) {
1870 if (!none || val < none) {
1871 val += start - min;
1872 }
1873 min = start;
1874 }
1875 for (key = min; key <= max; key++) {
1876 STRLEN offset;
1877 if (key >= end)
1878 goto go_out_list;
1879 /* offset must be non-negative (start <= min <= key < end) */
1880 offset = octets * (key - start);
1881 if (bits == 8)
1882 s[offset] = (U8)(val & 0xff);
1883 else if (bits == 16) {
1884 s[offset ] = (U8)((val >> 8) & 0xff);
1885 s[offset + 1] = (U8)( val & 0xff);
1886 }
1887 else if (bits == 32) {
1888 s[offset ] = (U8)((val >> 24) & 0xff);
1889 s[offset + 1] = (U8)((val >> 16) & 0xff);
1890 s[offset + 2] = (U8)((val >> 8) & 0xff);
1891 s[offset + 3] = (U8)( val & 0xff);
1892 }
1893
1894 if (!none || val < none)
1895 ++val;
1896 }
1897 }
1898 else {
1899 if (min < start)
1900 min = start;
1901 for (key = min; key <= max; key++) {
1902 STRLEN offset = (STRLEN)(key - start);
1903 if (key >= end)
1904 goto go_out_list;
1905 s[offset >> 3] |= 1 << (offset & 7);
1906 }
1907 }
1908 } /* while */
1909 go_out_list:
1910
1911 /* read $swash->{EXTRAS} */
1912 x = (U8*)SvPV(*extssvp, xcur);
1913 xend = x + xcur;
1914 while (x < xend) {
1915 STRLEN namelen;
1916 U8 *namestr;
1917 SV** othersvp;
1918 HV* otherhv;
1919 STRLEN otherbits;
1920 SV **otherbitssvp, *other;
1921 U8 *s, *o;
1922 STRLEN slen, olen;
1923
1924 U8 opc = *x++;
1925 if (opc == '\n')
1926 continue;
1927
1928 nl = (U8*)memchr(x, '\n', xend - x);
1929
1930 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1931 if (nl) {
1932 x = nl + 1; /* 1 is length of "\n" */
1933 continue;
1934 }
1935 else {
1936 x = xend; /* to EXTRAS' end at which \n is not found */
1937 break;
1938 }
1939 }
1940
1941 namestr = x;
1942 if (nl) {
1943 namelen = nl - namestr;
1944 x = nl + 1;
1945 }
1946 else {
1947 namelen = xend - namestr;
1948 x = xend;
1949 }
1950
1951 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1952 if (*othersvp && SvROK(*othersvp) &&
1953 SvTYPE(SvRV(*othersvp))==SVt_PVHV)
1954 otherhv = (HV*)SvRV(*othersvp);
1955 else
1956 Perl_croak(aTHX_ "otherhv is not a hash reference");
1957
1958 otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
1959 otherbits = (STRLEN)SvUV(*otherbitssvp);
1960 if (bits < otherbits)
1961 Perl_croak(aTHX_ "swash_get: swatch size mismatch");
1962
1963 /* The "other" swatch must be destroyed after. */
1964 other = swash_get(*othersvp, start, span);
1965 o = (U8*)SvPV(other, olen);
1966
1967 if (!olen)
1968 Perl_croak(aTHX_ "swash_get didn't return valid swatch for other");
1969
1970 s = (U8*)SvPV(swatch, slen);
1971 if (bits == 1 && otherbits == 1) {
1972 if (slen != olen)
1973 Perl_croak(aTHX_ "swash_get: swatch length mismatch");
1974
1975 switch (opc) {
1976 case '+':
1977 while (slen--)
1978 *s++ |= *o++;
1979 break;
1980 case '!':
1981 while (slen--)
1982 *s++ |= ~*o++;
1983 break;
1984 case '-':
1985 while (slen--)
1986 *s++ &= ~*o++;
1987 break;
1988 case '&':
1989 while (slen--)
1990 *s++ &= *o++;
1991 break;
1992 default:
1993 break;
1994 }
1995 }
1996 else { /* bits >= 8 */
1997 /* XXX: but weirdly otherval is treated as boolean */
1998 STRLEN otheroctets = otherbits >> 3;
1999 STRLEN offset = 0;
2000 U8* send = s + slen;
2001
2002 while (s < send) {
2003 UV otherval = 0;
2004
2005 if (otherbits == 1) {
2006 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2007 ++offset;
2008 }
2009 else {
2010 STRLEN vlen = otheroctets;
2011 otherval = *o++;
2012 while (--vlen) {
2013 otherval <<= 8;
2014 otherval |= *o++;
2015 }
2016 }
2017
2018 if (opc == '+' && otherval)
2019 otherval = 1;
2020 else if (opc == '!' && !otherval)
2021 otherval = 1;
2022 else if (opc == '-' && otherval)
2023 otherval = 0;
2024 else if (opc == '&' && !otherval)
2025 otherval = 0;
2026 else {
2027 s += octets; /* not modify orig swatch */
2028 continue;
2029 }
2030
2031 if (bits == 8)
2032 *s++ = (U8)( otherval & 0xff);
2033 else if (bits == 16) {
2034 *s++ = (U8)((otherval >> 8) & 0xff);
2035 *s++ = (U8)( otherval & 0xff);
2036 }
2037 else if (bits == 32) {
2038 *s++ = (U8)((otherval >> 24) & 0xff);
2039 *s++ = (U8)((otherval >> 16) & 0xff);
2040 *s++ = (U8)((otherval >> 8) & 0xff);
2041 *s++ = (U8)( otherval & 0xff);
2042 }
2043 }
2044 }
2045 sv_free(other); /* through with it! */
2046 } /* while */
2047 return swatch;
2048}
2049
0f830e0b
NC
2050/*
2051=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2052
2053Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2054of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2055bytes available. The return value is the pointer to the byte after the
2056end of the new character. In other words,
2057
2058 d = uvchr_to_utf8(d, uv);
2059
2060is the recommended wide native character-aware way of saying
2061
2062 *(d++) = uv;
2063
2064=cut
2065*/
2066
2067/* On ASCII machines this is normally a macro but we want a
2068 real function in case XS code wants it
2069*/
0f830e0b
NC
2070U8 *
2071Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2072{
2073 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2074}
2075
b851fbc1
JH
2076U8 *
2077Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2078{
2079 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2080}
2b9d42f0
NIS
2081
2082/*
0f830e0b
NC
2083=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2084flags
2085
2086Returns the native character value of the first character in the string
2087C<s>
2088which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2089length, in bytes, of that character.
2090
2091Allows length and flags to be passed to low level routine.
2092
2093=cut
2094*/
2095/* On ASCII machines this is normally a macro but we want
2096 a real function in case XS code wants it
2097*/
0f830e0b
NC
2098UV
2099Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2100U32 flags)
2101{
2102 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2103 return UNI_TO_NATIVE(uv);
2104}
2105
2106/*
d2cc3551
JH
2107=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2108
2109Build to the scalar dsv a displayable version of the string spv,
2110length len, the displayable version being at most pvlim bytes long
2111(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2112
9e55ce06 2113The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2114isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2115to display the \\[nrfta\\] as the backslashed versions (like '\n')
2116(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2117UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2118UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2119
d2cc3551
JH
2120The pointer to the PV of the dsv is returned.
2121
2122=cut */
e6b2e755 2123char *
e1ec3a88 2124Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2125{
2126 int truncated = 0;
e1ec3a88 2127 const char *s, *e;
e6b2e755
JH
2128
2129 sv_setpvn(dsv, "", 0);
e1ec3a88 2130 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2131 UV u;
a49f32c6
NC
2132 /* This serves double duty as a flag and a character to print after
2133 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2134 */
2135 char ok = 0;
c728cb41 2136
e6b2e755
JH
2137 if (pvlim && SvCUR(dsv) >= pvlim) {
2138 truncated++;
2139 break;
2140 }
2141 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2142 if (u < 256) {
a3b680e6 2143 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 2144 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 2145 switch (c) {
c728cb41 2146 case '\n':
a49f32c6 2147 ok = 'n'; break;
c728cb41 2148 case '\r':
a49f32c6 2149 ok = 'r'; break;
c728cb41 2150 case '\t':
a49f32c6 2151 ok = 't'; break;
c728cb41 2152 case '\f':
a49f32c6 2153 ok = 'f'; break;
c728cb41 2154 case '\a':
a49f32c6 2155 ok = 'a'; break;
c728cb41 2156 case '\\':
a49f32c6 2157 ok = '\\'; break;
c728cb41
JH
2158 default: break;
2159 }
a49f32c6
NC
2160 if (ok) {
2161 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2162 }
c728cb41 2163 }
00e86452 2164 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2165 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2166 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2167 ok = 1;
0a2ef054 2168 }
c728cb41
JH
2169 }
2170 if (!ok)
9e55ce06 2171 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2172 }
2173 if (truncated)
2174 sv_catpvn(dsv, "...", 3);
2175
2176 return SvPVX(dsv);
2177}
2b9d42f0 2178
d2cc3551
JH
2179/*
2180=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2181
2182Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2183the displayable version being at most pvlim bytes long
d2cc3551 2184(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2185
2186The flags argument is as in pv_uni_display().
2187
d2cc3551
JH
2188The pointer to the PV of the dsv is returned.
2189
2190=cut */
e6b2e755
JH
2191char *
2192Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2193{
cfd0369c
NC
2194 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2195 SvCUR(ssv), pvlim, flags);
701a277b
JH
2196}
2197
d2cc3551 2198/*
d07ddd77 2199=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2200
2201Return true if the strings s1 and s2 differ case-insensitively, false
2202if not (if they are equal case-insensitively). If u1 is true, the
2203string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2204the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2205are false, the respective string is assumed to be in native 8-bit
2206encoding.
2207
2208If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2209in there (they will point at the beginning of the I<next> character).
2210If the pointers behind pe1 or pe2 are non-NULL, they are the end
2211pointers beyond which scanning will not continue under any
4cdaeff7 2212circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2213s2+l2 will be used as goal end pointers that will also stop the scan,
2214and which qualify towards defining a successful match: all the scans
2215that define an explicit length must reach their goal pointers for
2216a match to succeed).
d2cc3551
JH
2217
2218For case-insensitiveness, the "casefolding" of Unicode is used
2219instead of upper/lowercasing both the characters, see
2220http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2221
2222=cut */
701a277b 2223I32
d07ddd77 2224Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2225{
e1ec3a88
AL
2226 register const U8 *p1 = (const U8*)s1;
2227 register const U8 *p2 = (const U8*)s2;
2228 register const U8 *f1 = 0, *f2 = 0;
2229 register U8 *e1 = 0, *q1 = 0;
2230 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2231 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2232 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2233 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2234 U8 natbuf[1+1];
2235 STRLEN foldlen1, foldlen2;
d07ddd77 2236 bool match;
332ddc25 2237
d07ddd77
JH
2238 if (pe1)
2239 e1 = *(U8**)pe1;
e1ec3a88
AL
2240 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2241 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2242 if (pe2)
2243 e2 = *(U8**)pe2;
e1ec3a88
AL
2244 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2245 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2246
2247 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2248 return 1; /* mismatch; possible infinite loop or false positive */
2249
a6872d42
JH
2250 if (!u1 || !u2)
2251 natbuf[1] = 0; /* Need to terminate the buffer. */
2252
d07ddd77
JH
2253 while ((e1 == 0 || p1 < e1) &&
2254 (f1 == 0 || p1 < f1) &&
2255 (e2 == 0 || p2 < e2) &&
2256 (f2 == 0 || p2 < f2)) {
2257 if (n1 == 0) {
d7f013c8
JH
2258 if (u1)
2259 to_utf8_fold(p1, foldbuf1, &foldlen1);
2260 else {
809e8e66 2261 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2262 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2263 }
2264 q1 = foldbuf1;
d07ddd77 2265 n1 = foldlen1;
332ddc25 2266 }
d07ddd77 2267 if (n2 == 0) {
d7f013c8
JH
2268 if (u2)
2269 to_utf8_fold(p2, foldbuf2, &foldlen2);
2270 else {
809e8e66 2271 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2272 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2273 }
2274 q2 = foldbuf2;
d07ddd77 2275 n2 = foldlen2;
332ddc25 2276 }
d07ddd77
JH
2277 while (n1 && n2) {
2278 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2279 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2280 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2281 return 1; /* mismatch */
d07ddd77 2282 n1 -= UTF8SKIP(q1);
d7f013c8 2283 q1 += UTF8SKIP(q1);
d07ddd77 2284 n2 -= UTF8SKIP(q2);
d7f013c8 2285 q2 += UTF8SKIP(q2);
701a277b 2286 }
d07ddd77 2287 if (n1 == 0)
d7f013c8 2288 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2289 if (n2 == 0)
d7f013c8
JH
2290 p2 += u2 ? UTF8SKIP(p2) : 1;
2291
d2cc3551 2292 }
5469e704 2293
d07ddd77
JH
2294 /* A match is defined by all the scans that specified
2295 * an explicit length reaching their final goals. */
2296 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2297
2298 if (match) {
d07ddd77
JH
2299 if (pe1)
2300 *pe1 = (char*)p1;
2301 if (pe2)
2302 *pe2 = (char*)p2;
5469e704
JH
2303 }
2304
2305 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2306}
701a277b 2307
a49f32c6
NC
2308/*
2309 * Local variables:
2310 * c-indentation-style: bsd
2311 * c-basic-offset: 4
2312 * indent-tabs-mode: t
2313 * End:
2314 *
37442d52
RGS
2315 * ex: set ts=8 sts=4 sw=4 noet:
2316 */