This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl_is_utf8_* share a lot of common code. Pull that out into a new
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
646ca15d
JH
189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
TS
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
3b0fc154 241#ifdef IS_UTF8_CHAR
768c67ee 242 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 245 return is_utf8_char_slow(s, len);
386d01d6
GS
246}
247
6662521e 248/*
7fc63493 249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 250
c9ada85f 251Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
6662521e 255
768c67ee
JH
256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
6662521e
GS
258=cut
259*/
260
8e84507e 261bool
7fc63493 262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 263{
7fc63493
AL
264 const U8* x = s;
265 const U8* send;
067a85ef 266
44f8325f 267 if (!len)
e1ec3a88 268 len = strlen((const char *)s);
1aa99e6b
IH
269 send = s + len;
270
6662521e 271 while (x < send) {
a3b680e6 272 STRLEN c;
1acdb0da
JH
273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
768c67ee 277 goto out;
1acdb0da
JH
278 else {
279 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
768c67ee
JH
282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
646ca15d
JH
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee
JH
295
296 out:
60006e79
JH
297 if (x != send)
298 return FALSE;
067a85ef
A
299
300 return TRUE;
6662521e
GS
301}
302
67e989fb 303/*
814fafa7
NC
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
7fc63493
AL
329 const U8* x = s;
330 const U8* send;
81cd54e3
JH
331 STRLEN c;
332
44f8325f 333 if (!len)
e1ec3a88 334 len = strlen((const char *)s);
81cd54e3 335 send = s + len;
768c67ee
JH
336 if (el)
337 *el = 0;
81cd54e3
JH
338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
81cd54e3 345 else {
768c67ee
JH
346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
81cd54e3 359 }
768c67ee
JH
360 x += c;
361 if (el)
362 (*el)++;
81cd54e3 363 }
768c67ee
JH
364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
81cd54e3 369 return FALSE;
81cd54e3
JH
370
371 return TRUE;
372}
373
374/*
768c67ee 375
7fc63493 376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 377
9041c2e3
NIS
378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
1e54db1a 380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 381C<retlen> will be set to the length, in bytes, of that character.
67e989fb 382
1e54db1a 383If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
28d3d195
JH
386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 393
9041c2e3
NIS
394Most code should use utf8_to_uvchr() rather than call this directly.
395
37607a96
PK
396=cut
397*/
67e989fb 398
a0ed51b3 399UV
7fc63493 400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 401{
7fc63493 402 const U8 *s0 = s;
9c5ffd7c 403 UV uv = *s, ouv = 0;
ba210ebe 404 STRLEN len = 1;
7fc63493
AL
405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
ba210ebe 407 STRLEN expectlen = 0;
a0dbb045
JH
408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
c867b360
JH
419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
425 goto malformed;
426 }
427
1d72bdf6 428 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
429 if (retlen)
430 *retlen = 1;
c4d5f83a 431 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 432 }
67e989fb 433
421a8bf2 434 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 435 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 436 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
437 goto malformed;
438 }
439
421a8bf2 440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 442 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
443 goto malformed;
444 }
9041c2e3 445
1d72bdf6 446#ifdef EBCDIC
75383841 447 uv = NATIVE_TO_UTF(uv);
1d72bdf6 448#else
fcc8fcf6
JH
449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 451 warning = UTF8_WARN_FE_FF;
ba210ebe 452 goto malformed;
a0ed51b3 453 }
1d72bdf6
NIS
454#endif
455
ba210ebe
JH
456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
ba210ebe
JH
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
a0ed51b3
LW
469 if (retlen)
470 *retlen = len;
9041c2e3 471
ba210ebe
JH
472 expectlen = len;
473
fcc8fcf6
JH
474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 476 warning = UTF8_WARN_SHORT;
ba210ebe
JH
477 goto malformed;
478 }
479
480 len--;
a0ed51b3 481 s++;
ba210ebe
JH
482 ouv = uv;
483
a0ed51b3 484 while (len--) {
421a8bf2
JH
485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 489 goto malformed;
a0ed51b3
LW
490 }
491 else
8850bf83 492 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
75dbc644 496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
ba210ebe
JH
506 }
507 s++;
508 ouv = uv;
509 }
510
421a8bf2 511 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 513 warning = UTF8_WARN_SURROGATE;
ba210ebe 514 goto malformed;
eb160463 515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 517 warning = UTF8_WARN_LONG;
ba210ebe 518 goto malformed;
421a8bf2 519 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 520 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 521 warning = UTF8_WARN_FFFF;
a9917092 522 goto malformed;
a0ed51b3 523 }
ba210ebe 524
a0ed51b3 525 return uv;
ba210ebe
JH
526
527malformed:
528
fcc8fcf6 529 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 530 if (retlen)
cc366d4b 531 *retlen = -1;
ba210ebe
JH
532 return 0;
533 }
534
a0dbb045 535 if (dowarn) {
44f8325f 536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045
JH
537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
54667de8 541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
542 break;
543 case UTF8_WARN_CONTINUATION:
097fb8e2 544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
545 break;
546 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
551405c4
AL
550 else {
551 const int len = (int)(s-s0);
097fb8e2 552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
a0dbb045
JH
556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
097fb8e2 561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 563 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
564 break;
565 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
a0dbb045
JH
568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
a0dbb045 572 case UTF8_WARN_LONG:
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
54667de8 580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
581 break;
582 }
583
584 if (warning) {
44f8325f 585 const char * const s = SvPVX_const(sv);
a0dbb045
JH
586
587 if (PL_op)
9014280d 588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 589 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 590 else
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
592 }
593 }
594
ba210ebe 595 if (retlen)
28d3d195 596 *retlen = expectlen ? expectlen : len;
ba210ebe 597
28d3d195 598 return 0;
a0ed51b3
LW
599}
600
8e84507e 601/*
7fc63493 602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
603
604Returns the native character value of the first character in the string C<s>
1e54db1a 605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
606length, in bytes, of that character.
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 616{
89ebb4a3 617 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
619}
620
621/*
7fc63493 622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
623
624Returns the Unicode code point of the first character in the string C<s>
1e54db1a 625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
1e54db1a 631If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 632returned and retlen is set, if possible, to -1.
8e84507e
NIS
633
634=cut
635*/
636
637UV
7fc63493 638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 639{
9041c2e3 640 /* Call the low level routine asking for checks */
89ebb4a3 641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
643}
644
b76347f2 645/*
35a4481c 646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
647
648Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
b76347f2
JH
651
652=cut
653*/
654
655STRLEN
35a4481c 656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
657{
658 STRLEN len = 0;
659
8850bf83
JH
660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
a3b680e6
AL
664 if (e < s)
665 goto warn_and_return;
b76347f2 666 while (s < e) {
4373e329 667 const U8 t = UTF8SKIP(s);
901b21bf 668 if (e - s < t) {
a3b680e6 669 warn_and_return:
901b21bf
JH
670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 673 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
b76347f2
JH
679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
b06226ff 686/*
35a4481c 687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 688
1e54db1a 689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
37607a96
PK
695=cut
696*/
a0ed51b3 697
02eb7b47 698IV
35a4481c 699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 700{
02eb7b47
JH
701 IV off = 0;
702
8850bf83
JH
703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
a0ed51b3
LW
707 if (a < b) {
708 while (a < b) {
35a4481c 709 const U8 c = UTF8SKIP(a);
a3b680e6
AL
710 if (b - a < c)
711 goto warn_and_return;
02eb7b47 712 a += c;
a0ed51b3
LW
713 off--;
714 }
715 }
716 else {
717 while (b < a) {
4373e329 718 const U8 c = UTF8SKIP(b);
02eb7b47 719
901b21bf 720 if (a - b < c) {
a3b680e6 721 warn_and_return:
901b21bf
JH
722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
02eb7b47 731 b += c;
a0ed51b3
LW
732 off++;
733 }
734 }
02eb7b47 735
a0ed51b3
LW
736 return off;
737}
738
b06226ff 739/*
37607a96 740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 741
8850bf83
JH
742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
b06226ff
JH
744
745WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
b06226ff 748
37607a96
PK
749=cut
750*/
a0ed51b3
LW
751
752U8 *
4373e329 753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 754{
8850bf83
JH
755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
a0ed51b3
LW
759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
8850bf83
JH
766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
a0ed51b3
LW
768 }
769 }
4373e329 770 return (U8 *)s;
a0ed51b3
LW
771}
772
6940069f 773/*
eebe1485 774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 775
1e54db1a 776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
67e989fb 779Returns zero on failure, setting C<len> to -1.
6940069f
GS
780
781=cut
782*/
783
784U8 *
37607a96 785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 786{
6940069f
GS
787 U8 *send;
788 U8 *d;
dcad2880 789 U8 *save = s;
246fae53 790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
1d72bdf6
NIS
795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
798 *len = -1;
799 return 0;
800 }
246fae53 801 }
dcad2880
JH
802
803 d = s = save;
6940069f 804 while (s < send) {
ed646e6e 805 STRLEN ulen;
9041c2e3 806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 807 s += ulen;
6940069f
GS
808 }
809 *d = '\0';
246fae53 810 *len = d - save;
6940069f
GS
811 return save;
812}
813
814/*
e1ec3a88 815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 816
1e54db1a 817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
f9a63242 823
37607a96
PK
824=cut
825*/
f9a63242
JH
826
827U8 *
e1ec3a88 828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 829{
f9a63242 830 U8 *d;
e1ec3a88
AL
831 const U8 *start = s;
832 const U8 *send;
f9a63242
JH
833 I32 count = 0;
834
835 if (!*is_utf8)
73d840c0 836 return (U8 *)start;
f9a63242 837
1e54db1a 838 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 839 for (send = s + *len; s < send;) {
e1ec3a88 840 U8 c = *s++;
1d72bdf6 841 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
73d840c0 846 return (U8 *)start;
db42d148 847 }
f9a63242
JH
848 }
849
850 *is_utf8 = 0;
851
a02a5408 852 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 853 s = start; start = d;
f9a63242
JH
854 while (s < send) {
855 U8 c = *s++;
c4d5f83a
NIS
856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
f9a63242
JH
862 }
863 *d = '\0';
864 *len = d - start;
73d840c0 865 return (U8 *)start;
f9a63242
JH
866}
867
868/*
35a4481c 869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 870
1e54db1a 871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
6940069f 874
1e54db1a 875If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
876see sv_recode_to_utf8().
877
497711e7 878=cut
6940069f
GS
879*/
880
881U8*
35a4481c 882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 883{
35a4481c 884 const U8 * const send = s + (*len);
6940069f
GS
885 U8 *d;
886 U8 *dst;
6940069f 887
a02a5408 888 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
889 dst = d;
890
891 while (s < send) {
35a4481c 892 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 893 if (UNI_IS_INVARIANT(uv))
eb160463 894 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 895 else {
eb160463
GS
896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
898 }
899 }
900 *d = '\0';
6662521e 901 *len = d-dst;
6940069f
GS
902 return dst;
903}
904
a0ed51b3 905/*
dea0fc0b 906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
dea0fc0b 912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 913{
dea0fc0b
JH
914 U8* pend;
915 U8* dstart = d;
916
1de9afcd
RGS
917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
dea0fc0b 923 if (bytelen & 1)
014ead4b 924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
925
926 pend = p + bytelen;
927
a0ed51b3 928 while (p < pend) {
dea0fc0b
JH
929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
a0ed51b3 931 if (uv < 0x80) {
eb160463 932 *d++ = (U8)uv;
a0ed51b3
LW
933 continue;
934 }
935 if (uv < 0x800) {
eb160463
GS
936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
941 UV low = (p[0] << 8) + p[1];
942 p += 2;
dea0fc0b
JH
943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
eb160463
GS
948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
951 continue;
952 }
953 else {
eb160463
GS
954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
958 continue;
959 }
960 }
dea0fc0b 961 *newlen = d - dstart;
a0ed51b3
LW
962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
dea0fc0b 968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
dea0fc0b 978 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
84afefe6 984Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1076 return is_utf8_print(tmpbuf);
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1084 return is_utf8_punct(tmpbuf);
1085}
1086
4d61ec05 1087bool
84afefe6 1088Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
84afefe6
JH
1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1097{
0ebc6274
JH
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1100}
1101
84afefe6
JH
1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1104{
0ebc6274
JH
1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1107}
1108
84afefe6
JH
1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1111{
0ebc6274
JH
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1114}
1115
84afefe6
JH
1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
0ebc6274
JH
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1121}
1122
a0ed51b3
LW
1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
84afefe6 1126Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
b8c5462f 1197bool
84afefe6 1198Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
4d61ec05 1203bool
84afefe6 1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
b7ac61fa
JH
1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
ee099d14
JH
1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
89ebb4a3 1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1216 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
ee099d14
JH
1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
89ebb4a3 1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1226 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
ee099d14
JH
1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
89ebb4a3 1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1236 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1237}
1238
a0ed51b3 1239bool
bde6a22d
NC
1240S_is_utf8_common(pTHX_ const U8 const *p, SV **swash,
1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 0, 0);
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
386d01d6
GS
1253 if (!is_utf8_char(p))
1254 return FALSE;
a0ed51b3 1255 if (!PL_utf8_alnum)
289d4f09
ML
1256 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1257 * descendant of isalnum(3), in other words, it doesn't
1258 * contain the '_'. --jhi */
1259 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1260 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1261/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1262#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1263 if (!PL_utf8_alnum)
1264 PL_utf8_alnum = swash_init("utf8", "",
1265 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1266 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1267#endif
1268}
1269
1270bool
7fc63493 1271Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1272{
386d01d6
GS
1273 if (!is_utf8_char(p))
1274 return FALSE;
b8c5462f
JH
1275 if (!PL_utf8_alnum)
1276 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1277 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1278/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1279#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1280 if (!PL_utf8_alnum)
1281 PL_utf8_alnum = swash_init("utf8", "",
1282 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1283 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1284#endif
1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1289{
82686b01
JH
1290 if (*p == '_')
1291 return TRUE;
bde6a22d
NC
1292 /* is_utf8_idstart would be more logical. */
1293 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1298{
1299 if (*p == '_')
1300 return TRUE;
bde6a22d 1301 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1302}
1303
1304bool
7fc63493 1305Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1306{
bde6a22d 1307 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1308}
1309
1310bool
7fc63493 1311Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1312{
bde6a22d 1313 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1314}
1315
1316bool
7fc63493 1317Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1318{
bde6a22d 1319 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1320}
1321
1322bool
7fc63493 1323Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1324{
bde6a22d 1325 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1326}
1327
1328bool
7fc63493 1329Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1330{
bde6a22d 1331 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1332}
1333
1334bool
7fc63493 1335Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1336{
bde6a22d 1337 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1338}
1339
1340bool
7fc63493 1341Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1342{
bde6a22d 1343 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1344}
1345
1346bool
7fc63493 1347Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1348{
bde6a22d 1349 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1350}
1351
1352bool
7fc63493 1353Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1354{
bde6a22d 1355 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1356}
1357
1358bool
7fc63493 1359Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1360{
bde6a22d 1361 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1362}
1363
1364bool
7fc63493 1365Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1366{
bde6a22d 1367 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1368}
1369
1370bool
7fc63493 1371Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1372{
bde6a22d 1373 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1374}
1375
6b5c0936
JH
1376/*
1377=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1378
1379The "p" contains the pointer to the UTF-8 string encoding
1380the character that is being converted.
1381
1382The "ustrp" is a pointer to the character buffer to put the
1383conversion result to. The "lenp" is a pointer to the length
1384of the result.
1385
0134edef 1386The "swashp" is a pointer to the swash to use.
6b5c0936 1387
0134edef
JH
1388Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1389and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1390but not always, a multicharacter mapping), is tried first.
6b5c0936 1391
0134edef
JH
1392The "special" is a string like "utf8::ToSpecLower", which means the
1393hash %utf8::ToSpecLower. The access to the hash is through
1394Perl_to_utf8_case().
6b5c0936 1395
0134edef
JH
1396The "normal" is a string like "ToLower" which means the swash
1397%utf8::ToLower.
1398
1399=cut */
6b5c0936 1400
2104c8d9 1401UV
9a957fbc
AL
1402Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1403 SV **swashp, const char *normal, const char *special)
a0ed51b3 1404{
89ebb4a3 1405 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1406 STRLEN len = 0;
a0ed51b3 1407
aec46f14 1408 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1409 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1410 * are necessary in EBCDIC, they are redundant no-ops
1411 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1412 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1413 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1414
1415 if (!*swashp) /* load on-demand */
1416 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1417
b08cf34e
JH
1418 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1419 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1420 /* It might be "special" (sometimes, but not always,
2a37f04d 1421 * a multicharacter mapping) */
983ffd37 1422 HV *hv;
b08cf34e
JH
1423 SV **svp;
1424
1425 if ((hv = get_hv(special, FALSE)) &&
1426 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1427 (*svp)) {
cfd0369c 1428 const char *s;
47654450 1429
cfd0369c 1430 s = SvPV_const(*svp, len);
47654450
JH
1431 if (len == 1)
1432 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1433 else {
2f9475ad
JH
1434#ifdef EBCDIC
1435 /* If we have EBCDIC we need to remap the characters
1436 * since any characters in the low 256 are Unicode
1437 * code points, not EBCDIC. */
7cda7a3d 1438 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1439
1440 d = tmpbuf;
b08cf34e 1441 if (SvUTF8(*svp)) {
2f9475ad
JH
1442 STRLEN tlen = 0;
1443
1444 while (t < tend) {
1445 UV c = utf8_to_uvchr(t, &tlen);
1446 if (tlen > 0) {
1447 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1448 t += tlen;
1449 }
1450 else
1451 break;
1452 }
1453 }
1454 else {
36fec512
JH
1455 while (t < tend) {
1456 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1457 t++;
1458 }
2f9475ad
JH
1459 }
1460 len = d - tmpbuf;
1461 Copy(tmpbuf, ustrp, len, U8);
1462#else
d2dcd0fb 1463 Copy(s, ustrp, len, U8);
2f9475ad 1464#endif
29e98929 1465 }
983ffd37 1466 }
0134edef
JH
1467 }
1468
1469 if (!len && *swashp) {
1470 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1471
1472 if (uv2) {
1473 /* It was "normal" (a single character mapping). */
1474 UV uv3 = UNI_TO_NATIVE(uv2);
1475
e9101d72 1476 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1477 }
1478 }
1feea2c7 1479
0134edef
JH
1480 if (!len) /* Neither: just copy. */
1481 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1482
2a37f04d
JH
1483 if (lenp)
1484 *lenp = len;
1485
0134edef 1486 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1487}
1488
d3e79532 1489/*
7fc63493 1490=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1491
1492Convert the UTF-8 encoded character at p to its uppercase version and
1493store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1494that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1495the uppercase version may be longer than the original character.
d3e79532
JH
1496
1497The first character of the uppercased version is returned
1498(but note, as explained above, that there may be more.)
1499
1500=cut */
1501
2104c8d9 1502UV
7fc63493 1503Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1504{
983ffd37 1505 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1506 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1507}
a0ed51b3 1508
d3e79532 1509/*
7fc63493 1510=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1511
1512Convert the UTF-8 encoded character at p to its titlecase version and
1513store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1514that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1515titlecase version may be longer than the original character.
d3e79532
JH
1516
1517The first character of the titlecased version is returned
1518(but note, as explained above, that there may be more.)
1519
1520=cut */
1521
983ffd37 1522UV
7fc63493 1523Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1524{
1525 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1526 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1527}
1528
d3e79532 1529/*
7fc63493 1530=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1531
1532Convert the UTF-8 encoded character at p to its lowercase version and
1533store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1534that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1535lowercase version may be longer than the original character.
d3e79532
JH
1536
1537The first character of the lowercased version is returned
1538(but note, as explained above, that there may be more.)
1539
1540=cut */
1541
2104c8d9 1542UV
7fc63493 1543Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1544{
983ffd37 1545 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1546 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1547}
1548
d3e79532 1549/*
7fc63493 1550=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1551
1552Convert the UTF-8 encoded character at p to its foldcase version and
1553store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1554that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1555foldcase version may be longer than the original character (up to
1556three characters).
1557
1558The first character of the foldcased version is returned
1559(but note, as explained above, that there may be more.)
1560
1561=cut */
1562
b4e400f9 1563UV
7fc63493 1564Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1565{
1566 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1567 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1568}
1569
1570/* a "swash" is a swatch hash */
1571
1572SV*
7fc63493 1573Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1574{
27da23d5 1575 dVAR;
a0ed51b3 1576 SV* retval;
9a957fbc 1577 SV* const tokenbufsv = sv_newmortal();
8e84507e 1578 dSP;
7fc63493
AL
1579 const size_t pkg_len = strlen(pkg);
1580 const size_t name_len = strlen(name);
aec46f14 1581 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1582 SV* errsv_save;
ce3b816e 1583
96ca9f55
DM
1584 PUSHSTACKi(PERLSI_MAGIC);
1585 ENTER;
1586 SAVEI32(PL_hints);
1587 PL_hints = 0;
1588 save_re_context();
1b026014 1589 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1590 ENTER;
f8be5cf0 1591 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1592 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1593 Nullsv);
f8be5cf0
JH
1594 if (!SvTRUE(ERRSV))
1595 sv_setsv(ERRSV, errsv_save);
1596 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1597 LEAVE;
1598 }
1599 SPAGAIN;
a0ed51b3
LW
1600 PUSHMARK(SP);
1601 EXTEND(SP,5);
71bed85a
NC
1602 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1603 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1604 PUSHs(listsv);
1605 PUSHs(sv_2mortal(newSViv(minbits)));
1606 PUSHs(sv_2mortal(newSViv(none)));
1607 PUTBACK;
923e4eb5 1608 if (IN_PERL_COMPILETIME) {
bf1fed83 1609 /* XXX ought to be handled by lex_start */
82686b01 1610 SAVEI32(PL_in_my);
2b4bd638 1611 PL_in_my = 0;
bf1fed83 1612 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1613 }
f8be5cf0 1614 errsv_save = newSVsv(ERRSV);
864dbfa3 1615 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1616 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1617 else
e24b16f9 1618 retval = &PL_sv_undef;
f8be5cf0
JH
1619 if (!SvTRUE(ERRSV))
1620 sv_setsv(ERRSV, errsv_save);
1621 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1622 LEAVE;
1623 POPSTACK;
923e4eb5 1624 if (IN_PERL_COMPILETIME) {
bf1fed83 1625 STRLEN len;
aec46f14 1626 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1627
1628 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1629 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1630 }
bc45ce41
JH
1631 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1632 if (SvPOK(retval))
35c1215d
NC
1633 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1634 retval);
cea2e8a9 1635 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1636 }
a0ed51b3
LW
1637 return retval;
1638}
1639
035d37be
JH
1640
1641/* This API is wrong for special case conversions since we may need to
1642 * return several Unicode characters for a single Unicode character
1643 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1644 * the lower-level routine, and it is similarly broken for returning
1645 * multiple values. --jhi */
a0ed51b3 1646UV
7fc63493 1647Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1648{
27da23d5 1649 dVAR;
aec46f14 1650 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1651 U32 klen;
1652 U32 off;
a0ed51b3 1653 STRLEN slen;
7d85a32c 1654 STRLEN needents;
cfd0369c 1655 const U8 *tmps = NULL;
a0ed51b3
LW
1656 U32 bit;
1657 SV *retval;
3568d838
JH
1658 U8 tmputf8[2];
1659 UV c = NATIVE_TO_ASCII(*ptr);
1660
1661 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1662 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1663 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1664 ptr = tmputf8;
1665 }
1666 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1667 * then the "swatch" is a vec() for al the chars which start
1668 * with 0xAA..0xYY
1669 * So the key in the hash (klen) is length of encoded char -1
1670 */
1671 klen = UTF8SKIP(ptr) - 1;
1672 off = ptr[klen];
a0ed51b3 1673
7d85a32c
JH
1674 if (klen == 0)
1675 {
1676 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1677 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1678 */
1679 needents = UTF_CONTINUATION_MARK;
1680 off = NATIVE_TO_UTF(ptr[klen]);
1681 }
1682 else
1683 {
1684 /* If char is encoded then swatch is for the prefix */
1685 needents = (1 << UTF_ACCUMULATION_SHIFT);
1686 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1687 }
1688
a0ed51b3
LW
1689 /*
1690 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1691 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1692 * it's nothing to sniff at.) Pity we usually come through at least
1693 * two function calls to get here...
1694 *
1695 * NB: this code assumes that swatches are never modified, once generated!
1696 */
1697
3568d838 1698 if (hv == PL_last_swash_hv &&
a0ed51b3 1699 klen == PL_last_swash_klen &&
27da23d5 1700 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1701 {
1702 tmps = PL_last_swash_tmps;
1703 slen = PL_last_swash_slen;
1704 }
1705 else {
1706 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1707 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1708
1709 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1710 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1711 dSP;
2b9d42f0
NIS
1712 /* We use utf8n_to_uvuni() as we want an index into
1713 Unicode tables, not a native character number.
1714 */
aec46f14 1715 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1716 ckWARN(WARN_UTF8) ?
1717 0 : UTF8_ALLOW_ANY);
f8be5cf0 1718 SV *errsv_save;
a0ed51b3
LW
1719 ENTER;
1720 SAVETMPS;
1721 save_re_context();
1722 PUSHSTACKi(PERLSI_MAGIC);
1723 PUSHMARK(SP);
1724 EXTEND(SP,3);
1725 PUSHs((SV*)sv);
ffbc6a93 1726 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1727 PUSHs(sv_2mortal(newSViv((klen) ?
1728 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1729 PUSHs(sv_2mortal(newSViv(needents)));
1730 PUTBACK;
f8be5cf0 1731 errsv_save = newSVsv(ERRSV);
864dbfa3 1732 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1733 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1734 else
e24b16f9 1735 retval = &PL_sv_undef;
f8be5cf0
JH
1736 if (!SvTRUE(ERRSV))
1737 sv_setsv(ERRSV, errsv_save);
1738 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1739 POPSTACK;
1740 FREETMPS;
1741 LEAVE;
923e4eb5 1742 if (IN_PERL_COMPILETIME)
eb160463 1743 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1744
e1ec3a88 1745 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1746
7d85a32c 1747 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1748 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1749 }
1750
1751 PL_last_swash_hv = hv;
1752 PL_last_swash_klen = klen;
cfd0369c
NC
1753 /* FIXME change interpvar.h? */
1754 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1755 PL_last_swash_slen = slen;
1756 if (klen)
1757 Copy(ptr, PL_last_swash_key, klen, U8);
1758 }
1759
9faf8d75 1760 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1761 case 1:
1762 bit = 1 << (off & 7);
1763 off >>= 3;
1764 return (tmps[off] & bit) != 0;
1765 case 8:
1766 return tmps[off];
1767 case 16:
1768 off <<= 1;
1769 return (tmps[off] << 8) + tmps[off + 1] ;
1770 case 32:
1771 off <<= 2;
1772 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1773 }
cea2e8a9 1774 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1775 return 0;
1776}
2b9d42f0 1777
b851fbc1
JH
1778U8 *
1779Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1780{
1781 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1782}
2b9d42f0
NIS
1783
1784/*
d2cc3551
JH
1785=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1786
1787Build to the scalar dsv a displayable version of the string spv,
1788length len, the displayable version being at most pvlim bytes long
1789(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1790
9e55ce06 1791The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1792isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1793to display the \\[nrfta\\] as the backslashed versions (like '\n')
1794(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1795UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1796UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1797
d2cc3551
JH
1798The pointer to the PV of the dsv is returned.
1799
1800=cut */
e6b2e755 1801char *
e1ec3a88 1802Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
1803{
1804 int truncated = 0;
e1ec3a88 1805 const char *s, *e;
e6b2e755
JH
1806
1807 sv_setpvn(dsv, "", 0);
e1ec3a88 1808 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1809 UV u;
a49f32c6
NC
1810 /* This serves double duty as a flag and a character to print after
1811 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1812 */
1813 char ok = 0;
c728cb41 1814
e6b2e755
JH
1815 if (pvlim && SvCUR(dsv) >= pvlim) {
1816 truncated++;
1817 break;
1818 }
1819 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1820 if (u < 256) {
a3b680e6 1821 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1822 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1823 switch (c) {
c728cb41 1824 case '\n':
a49f32c6 1825 ok = 'n'; break;
c728cb41 1826 case '\r':
a49f32c6 1827 ok = 'r'; break;
c728cb41 1828 case '\t':
a49f32c6 1829 ok = 't'; break;
c728cb41 1830 case '\f':
a49f32c6 1831 ok = 'f'; break;
c728cb41 1832 case '\a':
a49f32c6 1833 ok = 'a'; break;
c728cb41 1834 case '\\':
a49f32c6 1835 ok = '\\'; break;
c728cb41
JH
1836 default: break;
1837 }
a49f32c6
NC
1838 if (ok) {
1839 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1840 }
c728cb41 1841 }
00e86452 1842 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
1843 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1844 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1845 ok = 1;
0a2ef054 1846 }
c728cb41
JH
1847 }
1848 if (!ok)
9e55ce06 1849 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1850 }
1851 if (truncated)
1852 sv_catpvn(dsv, "...", 3);
1853
1854 return SvPVX(dsv);
1855}
2b9d42f0 1856
d2cc3551
JH
1857/*
1858=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1859
1860Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1861the displayable version being at most pvlim bytes long
d2cc3551 1862(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1863
1864The flags argument is as in pv_uni_display().
1865
d2cc3551
JH
1866The pointer to the PV of the dsv is returned.
1867
1868=cut */
e6b2e755
JH
1869char *
1870Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1871{
cfd0369c
NC
1872 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1873 SvCUR(ssv), pvlim, flags);
701a277b
JH
1874}
1875
d2cc3551 1876/*
d07ddd77 1877=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1878
1879Return true if the strings s1 and s2 differ case-insensitively, false
1880if not (if they are equal case-insensitively). If u1 is true, the
1881string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1882the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1883are false, the respective string is assumed to be in native 8-bit
1884encoding.
1885
1886If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1887in there (they will point at the beginning of the I<next> character).
1888If the pointers behind pe1 or pe2 are non-NULL, they are the end
1889pointers beyond which scanning will not continue under any
4cdaeff7 1890circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1891s2+l2 will be used as goal end pointers that will also stop the scan,
1892and which qualify towards defining a successful match: all the scans
1893that define an explicit length must reach their goal pointers for
1894a match to succeed).
d2cc3551
JH
1895
1896For case-insensitiveness, the "casefolding" of Unicode is used
1897instead of upper/lowercasing both the characters, see
1898http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1899
1900=cut */
701a277b 1901I32
d07ddd77 1902Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1903{
e1ec3a88
AL
1904 register const U8 *p1 = (const U8*)s1;
1905 register const U8 *p2 = (const U8*)s2;
1906 register const U8 *f1 = 0, *f2 = 0;
1907 register U8 *e1 = 0, *q1 = 0;
1908 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1909 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
1910 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1911 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
1912 U8 natbuf[1+1];
1913 STRLEN foldlen1, foldlen2;
d07ddd77 1914 bool match;
332ddc25 1915
d07ddd77
JH
1916 if (pe1)
1917 e1 = *(U8**)pe1;
e1ec3a88
AL
1918 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1919 f1 = (const U8*)s1 + l1;
d07ddd77
JH
1920 if (pe2)
1921 e2 = *(U8**)pe2;
e1ec3a88
AL
1922 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1923 f2 = (const U8*)s2 + l2;
d07ddd77
JH
1924
1925 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1926 return 1; /* mismatch; possible infinite loop or false positive */
1927
a6872d42
JH
1928 if (!u1 || !u2)
1929 natbuf[1] = 0; /* Need to terminate the buffer. */
1930
d07ddd77
JH
1931 while ((e1 == 0 || p1 < e1) &&
1932 (f1 == 0 || p1 < f1) &&
1933 (e2 == 0 || p2 < e2) &&
1934 (f2 == 0 || p2 < f2)) {
1935 if (n1 == 0) {
d7f013c8
JH
1936 if (u1)
1937 to_utf8_fold(p1, foldbuf1, &foldlen1);
1938 else {
809e8e66 1939 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
1940 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1941 }
1942 q1 = foldbuf1;
d07ddd77 1943 n1 = foldlen1;
332ddc25 1944 }
d07ddd77 1945 if (n2 == 0) {
d7f013c8
JH
1946 if (u2)
1947 to_utf8_fold(p2, foldbuf2, &foldlen2);
1948 else {
809e8e66 1949 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
1950 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1951 }
1952 q2 = foldbuf2;
d07ddd77 1953 n2 = foldlen2;
332ddc25 1954 }
d07ddd77
JH
1955 while (n1 && n2) {
1956 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1957 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1958 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1959 return 1; /* mismatch */
d07ddd77 1960 n1 -= UTF8SKIP(q1);
d7f013c8 1961 q1 += UTF8SKIP(q1);
d07ddd77 1962 n2 -= UTF8SKIP(q2);
d7f013c8 1963 q2 += UTF8SKIP(q2);
701a277b 1964 }
d07ddd77 1965 if (n1 == 0)
d7f013c8 1966 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1967 if (n2 == 0)
d7f013c8
JH
1968 p2 += u2 ? UTF8SKIP(p2) : 1;
1969
d2cc3551 1970 }
5469e704 1971
d07ddd77
JH
1972 /* A match is defined by all the scans that specified
1973 * an explicit length reaching their final goals. */
1974 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
1975
1976 if (match) {
d07ddd77
JH
1977 if (pe1)
1978 *pe1 = (char*)p1;
1979 if (pe2)
1980 *pe2 = (char*)p2;
5469e704
JH
1981 }
1982
1983 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1984}
701a277b 1985
a49f32c6
NC
1986/*
1987 * Local variables:
1988 * c-indentation-style: bsd
1989 * c-basic-offset: 4
1990 * indent-tabs-mode: t
1991 * End:
1992 *
37442d52
RGS
1993 * ex: set ts=8 sts=4 sw=4 noet:
1994 */