This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Coverage stats showed that there were no tests for taking a slice
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
646ca15d
JH
189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
ST
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
3b0fc154 241#ifdef IS_UTF8_CHAR
768c67ee 242 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 245 return is_utf8_char_slow(s, len);
386d01d6
GS
246}
247
6662521e 248/*
7fc63493 249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 250
c9ada85f 251Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
6662521e 255
768c67ee
JH
256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
6662521e
GS
258=cut
259*/
260
8e84507e 261bool
7fc63493 262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 263{
7fc63493
AL
264 const U8* x = s;
265 const U8* send;
067a85ef 266
44f8325f 267 if (!len)
e1ec3a88 268 len = strlen((const char *)s);
1aa99e6b
IH
269 send = s + len;
270
6662521e 271 while (x < send) {
a3b680e6 272 STRLEN c;
1acdb0da
JH
273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
768c67ee 277 goto out;
1acdb0da
JH
278 else {
279 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
768c67ee
JH
282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
646ca15d
JH
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee
JH
295
296 out:
60006e79
JH
297 if (x != send)
298 return FALSE;
067a85ef
A
299
300 return TRUE;
6662521e
GS
301}
302
67e989fb 303/*
814fafa7
NC
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
7fc63493
AL
329 const U8* x = s;
330 const U8* send;
81cd54e3
JH
331 STRLEN c;
332
44f8325f 333 if (!len)
e1ec3a88 334 len = strlen((const char *)s);
81cd54e3 335 send = s + len;
768c67ee
JH
336 if (el)
337 *el = 0;
81cd54e3
JH
338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
81cd54e3 345 else {
768c67ee
JH
346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
81cd54e3 359 }
768c67ee
JH
360 x += c;
361 if (el)
362 (*el)++;
81cd54e3 363 }
768c67ee
JH
364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
81cd54e3 369 return FALSE;
81cd54e3
JH
370
371 return TRUE;
372}
373
374/*
768c67ee 375
7fc63493 376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 377
9041c2e3
NIS
378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
1e54db1a 380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 381C<retlen> will be set to the length, in bytes, of that character.
67e989fb 382
1e54db1a 383If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
28d3d195
JH
386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 393
9041c2e3
NIS
394Most code should use utf8_to_uvchr() rather than call this directly.
395
37607a96
PK
396=cut
397*/
67e989fb 398
a0ed51b3 399UV
7fc63493 400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 401{
7fc63493 402 const U8 *s0 = s;
9c5ffd7c 403 UV uv = *s, ouv = 0;
ba210ebe 404 STRLEN len = 1;
7fc63493
AL
405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
ba210ebe 407 STRLEN expectlen = 0;
a0dbb045
JH
408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
c867b360
JH
419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
425 goto malformed;
426 }
427
1d72bdf6 428 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
429 if (retlen)
430 *retlen = 1;
c4d5f83a 431 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 432 }
67e989fb 433
421a8bf2 434 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 435 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 436 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
437 goto malformed;
438 }
439
421a8bf2 440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 442 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
443 goto malformed;
444 }
9041c2e3 445
1d72bdf6 446#ifdef EBCDIC
75383841 447 uv = NATIVE_TO_UTF(uv);
1d72bdf6 448#else
fcc8fcf6
JH
449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 451 warning = UTF8_WARN_FE_FF;
ba210ebe 452 goto malformed;
a0ed51b3 453 }
1d72bdf6
NIS
454#endif
455
ba210ebe
JH
456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
ba210ebe
JH
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
a0ed51b3
LW
469 if (retlen)
470 *retlen = len;
9041c2e3 471
ba210ebe
JH
472 expectlen = len;
473
fcc8fcf6
JH
474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 476 warning = UTF8_WARN_SHORT;
ba210ebe
JH
477 goto malformed;
478 }
479
480 len--;
a0ed51b3 481 s++;
ba210ebe
JH
482 ouv = uv;
483
a0ed51b3 484 while (len--) {
421a8bf2
JH
485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 489 goto malformed;
a0ed51b3
LW
490 }
491 else
8850bf83 492 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
75dbc644 496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
ba210ebe
JH
506 }
507 s++;
508 ouv = uv;
509 }
510
421a8bf2 511 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 513 warning = UTF8_WARN_SURROGATE;
ba210ebe 514 goto malformed;
eb160463 515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 517 warning = UTF8_WARN_LONG;
ba210ebe 518 goto malformed;
421a8bf2 519 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 520 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 521 warning = UTF8_WARN_FFFF;
a9917092 522 goto malformed;
a0ed51b3 523 }
ba210ebe 524
a0ed51b3 525 return uv;
ba210ebe
JH
526
527malformed:
528
fcc8fcf6 529 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 530 if (retlen)
cc366d4b 531 *retlen = -1;
ba210ebe
JH
532 return 0;
533 }
534
a0dbb045 535 if (dowarn) {
44f8325f 536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045
JH
537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
54667de8 541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
542 break;
543 case UTF8_WARN_CONTINUATION:
097fb8e2 544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
545 break;
546 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
551405c4
AL
550 else {
551 const int len = (int)(s-s0);
097fb8e2 552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
a0dbb045
JH
556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
097fb8e2 561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 563 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
564 break;
565 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
a0dbb045
JH
568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
a0dbb045 572 case UTF8_WARN_LONG:
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
54667de8 580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
581 break;
582 }
583
584 if (warning) {
44f8325f 585 const char * const s = SvPVX_const(sv);
a0dbb045
JH
586
587 if (PL_op)
9014280d 588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 589 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 590 else
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
592 }
593 }
594
ba210ebe 595 if (retlen)
28d3d195 596 *retlen = expectlen ? expectlen : len;
ba210ebe 597
28d3d195 598 return 0;
a0ed51b3
LW
599}
600
8e84507e 601/*
7fc63493 602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
603
604Returns the native character value of the first character in the string C<s>
1e54db1a 605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
606length, in bytes, of that character.
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 616{
89ebb4a3 617 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
619}
620
621/*
7fc63493 622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
623
624Returns the Unicode code point of the first character in the string C<s>
1e54db1a 625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
1e54db1a 631If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 632returned and retlen is set, if possible, to -1.
8e84507e
NIS
633
634=cut
635*/
636
637UV
7fc63493 638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 639{
9041c2e3 640 /* Call the low level routine asking for checks */
89ebb4a3 641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
643}
644
b76347f2 645/*
35a4481c 646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
647
648Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
b76347f2
JH
651
652=cut
653*/
654
655STRLEN
35a4481c 656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
657{
658 STRLEN len = 0;
659
8850bf83
JH
660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
a3b680e6
AL
664 if (e < s)
665 goto warn_and_return;
b76347f2 666 while (s < e) {
4373e329 667 const U8 t = UTF8SKIP(s);
901b21bf 668 if (e - s < t) {
a3b680e6 669 warn_and_return:
901b21bf
JH
670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 673 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
b76347f2
JH
679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
b06226ff 686/*
35a4481c 687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 688
1e54db1a 689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
37607a96
PK
695=cut
696*/
a0ed51b3 697
02eb7b47 698IV
35a4481c 699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 700{
02eb7b47
JH
701 IV off = 0;
702
8850bf83
JH
703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
a0ed51b3
LW
707 if (a < b) {
708 while (a < b) {
35a4481c 709 const U8 c = UTF8SKIP(a);
a3b680e6
AL
710 if (b - a < c)
711 goto warn_and_return;
02eb7b47 712 a += c;
a0ed51b3
LW
713 off--;
714 }
715 }
716 else {
717 while (b < a) {
4373e329 718 const U8 c = UTF8SKIP(b);
02eb7b47 719
901b21bf 720 if (a - b < c) {
a3b680e6 721 warn_and_return:
901b21bf
JH
722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
02eb7b47 731 b += c;
a0ed51b3
LW
732 off++;
733 }
734 }
02eb7b47 735
a0ed51b3
LW
736 return off;
737}
738
b06226ff 739/*
37607a96 740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 741
8850bf83
JH
742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
b06226ff
JH
744
745WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
b06226ff 748
37607a96
PK
749=cut
750*/
a0ed51b3
LW
751
752U8 *
4373e329 753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 754{
8850bf83
JH
755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
a0ed51b3
LW
759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
8850bf83
JH
766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
a0ed51b3
LW
768 }
769 }
4373e329 770 return (U8 *)s;
a0ed51b3
LW
771}
772
6940069f 773/*
eebe1485 774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 775
1e54db1a 776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
67e989fb 779Returns zero on failure, setting C<len> to -1.
6940069f
GS
780
781=cut
782*/
783
784U8 *
37607a96 785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 786{
6940069f
GS
787 U8 *send;
788 U8 *d;
dcad2880 789 U8 *save = s;
246fae53 790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
1d72bdf6
NIS
795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
798 *len = -1;
799 return 0;
800 }
246fae53 801 }
dcad2880
JH
802
803 d = s = save;
6940069f 804 while (s < send) {
ed646e6e 805 STRLEN ulen;
9041c2e3 806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 807 s += ulen;
6940069f
GS
808 }
809 *d = '\0';
246fae53 810 *len = d - save;
6940069f
GS
811 return save;
812}
813
814/*
e1ec3a88 815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 816
1e54db1a 817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
f9a63242 823
37607a96
PK
824=cut
825*/
f9a63242
JH
826
827U8 *
e1ec3a88 828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 829{
f9a63242 830 U8 *d;
e1ec3a88
AL
831 const U8 *start = s;
832 const U8 *send;
f9a63242
JH
833 I32 count = 0;
834
835 if (!*is_utf8)
73d840c0 836 return (U8 *)start;
f9a63242 837
1e54db1a 838 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 839 for (send = s + *len; s < send;) {
e1ec3a88 840 U8 c = *s++;
1d72bdf6 841 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
73d840c0 846 return (U8 *)start;
db42d148 847 }
f9a63242
JH
848 }
849
850 *is_utf8 = 0;
851
a02a5408 852 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 853 s = start; start = d;
f9a63242
JH
854 while (s < send) {
855 U8 c = *s++;
c4d5f83a
NIS
856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
f9a63242
JH
862 }
863 *d = '\0';
864 *len = d - start;
73d840c0 865 return (U8 *)start;
f9a63242
JH
866}
867
868/*
35a4481c 869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 870
1e54db1a 871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
6940069f 874
1e54db1a 875If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
876see sv_recode_to_utf8().
877
497711e7 878=cut
6940069f
GS
879*/
880
881U8*
35a4481c 882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 883{
35a4481c 884 const U8 * const send = s + (*len);
6940069f
GS
885 U8 *d;
886 U8 *dst;
6940069f 887
a02a5408 888 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
889 dst = d;
890
891 while (s < send) {
35a4481c 892 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 893 if (UNI_IS_INVARIANT(uv))
eb160463 894 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 895 else {
eb160463
GS
896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
898 }
899 }
900 *d = '\0';
6662521e 901 *len = d-dst;
6940069f
GS
902 return dst;
903}
904
a0ed51b3 905/*
dea0fc0b 906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
dea0fc0b 912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 913{
dea0fc0b
JH
914 U8* pend;
915 U8* dstart = d;
916
1de9afcd
RGS
917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
dea0fc0b 923 if (bytelen & 1)
014ead4b 924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
925
926 pend = p + bytelen;
927
a0ed51b3 928 while (p < pend) {
dea0fc0b
JH
929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
a0ed51b3 931 if (uv < 0x80) {
eb160463 932 *d++ = (U8)uv;
a0ed51b3
LW
933 continue;
934 }
935 if (uv < 0x800) {
eb160463
GS
936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
941 UV low = (p[0] << 8) + p[1];
942 p += 2;
dea0fc0b
JH
943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
eb160463
GS
948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
951 continue;
952 }
953 else {
eb160463
GS
954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
958 continue;
959 }
960 }
dea0fc0b 961 *newlen = d - dstart;
a0ed51b3
LW
962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
dea0fc0b 968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
dea0fc0b 978 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
84afefe6 984Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1076 return is_utf8_print(tmpbuf);
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1084 return is_utf8_punct(tmpbuf);
1085}
1086
4d61ec05 1087bool
84afefe6 1088Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
84afefe6
JH
1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1097{
0ebc6274
JH
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1100}
1101
84afefe6
JH
1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1104{
0ebc6274
JH
1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1107}
1108
84afefe6
JH
1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1111{
0ebc6274
JH
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1114}
1115
84afefe6
JH
1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
0ebc6274
JH
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1121}
1122
a0ed51b3
LW
1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
84afefe6 1126Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
b8c5462f 1197bool
84afefe6 1198Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
4d61ec05 1203bool
84afefe6 1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
b7ac61fa
JH
1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
ee099d14
JH
1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
89ebb4a3 1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1216 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
ee099d14
JH
1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
89ebb4a3 1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1226 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
ee099d14
JH
1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
89ebb4a3 1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1236 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1237}
1238
a0ed51b3 1239bool
5141f98e 1240S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 0, 0);
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
671c33bf
NC
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1257}
1258
1259bool
7fc63493 1260Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1261{
671c33bf 1262 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1267{
82686b01
JH
1268 if (*p == '_')
1269 return TRUE;
bde6a22d
NC
1270 /* is_utf8_idstart would be more logical. */
1271 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1272}
1273
1274bool
7fc63493 1275Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1276{
1277 if (*p == '_')
1278 return TRUE;
bde6a22d 1279 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1280}
1281
1282bool
7fc63493 1283Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1284{
bde6a22d 1285 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1286}
1287
1288bool
7fc63493 1289Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1290{
bde6a22d 1291 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1292}
1293
1294bool
7fc63493 1295Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1296{
bde6a22d 1297 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1298}
1299
1300bool
7fc63493 1301Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1302{
bde6a22d 1303 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1304}
1305
1306bool
7fc63493 1307Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1308{
bde6a22d 1309 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1310}
1311
1312bool
7fc63493 1313Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1314{
bde6a22d 1315 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1320{
bde6a22d 1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1326{
bde6a22d 1327 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1328}
1329
1330bool
7fc63493 1331Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1332{
bde6a22d 1333 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1334}
1335
1336bool
7fc63493 1337Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1338{
bde6a22d 1339 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1340}
1341
1342bool
7fc63493 1343Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1344{
bde6a22d 1345 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1350{
bde6a22d 1351 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1352}
1353
6b5c0936
JH
1354/*
1355=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1356
1357The "p" contains the pointer to the UTF-8 string encoding
1358the character that is being converted.
1359
1360The "ustrp" is a pointer to the character buffer to put the
1361conversion result to. The "lenp" is a pointer to the length
1362of the result.
1363
0134edef 1364The "swashp" is a pointer to the swash to use.
6b5c0936 1365
0134edef
JH
1366Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1367and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1368but not always, a multicharacter mapping), is tried first.
6b5c0936 1369
0134edef
JH
1370The "special" is a string like "utf8::ToSpecLower", which means the
1371hash %utf8::ToSpecLower. The access to the hash is through
1372Perl_to_utf8_case().
6b5c0936 1373
0134edef
JH
1374The "normal" is a string like "ToLower" which means the swash
1375%utf8::ToLower.
1376
1377=cut */
6b5c0936 1378
2104c8d9 1379UV
9a957fbc
AL
1380Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1381 SV **swashp, const char *normal, const char *special)
a0ed51b3 1382{
89ebb4a3 1383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1384 STRLEN len = 0;
a0ed51b3 1385
aec46f14 1386 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1387 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1388 * are necessary in EBCDIC, they are redundant no-ops
1389 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1390 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1391 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1392
1393 if (!*swashp) /* load on-demand */
1394 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1395
b08cf34e
JH
1396 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1397 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1398 /* It might be "special" (sometimes, but not always,
2a37f04d 1399 * a multicharacter mapping) */
983ffd37 1400 HV *hv;
b08cf34e
JH
1401 SV **svp;
1402
1403 if ((hv = get_hv(special, FALSE)) &&
1404 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 (*svp)) {
cfd0369c 1406 const char *s;
47654450 1407
cfd0369c 1408 s = SvPV_const(*svp, len);
47654450
JH
1409 if (len == 1)
1410 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1411 else {
2f9475ad
JH
1412#ifdef EBCDIC
1413 /* If we have EBCDIC we need to remap the characters
1414 * since any characters in the low 256 are Unicode
1415 * code points, not EBCDIC. */
7cda7a3d 1416 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1417
1418 d = tmpbuf;
b08cf34e 1419 if (SvUTF8(*svp)) {
2f9475ad
JH
1420 STRLEN tlen = 0;
1421
1422 while (t < tend) {
1423 UV c = utf8_to_uvchr(t, &tlen);
1424 if (tlen > 0) {
1425 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 t += tlen;
1427 }
1428 else
1429 break;
1430 }
1431 }
1432 else {
36fec512
JH
1433 while (t < tend) {
1434 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1435 t++;
1436 }
2f9475ad
JH
1437 }
1438 len = d - tmpbuf;
1439 Copy(tmpbuf, ustrp, len, U8);
1440#else
d2dcd0fb 1441 Copy(s, ustrp, len, U8);
2f9475ad 1442#endif
29e98929 1443 }
983ffd37 1444 }
0134edef
JH
1445 }
1446
1447 if (!len && *swashp) {
1448 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1449
1450 if (uv2) {
1451 /* It was "normal" (a single character mapping). */
1452 UV uv3 = UNI_TO_NATIVE(uv2);
1453
e9101d72 1454 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1455 }
1456 }
1feea2c7 1457
0134edef
JH
1458 if (!len) /* Neither: just copy. */
1459 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460
2a37f04d
JH
1461 if (lenp)
1462 *lenp = len;
1463
0134edef 1464 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1465}
1466
d3e79532 1467/*
7fc63493 1468=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1469
1470Convert the UTF-8 encoded character at p to its uppercase version and
1471store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1472that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1473the uppercase version may be longer than the original character.
d3e79532
JH
1474
1475The first character of the uppercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
2104c8d9 1480UV
7fc63493 1481Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1482{
983ffd37 1483 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1484 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1485}
a0ed51b3 1486
d3e79532 1487/*
7fc63493 1488=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1489
1490Convert the UTF-8 encoded character at p to its titlecase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1492that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1493titlecase version may be longer than the original character.
d3e79532
JH
1494
1495The first character of the titlecased version is returned
1496(but note, as explained above, that there may be more.)
1497
1498=cut */
1499
983ffd37 1500UV
7fc63493 1501Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1502{
1503 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1504 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1505}
1506
d3e79532 1507/*
7fc63493 1508=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1509
1510Convert the UTF-8 encoded character at p to its lowercase version and
1511store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1512that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513lowercase version may be longer than the original character.
d3e79532
JH
1514
1515The first character of the lowercased version is returned
1516(but note, as explained above, that there may be more.)
1517
1518=cut */
1519
2104c8d9 1520UV
7fc63493 1521Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1522{
983ffd37 1523 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1524 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1525}
1526
d3e79532 1527/*
7fc63493 1528=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1529
1530Convert the UTF-8 encoded character at p to its foldcase version and
1531store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1532that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1533foldcase version may be longer than the original character (up to
1534three characters).
1535
1536The first character of the foldcased version is returned
1537(but note, as explained above, that there may be more.)
1538
1539=cut */
1540
b4e400f9 1541UV
7fc63493 1542Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1543{
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1546}
1547
1548/* a "swash" is a swatch hash */
1549
1550SV*
7fc63493 1551Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1552{
27da23d5 1553 dVAR;
a0ed51b3 1554 SV* retval;
9a957fbc 1555 SV* const tokenbufsv = sv_newmortal();
8e84507e 1556 dSP;
7fc63493
AL
1557 const size_t pkg_len = strlen(pkg);
1558 const size_t name_len = strlen(name);
aec46f14 1559 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1560 SV* errsv_save;
ce3b816e 1561
96ca9f55
DM
1562 PUSHSTACKi(PERLSI_MAGIC);
1563 ENTER;
1564 SAVEI32(PL_hints);
1565 PL_hints = 0;
1566 save_re_context();
1b026014 1567 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1568 ENTER;
f8be5cf0 1569 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1570 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1571 Nullsv);
f8be5cf0
JH
1572 if (!SvTRUE(ERRSV))
1573 sv_setsv(ERRSV, errsv_save);
1574 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1575 LEAVE;
1576 }
1577 SPAGAIN;
a0ed51b3
LW
1578 PUSHMARK(SP);
1579 EXTEND(SP,5);
71bed85a
NC
1580 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1581 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1582 PUSHs(listsv);
1583 PUSHs(sv_2mortal(newSViv(minbits)));
1584 PUSHs(sv_2mortal(newSViv(none)));
1585 PUTBACK;
923e4eb5 1586 if (IN_PERL_COMPILETIME) {
bf1fed83 1587 /* XXX ought to be handled by lex_start */
82686b01 1588 SAVEI32(PL_in_my);
2b4bd638 1589 PL_in_my = 0;
bf1fed83 1590 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1591 }
f8be5cf0 1592 errsv_save = newSVsv(ERRSV);
864dbfa3 1593 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1594 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1595 else
e24b16f9 1596 retval = &PL_sv_undef;
f8be5cf0
JH
1597 if (!SvTRUE(ERRSV))
1598 sv_setsv(ERRSV, errsv_save);
1599 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1600 LEAVE;
1601 POPSTACK;
923e4eb5 1602 if (IN_PERL_COMPILETIME) {
bf1fed83 1603 STRLEN len;
aec46f14 1604 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1605
1606 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1607 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1608 }
bc45ce41
JH
1609 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1610 if (SvPOK(retval))
35c1215d
NC
1611 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1612 retval);
cea2e8a9 1613 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1614 }
a0ed51b3
LW
1615 return retval;
1616}
1617
035d37be
JH
1618
1619/* This API is wrong for special case conversions since we may need to
1620 * return several Unicode characters for a single Unicode character
1621 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1622 * the lower-level routine, and it is similarly broken for returning
1623 * multiple values. --jhi */
a0ed51b3 1624UV
7fc63493 1625Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1626{
27da23d5 1627 dVAR;
aec46f14 1628 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1629 U32 klen;
1630 U32 off;
a0ed51b3 1631 STRLEN slen;
7d85a32c 1632 STRLEN needents;
cfd0369c 1633 const U8 *tmps = NULL;
a0ed51b3
LW
1634 U32 bit;
1635 SV *retval;
3568d838
JH
1636 U8 tmputf8[2];
1637 UV c = NATIVE_TO_ASCII(*ptr);
1638
1639 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1640 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1641 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1642 ptr = tmputf8;
1643 }
1644 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1645 * then the "swatch" is a vec() for al the chars which start
1646 * with 0xAA..0xYY
1647 * So the key in the hash (klen) is length of encoded char -1
1648 */
1649 klen = UTF8SKIP(ptr) - 1;
1650 off = ptr[klen];
a0ed51b3 1651
7d85a32c
JH
1652 if (klen == 0)
1653 {
1654 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1655 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1656 */
1657 needents = UTF_CONTINUATION_MARK;
1658 off = NATIVE_TO_UTF(ptr[klen]);
1659 }
1660 else
1661 {
1662 /* If char is encoded then swatch is for the prefix */
1663 needents = (1 << UTF_ACCUMULATION_SHIFT);
1664 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1665 }
1666
a0ed51b3
LW
1667 /*
1668 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1669 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1670 * it's nothing to sniff at.) Pity we usually come through at least
1671 * two function calls to get here...
1672 *
1673 * NB: this code assumes that swatches are never modified, once generated!
1674 */
1675
3568d838 1676 if (hv == PL_last_swash_hv &&
a0ed51b3 1677 klen == PL_last_swash_klen &&
27da23d5 1678 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1679 {
1680 tmps = PL_last_swash_tmps;
1681 slen = PL_last_swash_slen;
1682 }
1683 else {
1684 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1685 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1686
1687 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1688 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1689 dSP;
2b9d42f0
NIS
1690 /* We use utf8n_to_uvuni() as we want an index into
1691 Unicode tables, not a native character number.
1692 */
aec46f14 1693 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1694 ckWARN(WARN_UTF8) ?
1695 0 : UTF8_ALLOW_ANY);
f8be5cf0 1696 SV *errsv_save;
a0ed51b3
LW
1697 ENTER;
1698 SAVETMPS;
1699 save_re_context();
1700 PUSHSTACKi(PERLSI_MAGIC);
1701 PUSHMARK(SP);
1702 EXTEND(SP,3);
1703 PUSHs((SV*)sv);
ffbc6a93 1704 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1705 PUSHs(sv_2mortal(newSViv((klen) ?
1706 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1707 PUSHs(sv_2mortal(newSViv(needents)));
1708 PUTBACK;
f8be5cf0 1709 errsv_save = newSVsv(ERRSV);
864dbfa3 1710 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1711 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1712 else
e24b16f9 1713 retval = &PL_sv_undef;
f8be5cf0
JH
1714 if (!SvTRUE(ERRSV))
1715 sv_setsv(ERRSV, errsv_save);
1716 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1717 POPSTACK;
1718 FREETMPS;
1719 LEAVE;
923e4eb5 1720 if (IN_PERL_COMPILETIME)
eb160463 1721 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1722
e1ec3a88 1723 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1724
7d85a32c 1725 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1726 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1727 }
1728
1729 PL_last_swash_hv = hv;
1730 PL_last_swash_klen = klen;
cfd0369c
NC
1731 /* FIXME change interpvar.h? */
1732 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1733 PL_last_swash_slen = slen;
1734 if (klen)
1735 Copy(ptr, PL_last_swash_key, klen, U8);
1736 }
1737
9faf8d75 1738 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1739 case 1:
1740 bit = 1 << (off & 7);
1741 off >>= 3;
1742 return (tmps[off] & bit) != 0;
1743 case 8:
1744 return tmps[off];
1745 case 16:
1746 off <<= 1;
1747 return (tmps[off] << 8) + tmps[off + 1] ;
1748 case 32:
1749 off <<= 2;
1750 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1751 }
cea2e8a9 1752 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1753 return 0;
1754}
2b9d42f0 1755
b851fbc1
JH
1756U8 *
1757Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1758{
1759 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1760}
2b9d42f0
NIS
1761
1762/*
d2cc3551
JH
1763=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1764
1765Build to the scalar dsv a displayable version of the string spv,
1766length len, the displayable version being at most pvlim bytes long
1767(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1768
9e55ce06 1769The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1770isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1771to display the \\[nrfta\\] as the backslashed versions (like '\n')
1772(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1773UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1774UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1775
d2cc3551
JH
1776The pointer to the PV of the dsv is returned.
1777
1778=cut */
e6b2e755 1779char *
e1ec3a88 1780Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
1781{
1782 int truncated = 0;
e1ec3a88 1783 const char *s, *e;
e6b2e755
JH
1784
1785 sv_setpvn(dsv, "", 0);
e1ec3a88 1786 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1787 UV u;
a49f32c6
NC
1788 /* This serves double duty as a flag and a character to print after
1789 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1790 */
1791 char ok = 0;
c728cb41 1792
e6b2e755
JH
1793 if (pvlim && SvCUR(dsv) >= pvlim) {
1794 truncated++;
1795 break;
1796 }
1797 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1798 if (u < 256) {
a3b680e6 1799 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1800 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1801 switch (c) {
c728cb41 1802 case '\n':
a49f32c6 1803 ok = 'n'; break;
c728cb41 1804 case '\r':
a49f32c6 1805 ok = 'r'; break;
c728cb41 1806 case '\t':
a49f32c6 1807 ok = 't'; break;
c728cb41 1808 case '\f':
a49f32c6 1809 ok = 'f'; break;
c728cb41 1810 case '\a':
a49f32c6 1811 ok = 'a'; break;
c728cb41 1812 case '\\':
a49f32c6 1813 ok = '\\'; break;
c728cb41
JH
1814 default: break;
1815 }
a49f32c6
NC
1816 if (ok) {
1817 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1818 }
c728cb41 1819 }
00e86452 1820 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
1821 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1822 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1823 ok = 1;
0a2ef054 1824 }
c728cb41
JH
1825 }
1826 if (!ok)
9e55ce06 1827 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1828 }
1829 if (truncated)
1830 sv_catpvn(dsv, "...", 3);
1831
1832 return SvPVX(dsv);
1833}
2b9d42f0 1834
d2cc3551
JH
1835/*
1836=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1837
1838Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1839the displayable version being at most pvlim bytes long
d2cc3551 1840(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1841
1842The flags argument is as in pv_uni_display().
1843
d2cc3551
JH
1844The pointer to the PV of the dsv is returned.
1845
1846=cut */
e6b2e755
JH
1847char *
1848Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1849{
cfd0369c
NC
1850 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1851 SvCUR(ssv), pvlim, flags);
701a277b
JH
1852}
1853
d2cc3551 1854/*
d07ddd77 1855=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1856
1857Return true if the strings s1 and s2 differ case-insensitively, false
1858if not (if they are equal case-insensitively). If u1 is true, the
1859string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1860the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1861are false, the respective string is assumed to be in native 8-bit
1862encoding.
1863
1864If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1865in there (they will point at the beginning of the I<next> character).
1866If the pointers behind pe1 or pe2 are non-NULL, they are the end
1867pointers beyond which scanning will not continue under any
4cdaeff7 1868circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1869s2+l2 will be used as goal end pointers that will also stop the scan,
1870and which qualify towards defining a successful match: all the scans
1871that define an explicit length must reach their goal pointers for
1872a match to succeed).
d2cc3551
JH
1873
1874For case-insensitiveness, the "casefolding" of Unicode is used
1875instead of upper/lowercasing both the characters, see
1876http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1877
1878=cut */
701a277b 1879I32
d07ddd77 1880Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1881{
e1ec3a88
AL
1882 register const U8 *p1 = (const U8*)s1;
1883 register const U8 *p2 = (const U8*)s2;
1884 register const U8 *f1 = 0, *f2 = 0;
1885 register U8 *e1 = 0, *q1 = 0;
1886 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1887 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
1888 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1889 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
1890 U8 natbuf[1+1];
1891 STRLEN foldlen1, foldlen2;
d07ddd77 1892 bool match;
332ddc25 1893
d07ddd77
JH
1894 if (pe1)
1895 e1 = *(U8**)pe1;
e1ec3a88
AL
1896 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1897 f1 = (const U8*)s1 + l1;
d07ddd77
JH
1898 if (pe2)
1899 e2 = *(U8**)pe2;
e1ec3a88
AL
1900 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1901 f2 = (const U8*)s2 + l2;
d07ddd77
JH
1902
1903 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1904 return 1; /* mismatch; possible infinite loop or false positive */
1905
a6872d42
JH
1906 if (!u1 || !u2)
1907 natbuf[1] = 0; /* Need to terminate the buffer. */
1908
d07ddd77
JH
1909 while ((e1 == 0 || p1 < e1) &&
1910 (f1 == 0 || p1 < f1) &&
1911 (e2 == 0 || p2 < e2) &&
1912 (f2 == 0 || p2 < f2)) {
1913 if (n1 == 0) {
d7f013c8
JH
1914 if (u1)
1915 to_utf8_fold(p1, foldbuf1, &foldlen1);
1916 else {
809e8e66 1917 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
1918 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1919 }
1920 q1 = foldbuf1;
d07ddd77 1921 n1 = foldlen1;
332ddc25 1922 }
d07ddd77 1923 if (n2 == 0) {
d7f013c8
JH
1924 if (u2)
1925 to_utf8_fold(p2, foldbuf2, &foldlen2);
1926 else {
809e8e66 1927 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
1928 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1929 }
1930 q2 = foldbuf2;
d07ddd77 1931 n2 = foldlen2;
332ddc25 1932 }
d07ddd77
JH
1933 while (n1 && n2) {
1934 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1935 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1936 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1937 return 1; /* mismatch */
d07ddd77 1938 n1 -= UTF8SKIP(q1);
d7f013c8 1939 q1 += UTF8SKIP(q1);
d07ddd77 1940 n2 -= UTF8SKIP(q2);
d7f013c8 1941 q2 += UTF8SKIP(q2);
701a277b 1942 }
d07ddd77 1943 if (n1 == 0)
d7f013c8 1944 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1945 if (n2 == 0)
d7f013c8
JH
1946 p2 += u2 ? UTF8SKIP(p2) : 1;
1947
d2cc3551 1948 }
5469e704 1949
d07ddd77
JH
1950 /* A match is defined by all the scans that specified
1951 * an explicit length reaching their final goals. */
1952 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
1953
1954 if (match) {
d07ddd77
JH
1955 if (pe1)
1956 *pe1 = (char*)p1;
1957 if (pe2)
1958 *pe2 = (char*)p2;
5469e704
JH
1959 }
1960
1961 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 1962}
701a277b 1963
a49f32c6
NC
1964/*
1965 * Local variables:
1966 * c-indentation-style: bsd
1967 * c-basic-offset: 4
1968 * indent-tabs-mode: t
1969 * End:
1970 *
37442d52
RGS
1971 * ex: set ts=8 sts=4 sw=4 noet:
1972 */