This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix apidoc entries for PUSHMARK and newXSproto.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
b5f8cc5c 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
901b21bf
JH
27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7
JH
29/*
30=head1 Unicode Support
a0ed51b3 31
b851fbc1 32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 33
1e54db1a 34Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
eebe1485
SC
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
9041c2e3 37end of the new character. In other words,
eebe1485 38
b851fbc1
JH
39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
9041c2e3 43 d = uvuni_to_utf8(d, uv);
eebe1485 44
b851fbc1
JH
45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
eebe1485
SC
49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
dfe13c55 56U8 *
b851fbc1 57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 58{
62961d2e 59 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
c867b360 67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
2a20b9da 70 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
73 )
9014280d 74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
75 "Unicode character 0x%04"UVxf" is illegal", uv);
76 }
c4d5f83a 77 if (UNI_IS_INVARIANT(uv)) {
eb160463 78 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
79 return d;
80 }
2d331972 81#if defined(EBCDIC)
1d72bdf6
NIS
82 else {
83 STRLEN len = UNISKIP(uv);
84 U8 *p = d+len-1;
85 while (p > d) {
eb160463 86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
87 uv >>= UTF_ACCUMULATION_SHIFT;
88 }
eb160463 89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
90 return d+len;
91 }
92#else /* Non loop style */
a0ed51b3 93 if (uv < 0x800) {
eb160463
GS
94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
96 return d;
97 }
98 if (uv < 0x10000) {
eb160463
GS
99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
102 return d;
103 }
104 if (uv < 0x200000) {
eb160463
GS
105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
109 return d;
110 }
111 if (uv < 0x4000000) {
eb160463
GS
112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x80000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
6b8eaf93 128#ifdef HAS_QUAD
d7578b48 129 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
130#endif
131 {
eb160463
GS
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
139 return d;
140 }
6b8eaf93 141#ifdef HAS_QUAD
a0ed51b3 142 {
eb160463
GS
143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
156 return d;
157 }
158#endif
1d72bdf6 159#endif /* Loop style */
a0ed51b3 160}
b851fbc1
JH
161
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
9041c2e3
NIS
167
168
169/*
eebe1485
SC
170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
5da9da9e 172Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
9041c2e3 176
82686b01 177=cut */
067a85ef 178STRLEN
386d01d6
GS
179Perl_is_utf8_char(pTHX_ U8 *s)
180{
181 U8 u = *s;
067a85ef
A
182 STRLEN slen, len;
183 UV uv, ouv;
386d01d6 184
1d72bdf6 185 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
186 return 1;
187
60006e79 188 if (!UTF8_IS_START(u))
386d01d6
GS
189 return 0;
190
9f07fdcd 191 len = UTF8SKIP(s);
386d01d6 192
60006e79 193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
194 return 0;
195
386d01d6
GS
196 slen = len - 1;
197 s++;
6eb6869e 198 u &= UTF_START_MASK(len);
1d72bdf6 199 uv = u;
067a85ef 200 ouv = uv;
386d01d6 201 while (slen--) {
60006e79 202 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 203 return 0;
8850bf83 204 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 205 if (uv < ouv)
067a85ef
A
206 return 0;
207 ouv = uv;
386d01d6
GS
208 s++;
209 }
067a85ef 210
eb160463 211 if ((STRLEN)UNISKIP(uv) < len)
067a85ef
A
212 return 0;
213
386d01d6
GS
214 return len;
215}
216
6662521e 217/*
eebe1485 218=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 219
c9ada85f 220Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
221UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
222not mean 'a string that contains code points above 0x7F encoded in UTF-8'
223because a valid ASCII string is a valid UTF-8 string.
6662521e
GS
224
225=cut
226*/
227
8e84507e 228bool
6662521e
GS
229Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230{
067a85ef 231 U8* x = s;
1aa99e6b 232 U8* send;
067a85ef
A
233 STRLEN c;
234
1aa99e6b 235 if (!len)
6cd5fe39 236 len = strlen((char *)s);
1aa99e6b
IH
237 send = s + len;
238
6662521e 239 while (x < send) {
1acdb0da
JH
240 /* Inline the easy bits of is_utf8_char() here for speed... */
241 if (UTF8_IS_INVARIANT(*x))
242 c = 1;
243 else if (!UTF8_IS_START(*x))
244 return FALSE;
245 else {
246 /* ... and call is_utf8_char() only if really needed. */
247 c = is_utf8_char(x);
248 if (!c)
249 return FALSE;
250 }
6662521e 251 x += c;
6662521e 252 }
60006e79
JH
253 if (x != send)
254 return FALSE;
067a85ef
A
255
256 return TRUE;
6662521e
GS
257}
258
67e989fb 259/*
81cd54e3
JH
260=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
261
262Like is_ut8_string but store the location of the failure in
263the last argument.
264
265=cut
266*/
267
268bool
269Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
270{
271 U8* x = s;
272 U8* send;
273 STRLEN c;
274
275 if (!len)
276 len = strlen((char *)s);
277 send = s + len;
278
279 while (x < send) {
280 /* Inline the easy bits of is_utf8_char() here for speed... */
281 if (UTF8_IS_INVARIANT(*x))
282 c = 1;
283 else if (!UTF8_IS_START(*x)) {
284 if (p)
285 *p = x;
286 return FALSE;
287 }
288 else {
289 /* ... and call is_utf8_char() only if really needed. */
290 c = is_utf8_char(x);
291 if (!c) {
292 if (p)
293 *p = x;
294 return FALSE;
295 }
296 }
297 x += c;
298 }
299 if (x != send) {
300 if (p)
301 *p = x;
302 return FALSE;
303 }
304
305 return TRUE;
306}
307
308/*
9041c2e3 309=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 310
9041c2e3
NIS
311Bottom level UTF-8 decode routine.
312Returns the unicode code point value of the first character in the string C<s>
1e54db1a 313which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 314C<retlen> will be set to the length, in bytes, of that character.
67e989fb 315
1e54db1a 316If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
317is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
318it is assumed that the caller will raise a warning, and this function
28d3d195
JH
319will silently just set C<retlen> to C<-1> and return zero. If the
320C<flags> does not contain UTF8_CHECK_ONLY, warnings about
321malformations will be given, C<retlen> will be set to the expected
322length of the UTF-8 character in bytes, and zero will be returned.
323
324The C<flags> can also contain various flags to allow deviations from
325the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 326
9041c2e3
NIS
327Most code should use utf8_to_uvchr() rather than call this directly.
328
37607a96
PK
329=cut
330*/
67e989fb 331
a0ed51b3 332UV
37607a96 333Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 334{
097fb8e2 335 U8 *s0 = s;
9c5ffd7c 336 UV uv = *s, ouv = 0;
ba210ebe
JH
337 STRLEN len = 1;
338 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 339 UV startbyte = *s;
ba210ebe 340 STRLEN expectlen = 0;
a0dbb045
JH
341 U32 warning = 0;
342
343/* This list is a superset of the UTF8_ALLOW_XXX. */
344
345#define UTF8_WARN_EMPTY 1
346#define UTF8_WARN_CONTINUATION 2
347#define UTF8_WARN_NON_CONTINUATION 3
348#define UTF8_WARN_FE_FF 4
349#define UTF8_WARN_SHORT 5
350#define UTF8_WARN_OVERFLOW 6
351#define UTF8_WARN_SURROGATE 7
c867b360
JH
352#define UTF8_WARN_LONG 8
353#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
354
355 if (curlen == 0 &&
356 !(flags & UTF8_ALLOW_EMPTY)) {
357 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
358 goto malformed;
359 }
360
1d72bdf6 361 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
362 if (retlen)
363 *retlen = 1;
c4d5f83a 364 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 365 }
67e989fb 366
421a8bf2 367 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 368 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 369 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
370 goto malformed;
371 }
372
421a8bf2 373 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 374 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 375 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
376 goto malformed;
377 }
9041c2e3 378
1d72bdf6 379#ifdef EBCDIC
75383841 380 uv = NATIVE_TO_UTF(uv);
1d72bdf6 381#else
fcc8fcf6
JH
382 if ((uv == 0xfe || uv == 0xff) &&
383 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 384 warning = UTF8_WARN_FE_FF;
ba210ebe 385 goto malformed;
a0ed51b3 386 }
1d72bdf6
NIS
387#endif
388
ba210ebe
JH
389 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
390 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
391 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
392 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
393#ifdef EBCDIC
394 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
395 else { len = 7; uv &= 0x01; }
396#else
ba210ebe
JH
397 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
398 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
399 else { len = 13; uv = 0; } /* whoa! */
400#endif
401
a0ed51b3
LW
402 if (retlen)
403 *retlen = len;
9041c2e3 404
ba210ebe
JH
405 expectlen = len;
406
fcc8fcf6
JH
407 if ((curlen < expectlen) &&
408 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 409 warning = UTF8_WARN_SHORT;
ba210ebe
JH
410 goto malformed;
411 }
412
413 len--;
a0ed51b3 414 s++;
ba210ebe
JH
415 ouv = uv;
416
a0ed51b3 417 while (len--) {
421a8bf2
JH
418 if (!UTF8_IS_CONTINUATION(*s) &&
419 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
420 s--;
421 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 422 goto malformed;
a0ed51b3
LW
423 }
424 else
8850bf83 425 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
426 if (!(uv > ouv)) {
427 /* These cannot be allowed. */
428 if (uv == ouv) {
429 if (!(flags & UTF8_ALLOW_LONG)) {
430 warning = UTF8_WARN_LONG;
431 goto malformed;
432 }
433 }
434 else { /* uv < ouv */
435 /* This cannot be allowed. */
436 warning = UTF8_WARN_OVERFLOW;
437 goto malformed;
438 }
ba210ebe
JH
439 }
440 s++;
441 ouv = uv;
442 }
443
421a8bf2 444 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 446 warning = UTF8_WARN_SURROGATE;
ba210ebe 447 goto malformed;
eb160463 448 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 449 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 450 warning = UTF8_WARN_LONG;
ba210ebe 451 goto malformed;
421a8bf2 452 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 453 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 454 warning = UTF8_WARN_FFFF;
a9917092 455 goto malformed;
a0ed51b3 456 }
ba210ebe 457
a0ed51b3 458 return uv;
ba210ebe
JH
459
460malformed:
461
fcc8fcf6 462 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 463 if (retlen)
cc366d4b 464 *retlen = -1;
ba210ebe
JH
465 return 0;
466 }
467
a0dbb045
JH
468 if (dowarn) {
469 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
470
471 switch (warning) {
472 case 0: /* Intentionally empty. */ break;
473 case UTF8_WARN_EMPTY:
474 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
475 break;
476 case UTF8_WARN_CONTINUATION:
097fb8e2 477 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
478 break;
479 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
480 if (s == s0)
481 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
482 (UV)s[1], startbyte);
483 else
484 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
485 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
486
a0dbb045
JH
487 break;
488 case UTF8_WARN_FE_FF:
489 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
490 break;
491 case UTF8_WARN_SHORT:
097fb8e2
JH
492 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
493 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 494 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
495 break;
496 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
497 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
498 ouv, *s, startbyte);
a0dbb045
JH
499 break;
500 case UTF8_WARN_SURROGATE:
501 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
502 break;
a0dbb045 503 case UTF8_WARN_LONG:
097fb8e2
JH
504 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
505 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
506 break;
507 case UTF8_WARN_FFFF:
508 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
509 break;
510 default:
511 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
512 break;
513 }
514
515 if (warning) {
516 char *s = SvPVX(sv);
517
518 if (PL_op)
9014280d 519 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 520 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 521 else
9014280d 522 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
523 }
524 }
525
ba210ebe 526 if (retlen)
28d3d195 527 *retlen = expectlen ? expectlen : len;
ba210ebe 528
28d3d195 529 return 0;
a0ed51b3
LW
530}
531
8e84507e 532/*
37607a96 533=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
534
535Returns the native character value of the first character in the string C<s>
1e54db1a 536which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
537length, in bytes, of that character.
538
1e54db1a 539If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
540returned and retlen is set, if possible, to -1.
541
542=cut
543*/
544
545UV
37607a96 546Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 547{
872c91ae
JH
548 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
549 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
550}
551
552/*
37607a96 553=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
554
555Returns the Unicode code point of the first character in the string C<s>
1e54db1a 556which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
557length, in bytes, of that character.
558
559This function should only be used when returned UV is considered
560an index into the Unicode semantic tables (e.g. swashes).
561
1e54db1a 562If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 563returned and retlen is set, if possible, to -1.
8e84507e
NIS
564
565=cut
566*/
567
568UV
37607a96 569Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 570{
9041c2e3 571 /* Call the low level routine asking for checks */
872c91ae
JH
572 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
573 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
574}
575
b76347f2 576/*
37607a96 577=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
578
579Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
580Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
581up past C<e>, croaks.
b76347f2
JH
582
583=cut
584*/
585
586STRLEN
37607a96 587Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
588{
589 STRLEN len = 0;
590
8850bf83
JH
591 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
592 * the bitops (especially ~) can create illegal UTF-8.
593 * In other words: in Perl UTF-8 is not just for Unicode. */
594
901b21bf
JH
595 if (e < s) {
596 if (ckWARN_d(WARN_UTF8)) {
597 if (PL_op)
598 Perl_warner(aTHX_ packWARN(WARN_UTF8),
599 "%s in %s", unees, OP_DESC(PL_op));
600 else
601 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
602 }
603 return 0;
604 }
b76347f2 605 while (s < e) {
02eb7b47 606 U8 t = UTF8SKIP(s);
b76347f2 607
901b21bf
JH
608 if (e - s < t) {
609 if (ckWARN_d(WARN_UTF8)) {
610 if (PL_op)
611 Perl_warner(aTHX_ packWARN(WARN_UTF8),
612 unees, OP_DESC(PL_op));
613 else
614 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
615 }
616 return len;
617 }
b76347f2
JH
618 s += t;
619 len++;
620 }
621
622 return len;
623}
624
b06226ff 625/*
eebe1485 626=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 627
1e54db1a 628Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
629and C<b>.
630
631WARNING: use only if you *know* that the pointers point inside the
632same UTF-8 buffer.
633
37607a96
PK
634=cut
635*/
a0ed51b3 636
02eb7b47 637IV
864dbfa3 638Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 639{
02eb7b47
JH
640 IV off = 0;
641
8850bf83
JH
642 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
643 * the bitops (especially ~) can create illegal UTF-8.
644 * In other words: in Perl UTF-8 is not just for Unicode. */
645
a0ed51b3
LW
646 if (a < b) {
647 while (a < b) {
02eb7b47
JH
648 U8 c = UTF8SKIP(a);
649
901b21bf
JH
650 if (b - a < c) {
651 if (ckWARN_d(WARN_UTF8)) {
652 if (PL_op)
653 Perl_warner(aTHX_ packWARN(WARN_UTF8),
654 "%s in %s", unees, OP_DESC(PL_op));
655 else
656 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
657 }
658 return off;
659 }
02eb7b47 660 a += c;
a0ed51b3
LW
661 off--;
662 }
663 }
664 else {
665 while (b < a) {
02eb7b47
JH
666 U8 c = UTF8SKIP(b);
667
901b21bf
JH
668 if (a - b < c) {
669 if (ckWARN_d(WARN_UTF8)) {
670 if (PL_op)
671 Perl_warner(aTHX_ packWARN(WARN_UTF8),
672 "%s in %s", unees, OP_DESC(PL_op));
673 else
674 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675 }
676 return off;
677 }
02eb7b47 678 b += c;
a0ed51b3
LW
679 off++;
680 }
681 }
02eb7b47 682
a0ed51b3
LW
683 return off;
684}
685
b06226ff 686/*
37607a96 687=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 688
8850bf83
JH
689Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690forward or backward.
b06226ff
JH
691
692WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
693the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694on the first byte of character or just after the last byte of a character.
b06226ff 695
37607a96
PK
696=cut
697*/
a0ed51b3
LW
698
699U8 *
864dbfa3 700Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 701{
8850bf83
JH
702 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703 * the bitops (especially ~) can create illegal UTF-8.
704 * In other words: in Perl UTF-8 is not just for Unicode. */
705
a0ed51b3
LW
706 if (off >= 0) {
707 while (off--)
708 s += UTF8SKIP(s);
709 }
710 else {
711 while (off++) {
712 s--;
8850bf83
JH
713 while (UTF8_IS_CONTINUATION(*s))
714 s--;
a0ed51b3
LW
715 }
716 }
717 return s;
718}
719
6940069f 720/*
eebe1485 721=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 722
1e54db1a 723Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
724Unlike C<bytes_to_utf8>, this over-writes the original string, and
725updates len to contain the new length.
67e989fb 726Returns zero on failure, setting C<len> to -1.
6940069f
GS
727
728=cut
729*/
730
731U8 *
37607a96 732Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 733{
6940069f
GS
734 U8 *send;
735 U8 *d;
dcad2880 736 U8 *save = s;
246fae53 737
1e54db1a 738 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
739 for (send = s + *len; s < send; ) {
740 U8 c = *s++;
741
1d72bdf6
NIS
742 if (!UTF8_IS_INVARIANT(c) &&
743 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
745 *len = -1;
746 return 0;
747 }
246fae53 748 }
dcad2880
JH
749
750 d = s = save;
6940069f 751 while (s < send) {
ed646e6e 752 STRLEN ulen;
9041c2e3 753 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 754 s += ulen;
6940069f
GS
755 }
756 *d = '\0';
246fae53 757 *len = d - save;
6940069f
GS
758 return save;
759}
760
761/*
f9a63242
JH
762=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
763
1e54db1a 764Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
f9a63242 765Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
766the newly-created string, and updates C<len> to contain the new
767length. Returns the original string if no conversion occurs, C<len>
768is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7690 if C<s> is converted or contains all 7bit characters.
f9a63242 770
37607a96
PK
771=cut
772*/
f9a63242
JH
773
774U8 *
37607a96 775Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 776{
f9a63242
JH
777 U8 *d;
778 U8 *start = s;
db42d148 779 U8 *send;
f9a63242
JH
780 I32 count = 0;
781
782 if (!*is_utf8)
783 return start;
784
1e54db1a 785 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242
JH
786 for (send = s + *len; s < send;) {
787 U8 c = *s++;
1d72bdf6 788 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
789 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790 (c = *s++) && UTF8_IS_CONTINUATION(c))
791 count++;
792 else
f9a63242 793 return start;
db42d148 794 }
f9a63242
JH
795 }
796
797 *is_utf8 = 0;
798
f9a63242 799 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 800 s = start; start = d;
f9a63242
JH
801 while (s < send) {
802 U8 c = *s++;
c4d5f83a
NIS
803 if (!UTF8_IS_INVARIANT(c)) {
804 /* Then it is two-byte encoded */
805 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806 c = ASCII_TO_NATIVE(c);
807 }
808 *d++ = c;
f9a63242
JH
809 }
810 *d = '\0';
811 *len = d - start;
812 return start;
813}
814
815/*
eebe1485 816=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 817
1e54db1a 818Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
819Returns a pointer to the newly-created string, and sets C<len> to
820reflect the new length.
6940069f 821
1e54db1a 822If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
823see sv_recode_to_utf8().
824
497711e7 825=cut
6940069f
GS
826*/
827
828U8*
37607a96 829Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 830{
6940069f
GS
831 U8 *send;
832 U8 *d;
833 U8 *dst;
6662521e 834 send = s + (*len);
6940069f 835
6662521e 836 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
837 dst = d;
838
839 while (s < send) {
db42d148 840 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 841 if (UNI_IS_INVARIANT(uv))
eb160463 842 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 843 else {
eb160463
GS
844 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
845 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
846 }
847 }
848 *d = '\0';
6662521e 849 *len = d-dst;
6940069f
GS
850 return dst;
851}
852
a0ed51b3 853/*
dea0fc0b 854 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
855 *
856 * Destination must be pre-extended to 3/2 source. Do not use in-place.
857 * We optimize for native, for obvious reasons. */
858
859U8*
dea0fc0b 860Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 861{
dea0fc0b
JH
862 U8* pend;
863 U8* dstart = d;
864
865 if (bytelen & 1)
a7867d0a 866 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
867
868 pend = p + bytelen;
869
a0ed51b3 870 while (p < pend) {
dea0fc0b
JH
871 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
872 p += 2;
a0ed51b3 873 if (uv < 0x80) {
eb160463 874 *d++ = (U8)uv;
a0ed51b3
LW
875 continue;
876 }
877 if (uv < 0x800) {
eb160463
GS
878 *d++ = (U8)(( uv >> 6) | 0xc0);
879 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
880 continue;
881 }
882 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
883 UV low = (p[0] << 8) + p[1];
884 p += 2;
dea0fc0b
JH
885 if (low < 0xdc00 || low >= 0xdfff)
886 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
887 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
888 }
889 if (uv < 0x10000) {
eb160463
GS
890 *d++ = (U8)(( uv >> 12) | 0xe0);
891 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
892 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
893 continue;
894 }
895 else {
eb160463
GS
896 *d++ = (U8)(( uv >> 18) | 0xf0);
897 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
898 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
899 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
900 continue;
901 }
902 }
dea0fc0b 903 *newlen = d - dstart;
a0ed51b3
LW
904 return d;
905}
906
907/* Note: this one is slightly destructive of the source. */
908
909U8*
dea0fc0b 910Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
911{
912 U8* s = (U8*)p;
913 U8* send = s + bytelen;
914 while (s < send) {
915 U8 tmp = s[0];
916 s[0] = s[1];
917 s[1] = tmp;
918 s += 2;
919 }
dea0fc0b 920 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
921}
922
923/* for now these are all defined (inefficiently) in terms of the utf8 versions */
924
925bool
84afefe6 926Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 927{
ad391ad9 928 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 929 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
930 return is_utf8_alnum(tmpbuf);
931}
932
933bool
84afefe6 934Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 935{
ad391ad9 936 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 937 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
938 return is_utf8_alnumc(tmpbuf);
939}
940
941bool
84afefe6 942Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 943{
ad391ad9 944 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 945 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
946 return is_utf8_idfirst(tmpbuf);
947}
948
949bool
84afefe6 950Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 951{
ad391ad9 952 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 953 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
954 return is_utf8_alpha(tmpbuf);
955}
956
957bool
84afefe6 958Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 959{
ad391ad9 960 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 961 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
962 return is_utf8_ascii(tmpbuf);
963}
964
965bool
84afefe6 966Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 967{
ad391ad9 968 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 969 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
970 return is_utf8_space(tmpbuf);
971}
972
973bool
84afefe6 974Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 975{
ad391ad9 976 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 977 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
978 return is_utf8_digit(tmpbuf);
979}
980
981bool
84afefe6 982Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 983{
ad391ad9 984 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 985 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
986 return is_utf8_upper(tmpbuf);
987}
988
989bool
84afefe6 990Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 991{
ad391ad9 992 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 993 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
994 return is_utf8_lower(tmpbuf);
995}
996
997bool
84afefe6 998Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 999{
ad391ad9 1000 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1001 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1002 return is_utf8_cntrl(tmpbuf);
1003}
1004
1005bool
84afefe6 1006Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1007{
ad391ad9 1008 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1009 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1010 return is_utf8_graph(tmpbuf);
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1015{
ad391ad9 1016 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1017 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1018 return is_utf8_print(tmpbuf);
1019}
1020
b8c5462f 1021bool
84afefe6 1022Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1023{
ad391ad9 1024 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1025 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1026 return is_utf8_punct(tmpbuf);
1027}
1028
4d61ec05 1029bool
84afefe6 1030Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1031{
e7ae6809 1032 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 1033 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1034 return is_utf8_xdigit(tmpbuf);
1035}
1036
84afefe6
JH
1037UV
1038Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1039{
0ebc6274
JH
1040 uvchr_to_utf8(p, c);
1041 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1042}
1043
84afefe6
JH
1044UV
1045Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1046{
0ebc6274
JH
1047 uvchr_to_utf8(p, c);
1048 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1049}
1050
84afefe6
JH
1051UV
1052Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1053{
0ebc6274
JH
1054 uvchr_to_utf8(p, c);
1055 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1056}
1057
84afefe6
JH
1058UV
1059Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1060{
0ebc6274
JH
1061 uvchr_to_utf8(p, c);
1062 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1063}
1064
a0ed51b3
LW
1065/* for now these all assume no locale info available for Unicode > 255 */
1066
1067bool
84afefe6 1068Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1069{
1070 return is_uni_alnum(c); /* XXX no locale support yet */
1071}
1072
1073bool
84afefe6 1074Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1075{
1076 return is_uni_alnumc(c); /* XXX no locale support yet */
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1081{
1082 return is_uni_idfirst(c); /* XXX no locale support yet */
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1087{
1088 return is_uni_alpha(c); /* XXX no locale support yet */
1089}
1090
1091bool
84afefe6 1092Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1093{
1094 return is_uni_ascii(c); /* XXX no locale support yet */
1095}
1096
1097bool
84afefe6 1098Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1099{
1100 return is_uni_space(c); /* XXX no locale support yet */
1101}
1102
1103bool
84afefe6 1104Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1105{
1106 return is_uni_digit(c); /* XXX no locale support yet */
1107}
1108
1109bool
84afefe6 1110Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1111{
1112 return is_uni_upper(c); /* XXX no locale support yet */
1113}
1114
1115bool
84afefe6 1116Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1117{
1118 return is_uni_lower(c); /* XXX no locale support yet */
1119}
1120
1121bool
84afefe6 1122Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1123{
1124 return is_uni_cntrl(c); /* XXX no locale support yet */
1125}
1126
1127bool
84afefe6 1128Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1129{
1130 return is_uni_graph(c); /* XXX no locale support yet */
1131}
1132
1133bool
84afefe6 1134Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1135{
1136 return is_uni_print(c); /* XXX no locale support yet */
1137}
1138
b8c5462f 1139bool
84afefe6 1140Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1141{
1142 return is_uni_punct(c); /* XXX no locale support yet */
1143}
1144
4d61ec05 1145bool
84afefe6 1146Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1147{
1148 return is_uni_xdigit(c); /* XXX no locale support yet */
1149}
1150
b7ac61fa
JH
1151U32
1152Perl_to_uni_upper_lc(pTHX_ U32 c)
1153{
ee099d14
JH
1154 /* XXX returns only the first character -- do not use XXX */
1155 /* XXX no locale support yet */
1156 STRLEN len;
1157 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1158 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1159}
1160
1161U32
1162Perl_to_uni_title_lc(pTHX_ U32 c)
1163{
ee099d14
JH
1164 /* XXX returns only the first character XXX -- do not use XXX */
1165 /* XXX no locale support yet */
1166 STRLEN len;
1167 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1168 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1169}
1170
1171U32
1172Perl_to_uni_lower_lc(pTHX_ U32 c)
1173{
ee099d14
JH
1174 /* XXX returns only the first character -- do not use XXX */
1175 /* XXX no locale support yet */
1176 STRLEN len;
1177 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1178 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1179}
1180
a0ed51b3 1181bool
864dbfa3 1182Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1183{
386d01d6
GS
1184 if (!is_utf8_char(p))
1185 return FALSE;
a0ed51b3 1186 if (!PL_utf8_alnum)
289d4f09
ML
1187 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1188 * descendant of isalnum(3), in other words, it doesn't
1189 * contain the '_'. --jhi */
1190 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1191 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1192/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1193#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1194 if (!PL_utf8_alnum)
1195 PL_utf8_alnum = swash_init("utf8", "",
1196 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1197 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1198#endif
1199}
1200
1201bool
b8c5462f
JH
1202Perl_is_utf8_alnumc(pTHX_ U8 *p)
1203{
386d01d6
GS
1204 if (!is_utf8_char(p))
1205 return FALSE;
b8c5462f
JH
1206 if (!PL_utf8_alnum)
1207 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1208 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1209/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1210#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1211 if (!PL_utf8_alnum)
1212 PL_utf8_alnum = swash_init("utf8", "",
1213 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1214 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1215#endif
1216}
1217
1218bool
82686b01 1219Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1220{
82686b01
JH
1221 if (*p == '_')
1222 return TRUE;
1223 if (!is_utf8_char(p))
1224 return FALSE;
1225 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1226 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1227 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1228}
1229
1230bool
1231Perl_is_utf8_idcont(pTHX_ U8 *p)
1232{
1233 if (*p == '_')
1234 return TRUE;
1235 if (!is_utf8_char(p))
1236 return FALSE;
1237 if (!PL_utf8_idcont)
1238 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1239 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1240}
1241
1242bool
864dbfa3 1243Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1244{
386d01d6
GS
1245 if (!is_utf8_char(p))
1246 return FALSE;
a0ed51b3 1247 if (!PL_utf8_alpha)
e24b16f9 1248 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1249 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1250}
1251
1252bool
b8c5462f
JH
1253Perl_is_utf8_ascii(pTHX_ U8 *p)
1254{
386d01d6
GS
1255 if (!is_utf8_char(p))
1256 return FALSE;
b8c5462f
JH
1257 if (!PL_utf8_ascii)
1258 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1259 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1260}
1261
1262bool
864dbfa3 1263Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1264{
386d01d6
GS
1265 if (!is_utf8_char(p))
1266 return FALSE;
a0ed51b3 1267 if (!PL_utf8_space)
3bec3564 1268 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1269 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1270}
1271
1272bool
864dbfa3 1273Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1274{
386d01d6
GS
1275 if (!is_utf8_char(p))
1276 return FALSE;
a0ed51b3 1277 if (!PL_utf8_digit)
e24b16f9 1278 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1279 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1280}
1281
1282bool
864dbfa3 1283Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1284{
386d01d6
GS
1285 if (!is_utf8_char(p))
1286 return FALSE;
a0ed51b3 1287 if (!PL_utf8_upper)
c65e4d19 1288 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1289 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1290}
1291
1292bool
864dbfa3 1293Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1294{
386d01d6
GS
1295 if (!is_utf8_char(p))
1296 return FALSE;
a0ed51b3 1297 if (!PL_utf8_lower)
c65e4d19 1298 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1299 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1300}
1301
1302bool
b8c5462f
JH
1303Perl_is_utf8_cntrl(pTHX_ U8 *p)
1304{
386d01d6
GS
1305 if (!is_utf8_char(p))
1306 return FALSE;
b8c5462f
JH
1307 if (!PL_utf8_cntrl)
1308 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1309 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1310}
1311
1312bool
1313Perl_is_utf8_graph(pTHX_ U8 *p)
1314{
386d01d6
GS
1315 if (!is_utf8_char(p))
1316 return FALSE;
b8c5462f
JH
1317 if (!PL_utf8_graph)
1318 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1319 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1320}
1321
1322bool
864dbfa3 1323Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1324{
386d01d6
GS
1325 if (!is_utf8_char(p))
1326 return FALSE;
a0ed51b3 1327 if (!PL_utf8_print)
e24b16f9 1328 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1329 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1330}
1331
1332bool
b8c5462f
JH
1333Perl_is_utf8_punct(pTHX_ U8 *p)
1334{
386d01d6
GS
1335 if (!is_utf8_char(p))
1336 return FALSE;
b8c5462f
JH
1337 if (!PL_utf8_punct)
1338 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1339 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1340}
1341
1342bool
1343Perl_is_utf8_xdigit(pTHX_ U8 *p)
1344{
386d01d6
GS
1345 if (!is_utf8_char(p))
1346 return FALSE;
b8c5462f
JH
1347 if (!PL_utf8_xdigit)
1348 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1349 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1350}
1351
1352bool
864dbfa3 1353Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1354{
386d01d6
GS
1355 if (!is_utf8_char(p))
1356 return FALSE;
a0ed51b3 1357 if (!PL_utf8_mark)
e24b16f9 1358 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1359 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1360}
1361
6b5c0936
JH
1362/*
1363=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1364
1365The "p" contains the pointer to the UTF-8 string encoding
1366the character that is being converted.
1367
1368The "ustrp" is a pointer to the character buffer to put the
1369conversion result to. The "lenp" is a pointer to the length
1370of the result.
1371
0134edef 1372The "swashp" is a pointer to the swash to use.
6b5c0936 1373
0134edef
JH
1374Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1375and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1376but not always, a multicharacter mapping), is tried first.
6b5c0936 1377
0134edef
JH
1378The "special" is a string like "utf8::ToSpecLower", which means the
1379hash %utf8::ToSpecLower. The access to the hash is through
1380Perl_to_utf8_case().
6b5c0936 1381
0134edef
JH
1382The "normal" is a string like "ToLower" which means the swash
1383%utf8::ToLower.
1384
1385=cut */
6b5c0936 1386
2104c8d9 1387UV
a6872d42 1388Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1389{
0134edef 1390 UV uv0, uv1;
2f9475ad 1391 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1392 STRLEN len = 0;
a0ed51b3 1393
1feea2c7
JH
1394 uv0 = utf8_to_uvchr(p, 0);
1395 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1396 * are necessary in EBCDIC, they are redundant no-ops
1397 * in ASCII-ish platforms, and hopefully optimized away. */
1398 uv1 = NATIVE_TO_UNI(uv0);
1399 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1400
1401 if (!*swashp) /* load on-demand */
1402 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1403
b08cf34e
JH
1404 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1405 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1406 /* It might be "special" (sometimes, but not always,
2a37f04d 1407 * a multicharacter mapping) */
983ffd37 1408 HV *hv;
b08cf34e
JH
1409 SV **svp;
1410
1411 if ((hv = get_hv(special, FALSE)) &&
1412 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1413 (*svp)) {
1414 char *s;
47654450 1415
b08cf34e 1416 s = SvPV(*svp, len);
47654450
JH
1417 if (len == 1)
1418 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1419 else {
2f9475ad
JH
1420#ifdef EBCDIC
1421 /* If we have EBCDIC we need to remap the characters
1422 * since any characters in the low 256 are Unicode
1423 * code points, not EBCDIC. */
7cda7a3d 1424 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1425
1426 d = tmpbuf;
b08cf34e 1427 if (SvUTF8(*svp)) {
2f9475ad
JH
1428 STRLEN tlen = 0;
1429
1430 while (t < tend) {
1431 UV c = utf8_to_uvchr(t, &tlen);
1432 if (tlen > 0) {
1433 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1434 t += tlen;
1435 }
1436 else
1437 break;
1438 }
1439 }
1440 else {
36fec512
JH
1441 while (t < tend) {
1442 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1443 t++;
1444 }
2f9475ad
JH
1445 }
1446 len = d - tmpbuf;
1447 Copy(tmpbuf, ustrp, len, U8);
1448#else
d2dcd0fb 1449 Copy(s, ustrp, len, U8);
2f9475ad 1450#endif
29e98929 1451 }
983ffd37 1452 }
0134edef
JH
1453 }
1454
1455 if (!len && *swashp) {
1456 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1457
1458 if (uv2) {
1459 /* It was "normal" (a single character mapping). */
1460 UV uv3 = UNI_TO_NATIVE(uv2);
1461
e9101d72 1462 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1463 }
1464 }
1feea2c7 1465
0134edef
JH
1466 if (!len) /* Neither: just copy. */
1467 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1468
2a37f04d
JH
1469 if (lenp)
1470 *lenp = len;
1471
0134edef 1472 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1473}
1474
d3e79532
JH
1475/*
1476=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1477
1478Convert the UTF-8 encoded character at p to its uppercase version and
1479store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1480that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1481uppercase version may be longer than the original character (up to two
1482characters).
1483
1484The first character of the uppercased version is returned
1485(but note, as explained above, that there may be more.)
1486
1487=cut */
1488
2104c8d9 1489UV
983ffd37 1490Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1491{
983ffd37 1492 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1493 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1494}
a0ed51b3 1495
d3e79532
JH
1496/*
1497=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1498
1499Convert the UTF-8 encoded character at p to its titlecase version and
1500store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1501that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1502titlecase version may be longer than the original character (up to two
1503characters).
1504
1505The first character of the titlecased version is returned
1506(but note, as explained above, that there may be more.)
1507
1508=cut */
1509
983ffd37
JH
1510UV
1511Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1512{
1513 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1514 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1515}
1516
d3e79532
JH
1517/*
1518=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1519
1520Convert the UTF-8 encoded character at p to its lowercase version and
1521store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1522that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1523lowercase version may be longer than the original character (up to two
1524characters).
1525
1526The first character of the lowercased version is returned
1527(but note, as explained above, that there may be more.)
1528
1529=cut */
1530
2104c8d9 1531UV
a2a2844f 1532Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1533{
983ffd37 1534 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1535 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1536}
1537
d3e79532
JH
1538/*
1539=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1540
1541Convert the UTF-8 encoded character at p to its foldcase version and
1542store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1543that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1544foldcase version may be longer than the original character (up to
1545three characters).
1546
1547The first character of the foldcased version is returned
1548(but note, as explained above, that there may be more.)
1549
1550=cut */
1551
b4e400f9
JH
1552UV
1553Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1554{
1555 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1556 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1557}
1558
1559/* a "swash" is a swatch hash */
1560
1561SV*
864dbfa3 1562Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1563{
1564 SV* retval;
bf1fed83 1565 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1566 dSP;
71bed85a
NC
1567 size_t pkg_len = strlen(pkg);
1568 size_t name_len = strlen(name);
1569 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1570 SV* errsv_save;
ce3b816e 1571
1b026014 1572 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1573 ENTER;
f8be5cf0 1574 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1575 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1576 Nullsv);
f8be5cf0
JH
1577 if (!SvTRUE(ERRSV))
1578 sv_setsv(ERRSV, errsv_save);
1579 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1580 LEAVE;
1581 }
1582 SPAGAIN;
a0ed51b3
LW
1583 PUSHSTACKi(PERLSI_MAGIC);
1584 PUSHMARK(SP);
1585 EXTEND(SP,5);
71bed85a
NC
1586 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1587 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1588 PUSHs(listsv);
1589 PUSHs(sv_2mortal(newSViv(minbits)));
1590 PUSHs(sv_2mortal(newSViv(none)));
1591 PUTBACK;
1592 ENTER;
1593 SAVEI32(PL_hints);
1594 PL_hints = 0;
1595 save_re_context();
923e4eb5 1596 if (IN_PERL_COMPILETIME) {
bf1fed83 1597 /* XXX ought to be handled by lex_start */
82686b01 1598 SAVEI32(PL_in_my);
2b4bd638 1599 PL_in_my = 0;
bf1fed83 1600 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1601 }
f8be5cf0 1602 errsv_save = newSVsv(ERRSV);
864dbfa3 1603 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1604 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1605 else
e24b16f9 1606 retval = &PL_sv_undef;
f8be5cf0
JH
1607 if (!SvTRUE(ERRSV))
1608 sv_setsv(ERRSV, errsv_save);
1609 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1610 LEAVE;
1611 POPSTACK;
923e4eb5 1612 if (IN_PERL_COMPILETIME) {
bf1fed83
JH
1613 STRLEN len;
1614 char* pv = SvPV(tokenbufsv, len);
1615
1616 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1617 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1618 }
bc45ce41
JH
1619 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1620 if (SvPOK(retval))
35c1215d
NC
1621 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1622 retval);
cea2e8a9 1623 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1624 }
a0ed51b3
LW
1625 return retval;
1626}
1627
035d37be
JH
1628
1629/* This API is wrong for special case conversions since we may need to
1630 * return several Unicode characters for a single Unicode character
1631 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1632 * the lower-level routine, and it is similarly broken for returning
1633 * multiple values. --jhi */
a0ed51b3 1634UV
3568d838 1635Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1636{
1637 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1638 U32 klen;
1639 U32 off;
a0ed51b3 1640 STRLEN slen;
7d85a32c 1641 STRLEN needents;
4ea42e7f 1642 U8 *tmps = NULL;
a0ed51b3
LW
1643 U32 bit;
1644 SV *retval;
3568d838
JH
1645 U8 tmputf8[2];
1646 UV c = NATIVE_TO_ASCII(*ptr);
1647
1648 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1649 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1650 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1651 ptr = tmputf8;
1652 }
1653 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1654 * then the "swatch" is a vec() for al the chars which start
1655 * with 0xAA..0xYY
1656 * So the key in the hash (klen) is length of encoded char -1
1657 */
1658 klen = UTF8SKIP(ptr) - 1;
1659 off = ptr[klen];
a0ed51b3 1660
7d85a32c
JH
1661 if (klen == 0)
1662 {
1663 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1664 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1665 */
1666 needents = UTF_CONTINUATION_MARK;
1667 off = NATIVE_TO_UTF(ptr[klen]);
1668 }
1669 else
1670 {
1671 /* If char is encoded then swatch is for the prefix */
1672 needents = (1 << UTF_ACCUMULATION_SHIFT);
1673 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1674 }
1675
a0ed51b3
LW
1676 /*
1677 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1678 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1679 * it's nothing to sniff at.) Pity we usually come through at least
1680 * two function calls to get here...
1681 *
1682 * NB: this code assumes that swatches are never modified, once generated!
1683 */
1684
3568d838 1685 if (hv == PL_last_swash_hv &&
a0ed51b3 1686 klen == PL_last_swash_klen &&
3568d838 1687 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1688 {
1689 tmps = PL_last_swash_tmps;
1690 slen = PL_last_swash_slen;
1691 }
1692 else {
1693 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1694 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1695
1696 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1697 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1698 dSP;
2b9d42f0
NIS
1699 /* We use utf8n_to_uvuni() as we want an index into
1700 Unicode tables, not a native character number.
1701 */
872c91ae
JH
1702 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1703 ckWARN(WARN_UTF8) ?
1704 0 : UTF8_ALLOW_ANY);
f8be5cf0 1705 SV *errsv_save;
a0ed51b3
LW
1706 ENTER;
1707 SAVETMPS;
1708 save_re_context();
1709 PUSHSTACKi(PERLSI_MAGIC);
1710 PUSHMARK(SP);
1711 EXTEND(SP,3);
1712 PUSHs((SV*)sv);
ffbc6a93 1713 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1714 PUSHs(sv_2mortal(newSViv((klen) ?
1715 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1716 PUSHs(sv_2mortal(newSViv(needents)));
1717 PUTBACK;
f8be5cf0 1718 errsv_save = newSVsv(ERRSV);
864dbfa3 1719 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1720 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1721 else
e24b16f9 1722 retval = &PL_sv_undef;
f8be5cf0
JH
1723 if (!SvTRUE(ERRSV))
1724 sv_setsv(ERRSV, errsv_save);
1725 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1726 POPSTACK;
1727 FREETMPS;
1728 LEAVE;
923e4eb5 1729 if (IN_PERL_COMPILETIME)
eb160463 1730 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1731
dfe13c55 1732 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1733
7d85a32c 1734 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1735 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1736 }
1737
1738 PL_last_swash_hv = hv;
1739 PL_last_swash_klen = klen;
1740 PL_last_swash_tmps = tmps;
1741 PL_last_swash_slen = slen;
1742 if (klen)
1743 Copy(ptr, PL_last_swash_key, klen, U8);
1744 }
1745
9faf8d75 1746 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1747 case 1:
1748 bit = 1 << (off & 7);
1749 off >>= 3;
1750 return (tmps[off] & bit) != 0;
1751 case 8:
1752 return tmps[off];
1753 case 16:
1754 off <<= 1;
1755 return (tmps[off] << 8) + tmps[off + 1] ;
1756 case 32:
1757 off <<= 2;
1758 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1759 }
cea2e8a9 1760 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1761 return 0;
1762}
2b9d42f0
NIS
1763
1764
1765/*
37607a96 1766=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1767
1e54db1a 1768Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2b9d42f0
NIS
1769of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1770bytes available. The return value is the pointer to the byte after the
1771end of the new character. In other words,
1772
1773 d = uvchr_to_utf8(d, uv);
1774
1775is the recommended wide native character-aware way of saying
1776
1777 *(d++) = uv;
1778
1779=cut
1780*/
1781
1782/* On ASCII machines this is normally a macro but we want a
1783 real function in case XS code wants it
1784*/
1785#undef Perl_uvchr_to_utf8
1786U8 *
1787Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1788{
b851fbc1 1789 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1790}
1791
b851fbc1
JH
1792U8 *
1793Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1794{
1795 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1796}
2b9d42f0
NIS
1797
1798/*
37607a96 1799=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1800
1801Returns the native character value of the first character in the string C<s>
1e54db1a 1802which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1803length, in bytes, of that character.
1804
1805Allows length and flags to be passed to low level routine.
1806
1807=cut
1808*/
0a2ef054
JH
1809/* On ASCII machines this is normally a macro but we want
1810 a real function in case XS code wants it
2b9d42f0
NIS
1811*/
1812#undef Perl_utf8n_to_uvchr
1813UV
37607a96 1814Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1815{
1816 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1817 return UNI_TO_NATIVE(uv);
1818}
1819
d2cc3551
JH
1820/*
1821=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1822
1823Build to the scalar dsv a displayable version of the string spv,
1824length len, the displayable version being at most pvlim bytes long
1825(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1826
9e55ce06 1827The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1828isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1829to display the \\[nrfta\\] as the backslashed versions (like '\n')
1830(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1831UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1832UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1833
d2cc3551
JH
1834The pointer to the PV of the dsv is returned.
1835
1836=cut */
e6b2e755
JH
1837char *
1838Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1839{
1840 int truncated = 0;
1841 char *s, *e;
1842
1843 sv_setpvn(dsv, "", 0);
1844 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1845 UV u;
c728cb41
JH
1846 bool ok = FALSE;
1847
e6b2e755
JH
1848 if (pvlim && SvCUR(dsv) >= pvlim) {
1849 truncated++;
1850 break;
1851 }
1852 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1853 if (u < 256) {
c728cb41
JH
1854 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1855 switch (u & 0xFF) {
1856 case '\n':
1857 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1858 case '\r':
1859 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1860 case '\t':
1861 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1862 case '\f':
1863 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1864 case '\a':
1865 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1866 case '\\':
d79a7a3d 1867 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41
JH
1868 default: break;
1869 }
1870 }
00e86452
JH
1871 /* isPRINT() is the locale-blind version. */
1872 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1873 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054
JH
1874 ok = TRUE;
1875 }
c728cb41
JH
1876 }
1877 if (!ok)
9e55ce06 1878 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1879 }
1880 if (truncated)
1881 sv_catpvn(dsv, "...", 3);
1882
1883 return SvPVX(dsv);
1884}
2b9d42f0 1885
d2cc3551
JH
1886/*
1887=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1888
1889Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1890the displayable version being at most pvlim bytes long
d2cc3551 1891(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1892
1893The flags argument is as in pv_uni_display().
1894
d2cc3551
JH
1895The pointer to the PV of the dsv is returned.
1896
1897=cut */
e6b2e755
JH
1898char *
1899Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1900{
701a277b
JH
1901 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1902 pvlim, flags);
1903}
1904
d2cc3551 1905/*
d07ddd77 1906=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1907
1908Return true if the strings s1 and s2 differ case-insensitively, false
1909if not (if they are equal case-insensitively). If u1 is true, the
1910string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1911the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1912are false, the respective string is assumed to be in native 8-bit
1913encoding.
1914
1915If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1916in there (they will point at the beginning of the I<next> character).
1917If the pointers behind pe1 or pe2 are non-NULL, they are the end
1918pointers beyond which scanning will not continue under any
1919circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1920s2+l2 will be used as goal end pointers that will also stop the scan,
1921and which qualify towards defining a successful match: all the scans
1922that define an explicit length must reach their goal pointers for
1923a match to succeed).
d2cc3551
JH
1924
1925For case-insensitiveness, the "casefolding" of Unicode is used
1926instead of upper/lowercasing both the characters, see
1927http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1928
1929=cut */
701a277b 1930I32
d07ddd77 1931Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1932{
5469e704
JH
1933 register U8 *p1 = (U8*)s1;
1934 register U8 *p2 = (U8*)s2;
d07ddd77
JH
1935 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1936 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1937 STRLEN n1 = 0, n2 = 0;
ffce6cc2
JH
1938 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1939 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8
JH
1940 U8 natbuf[1+1];
1941 STRLEN foldlen1, foldlen2;
d07ddd77 1942 bool match;
332ddc25 1943
d07ddd77
JH
1944 if (pe1)
1945 e1 = *(U8**)pe1;
eb160463 1946 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
d07ddd77
JH
1947 f1 = (U8*)s1 + l1;
1948 if (pe2)
1949 e2 = *(U8**)pe2;
eb160463 1950 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
d07ddd77
JH
1951 f2 = (U8*)s2 + l2;
1952
1953 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1954 return 1; /* mismatch; possible infinite loop or false positive */
1955
a6872d42
JH
1956 if (!u1 || !u2)
1957 natbuf[1] = 0; /* Need to terminate the buffer. */
1958
d07ddd77
JH
1959 while ((e1 == 0 || p1 < e1) &&
1960 (f1 == 0 || p1 < f1) &&
1961 (e2 == 0 || p2 < e2) &&
1962 (f2 == 0 || p2 < f2)) {
1963 if (n1 == 0) {
d7f013c8
JH
1964 if (u1)
1965 to_utf8_fold(p1, foldbuf1, &foldlen1);
1966 else {
f5cee151 1967 natbuf[0] = *p1;
d7f013c8
JH
1968 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1969 }
1970 q1 = foldbuf1;
d07ddd77 1971 n1 = foldlen1;
332ddc25 1972 }
d07ddd77 1973 if (n2 == 0) {
d7f013c8
JH
1974 if (u2)
1975 to_utf8_fold(p2, foldbuf2, &foldlen2);
1976 else {
f5cee151 1977 natbuf[0] = *p2;
d7f013c8
JH
1978 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1979 }
1980 q2 = foldbuf2;
d07ddd77 1981 n2 = foldlen2;
332ddc25 1982 }
d07ddd77
JH
1983 while (n1 && n2) {
1984 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1985 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1986 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1987 return 1; /* mismatch */
d07ddd77 1988 n1 -= UTF8SKIP(q1);
d7f013c8 1989 q1 += UTF8SKIP(q1);
d07ddd77 1990 n2 -= UTF8SKIP(q2);
d7f013c8 1991 q2 += UTF8SKIP(q2);
701a277b 1992 }
d07ddd77 1993 if (n1 == 0)
d7f013c8 1994 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1995 if (n2 == 0)
d7f013c8
JH
1996 p2 += u2 ? UTF8SKIP(p2) : 1;
1997
d2cc3551 1998 }
5469e704 1999
d07ddd77
JH
2000 /* A match is defined by all the scans that specified
2001 * an explicit length reaching their final goals. */
2002 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2003
2004 if (match) {
d07ddd77
JH
2005 if (pe1)
2006 *pe1 = (char*)p1;
2007 if (pe2)
2008 *pe2 = (char*)p2;
5469e704
JH
2009 }
2010
2011 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2012}
701a277b 2013