This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #25147] C<use strict; print if foo> didn't give the
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
4bb101f2 3 * Copyright (C) 2000, 2001, 2002, 2003, by Larry Wall and others
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
901b21bf
JH
27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7
JH
29/*
30=head1 Unicode Support
a0ed51b3 31
b851fbc1 32=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 33
1e54db1a 34Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
eebe1485
SC
35of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
36bytes available. The return value is the pointer to the byte after the
9041c2e3 37end of the new character. In other words,
eebe1485 38
b851fbc1
JH
39 d = uvuni_to_utf8_flags(d, uv, flags);
40
41or, in most cases,
42
9041c2e3 43 d = uvuni_to_utf8(d, uv);
eebe1485 44
b851fbc1
JH
45(which is equivalent to)
46
47 d = uvuni_to_utf8_flags(d, uv, 0);
48
eebe1485
SC
49is the recommended Unicode-aware way of saying
50
51 *(d++) = uv;
52
53=cut
54*/
55
dfe13c55 56U8 *
b851fbc1 57Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 58{
62961d2e 59 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
60 if (UNICODE_IS_SURROGATE(uv) &&
61 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 62 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
63 else if (
64 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
65 !(flags & UNICODE_ALLOW_FDD0))
66 ||
c867b360 67 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
68 !(flags & UNICODE_ALLOW_FFFF))) &&
69 /* UNICODE_ALLOW_SUPER includes
2a20b9da 70 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
71 ((uv <= PERL_UNICODE_MAX) ||
72 !(flags & UNICODE_ALLOW_SUPER))
73 )
9014280d 74 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
75 "Unicode character 0x%04"UVxf" is illegal", uv);
76 }
c4d5f83a 77 if (UNI_IS_INVARIANT(uv)) {
eb160463 78 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
79 return d;
80 }
2d331972 81#if defined(EBCDIC)
1d72bdf6
NIS
82 else {
83 STRLEN len = UNISKIP(uv);
84 U8 *p = d+len-1;
85 while (p > d) {
eb160463 86 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
87 uv >>= UTF_ACCUMULATION_SHIFT;
88 }
eb160463 89 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
90 return d+len;
91 }
92#else /* Non loop style */
a0ed51b3 93 if (uv < 0x800) {
eb160463
GS
94 *d++ = (U8)(( uv >> 6) | 0xc0);
95 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
96 return d;
97 }
98 if (uv < 0x10000) {
eb160463
GS
99 *d++ = (U8)(( uv >> 12) | 0xe0);
100 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
102 return d;
103 }
104 if (uv < 0x200000) {
eb160463
GS
105 *d++ = (U8)(( uv >> 18) | 0xf0);
106 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
107 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
108 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
109 return d;
110 }
111 if (uv < 0x4000000) {
eb160463
GS
112 *d++ = (U8)(( uv >> 24) | 0xf8);
113 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x80000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 30) | 0xfc);
121 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
6b8eaf93 128#ifdef HAS_QUAD
d7578b48 129 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
130#endif
131 {
eb160463
GS
132 *d++ = 0xfe; /* Can't match U+FEFF! */
133 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
134 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
135 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
136 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
138 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
139 return d;
140 }
6b8eaf93 141#ifdef HAS_QUAD
a0ed51b3 142 {
eb160463
GS
143 *d++ = 0xff; /* Can't match U+FFFE! */
144 *d++ = 0x80; /* 6 Reserved bits */
145 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
146 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
156 return d;
157 }
158#endif
1d72bdf6 159#endif /* Loop style */
a0ed51b3 160}
b851fbc1
JH
161
162U8 *
163Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
164{
165 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
166}
9041c2e3
NIS
167
168
169/*
eebe1485
SC
170=for apidoc A|STRLEN|is_utf8_char|U8 *s
171
5da9da9e 172Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
9041c2e3 176
82686b01 177=cut */
067a85ef 178STRLEN
386d01d6
GS
179Perl_is_utf8_char(pTHX_ U8 *s)
180{
181 U8 u = *s;
067a85ef
A
182 STRLEN slen, len;
183 UV uv, ouv;
386d01d6 184
1d72bdf6 185 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
186 return 1;
187
60006e79 188 if (!UTF8_IS_START(u))
386d01d6
GS
189 return 0;
190
9f07fdcd 191 len = UTF8SKIP(s);
386d01d6 192
60006e79 193 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
194 return 0;
195
386d01d6
GS
196 slen = len - 1;
197 s++;
6eb6869e 198 u &= UTF_START_MASK(len);
1d72bdf6 199 uv = u;
067a85ef 200 ouv = uv;
386d01d6 201 while (slen--) {
60006e79 202 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 203 return 0;
8850bf83 204 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 205 if (uv < ouv)
067a85ef
A
206 return 0;
207 ouv = uv;
386d01d6
GS
208 s++;
209 }
067a85ef 210
eb160463 211 if ((STRLEN)UNISKIP(uv) < len)
067a85ef
A
212 return 0;
213
386d01d6
GS
214 return len;
215}
216
6662521e 217/*
eebe1485 218=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 219
c9ada85f 220Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
221UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
222not mean 'a string that contains code points above 0x7F encoded in UTF-8'
223because a valid ASCII string is a valid UTF-8 string.
6662521e
GS
224
225=cut
226*/
227
8e84507e 228bool
6662521e
GS
229Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
230{
067a85ef 231 U8* x = s;
1aa99e6b 232 U8* send;
067a85ef
A
233 STRLEN c;
234
1aa99e6b 235 if (!len)
6cd5fe39 236 len = strlen((char *)s);
1aa99e6b
IH
237 send = s + len;
238
6662521e 239 while (x < send) {
1acdb0da
JH
240 /* Inline the easy bits of is_utf8_char() here for speed... */
241 if (UTF8_IS_INVARIANT(*x))
242 c = 1;
243 else if (!UTF8_IS_START(*x))
244 return FALSE;
245 else {
246 /* ... and call is_utf8_char() only if really needed. */
247 c = is_utf8_char(x);
248 if (!c)
249 return FALSE;
250 }
6662521e 251 x += c;
6662521e 252 }
60006e79
JH
253 if (x != send)
254 return FALSE;
067a85ef
A
255
256 return TRUE;
6662521e
GS
257}
258
67e989fb 259/*
81cd54e3
JH
260=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
261
262Like is_ut8_string but store the location of the failure in
263the last argument.
264
265=cut
266*/
267
268bool
269Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
270{
271 U8* x = s;
272 U8* send;
273 STRLEN c;
274
275 if (!len)
276 len = strlen((char *)s);
277 send = s + len;
278
279 while (x < send) {
280 /* Inline the easy bits of is_utf8_char() here for speed... */
281 if (UTF8_IS_INVARIANT(*x))
282 c = 1;
283 else if (!UTF8_IS_START(*x)) {
284 if (p)
285 *p = x;
286 return FALSE;
287 }
288 else {
289 /* ... and call is_utf8_char() only if really needed. */
290 c = is_utf8_char(x);
291 if (!c) {
292 if (p)
293 *p = x;
294 return FALSE;
295 }
296 }
297 x += c;
298 }
299 if (x != send) {
300 if (p)
301 *p = x;
302 return FALSE;
303 }
304
305 return TRUE;
306}
307
308/*
9041c2e3 309=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 310
9041c2e3
NIS
311Bottom level UTF-8 decode routine.
312Returns the unicode code point value of the first character in the string C<s>
1e54db1a 313which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 314C<retlen> will be set to the length, in bytes, of that character.
67e989fb 315
1e54db1a 316If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
317is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
318it is assumed that the caller will raise a warning, and this function
28d3d195
JH
319will silently just set C<retlen> to C<-1> and return zero. If the
320C<flags> does not contain UTF8_CHECK_ONLY, warnings about
321malformations will be given, C<retlen> will be set to the expected
322length of the UTF-8 character in bytes, and zero will be returned.
323
324The C<flags> can also contain various flags to allow deviations from
325the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 326
9041c2e3
NIS
327Most code should use utf8_to_uvchr() rather than call this directly.
328
37607a96
PK
329=cut
330*/
67e989fb 331
a0ed51b3 332UV
37607a96 333Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 334{
097fb8e2 335 U8 *s0 = s;
9c5ffd7c 336 UV uv = *s, ouv = 0;
ba210ebe
JH
337 STRLEN len = 1;
338 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 339 UV startbyte = *s;
ba210ebe 340 STRLEN expectlen = 0;
a0dbb045
JH
341 U32 warning = 0;
342
343/* This list is a superset of the UTF8_ALLOW_XXX. */
344
345#define UTF8_WARN_EMPTY 1
346#define UTF8_WARN_CONTINUATION 2
347#define UTF8_WARN_NON_CONTINUATION 3
348#define UTF8_WARN_FE_FF 4
349#define UTF8_WARN_SHORT 5
350#define UTF8_WARN_OVERFLOW 6
351#define UTF8_WARN_SURROGATE 7
c867b360
JH
352#define UTF8_WARN_LONG 8
353#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
354
355 if (curlen == 0 &&
356 !(flags & UTF8_ALLOW_EMPTY)) {
357 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
358 goto malformed;
359 }
360
1d72bdf6 361 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
362 if (retlen)
363 *retlen = 1;
c4d5f83a 364 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 365 }
67e989fb 366
421a8bf2 367 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 368 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 369 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
370 goto malformed;
371 }
372
421a8bf2 373 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 374 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 375 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
376 goto malformed;
377 }
9041c2e3 378
1d72bdf6 379#ifdef EBCDIC
75383841 380 uv = NATIVE_TO_UTF(uv);
1d72bdf6 381#else
fcc8fcf6
JH
382 if ((uv == 0xfe || uv == 0xff) &&
383 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 384 warning = UTF8_WARN_FE_FF;
ba210ebe 385 goto malformed;
a0ed51b3 386 }
1d72bdf6
NIS
387#endif
388
ba210ebe
JH
389 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
390 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
391 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
392 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
393#ifdef EBCDIC
394 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
395 else { len = 7; uv &= 0x01; }
396#else
ba210ebe
JH
397 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
398 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
399 else { len = 13; uv = 0; } /* whoa! */
400#endif
401
a0ed51b3
LW
402 if (retlen)
403 *retlen = len;
9041c2e3 404
ba210ebe
JH
405 expectlen = len;
406
fcc8fcf6
JH
407 if ((curlen < expectlen) &&
408 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 409 warning = UTF8_WARN_SHORT;
ba210ebe
JH
410 goto malformed;
411 }
412
413 len--;
a0ed51b3 414 s++;
ba210ebe
JH
415 ouv = uv;
416
a0ed51b3 417 while (len--) {
421a8bf2
JH
418 if (!UTF8_IS_CONTINUATION(*s) &&
419 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
420 s--;
421 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 422 goto malformed;
a0ed51b3
LW
423 }
424 else
8850bf83 425 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
426 if (!(uv > ouv)) {
427 /* These cannot be allowed. */
428 if (uv == ouv) {
429 if (!(flags & UTF8_ALLOW_LONG)) {
430 warning = UTF8_WARN_LONG;
431 goto malformed;
432 }
433 }
434 else { /* uv < ouv */
435 /* This cannot be allowed. */
436 warning = UTF8_WARN_OVERFLOW;
437 goto malformed;
438 }
ba210ebe
JH
439 }
440 s++;
441 ouv = uv;
442 }
443
421a8bf2 444 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 446 warning = UTF8_WARN_SURROGATE;
ba210ebe 447 goto malformed;
eb160463 448 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 449 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 450 warning = UTF8_WARN_LONG;
ba210ebe 451 goto malformed;
421a8bf2 452 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 453 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 454 warning = UTF8_WARN_FFFF;
a9917092 455 goto malformed;
a0ed51b3 456 }
ba210ebe 457
a0ed51b3 458 return uv;
ba210ebe
JH
459
460malformed:
461
fcc8fcf6 462 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 463 if (retlen)
cc366d4b 464 *retlen = -1;
ba210ebe
JH
465 return 0;
466 }
467
a0dbb045
JH
468 if (dowarn) {
469 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
470
471 switch (warning) {
472 case 0: /* Intentionally empty. */ break;
473 case UTF8_WARN_EMPTY:
474 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
475 break;
476 case UTF8_WARN_CONTINUATION:
097fb8e2 477 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
478 break;
479 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
480 if (s == s0)
481 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
482 (UV)s[1], startbyte);
483 else
484 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
485 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
486
a0dbb045
JH
487 break;
488 case UTF8_WARN_FE_FF:
489 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
490 break;
491 case UTF8_WARN_SHORT:
097fb8e2
JH
492 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
493 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 494 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
495 break;
496 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
497 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
498 ouv, *s, startbyte);
a0dbb045
JH
499 break;
500 case UTF8_WARN_SURROGATE:
501 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
502 break;
a0dbb045 503 case UTF8_WARN_LONG:
097fb8e2
JH
504 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
505 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
506 break;
507 case UTF8_WARN_FFFF:
508 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
509 break;
510 default:
511 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
512 break;
513 }
514
515 if (warning) {
516 char *s = SvPVX(sv);
517
518 if (PL_op)
9014280d 519 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 520 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 521 else
9014280d 522 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
523 }
524 }
525
ba210ebe 526 if (retlen)
28d3d195 527 *retlen = expectlen ? expectlen : len;
ba210ebe 528
28d3d195 529 return 0;
a0ed51b3
LW
530}
531
8e84507e 532/*
37607a96 533=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
534
535Returns the native character value of the first character in the string C<s>
1e54db1a 536which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
537length, in bytes, of that character.
538
1e54db1a 539If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
540returned and retlen is set, if possible, to -1.
541
542=cut
543*/
544
545UV
37607a96 546Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 547{
872c91ae
JH
548 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
549 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
550}
551
552/*
37607a96 553=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
554
555Returns the Unicode code point of the first character in the string C<s>
1e54db1a 556which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
557length, in bytes, of that character.
558
559This function should only be used when returned UV is considered
560an index into the Unicode semantic tables (e.g. swashes).
561
1e54db1a 562If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 563returned and retlen is set, if possible, to -1.
8e84507e
NIS
564
565=cut
566*/
567
568UV
37607a96 569Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 570{
9041c2e3 571 /* Call the low level routine asking for checks */
872c91ae
JH
572 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
573 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
574}
575
b76347f2 576/*
37607a96 577=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
578
579Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
580Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
581up past C<e>, croaks.
b76347f2
JH
582
583=cut
584*/
585
586STRLEN
37607a96 587Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
588{
589 STRLEN len = 0;
590
8850bf83
JH
591 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
592 * the bitops (especially ~) can create illegal UTF-8.
593 * In other words: in Perl UTF-8 is not just for Unicode. */
594
901b21bf
JH
595 if (e < s) {
596 if (ckWARN_d(WARN_UTF8)) {
597 if (PL_op)
598 Perl_warner(aTHX_ packWARN(WARN_UTF8),
599 "%s in %s", unees, OP_DESC(PL_op));
600 else
601 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
602 }
603 return 0;
604 }
b76347f2 605 while (s < e) {
02eb7b47 606 U8 t = UTF8SKIP(s);
b76347f2 607
901b21bf
JH
608 if (e - s < t) {
609 if (ckWARN_d(WARN_UTF8)) {
610 if (PL_op)
611 Perl_warner(aTHX_ packWARN(WARN_UTF8),
612 unees, OP_DESC(PL_op));
613 else
614 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
615 }
616 return len;
617 }
b76347f2
JH
618 s += t;
619 len++;
620 }
621
622 return len;
623}
624
b06226ff 625/*
eebe1485 626=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 627
1e54db1a 628Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
629and C<b>.
630
631WARNING: use only if you *know* that the pointers point inside the
632same UTF-8 buffer.
633
37607a96
PK
634=cut
635*/
a0ed51b3 636
02eb7b47 637IV
864dbfa3 638Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 639{
02eb7b47
JH
640 IV off = 0;
641
8850bf83
JH
642 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
643 * the bitops (especially ~) can create illegal UTF-8.
644 * In other words: in Perl UTF-8 is not just for Unicode. */
645
a0ed51b3
LW
646 if (a < b) {
647 while (a < b) {
02eb7b47
JH
648 U8 c = UTF8SKIP(a);
649
901b21bf
JH
650 if (b - a < c) {
651 if (ckWARN_d(WARN_UTF8)) {
652 if (PL_op)
653 Perl_warner(aTHX_ packWARN(WARN_UTF8),
654 "%s in %s", unees, OP_DESC(PL_op));
655 else
656 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
657 }
658 return off;
659 }
02eb7b47 660 a += c;
a0ed51b3
LW
661 off--;
662 }
663 }
664 else {
665 while (b < a) {
02eb7b47
JH
666 U8 c = UTF8SKIP(b);
667
901b21bf
JH
668 if (a - b < c) {
669 if (ckWARN_d(WARN_UTF8)) {
670 if (PL_op)
671 Perl_warner(aTHX_ packWARN(WARN_UTF8),
672 "%s in %s", unees, OP_DESC(PL_op));
673 else
674 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675 }
676 return off;
677 }
02eb7b47 678 b += c;
a0ed51b3
LW
679 off++;
680 }
681 }
02eb7b47 682
a0ed51b3
LW
683 return off;
684}
685
b06226ff 686/*
37607a96 687=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 688
8850bf83
JH
689Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690forward or backward.
b06226ff
JH
691
692WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
693the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694on the first byte of character or just after the last byte of a character.
b06226ff 695
37607a96
PK
696=cut
697*/
a0ed51b3
LW
698
699U8 *
864dbfa3 700Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 701{
8850bf83
JH
702 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703 * the bitops (especially ~) can create illegal UTF-8.
704 * In other words: in Perl UTF-8 is not just for Unicode. */
705
a0ed51b3
LW
706 if (off >= 0) {
707 while (off--)
708 s += UTF8SKIP(s);
709 }
710 else {
711 while (off++) {
712 s--;
8850bf83
JH
713 while (UTF8_IS_CONTINUATION(*s))
714 s--;
a0ed51b3
LW
715 }
716 }
717 return s;
718}
719
6940069f 720/*
eebe1485 721=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 722
1e54db1a 723Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
724Unlike C<bytes_to_utf8>, this over-writes the original string, and
725updates len to contain the new length.
67e989fb 726Returns zero on failure, setting C<len> to -1.
6940069f
GS
727
728=cut
729*/
730
731U8 *
37607a96 732Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 733{
6940069f
GS
734 U8 *send;
735 U8 *d;
dcad2880 736 U8 *save = s;
246fae53 737
1e54db1a 738 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
739 for (send = s + *len; s < send; ) {
740 U8 c = *s++;
741
1d72bdf6
NIS
742 if (!UTF8_IS_INVARIANT(c) &&
743 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
745 *len = -1;
746 return 0;
747 }
246fae53 748 }
dcad2880
JH
749
750 d = s = save;
6940069f 751 while (s < send) {
ed646e6e 752 STRLEN ulen;
9041c2e3 753 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 754 s += ulen;
6940069f
GS
755 }
756 *d = '\0';
246fae53 757 *len = d - save;
6940069f
GS
758 return save;
759}
760
761/*
f9a63242
JH
762=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
763
1e54db1a 764Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
f9a63242 765Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
766the newly-created string, and updates C<len> to contain the new
767length. Returns the original string if no conversion occurs, C<len>
768is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7690 if C<s> is converted or contains all 7bit characters.
f9a63242 770
37607a96
PK
771=cut
772*/
f9a63242
JH
773
774U8 *
37607a96 775Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 776{
f9a63242
JH
777 U8 *d;
778 U8 *start = s;
db42d148 779 U8 *send;
f9a63242
JH
780 I32 count = 0;
781
782 if (!*is_utf8)
783 return start;
784
1e54db1a 785 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242
JH
786 for (send = s + *len; s < send;) {
787 U8 c = *s++;
1d72bdf6 788 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
789 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790 (c = *s++) && UTF8_IS_CONTINUATION(c))
791 count++;
792 else
f9a63242 793 return start;
db42d148 794 }
f9a63242
JH
795 }
796
797 *is_utf8 = 0;
798
f9a63242 799 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 800 s = start; start = d;
f9a63242
JH
801 while (s < send) {
802 U8 c = *s++;
c4d5f83a
NIS
803 if (!UTF8_IS_INVARIANT(c)) {
804 /* Then it is two-byte encoded */
805 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806 c = ASCII_TO_NATIVE(c);
807 }
808 *d++ = c;
f9a63242
JH
809 }
810 *d = '\0';
811 *len = d - start;
812 return start;
813}
814
815/*
eebe1485 816=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 817
1e54db1a 818Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
819Returns a pointer to the newly-created string, and sets C<len> to
820reflect the new length.
6940069f 821
1e54db1a 822If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
823see sv_recode_to_utf8().
824
497711e7 825=cut
6940069f
GS
826*/
827
828U8*
37607a96 829Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 830{
6940069f
GS
831 U8 *send;
832 U8 *d;
833 U8 *dst;
6662521e 834 send = s + (*len);
6940069f 835
6662521e 836 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
837 dst = d;
838
839 while (s < send) {
db42d148 840 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 841 if (UNI_IS_INVARIANT(uv))
eb160463 842 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 843 else {
eb160463
GS
844 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
845 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
846 }
847 }
848 *d = '\0';
6662521e 849 *len = d-dst;
6940069f
GS
850 return dst;
851}
852
a0ed51b3 853/*
dea0fc0b 854 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
855 *
856 * Destination must be pre-extended to 3/2 source. Do not use in-place.
857 * We optimize for native, for obvious reasons. */
858
859U8*
dea0fc0b 860Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 861{
dea0fc0b
JH
862 U8* pend;
863 U8* dstart = d;
864
865 if (bytelen & 1)
a7867d0a 866 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
dea0fc0b
JH
867
868 pend = p + bytelen;
869
a0ed51b3 870 while (p < pend) {
dea0fc0b
JH
871 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
872 p += 2;
a0ed51b3 873 if (uv < 0x80) {
eb160463 874 *d++ = (U8)uv;
a0ed51b3
LW
875 continue;
876 }
877 if (uv < 0x800) {
eb160463
GS
878 *d++ = (U8)(( uv >> 6) | 0xc0);
879 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
880 continue;
881 }
882 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
883 UV low = (p[0] << 8) + p[1];
884 p += 2;
dea0fc0b
JH
885 if (low < 0xdc00 || low >= 0xdfff)
886 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
887 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
888 }
889 if (uv < 0x10000) {
eb160463
GS
890 *d++ = (U8)(( uv >> 12) | 0xe0);
891 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
892 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
893 continue;
894 }
895 else {
eb160463
GS
896 *d++ = (U8)(( uv >> 18) | 0xf0);
897 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
898 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
899 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
900 continue;
901 }
902 }
dea0fc0b 903 *newlen = d - dstart;
a0ed51b3
LW
904 return d;
905}
906
907/* Note: this one is slightly destructive of the source. */
908
909U8*
dea0fc0b 910Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
911{
912 U8* s = (U8*)p;
913 U8* send = s + bytelen;
914 while (s < send) {
915 U8 tmp = s[0];
916 s[0] = s[1];
917 s[1] = tmp;
918 s += 2;
919 }
dea0fc0b 920 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
921}
922
923/* for now these are all defined (inefficiently) in terms of the utf8 versions */
924
925bool
84afefe6 926Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 927{
ad391ad9 928 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 929 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
930 return is_utf8_alnum(tmpbuf);
931}
932
933bool
84afefe6 934Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 935{
ad391ad9 936 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 937 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
938 return is_utf8_alnumc(tmpbuf);
939}
940
941bool
84afefe6 942Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 943{
ad391ad9 944 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 945 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
946 return is_utf8_idfirst(tmpbuf);
947}
948
949bool
84afefe6 950Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 951{
ad391ad9 952 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 953 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
954 return is_utf8_alpha(tmpbuf);
955}
956
957bool
84afefe6 958Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 959{
ad391ad9 960 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 961 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
962 return is_utf8_ascii(tmpbuf);
963}
964
965bool
84afefe6 966Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 967{
ad391ad9 968 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 969 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
970 return is_utf8_space(tmpbuf);
971}
972
973bool
84afefe6 974Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 975{
ad391ad9 976 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 977 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
978 return is_utf8_digit(tmpbuf);
979}
980
981bool
84afefe6 982Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 983{
ad391ad9 984 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 985 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
986 return is_utf8_upper(tmpbuf);
987}
988
989bool
84afefe6 990Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 991{
ad391ad9 992 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 993 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
994 return is_utf8_lower(tmpbuf);
995}
996
997bool
84afefe6 998Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 999{
ad391ad9 1000 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1001 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1002 return is_utf8_cntrl(tmpbuf);
1003}
1004
1005bool
84afefe6 1006Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1007{
ad391ad9 1008 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1009 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1010 return is_utf8_graph(tmpbuf);
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1015{
ad391ad9 1016 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1017 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1018 return is_utf8_print(tmpbuf);
1019}
1020
b8c5462f 1021bool
84afefe6 1022Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1023{
ad391ad9 1024 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1025 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1026 return is_utf8_punct(tmpbuf);
1027}
1028
4d61ec05 1029bool
84afefe6 1030Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1031{
e7ae6809 1032 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 1033 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1034 return is_utf8_xdigit(tmpbuf);
1035}
1036
84afefe6
JH
1037UV
1038Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1039{
0ebc6274
JH
1040 uvchr_to_utf8(p, c);
1041 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1042}
1043
84afefe6
JH
1044UV
1045Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1046{
0ebc6274
JH
1047 uvchr_to_utf8(p, c);
1048 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1049}
1050
84afefe6
JH
1051UV
1052Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1053{
0ebc6274
JH
1054 uvchr_to_utf8(p, c);
1055 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1056}
1057
84afefe6
JH
1058UV
1059Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1060{
0ebc6274
JH
1061 uvchr_to_utf8(p, c);
1062 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1063}
1064
a0ed51b3
LW
1065/* for now these all assume no locale info available for Unicode > 255 */
1066
1067bool
84afefe6 1068Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1069{
1070 return is_uni_alnum(c); /* XXX no locale support yet */
1071}
1072
1073bool
84afefe6 1074Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1075{
1076 return is_uni_alnumc(c); /* XXX no locale support yet */
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1081{
1082 return is_uni_idfirst(c); /* XXX no locale support yet */
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1087{
1088 return is_uni_alpha(c); /* XXX no locale support yet */
1089}
1090
1091bool
84afefe6 1092Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1093{
1094 return is_uni_ascii(c); /* XXX no locale support yet */
1095}
1096
1097bool
84afefe6 1098Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1099{
1100 return is_uni_space(c); /* XXX no locale support yet */
1101}
1102
1103bool
84afefe6 1104Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1105{
1106 return is_uni_digit(c); /* XXX no locale support yet */
1107}
1108
1109bool
84afefe6 1110Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1111{
1112 return is_uni_upper(c); /* XXX no locale support yet */
1113}
1114
1115bool
84afefe6 1116Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1117{
1118 return is_uni_lower(c); /* XXX no locale support yet */
1119}
1120
1121bool
84afefe6 1122Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1123{
1124 return is_uni_cntrl(c); /* XXX no locale support yet */
1125}
1126
1127bool
84afefe6 1128Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1129{
1130 return is_uni_graph(c); /* XXX no locale support yet */
1131}
1132
1133bool
84afefe6 1134Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1135{
1136 return is_uni_print(c); /* XXX no locale support yet */
1137}
1138
b8c5462f 1139bool
84afefe6 1140Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1141{
1142 return is_uni_punct(c); /* XXX no locale support yet */
1143}
1144
4d61ec05 1145bool
84afefe6 1146Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1147{
1148 return is_uni_xdigit(c); /* XXX no locale support yet */
1149}
1150
b7ac61fa
JH
1151U32
1152Perl_to_uni_upper_lc(pTHX_ U32 c)
1153{
ee099d14
JH
1154 /* XXX returns only the first character -- do not use XXX */
1155 /* XXX no locale support yet */
1156 STRLEN len;
1157 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1158 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1159}
1160
1161U32
1162Perl_to_uni_title_lc(pTHX_ U32 c)
1163{
ee099d14
JH
1164 /* XXX returns only the first character XXX -- do not use XXX */
1165 /* XXX no locale support yet */
1166 STRLEN len;
1167 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1168 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1169}
1170
1171U32
1172Perl_to_uni_lower_lc(pTHX_ U32 c)
1173{
ee099d14
JH
1174 /* XXX returns only the first character -- do not use XXX */
1175 /* XXX no locale support yet */
1176 STRLEN len;
1177 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1178 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1179}
1180
a0ed51b3 1181bool
864dbfa3 1182Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1183{
386d01d6
GS
1184 if (!is_utf8_char(p))
1185 return FALSE;
a0ed51b3 1186 if (!PL_utf8_alnum)
289d4f09
ML
1187 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1188 * descendant of isalnum(3), in other words, it doesn't
1189 * contain the '_'. --jhi */
1190 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1191 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1192/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1193#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1194 if (!PL_utf8_alnum)
1195 PL_utf8_alnum = swash_init("utf8", "",
1196 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1197 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1198#endif
1199}
1200
1201bool
b8c5462f
JH
1202Perl_is_utf8_alnumc(pTHX_ U8 *p)
1203{
386d01d6
GS
1204 if (!is_utf8_char(p))
1205 return FALSE;
b8c5462f
JH
1206 if (!PL_utf8_alnum)
1207 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1208 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1209/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1210#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1211 if (!PL_utf8_alnum)
1212 PL_utf8_alnum = swash_init("utf8", "",
1213 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1214 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1215#endif
1216}
1217
1218bool
82686b01 1219Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1220{
82686b01
JH
1221 if (*p == '_')
1222 return TRUE;
1223 if (!is_utf8_char(p))
1224 return FALSE;
1225 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1226 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1227 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1228}
1229
1230bool
1231Perl_is_utf8_idcont(pTHX_ U8 *p)
1232{
1233 if (*p == '_')
1234 return TRUE;
1235 if (!is_utf8_char(p))
1236 return FALSE;
1237 if (!PL_utf8_idcont)
1238 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1239 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1240}
1241
1242bool
864dbfa3 1243Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1244{
386d01d6
GS
1245 if (!is_utf8_char(p))
1246 return FALSE;
a0ed51b3 1247 if (!PL_utf8_alpha)
e24b16f9 1248 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1249 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1250}
1251
1252bool
b8c5462f
JH
1253Perl_is_utf8_ascii(pTHX_ U8 *p)
1254{
386d01d6
GS
1255 if (!is_utf8_char(p))
1256 return FALSE;
b8c5462f
JH
1257 if (!PL_utf8_ascii)
1258 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1259 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1260}
1261
1262bool
864dbfa3 1263Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1264{
386d01d6
GS
1265 if (!is_utf8_char(p))
1266 return FALSE;
a0ed51b3 1267 if (!PL_utf8_space)
3bec3564 1268 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1269 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1270}
1271
1272bool
864dbfa3 1273Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1274{
386d01d6
GS
1275 if (!is_utf8_char(p))
1276 return FALSE;
a0ed51b3 1277 if (!PL_utf8_digit)
e24b16f9 1278 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1279 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1280}
1281
1282bool
864dbfa3 1283Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1284{
386d01d6
GS
1285 if (!is_utf8_char(p))
1286 return FALSE;
a0ed51b3 1287 if (!PL_utf8_upper)
c65e4d19 1288 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1289 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1290}
1291
1292bool
864dbfa3 1293Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1294{
386d01d6
GS
1295 if (!is_utf8_char(p))
1296 return FALSE;
a0ed51b3 1297 if (!PL_utf8_lower)
c65e4d19 1298 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1299 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1300}
1301
1302bool
b8c5462f
JH
1303Perl_is_utf8_cntrl(pTHX_ U8 *p)
1304{
386d01d6
GS
1305 if (!is_utf8_char(p))
1306 return FALSE;
b8c5462f
JH
1307 if (!PL_utf8_cntrl)
1308 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1309 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1310}
1311
1312bool
1313Perl_is_utf8_graph(pTHX_ U8 *p)
1314{
386d01d6
GS
1315 if (!is_utf8_char(p))
1316 return FALSE;
b8c5462f
JH
1317 if (!PL_utf8_graph)
1318 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1319 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1320}
1321
1322bool
864dbfa3 1323Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1324{
386d01d6
GS
1325 if (!is_utf8_char(p))
1326 return FALSE;
a0ed51b3 1327 if (!PL_utf8_print)
e24b16f9 1328 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1329 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1330}
1331
1332bool
b8c5462f
JH
1333Perl_is_utf8_punct(pTHX_ U8 *p)
1334{
386d01d6
GS
1335 if (!is_utf8_char(p))
1336 return FALSE;
b8c5462f
JH
1337 if (!PL_utf8_punct)
1338 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1339 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1340}
1341
1342bool
1343Perl_is_utf8_xdigit(pTHX_ U8 *p)
1344{
386d01d6
GS
1345 if (!is_utf8_char(p))
1346 return FALSE;
b8c5462f
JH
1347 if (!PL_utf8_xdigit)
1348 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1349 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1350}
1351
1352bool
864dbfa3 1353Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1354{
386d01d6
GS
1355 if (!is_utf8_char(p))
1356 return FALSE;
a0ed51b3 1357 if (!PL_utf8_mark)
e24b16f9 1358 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1359 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1360}
1361
6b5c0936
JH
1362/*
1363=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1364
1365The "p" contains the pointer to the UTF-8 string encoding
1366the character that is being converted.
1367
1368The "ustrp" is a pointer to the character buffer to put the
1369conversion result to. The "lenp" is a pointer to the length
1370of the result.
1371
0134edef 1372The "swashp" is a pointer to the swash to use.
6b5c0936 1373
0134edef
JH
1374Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1375and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1376but not always, a multicharacter mapping), is tried first.
6b5c0936 1377
0134edef
JH
1378The "special" is a string like "utf8::ToSpecLower", which means the
1379hash %utf8::ToSpecLower. The access to the hash is through
1380Perl_to_utf8_case().
6b5c0936 1381
0134edef
JH
1382The "normal" is a string like "ToLower" which means the swash
1383%utf8::ToLower.
1384
1385=cut */
6b5c0936 1386
2104c8d9 1387UV
a6872d42 1388Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1389{
0134edef 1390 UV uv0, uv1;
2f9475ad 1391 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1392 STRLEN len = 0;
a0ed51b3 1393
1feea2c7
JH
1394 uv0 = utf8_to_uvchr(p, 0);
1395 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1396 * are necessary in EBCDIC, they are redundant no-ops
1397 * in ASCII-ish platforms, and hopefully optimized away. */
1398 uv1 = NATIVE_TO_UNI(uv0);
1399 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1400
1401 if (!*swashp) /* load on-demand */
1402 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1403
1404 if (special) {
1405 /* It might be "special" (sometimes, but not always,
2a37f04d 1406 * a multicharacter mapping) */
983ffd37
JH
1407 HV *hv;
1408 SV *keysv;
1409 HE *he;
2a37f04d 1410 SV *val;
2f9475ad 1411
983ffd37 1412 if ((hv = get_hv(special, FALSE)) &&
1feea2c7 1413 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
2a37f04d
JH
1414 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1415 (val = HeVAL(he))) {
47654450 1416 char *s;
47654450 1417
2a37f04d 1418 s = SvPV(val, len);
47654450
JH
1419 if (len == 1)
1420 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1421 else {
2f9475ad
JH
1422#ifdef EBCDIC
1423 /* If we have EBCDIC we need to remap the characters
1424 * since any characters in the low 256 are Unicode
1425 * code points, not EBCDIC. */
7cda7a3d 1426 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1427
1428 d = tmpbuf;
1429 if (SvUTF8(val)) {
1430 STRLEN tlen = 0;
1431
1432 while (t < tend) {
1433 UV c = utf8_to_uvchr(t, &tlen);
1434 if (tlen > 0) {
1435 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1436 t += tlen;
1437 }
1438 else
1439 break;
1440 }
1441 }
1442 else {
36fec512
JH
1443 while (t < tend) {
1444 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1445 t++;
1446 }
2f9475ad
JH
1447 }
1448 len = d - tmpbuf;
1449 Copy(tmpbuf, ustrp, len, U8);
1450#else
d2dcd0fb 1451 Copy(s, ustrp, len, U8);
2f9475ad 1452#endif
29e98929 1453 }
983ffd37 1454 }
0134edef
JH
1455 }
1456
1457 if (!len && *swashp) {
1458 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1459
1460 if (uv2) {
1461 /* It was "normal" (a single character mapping). */
1462 UV uv3 = UNI_TO_NATIVE(uv2);
1463
e9101d72 1464 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1465 }
1466 }
1feea2c7 1467
0134edef
JH
1468 if (!len) /* Neither: just copy. */
1469 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1470
2a37f04d
JH
1471 if (lenp)
1472 *lenp = len;
1473
0134edef 1474 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1475}
1476
d3e79532
JH
1477/*
1478=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1479
1480Convert the UTF-8 encoded character at p to its uppercase version and
1481store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1482that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1483uppercase version may be longer than the original character (up to two
1484characters).
1485
1486The first character of the uppercased version is returned
1487(but note, as explained above, that there may be more.)
1488
1489=cut */
1490
2104c8d9 1491UV
983ffd37 1492Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1493{
983ffd37 1494 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1495 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1496}
a0ed51b3 1497
d3e79532
JH
1498/*
1499=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1500
1501Convert the UTF-8 encoded character at p to its titlecase version and
1502store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1503that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1504titlecase version may be longer than the original character (up to two
1505characters).
1506
1507The first character of the titlecased version is returned
1508(but note, as explained above, that there may be more.)
1509
1510=cut */
1511
983ffd37
JH
1512UV
1513Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1514{
1515 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1516 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1517}
1518
d3e79532
JH
1519/*
1520=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1521
1522Convert the UTF-8 encoded character at p to its lowercase version and
1523store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1524that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1525lowercase version may be longer than the original character (up to two
1526characters).
1527
1528The first character of the lowercased version is returned
1529(but note, as explained above, that there may be more.)
1530
1531=cut */
1532
2104c8d9 1533UV
a2a2844f 1534Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1535{
983ffd37 1536 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1537 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1538}
1539
d3e79532
JH
1540/*
1541=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1542
1543Convert the UTF-8 encoded character at p to its foldcase version and
1544store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1545that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1546foldcase version may be longer than the original character (up to
1547three characters).
1548
1549The first character of the foldcased version is returned
1550(but note, as explained above, that there may be more.)
1551
1552=cut */
1553
b4e400f9
JH
1554UV
1555Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1556{
1557 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1558 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1559}
1560
1561/* a "swash" is a swatch hash */
1562
1563SV*
864dbfa3 1564Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1565{
1566 SV* retval;
bf1fed83 1567 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1568 dSP;
1b026014 1569 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
f8be5cf0 1570 SV* errsv_save;
ce3b816e 1571
1b026014 1572 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1573 ENTER;
f8be5cf0 1574 errsv_save = newSVsv(ERRSV);
ce3b816e 1575 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
f8be5cf0
JH
1576 if (!SvTRUE(ERRSV))
1577 sv_setsv(ERRSV, errsv_save);
1578 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1579 LEAVE;
1580 }
1581 SPAGAIN;
a0ed51b3
LW
1582 PUSHSTACKi(PERLSI_MAGIC);
1583 PUSHMARK(SP);
1584 EXTEND(SP,5);
1585 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1586 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1587 PUSHs(listsv);
1588 PUSHs(sv_2mortal(newSViv(minbits)));
1589 PUSHs(sv_2mortal(newSViv(none)));
1590 PUTBACK;
1591 ENTER;
1592 SAVEI32(PL_hints);
1593 PL_hints = 0;
1594 save_re_context();
923e4eb5 1595 if (IN_PERL_COMPILETIME) {
bf1fed83 1596 /* XXX ought to be handled by lex_start */
82686b01 1597 SAVEI32(PL_in_my);
2b4bd638 1598 PL_in_my = 0;
bf1fed83 1599 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1600 }
f8be5cf0 1601 errsv_save = newSVsv(ERRSV);
864dbfa3 1602 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1603 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1604 else
e24b16f9 1605 retval = &PL_sv_undef;
f8be5cf0
JH
1606 if (!SvTRUE(ERRSV))
1607 sv_setsv(ERRSV, errsv_save);
1608 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1609 LEAVE;
1610 POPSTACK;
923e4eb5 1611 if (IN_PERL_COMPILETIME) {
bf1fed83
JH
1612 STRLEN len;
1613 char* pv = SvPV(tokenbufsv, len);
1614
1615 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1616 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1617 }
bc45ce41
JH
1618 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1619 if (SvPOK(retval))
35c1215d
NC
1620 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1621 retval);
cea2e8a9 1622 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1623 }
a0ed51b3
LW
1624 return retval;
1625}
1626
035d37be
JH
1627
1628/* This API is wrong for special case conversions since we may need to
1629 * return several Unicode characters for a single Unicode character
1630 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1631 * the lower-level routine, and it is similarly broken for returning
1632 * multiple values. --jhi */
a0ed51b3 1633UV
3568d838 1634Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1635{
1636 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1637 U32 klen;
1638 U32 off;
a0ed51b3 1639 STRLEN slen;
7d85a32c 1640 STRLEN needents;
4ea42e7f 1641 U8 *tmps = NULL;
a0ed51b3
LW
1642 U32 bit;
1643 SV *retval;
3568d838
JH
1644 U8 tmputf8[2];
1645 UV c = NATIVE_TO_ASCII(*ptr);
1646
1647 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1648 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1649 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1650 ptr = tmputf8;
1651 }
1652 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1653 * then the "swatch" is a vec() for al the chars which start
1654 * with 0xAA..0xYY
1655 * So the key in the hash (klen) is length of encoded char -1
1656 */
1657 klen = UTF8SKIP(ptr) - 1;
1658 off = ptr[klen];
a0ed51b3 1659
7d85a32c
JH
1660 if (klen == 0)
1661 {
1662 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1663 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1664 */
1665 needents = UTF_CONTINUATION_MARK;
1666 off = NATIVE_TO_UTF(ptr[klen]);
1667 }
1668 else
1669 {
1670 /* If char is encoded then swatch is for the prefix */
1671 needents = (1 << UTF_ACCUMULATION_SHIFT);
1672 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1673 }
1674
a0ed51b3
LW
1675 /*
1676 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1677 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1678 * it's nothing to sniff at.) Pity we usually come through at least
1679 * two function calls to get here...
1680 *
1681 * NB: this code assumes that swatches are never modified, once generated!
1682 */
1683
3568d838 1684 if (hv == PL_last_swash_hv &&
a0ed51b3 1685 klen == PL_last_swash_klen &&
3568d838 1686 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1687 {
1688 tmps = PL_last_swash_tmps;
1689 slen = PL_last_swash_slen;
1690 }
1691 else {
1692 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1693 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1694
1695 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1696 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1697 dSP;
2b9d42f0
NIS
1698 /* We use utf8n_to_uvuni() as we want an index into
1699 Unicode tables, not a native character number.
1700 */
872c91ae
JH
1701 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1702 ckWARN(WARN_UTF8) ?
1703 0 : UTF8_ALLOW_ANY);
f8be5cf0 1704 SV *errsv_save;
a0ed51b3
LW
1705 ENTER;
1706 SAVETMPS;
1707 save_re_context();
1708 PUSHSTACKi(PERLSI_MAGIC);
1709 PUSHMARK(SP);
1710 EXTEND(SP,3);
1711 PUSHs((SV*)sv);
ffbc6a93 1712 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1713 PUSHs(sv_2mortal(newSViv((klen) ?
1714 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1715 PUSHs(sv_2mortal(newSViv(needents)));
1716 PUTBACK;
f8be5cf0 1717 errsv_save = newSVsv(ERRSV);
864dbfa3 1718 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1719 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1720 else
e24b16f9 1721 retval = &PL_sv_undef;
f8be5cf0
JH
1722 if (!SvTRUE(ERRSV))
1723 sv_setsv(ERRSV, errsv_save);
1724 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1725 POPSTACK;
1726 FREETMPS;
1727 LEAVE;
923e4eb5 1728 if (IN_PERL_COMPILETIME)
eb160463 1729 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1730
dfe13c55 1731 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1732
7d85a32c 1733 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1734 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1735 }
1736
1737 PL_last_swash_hv = hv;
1738 PL_last_swash_klen = klen;
1739 PL_last_swash_tmps = tmps;
1740 PL_last_swash_slen = slen;
1741 if (klen)
1742 Copy(ptr, PL_last_swash_key, klen, U8);
1743 }
1744
9faf8d75 1745 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1746 case 1:
1747 bit = 1 << (off & 7);
1748 off >>= 3;
1749 return (tmps[off] & bit) != 0;
1750 case 8:
1751 return tmps[off];
1752 case 16:
1753 off <<= 1;
1754 return (tmps[off] << 8) + tmps[off + 1] ;
1755 case 32:
1756 off <<= 2;
1757 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1758 }
cea2e8a9 1759 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1760 return 0;
1761}
2b9d42f0
NIS
1762
1763
1764/*
37607a96 1765=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1766
1e54db1a 1767Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2b9d42f0
NIS
1768of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1769bytes available. The return value is the pointer to the byte after the
1770end of the new character. In other words,
1771
1772 d = uvchr_to_utf8(d, uv);
1773
1774is the recommended wide native character-aware way of saying
1775
1776 *(d++) = uv;
1777
1778=cut
1779*/
1780
1781/* On ASCII machines this is normally a macro but we want a
1782 real function in case XS code wants it
1783*/
1784#undef Perl_uvchr_to_utf8
1785U8 *
1786Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1787{
b851fbc1 1788 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1789}
1790
b851fbc1
JH
1791U8 *
1792Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1793{
1794 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1795}
2b9d42f0
NIS
1796
1797/*
37607a96 1798=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1799
1800Returns the native character value of the first character in the string C<s>
1e54db1a 1801which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1802length, in bytes, of that character.
1803
1804Allows length and flags to be passed to low level routine.
1805
1806=cut
1807*/
0a2ef054
JH
1808/* On ASCII machines this is normally a macro but we want
1809 a real function in case XS code wants it
2b9d42f0
NIS
1810*/
1811#undef Perl_utf8n_to_uvchr
1812UV
37607a96 1813Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1814{
1815 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1816 return UNI_TO_NATIVE(uv);
1817}
1818
d2cc3551
JH
1819/*
1820=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1821
1822Build to the scalar dsv a displayable version of the string spv,
1823length len, the displayable version being at most pvlim bytes long
1824(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1825
9e55ce06 1826The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1827isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1828to display the \\[nrfta\\] as the backslashed versions (like '\n')
1829(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1830UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1831UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1832
d2cc3551
JH
1833The pointer to the PV of the dsv is returned.
1834
1835=cut */
e6b2e755
JH
1836char *
1837Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1838{
1839 int truncated = 0;
1840 char *s, *e;
1841
1842 sv_setpvn(dsv, "", 0);
1843 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1844 UV u;
c728cb41
JH
1845 bool ok = FALSE;
1846
e6b2e755
JH
1847 if (pvlim && SvCUR(dsv) >= pvlim) {
1848 truncated++;
1849 break;
1850 }
1851 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1852 if (u < 256) {
c728cb41
JH
1853 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1854 switch (u & 0xFF) {
1855 case '\n':
1856 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1857 case '\r':
1858 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1859 case '\t':
1860 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1861 case '\f':
1862 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1863 case '\a':
1864 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1865 case '\\':
d79a7a3d 1866 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41
JH
1867 default: break;
1868 }
1869 }
00e86452
JH
1870 /* isPRINT() is the locale-blind version. */
1871 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1872 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054
JH
1873 ok = TRUE;
1874 }
c728cb41
JH
1875 }
1876 if (!ok)
9e55ce06 1877 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1878 }
1879 if (truncated)
1880 sv_catpvn(dsv, "...", 3);
1881
1882 return SvPVX(dsv);
1883}
2b9d42f0 1884
d2cc3551
JH
1885/*
1886=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1887
1888Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1889the displayable version being at most pvlim bytes long
d2cc3551 1890(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1891
1892The flags argument is as in pv_uni_display().
1893
d2cc3551
JH
1894The pointer to the PV of the dsv is returned.
1895
1896=cut */
e6b2e755
JH
1897char *
1898Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1899{
701a277b
JH
1900 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1901 pvlim, flags);
1902}
1903
d2cc3551 1904/*
d07ddd77 1905=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1906
1907Return true if the strings s1 and s2 differ case-insensitively, false
1908if not (if they are equal case-insensitively). If u1 is true, the
1909string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1910the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1911are false, the respective string is assumed to be in native 8-bit
1912encoding.
1913
1914If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1915in there (they will point at the beginning of the I<next> character).
1916If the pointers behind pe1 or pe2 are non-NULL, they are the end
1917pointers beyond which scanning will not continue under any
1918circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1919s2+l2 will be used as goal end pointers that will also stop the scan,
1920and which qualify towards defining a successful match: all the scans
1921that define an explicit length must reach their goal pointers for
1922a match to succeed).
d2cc3551
JH
1923
1924For case-insensitiveness, the "casefolding" of Unicode is used
1925instead of upper/lowercasing both the characters, see
1926http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1927
1928=cut */
701a277b 1929I32
d07ddd77 1930Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1931{
5469e704
JH
1932 register U8 *p1 = (U8*)s1;
1933 register U8 *p2 = (U8*)s2;
d07ddd77
JH
1934 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1935 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1936 STRLEN n1 = 0, n2 = 0;
ffce6cc2
JH
1937 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1938 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8
JH
1939 U8 natbuf[1+1];
1940 STRLEN foldlen1, foldlen2;
d07ddd77 1941 bool match;
332ddc25 1942
d07ddd77
JH
1943 if (pe1)
1944 e1 = *(U8**)pe1;
eb160463 1945 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
d07ddd77
JH
1946 f1 = (U8*)s1 + l1;
1947 if (pe2)
1948 e2 = *(U8**)pe2;
eb160463 1949 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
d07ddd77
JH
1950 f2 = (U8*)s2 + l2;
1951
1952 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1953 return 1; /* mismatch; possible infinite loop or false positive */
1954
a6872d42
JH
1955 if (!u1 || !u2)
1956 natbuf[1] = 0; /* Need to terminate the buffer. */
1957
d07ddd77
JH
1958 while ((e1 == 0 || p1 < e1) &&
1959 (f1 == 0 || p1 < f1) &&
1960 (e2 == 0 || p2 < e2) &&
1961 (f2 == 0 || p2 < f2)) {
1962 if (n1 == 0) {
d7f013c8
JH
1963 if (u1)
1964 to_utf8_fold(p1, foldbuf1, &foldlen1);
1965 else {
f5cee151 1966 natbuf[0] = *p1;
d7f013c8
JH
1967 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1968 }
1969 q1 = foldbuf1;
d07ddd77 1970 n1 = foldlen1;
332ddc25 1971 }
d07ddd77 1972 if (n2 == 0) {
d7f013c8
JH
1973 if (u2)
1974 to_utf8_fold(p2, foldbuf2, &foldlen2);
1975 else {
f5cee151 1976 natbuf[0] = *p2;
d7f013c8
JH
1977 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1978 }
1979 q2 = foldbuf2;
d07ddd77 1980 n2 = foldlen2;
332ddc25 1981 }
d07ddd77
JH
1982 while (n1 && n2) {
1983 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1984 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1985 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1986 return 1; /* mismatch */
d07ddd77 1987 n1 -= UTF8SKIP(q1);
d7f013c8 1988 q1 += UTF8SKIP(q1);
d07ddd77 1989 n2 -= UTF8SKIP(q2);
d7f013c8 1990 q2 += UTF8SKIP(q2);
701a277b 1991 }
d07ddd77 1992 if (n1 == 0)
d7f013c8 1993 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 1994 if (n2 == 0)
d7f013c8
JH
1995 p2 += u2 ? UTF8SKIP(p2) : 1;
1996
d2cc3551 1997 }
5469e704 1998
d07ddd77
JH
1999 /* A match is defined by all the scans that specified
2000 * an explicit length reaching their final goals. */
2001 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2002
2003 if (match) {
d07ddd77
JH
2004 if (pe1)
2005 *pe1 = (char*)p1;
2006 if (pe2)
2007 *pe2 = (char*)p2;
5469e704
JH
2008 }
2009
2010 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2011}
701a277b 2012