This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Silence a const warning
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1
JH
169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3
NIS
175
176
177/*
7fc63493 178=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 179
5da9da9e 180Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
181character. Note that an INVARIANT (i.e. ASCII) character is a valid
182UTF-8 character. The actual number of bytes in the UTF-8 character
183will be returned if it is valid, otherwise 0.
9041c2e3 184
82686b01 185=cut */
067a85ef 186STRLEN
7fc63493 187Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 188{
3b0fc154
JH
189 STRLEN len;
190#ifdef IS_UTF8_CHAR
9f07fdcd 191 len = UTF8SKIP(s);
3b0fc154
JH
192 if (len <= 4)
193 return IS_UTF8_CHAR(s, len) ? len : 0;
194#endif /* #ifdef IS_UTF8_CHAR */
195 {
196 U8 u = *s;
197 STRLEN slen;
198 UV uv, ouv;
199
200 if (UTF8_IS_INVARIANT(u))
201 return 1;
202
203 if (!UTF8_IS_START(u))
204 return 0;
205
206 len = UTF8SKIP(s);
207
208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
209 return 0;
210
211 slen = len - 1;
212 s++;
213 u &= UTF_START_MASK(len);
214 uv = u;
215 ouv = uv;
216 while (slen--) {
217 if (!UTF8_IS_CONTINUATION(*s))
218 return 0;
219 uv = UTF8_ACCUMULATE(uv, *s);
220 if (uv < ouv)
221 return 0;
222 ouv = uv;
223 s++;
224 }
225
226 if ((STRLEN)UNISKIP(uv) < len)
227 return 0;
228
229 return len;
386d01d6 230 }
386d01d6
GS
231}
232
6662521e 233/*
7fc63493 234=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 235
c9ada85f 236Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
237UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
238not mean 'a string that contains code points above 0x7F encoded in UTF-8'
239because a valid ASCII string is a valid UTF-8 string.
6662521e
GS
240
241=cut
242*/
243
8e84507e 244bool
7fc63493 245Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 246{
7fc63493
AL
247 const U8* x = s;
248 const U8* send;
067a85ef 249
61468b03 250 if (!len && s)
e1ec3a88 251 len = strlen((const char *)s);
1aa99e6b
IH
252 send = s + len;
253
6662521e 254 while (x < send) {
a3b680e6 255 STRLEN c;
1acdb0da
JH
256 /* Inline the easy bits of is_utf8_char() here for speed... */
257 if (UTF8_IS_INVARIANT(*x))
258 c = 1;
259 else if (!UTF8_IS_START(*x))
260 return FALSE;
261 else {
262 /* ... and call is_utf8_char() only if really needed. */
263 c = is_utf8_char(x);
264 if (!c)
265 return FALSE;
266 }
6662521e 267 x += c;
6662521e 268 }
60006e79
JH
269 if (x != send)
270 return FALSE;
067a85ef
A
271
272 return TRUE;
6662521e
GS
273}
274
67e989fb 275/*
7fc63493 276=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
81cd54e3
JH
277
278Like is_ut8_string but store the location of the failure in
279the last argument.
280
281=cut
282*/
283
284bool
7fc63493 285Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
81cd54e3 286{
7fc63493
AL
287 const U8* x = s;
288 const U8* send;
81cd54e3
JH
289 STRLEN c;
290
61468b03 291 if (!len && s)
e1ec3a88 292 len = strlen((const char *)s);
81cd54e3
JH
293 send = s + len;
294
295 while (x < send) {
296 /* Inline the easy bits of is_utf8_char() here for speed... */
297 if (UTF8_IS_INVARIANT(*x))
298 c = 1;
299 else if (!UTF8_IS_START(*x)) {
300 if (p)
301 *p = x;
302 return FALSE;
303 }
304 else {
305 /* ... and call is_utf8_char() only if really needed. */
306 c = is_utf8_char(x);
307 if (!c) {
308 if (p)
309 *p = x;
310 return FALSE;
311 }
312 }
313 x += c;
314 }
315 if (x != send) {
316 if (p)
317 *p = x;
318 return FALSE;
319 }
320
321 return TRUE;
322}
323
324/*
7fc63493 325=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 326
9041c2e3
NIS
327Bottom level UTF-8 decode routine.
328Returns the unicode code point value of the first character in the string C<s>
1e54db1a 329which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 330C<retlen> will be set to the length, in bytes, of that character.
67e989fb 331
1e54db1a 332If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
333is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
334it is assumed that the caller will raise a warning, and this function
28d3d195
JH
335will silently just set C<retlen> to C<-1> and return zero. If the
336C<flags> does not contain UTF8_CHECK_ONLY, warnings about
337malformations will be given, C<retlen> will be set to the expected
338length of the UTF-8 character in bytes, and zero will be returned.
339
340The C<flags> can also contain various flags to allow deviations from
341the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 342
9041c2e3
NIS
343Most code should use utf8_to_uvchr() rather than call this directly.
344
37607a96
PK
345=cut
346*/
67e989fb 347
a0ed51b3 348UV
7fc63493 349Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 350{
7fc63493 351 const U8 *s0 = s;
9c5ffd7c 352 UV uv = *s, ouv = 0;
ba210ebe 353 STRLEN len = 1;
7fc63493
AL
354 const bool dowarn = ckWARN_d(WARN_UTF8);
355 const UV startbyte = *s;
ba210ebe 356 STRLEN expectlen = 0;
a0dbb045
JH
357 U32 warning = 0;
358
359/* This list is a superset of the UTF8_ALLOW_XXX. */
360
361#define UTF8_WARN_EMPTY 1
362#define UTF8_WARN_CONTINUATION 2
363#define UTF8_WARN_NON_CONTINUATION 3
364#define UTF8_WARN_FE_FF 4
365#define UTF8_WARN_SHORT 5
366#define UTF8_WARN_OVERFLOW 6
367#define UTF8_WARN_SURROGATE 7
c867b360
JH
368#define UTF8_WARN_LONG 8
369#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
370
371 if (curlen == 0 &&
372 !(flags & UTF8_ALLOW_EMPTY)) {
373 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
374 goto malformed;
375 }
376
1d72bdf6 377 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
378 if (retlen)
379 *retlen = 1;
c4d5f83a 380 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 381 }
67e989fb 382
421a8bf2 383 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 384 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 385 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
386 goto malformed;
387 }
388
421a8bf2 389 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 390 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 391 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
392 goto malformed;
393 }
9041c2e3 394
1d72bdf6 395#ifdef EBCDIC
75383841 396 uv = NATIVE_TO_UTF(uv);
1d72bdf6 397#else
fcc8fcf6
JH
398 if ((uv == 0xfe || uv == 0xff) &&
399 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 400 warning = UTF8_WARN_FE_FF;
ba210ebe 401 goto malformed;
a0ed51b3 402 }
1d72bdf6
NIS
403#endif
404
ba210ebe
JH
405 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
406 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
407 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
408 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
409#ifdef EBCDIC
410 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
411 else { len = 7; uv &= 0x01; }
412#else
ba210ebe
JH
413 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
414 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
415 else { len = 13; uv = 0; } /* whoa! */
416#endif
417
a0ed51b3
LW
418 if (retlen)
419 *retlen = len;
9041c2e3 420
ba210ebe
JH
421 expectlen = len;
422
fcc8fcf6
JH
423 if ((curlen < expectlen) &&
424 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 425 warning = UTF8_WARN_SHORT;
ba210ebe
JH
426 goto malformed;
427 }
428
429 len--;
a0ed51b3 430 s++;
ba210ebe
JH
431 ouv = uv;
432
a0ed51b3 433 while (len--) {
421a8bf2
JH
434 if (!UTF8_IS_CONTINUATION(*s) &&
435 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
436 s--;
437 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 438 goto malformed;
a0ed51b3
LW
439 }
440 else
8850bf83 441 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
442 if (!(uv > ouv)) {
443 /* These cannot be allowed. */
444 if (uv == ouv) {
75dbc644 445 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
446 warning = UTF8_WARN_LONG;
447 goto malformed;
448 }
449 }
450 else { /* uv < ouv */
451 /* This cannot be allowed. */
452 warning = UTF8_WARN_OVERFLOW;
453 goto malformed;
454 }
ba210ebe
JH
455 }
456 s++;
457 ouv = uv;
458 }
459
421a8bf2 460 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 461 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 462 warning = UTF8_WARN_SURROGATE;
ba210ebe 463 goto malformed;
eb160463 464 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 465 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 466 warning = UTF8_WARN_LONG;
ba210ebe 467 goto malformed;
421a8bf2 468 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 469 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 470 warning = UTF8_WARN_FFFF;
a9917092 471 goto malformed;
a0ed51b3 472 }
ba210ebe 473
a0ed51b3 474 return uv;
ba210ebe
JH
475
476malformed:
477
fcc8fcf6 478 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 479 if (retlen)
cc366d4b 480 *retlen = -1;
ba210ebe
JH
481 return 0;
482 }
483
a0dbb045
JH
484 if (dowarn) {
485 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
486
487 switch (warning) {
488 case 0: /* Intentionally empty. */ break;
489 case UTF8_WARN_EMPTY:
54667de8 490 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
491 break;
492 case UTF8_WARN_CONTINUATION:
097fb8e2 493 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
494 break;
495 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
496 if (s == s0)
497 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
498 (UV)s[1], startbyte);
499 else
500 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 501 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 502
a0dbb045
JH
503 break;
504 case UTF8_WARN_FE_FF:
505 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
506 break;
507 case UTF8_WARN_SHORT:
097fb8e2 508 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 509 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 510 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
511 break;
512 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
513 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
514 ouv, *s, startbyte);
a0dbb045
JH
515 break;
516 case UTF8_WARN_SURROGATE:
517 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
518 break;
a0dbb045 519 case UTF8_WARN_LONG:
097fb8e2 520 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 521 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
522 break;
523 case UTF8_WARN_FFFF:
524 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
525 break;
526 default:
54667de8 527 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
528 break;
529 }
530
531 if (warning) {
504618e9 532 const char *s = SvPVX_const(sv);
a0dbb045
JH
533
534 if (PL_op)
9014280d 535 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 536 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 537 else
9014280d 538 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
539 }
540 }
541
ba210ebe 542 if (retlen)
28d3d195 543 *retlen = expectlen ? expectlen : len;
ba210ebe 544
28d3d195 545 return 0;
a0ed51b3
LW
546}
547
8e84507e 548/*
7fc63493 549=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
550
551Returns the native character value of the first character in the string C<s>
1e54db1a 552which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
553length, in bytes, of that character.
554
1e54db1a 555If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
556returned and retlen is set, if possible, to -1.
557
558=cut
559*/
560
561UV
7fc63493 562Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 563{
89ebb4a3 564 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 565 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
566}
567
568/*
7fc63493 569=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
570
571Returns the Unicode code point of the first character in the string C<s>
1e54db1a 572which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
573length, in bytes, of that character.
574
575This function should only be used when returned UV is considered
576an index into the Unicode semantic tables (e.g. swashes).
577
1e54db1a 578If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 579returned and retlen is set, if possible, to -1.
8e84507e
NIS
580
581=cut
582*/
583
584UV
7fc63493 585Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 586{
9041c2e3 587 /* Call the low level routine asking for checks */
89ebb4a3 588 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 589 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
590}
591
b76347f2 592/*
35a4481c 593=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
594
595Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
596Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
597up past C<e>, croaks.
b76347f2
JH
598
599=cut
600*/
601
602STRLEN
35a4481c 603Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
604{
605 STRLEN len = 0;
606
8850bf83
JH
607 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
608 * the bitops (especially ~) can create illegal UTF-8.
609 * In other words: in Perl UTF-8 is not just for Unicode. */
610
a3b680e6
AL
611 if (e < s)
612 goto warn_and_return;
b76347f2 613 while (s < e) {
4373e329 614 const U8 t = UTF8SKIP(s);
901b21bf 615 if (e - s < t) {
a3b680e6 616 warn_and_return:
901b21bf
JH
617 if (ckWARN_d(WARN_UTF8)) {
618 if (PL_op)
619 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 620 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
621 else
622 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
623 }
624 return len;
625 }
b76347f2
JH
626 s += t;
627 len++;
628 }
629
630 return len;
631}
632
b06226ff 633/*
35a4481c 634=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 635
1e54db1a 636Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
637and C<b>.
638
639WARNING: use only if you *know* that the pointers point inside the
640same UTF-8 buffer.
641
37607a96
PK
642=cut
643*/
a0ed51b3 644
02eb7b47 645IV
35a4481c 646Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 647{
02eb7b47
JH
648 IV off = 0;
649
8850bf83
JH
650 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
651 * the bitops (especially ~) can create illegal UTF-8.
652 * In other words: in Perl UTF-8 is not just for Unicode. */
653
a0ed51b3
LW
654 if (a < b) {
655 while (a < b) {
35a4481c 656 const U8 c = UTF8SKIP(a);
a3b680e6
AL
657 if (b - a < c)
658 goto warn_and_return;
02eb7b47 659 a += c;
a0ed51b3
LW
660 off--;
661 }
662 }
663 else {
664 while (b < a) {
4373e329 665 const U8 c = UTF8SKIP(b);
02eb7b47 666
901b21bf 667 if (a - b < c) {
a3b680e6 668 warn_and_return:
901b21bf
JH
669 if (ckWARN_d(WARN_UTF8)) {
670 if (PL_op)
671 Perl_warner(aTHX_ packWARN(WARN_UTF8),
672 "%s in %s", unees, OP_DESC(PL_op));
673 else
674 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
675 }
676 return off;
677 }
02eb7b47 678 b += c;
a0ed51b3
LW
679 off++;
680 }
681 }
02eb7b47 682
a0ed51b3
LW
683 return off;
684}
685
b06226ff 686/*
37607a96 687=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 688
8850bf83
JH
689Return the UTF-8 pointer C<s> displaced by C<off> characters, either
690forward or backward.
b06226ff
JH
691
692WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
693the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
694on the first byte of character or just after the last byte of a character.
b06226ff 695
37607a96
PK
696=cut
697*/
a0ed51b3
LW
698
699U8 *
4373e329 700Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 701{
8850bf83
JH
702 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
703 * the bitops (especially ~) can create illegal UTF-8.
704 * In other words: in Perl UTF-8 is not just for Unicode. */
705
a0ed51b3
LW
706 if (off >= 0) {
707 while (off--)
708 s += UTF8SKIP(s);
709 }
710 else {
711 while (off++) {
712 s--;
8850bf83
JH
713 while (UTF8_IS_CONTINUATION(*s))
714 s--;
a0ed51b3
LW
715 }
716 }
4373e329 717 return (U8 *)s;
a0ed51b3
LW
718}
719
6940069f 720/*
eebe1485 721=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 722
1e54db1a 723Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
724Unlike C<bytes_to_utf8>, this over-writes the original string, and
725updates len to contain the new length.
67e989fb 726Returns zero on failure, setting C<len> to -1.
6940069f
GS
727
728=cut
729*/
730
731U8 *
37607a96 732Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 733{
6940069f
GS
734 U8 *send;
735 U8 *d;
dcad2880 736 U8 *save = s;
246fae53 737
1e54db1a 738 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
739 for (send = s + *len; s < send; ) {
740 U8 c = *s++;
741
1d72bdf6
NIS
742 if (!UTF8_IS_INVARIANT(c) &&
743 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
744 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
745 *len = -1;
746 return 0;
747 }
246fae53 748 }
dcad2880
JH
749
750 d = s = save;
6940069f 751 while (s < send) {
ed646e6e 752 STRLEN ulen;
9041c2e3 753 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 754 s += ulen;
6940069f
GS
755 }
756 *d = '\0';
246fae53 757 *len = d - save;
6940069f
GS
758 return save;
759}
760
761/*
e1ec3a88 762=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 763
1e54db1a 764Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 765Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
766the newly-created string, and updates C<len> to contain the new
767length. Returns the original string if no conversion occurs, C<len>
768is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7690 if C<s> is converted or contains all 7bit characters.
f9a63242 770
37607a96
PK
771=cut
772*/
f9a63242
JH
773
774U8 *
e1ec3a88 775Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 776{
f9a63242 777 U8 *d;
e1ec3a88
AL
778 const U8 *start = s;
779 const U8 *send;
f9a63242
JH
780 I32 count = 0;
781
782 if (!*is_utf8)
73d840c0 783 return (U8 *)start;
f9a63242 784
1e54db1a 785 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 786 for (send = s + *len; s < send;) {
e1ec3a88 787 U8 c = *s++;
1d72bdf6 788 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
789 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
790 (c = *s++) && UTF8_IS_CONTINUATION(c))
791 count++;
792 else
73d840c0 793 return (U8 *)start;
db42d148 794 }
f9a63242
JH
795 }
796
797 *is_utf8 = 0;
798
f9a63242 799 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 800 s = start; start = d;
f9a63242
JH
801 while (s < send) {
802 U8 c = *s++;
c4d5f83a
NIS
803 if (!UTF8_IS_INVARIANT(c)) {
804 /* Then it is two-byte encoded */
805 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
806 c = ASCII_TO_NATIVE(c);
807 }
808 *d++ = c;
f9a63242
JH
809 }
810 *d = '\0';
811 *len = d - start;
73d840c0 812 return (U8 *)start;
f9a63242
JH
813}
814
815/*
35a4481c 816=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 817
1e54db1a 818Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
819Returns a pointer to the newly-created string, and sets C<len> to
820reflect the new length.
6940069f 821
1e54db1a 822If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
823see sv_recode_to_utf8().
824
497711e7 825=cut
6940069f
GS
826*/
827
828U8*
35a4481c 829Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 830{
35a4481c 831 const U8 * const send = s + (*len);
6940069f
GS
832 U8 *d;
833 U8 *dst;
6940069f 834
6662521e 835 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
836 dst = d;
837
838 while (s < send) {
35a4481c 839 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 840 if (UNI_IS_INVARIANT(uv))
eb160463 841 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 842 else {
eb160463
GS
843 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
844 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
845 }
846 }
847 *d = '\0';
6662521e 848 *len = d-dst;
6940069f
GS
849 return dst;
850}
851
a0ed51b3 852/*
dea0fc0b 853 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
854 *
855 * Destination must be pre-extended to 3/2 source. Do not use in-place.
856 * We optimize for native, for obvious reasons. */
857
858U8*
dea0fc0b 859Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 860{
dea0fc0b
JH
861 U8* pend;
862 U8* dstart = d;
863
1de9afcd
RGS
864 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
865 d[0] = 0;
866 *newlen = 1;
867 return d;
868 }
869
dea0fc0b 870 if (bytelen & 1)
014ead4b 871 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
872
873 pend = p + bytelen;
874
a0ed51b3 875 while (p < pend) {
dea0fc0b
JH
876 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
877 p += 2;
a0ed51b3 878 if (uv < 0x80) {
eb160463 879 *d++ = (U8)uv;
a0ed51b3
LW
880 continue;
881 }
882 if (uv < 0x800) {
eb160463
GS
883 *d++ = (U8)(( uv >> 6) | 0xc0);
884 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
885 continue;
886 }
887 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
888 UV low = (p[0] << 8) + p[1];
889 p += 2;
dea0fc0b
JH
890 if (low < 0xdc00 || low >= 0xdfff)
891 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
892 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
893 }
894 if (uv < 0x10000) {
eb160463
GS
895 *d++ = (U8)(( uv >> 12) | 0xe0);
896 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
897 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
898 continue;
899 }
900 else {
eb160463
GS
901 *d++ = (U8)(( uv >> 18) | 0xf0);
902 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
903 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
904 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
905 continue;
906 }
907 }
dea0fc0b 908 *newlen = d - dstart;
a0ed51b3
LW
909 return d;
910}
911
912/* Note: this one is slightly destructive of the source. */
913
914U8*
dea0fc0b 915Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
916{
917 U8* s = (U8*)p;
918 U8* send = s + bytelen;
919 while (s < send) {
920 U8 tmp = s[0];
921 s[0] = s[1];
922 s[1] = tmp;
923 s += 2;
924 }
dea0fc0b 925 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
926}
927
928/* for now these are all defined (inefficiently) in terms of the utf8 versions */
929
930bool
84afefe6 931Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 932{
89ebb4a3 933 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 934 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
935 return is_utf8_alnum(tmpbuf);
936}
937
938bool
84afefe6 939Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 940{
89ebb4a3 941 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 942 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
943 return is_utf8_alnumc(tmpbuf);
944}
945
946bool
84afefe6 947Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 948{
89ebb4a3 949 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 950 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
951 return is_utf8_idfirst(tmpbuf);
952}
953
954bool
84afefe6 955Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 956{
89ebb4a3 957 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 958 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
959 return is_utf8_alpha(tmpbuf);
960}
961
962bool
84afefe6 963Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 964{
89ebb4a3 965 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 966 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
967 return is_utf8_ascii(tmpbuf);
968}
969
970bool
84afefe6 971Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 972{
89ebb4a3 973 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 974 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
975 return is_utf8_space(tmpbuf);
976}
977
978bool
84afefe6 979Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 980{
89ebb4a3 981 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 982 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
983 return is_utf8_digit(tmpbuf);
984}
985
986bool
84afefe6 987Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 988{
89ebb4a3 989 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 990 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
991 return is_utf8_upper(tmpbuf);
992}
993
994bool
84afefe6 995Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
999 return is_utf8_lower(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1007 return is_utf8_cntrl(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1015 return is_utf8_graph(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1023 return is_utf8_print(tmpbuf);
1024}
1025
b8c5462f 1026bool
84afefe6 1027Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1031 return is_utf8_punct(tmpbuf);
1032}
1033
4d61ec05 1034bool
84afefe6 1035Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1039 return is_utf8_xdigit(tmpbuf);
1040}
1041
84afefe6
JH
1042UV
1043Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1044{
0ebc6274
JH
1045 uvchr_to_utf8(p, c);
1046 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1047}
1048
84afefe6
JH
1049UV
1050Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1051{
0ebc6274
JH
1052 uvchr_to_utf8(p, c);
1053 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1054}
1055
84afefe6
JH
1056UV
1057Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1058{
0ebc6274
JH
1059 uvchr_to_utf8(p, c);
1060 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1061}
1062
84afefe6
JH
1063UV
1064Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1065{
0ebc6274
JH
1066 uvchr_to_utf8(p, c);
1067 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1068}
1069
a0ed51b3
LW
1070/* for now these all assume no locale info available for Unicode > 255 */
1071
1072bool
84afefe6 1073Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1074{
1075 return is_uni_alnum(c); /* XXX no locale support yet */
1076}
1077
1078bool
84afefe6 1079Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1080{
1081 return is_uni_alnumc(c); /* XXX no locale support yet */
1082}
1083
1084bool
84afefe6 1085Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1086{
1087 return is_uni_idfirst(c); /* XXX no locale support yet */
1088}
1089
1090bool
84afefe6 1091Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1092{
1093 return is_uni_alpha(c); /* XXX no locale support yet */
1094}
1095
1096bool
84afefe6 1097Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1098{
1099 return is_uni_ascii(c); /* XXX no locale support yet */
1100}
1101
1102bool
84afefe6 1103Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1104{
1105 return is_uni_space(c); /* XXX no locale support yet */
1106}
1107
1108bool
84afefe6 1109Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1110{
1111 return is_uni_digit(c); /* XXX no locale support yet */
1112}
1113
1114bool
84afefe6 1115Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1116{
1117 return is_uni_upper(c); /* XXX no locale support yet */
1118}
1119
1120bool
84afefe6 1121Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1122{
1123 return is_uni_lower(c); /* XXX no locale support yet */
1124}
1125
1126bool
84afefe6 1127Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1128{
1129 return is_uni_cntrl(c); /* XXX no locale support yet */
1130}
1131
1132bool
84afefe6 1133Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1134{
1135 return is_uni_graph(c); /* XXX no locale support yet */
1136}
1137
1138bool
84afefe6 1139Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1140{
1141 return is_uni_print(c); /* XXX no locale support yet */
1142}
1143
b8c5462f 1144bool
84afefe6 1145Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1146{
1147 return is_uni_punct(c); /* XXX no locale support yet */
1148}
1149
4d61ec05 1150bool
84afefe6 1151Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1152{
1153 return is_uni_xdigit(c); /* XXX no locale support yet */
1154}
1155
b7ac61fa
JH
1156U32
1157Perl_to_uni_upper_lc(pTHX_ U32 c)
1158{
ee099d14
JH
1159 /* XXX returns only the first character -- do not use XXX */
1160 /* XXX no locale support yet */
1161 STRLEN len;
89ebb4a3 1162 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1163 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1164}
1165
1166U32
1167Perl_to_uni_title_lc(pTHX_ U32 c)
1168{
ee099d14
JH
1169 /* XXX returns only the first character XXX -- do not use XXX */
1170 /* XXX no locale support yet */
1171 STRLEN len;
89ebb4a3 1172 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1173 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1174}
1175
1176U32
1177Perl_to_uni_lower_lc(pTHX_ U32 c)
1178{
ee099d14
JH
1179 /* XXX returns only the first character -- do not use XXX */
1180 /* XXX no locale support yet */
1181 STRLEN len;
89ebb4a3 1182 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1183 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1184}
1185
a0ed51b3 1186bool
7fc63493 1187Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1188{
386d01d6
GS
1189 if (!is_utf8_char(p))
1190 return FALSE;
a0ed51b3 1191 if (!PL_utf8_alnum)
289d4f09
ML
1192 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1193 * descendant of isalnum(3), in other words, it doesn't
1194 * contain the '_'. --jhi */
1195 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1196 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1197/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1198#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1199 if (!PL_utf8_alnum)
1200 PL_utf8_alnum = swash_init("utf8", "",
1201 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1202 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1203#endif
1204}
1205
1206bool
7fc63493 1207Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1208{
386d01d6
GS
1209 if (!is_utf8_char(p))
1210 return FALSE;
b8c5462f
JH
1211 if (!PL_utf8_alnum)
1212 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1213 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1214/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1215#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1216 if (!PL_utf8_alnum)
1217 PL_utf8_alnum = swash_init("utf8", "",
1218 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1219 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1220#endif
1221}
1222
1223bool
7fc63493 1224Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1225{
82686b01
JH
1226 if (*p == '_')
1227 return TRUE;
1228 if (!is_utf8_char(p))
1229 return FALSE;
1230 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1231 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1232 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1233}
1234
1235bool
7fc63493 1236Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1237{
1238 if (*p == '_')
1239 return TRUE;
1240 if (!is_utf8_char(p))
1241 return FALSE;
1242 if (!PL_utf8_idcont)
1243 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1244 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1245}
1246
1247bool
7fc63493 1248Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1249{
386d01d6
GS
1250 if (!is_utf8_char(p))
1251 return FALSE;
a0ed51b3 1252 if (!PL_utf8_alpha)
e24b16f9 1253 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1254 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1255}
1256
1257bool
7fc63493 1258Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1259{
386d01d6
GS
1260 if (!is_utf8_char(p))
1261 return FALSE;
b8c5462f
JH
1262 if (!PL_utf8_ascii)
1263 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1264 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1265}
1266
1267bool
7fc63493 1268Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1269{
386d01d6
GS
1270 if (!is_utf8_char(p))
1271 return FALSE;
a0ed51b3 1272 if (!PL_utf8_space)
3bec3564 1273 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1274 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1275}
1276
1277bool
7fc63493 1278Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1279{
386d01d6
GS
1280 if (!is_utf8_char(p))
1281 return FALSE;
a0ed51b3 1282 if (!PL_utf8_digit)
e24b16f9 1283 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1284 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1289{
386d01d6
GS
1290 if (!is_utf8_char(p))
1291 return FALSE;
a0ed51b3 1292 if (!PL_utf8_upper)
c65e4d19 1293 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1294 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1295}
1296
1297bool
7fc63493 1298Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1299{
386d01d6
GS
1300 if (!is_utf8_char(p))
1301 return FALSE;
a0ed51b3 1302 if (!PL_utf8_lower)
c65e4d19 1303 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1304 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1305}
1306
1307bool
7fc63493 1308Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1309{
386d01d6
GS
1310 if (!is_utf8_char(p))
1311 return FALSE;
b8c5462f
JH
1312 if (!PL_utf8_cntrl)
1313 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1314 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1315}
1316
1317bool
7fc63493 1318Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1319{
386d01d6
GS
1320 if (!is_utf8_char(p))
1321 return FALSE;
b8c5462f
JH
1322 if (!PL_utf8_graph)
1323 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1324 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1325}
1326
1327bool
7fc63493 1328Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1329{
386d01d6
GS
1330 if (!is_utf8_char(p))
1331 return FALSE;
a0ed51b3 1332 if (!PL_utf8_print)
e24b16f9 1333 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1334 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1335}
1336
1337bool
7fc63493 1338Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1339{
386d01d6
GS
1340 if (!is_utf8_char(p))
1341 return FALSE;
b8c5462f
JH
1342 if (!PL_utf8_punct)
1343 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1344 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1345}
1346
1347bool
7fc63493 1348Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1349{
386d01d6
GS
1350 if (!is_utf8_char(p))
1351 return FALSE;
b8c5462f
JH
1352 if (!PL_utf8_xdigit)
1353 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1354 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1355}
1356
1357bool
7fc63493 1358Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1359{
386d01d6
GS
1360 if (!is_utf8_char(p))
1361 return FALSE;
a0ed51b3 1362 if (!PL_utf8_mark)
e24b16f9 1363 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1364 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1365}
1366
6b5c0936
JH
1367/*
1368=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1369
1370The "p" contains the pointer to the UTF-8 string encoding
1371the character that is being converted.
1372
1373The "ustrp" is a pointer to the character buffer to put the
1374conversion result to. The "lenp" is a pointer to the length
1375of the result.
1376
0134edef 1377The "swashp" is a pointer to the swash to use.
6b5c0936 1378
0134edef
JH
1379Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1380and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1381but not always, a multicharacter mapping), is tried first.
6b5c0936 1382
0134edef
JH
1383The "special" is a string like "utf8::ToSpecLower", which means the
1384hash %utf8::ToSpecLower. The access to the hash is through
1385Perl_to_utf8_case().
6b5c0936 1386
0134edef
JH
1387The "normal" is a string like "ToLower" which means the swash
1388%utf8::ToLower.
1389
1390=cut */
6b5c0936 1391
2104c8d9 1392UV
e1ec3a88 1393Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
a0ed51b3 1394{
89ebb4a3 1395 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1396 STRLEN len = 0;
a0ed51b3 1397
7fc63493 1398 const UV uv0 = utf8_to_uvchr(p, 0);
1feea2c7
JH
1399 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1400 * are necessary in EBCDIC, they are redundant no-ops
1401 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1402 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1403 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1404
1405 if (!*swashp) /* load on-demand */
1406 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1407
b08cf34e
JH
1408 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1409 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1410 /* It might be "special" (sometimes, but not always,
2a37f04d 1411 * a multicharacter mapping) */
983ffd37 1412 HV *hv;
b08cf34e
JH
1413 SV **svp;
1414
1415 if ((hv = get_hv(special, FALSE)) &&
1416 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1417 (*svp)) {
1418 char *s;
47654450 1419
b08cf34e 1420 s = SvPV(*svp, len);
47654450
JH
1421 if (len == 1)
1422 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1423 else {
2f9475ad
JH
1424#ifdef EBCDIC
1425 /* If we have EBCDIC we need to remap the characters
1426 * since any characters in the low 256 are Unicode
1427 * code points, not EBCDIC. */
7cda7a3d 1428 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1429
1430 d = tmpbuf;
b08cf34e 1431 if (SvUTF8(*svp)) {
2f9475ad
JH
1432 STRLEN tlen = 0;
1433
1434 while (t < tend) {
1435 UV c = utf8_to_uvchr(t, &tlen);
1436 if (tlen > 0) {
1437 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1438 t += tlen;
1439 }
1440 else
1441 break;
1442 }
1443 }
1444 else {
36fec512
JH
1445 while (t < tend) {
1446 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1447 t++;
1448 }
2f9475ad
JH
1449 }
1450 len = d - tmpbuf;
1451 Copy(tmpbuf, ustrp, len, U8);
1452#else
d2dcd0fb 1453 Copy(s, ustrp, len, U8);
2f9475ad 1454#endif
29e98929 1455 }
983ffd37 1456 }
0134edef
JH
1457 }
1458
1459 if (!len && *swashp) {
1460 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1461
1462 if (uv2) {
1463 /* It was "normal" (a single character mapping). */
1464 UV uv3 = UNI_TO_NATIVE(uv2);
1465
e9101d72 1466 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1467 }
1468 }
1feea2c7 1469
0134edef
JH
1470 if (!len) /* Neither: just copy. */
1471 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1472
2a37f04d
JH
1473 if (lenp)
1474 *lenp = len;
1475
0134edef 1476 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1477}
1478
d3e79532 1479/*
7fc63493 1480=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1481
1482Convert the UTF-8 encoded character at p to its uppercase version and
1483store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1484that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1485the uppercase version may be longer than the original character.
d3e79532
JH
1486
1487The first character of the uppercased version is returned
1488(but note, as explained above, that there may be more.)
1489
1490=cut */
1491
2104c8d9 1492UV
7fc63493 1493Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1494{
983ffd37 1495 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1496 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1497}
a0ed51b3 1498
d3e79532 1499/*
7fc63493 1500=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1501
1502Convert the UTF-8 encoded character at p to its titlecase version and
1503store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1504that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1505titlecase version may be longer than the original character.
d3e79532
JH
1506
1507The first character of the titlecased version is returned
1508(but note, as explained above, that there may be more.)
1509
1510=cut */
1511
983ffd37 1512UV
7fc63493 1513Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1514{
1515 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1516 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1517}
1518
d3e79532 1519/*
7fc63493 1520=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1521
1522Convert the UTF-8 encoded character at p to its lowercase version and
1523store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1524that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1525lowercase version may be longer than the original character.
d3e79532
JH
1526
1527The first character of the lowercased version is returned
1528(but note, as explained above, that there may be more.)
1529
1530=cut */
1531
2104c8d9 1532UV
7fc63493 1533Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1534{
983ffd37 1535 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1536 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1537}
1538
d3e79532 1539/*
7fc63493 1540=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1541
1542Convert the UTF-8 encoded character at p to its foldcase version and
1543store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1544that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1545foldcase version may be longer than the original character (up to
1546three characters).
1547
1548The first character of the foldcased version is returned
1549(but note, as explained above, that there may be more.)
1550
1551=cut */
1552
b4e400f9 1553UV
7fc63493 1554Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1555{
1556 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1557 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1558}
1559
1560/* a "swash" is a swatch hash */
1561
1562SV*
7fc63493 1563Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1564{
27da23d5 1565 dVAR;
a0ed51b3 1566 SV* retval;
243b1711 1567 SV* tokenbufsv = sv_newmortal();
8e84507e 1568 dSP;
7fc63493
AL
1569 const size_t pkg_len = strlen(pkg);
1570 const size_t name_len = strlen(name);
71bed85a 1571 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1572 SV* errsv_save;
ce3b816e 1573
96ca9f55
DM
1574 PUSHSTACKi(PERLSI_MAGIC);
1575 ENTER;
1576 SAVEI32(PL_hints);
1577 PL_hints = 0;
1578 save_re_context();
1b026014 1579 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1580 ENTER;
f8be5cf0 1581 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1582 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1583 Nullsv);
f8be5cf0
JH
1584 if (!SvTRUE(ERRSV))
1585 sv_setsv(ERRSV, errsv_save);
1586 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1587 LEAVE;
1588 }
1589 SPAGAIN;
a0ed51b3
LW
1590 PUSHMARK(SP);
1591 EXTEND(SP,5);
71bed85a
NC
1592 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1593 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1594 PUSHs(listsv);
1595 PUSHs(sv_2mortal(newSViv(minbits)));
1596 PUSHs(sv_2mortal(newSViv(none)));
1597 PUTBACK;
923e4eb5 1598 if (IN_PERL_COMPILETIME) {
bf1fed83 1599 /* XXX ought to be handled by lex_start */
82686b01 1600 SAVEI32(PL_in_my);
2b4bd638 1601 PL_in_my = 0;
bf1fed83 1602 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1603 }
f8be5cf0 1604 errsv_save = newSVsv(ERRSV);
864dbfa3 1605 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1606 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1607 else
e24b16f9 1608 retval = &PL_sv_undef;
f8be5cf0
JH
1609 if (!SvTRUE(ERRSV))
1610 sv_setsv(ERRSV, errsv_save);
1611 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1612 LEAVE;
1613 POPSTACK;
923e4eb5 1614 if (IN_PERL_COMPILETIME) {
bf1fed83 1615 STRLEN len;
73d840c0 1616 const char* pv = SvPV(tokenbufsv, len);
bf1fed83
JH
1617
1618 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1619 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1620 }
bc45ce41
JH
1621 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1622 if (SvPOK(retval))
35c1215d
NC
1623 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1624 retval);
cea2e8a9 1625 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1626 }
a0ed51b3
LW
1627 return retval;
1628}
1629
035d37be
JH
1630
1631/* This API is wrong for special case conversions since we may need to
1632 * return several Unicode characters for a single Unicode character
1633 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1634 * the lower-level routine, and it is similarly broken for returning
1635 * multiple values. --jhi */
a0ed51b3 1636UV
7fc63493 1637Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1638{
27da23d5 1639 dVAR;
a0ed51b3 1640 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1641 U32 klen;
1642 U32 off;
a0ed51b3 1643 STRLEN slen;
7d85a32c 1644 STRLEN needents;
4ea42e7f 1645 U8 *tmps = NULL;
a0ed51b3
LW
1646 U32 bit;
1647 SV *retval;
3568d838
JH
1648 U8 tmputf8[2];
1649 UV c = NATIVE_TO_ASCII(*ptr);
1650
1651 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1652 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1653 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1654 ptr = tmputf8;
1655 }
1656 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1657 * then the "swatch" is a vec() for al the chars which start
1658 * with 0xAA..0xYY
1659 * So the key in the hash (klen) is length of encoded char -1
1660 */
1661 klen = UTF8SKIP(ptr) - 1;
1662 off = ptr[klen];
a0ed51b3 1663
7d85a32c
JH
1664 if (klen == 0)
1665 {
1666 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1667 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1668 */
1669 needents = UTF_CONTINUATION_MARK;
1670 off = NATIVE_TO_UTF(ptr[klen]);
1671 }
1672 else
1673 {
1674 /* If char is encoded then swatch is for the prefix */
1675 needents = (1 << UTF_ACCUMULATION_SHIFT);
1676 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1677 }
1678
a0ed51b3
LW
1679 /*
1680 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1681 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1682 * it's nothing to sniff at.) Pity we usually come through at least
1683 * two function calls to get here...
1684 *
1685 * NB: this code assumes that swatches are never modified, once generated!
1686 */
1687
3568d838 1688 if (hv == PL_last_swash_hv &&
a0ed51b3 1689 klen == PL_last_swash_klen &&
27da23d5 1690 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1691 {
1692 tmps = PL_last_swash_tmps;
1693 slen = PL_last_swash_slen;
1694 }
1695 else {
1696 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1697 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1698
1699 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1700 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1701 dSP;
2b9d42f0
NIS
1702 /* We use utf8n_to_uvuni() as we want an index into
1703 Unicode tables, not a native character number.
1704 */
89ebb4a3 1705 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1706 ckWARN(WARN_UTF8) ?
1707 0 : UTF8_ALLOW_ANY);
f8be5cf0 1708 SV *errsv_save;
a0ed51b3
LW
1709 ENTER;
1710 SAVETMPS;
1711 save_re_context();
1712 PUSHSTACKi(PERLSI_MAGIC);
1713 PUSHMARK(SP);
1714 EXTEND(SP,3);
1715 PUSHs((SV*)sv);
ffbc6a93 1716 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1717 PUSHs(sv_2mortal(newSViv((klen) ?
1718 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1719 PUSHs(sv_2mortal(newSViv(needents)));
1720 PUTBACK;
f8be5cf0 1721 errsv_save = newSVsv(ERRSV);
864dbfa3 1722 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1723 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1724 else
e24b16f9 1725 retval = &PL_sv_undef;
f8be5cf0
JH
1726 if (!SvTRUE(ERRSV))
1727 sv_setsv(ERRSV, errsv_save);
1728 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1729 POPSTACK;
1730 FREETMPS;
1731 LEAVE;
923e4eb5 1732 if (IN_PERL_COMPILETIME)
eb160463 1733 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1734
e1ec3a88 1735 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1736
7d85a32c 1737 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1738 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1739 }
1740
1741 PL_last_swash_hv = hv;
1742 PL_last_swash_klen = klen;
1743 PL_last_swash_tmps = tmps;
1744 PL_last_swash_slen = slen;
1745 if (klen)
1746 Copy(ptr, PL_last_swash_key, klen, U8);
1747 }
1748
9faf8d75 1749 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1750 case 1:
1751 bit = 1 << (off & 7);
1752 off >>= 3;
1753 return (tmps[off] & bit) != 0;
1754 case 8:
1755 return tmps[off];
1756 case 16:
1757 off <<= 1;
1758 return (tmps[off] << 8) + tmps[off + 1] ;
1759 case 32:
1760 off <<= 2;
1761 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1762 }
cea2e8a9 1763 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1764 return 0;
1765}
2b9d42f0
NIS
1766
1767
1768/*
37607a96 1769=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1770
1e54db1a 1771Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1772of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0
NIS
1773bytes available. The return value is the pointer to the byte after the
1774end of the new character. In other words,
1775
1776 d = uvchr_to_utf8(d, uv);
1777
1778is the recommended wide native character-aware way of saying
1779
1780 *(d++) = uv;
1781
1782=cut
1783*/
1784
1785/* On ASCII machines this is normally a macro but we want a
1786 real function in case XS code wants it
1787*/
1788#undef Perl_uvchr_to_utf8
1789U8 *
1790Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1791{
b851fbc1 1792 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1793}
1794
b851fbc1
JH
1795U8 *
1796Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1797{
1798 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1799}
2b9d42f0
NIS
1800
1801/*
37607a96 1802=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1803
1804Returns the native character value of the first character in the string C<s>
1e54db1a 1805which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1806length, in bytes, of that character.
1807
1808Allows length and flags to be passed to low level routine.
1809
1810=cut
1811*/
0a2ef054
JH
1812/* On ASCII machines this is normally a macro but we want
1813 a real function in case XS code wants it
2b9d42f0
NIS
1814*/
1815#undef Perl_utf8n_to_uvchr
1816UV
7fc63493 1817Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1818{
1819 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1820 return UNI_TO_NATIVE(uv);
1821}
1822
d2cc3551
JH
1823/*
1824=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1825
1826Build to the scalar dsv a displayable version of the string spv,
1827length len, the displayable version being at most pvlim bytes long
1828(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1829
9e55ce06 1830The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1831isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1832to display the \\[nrfta\\] as the backslashed versions (like '\n')
1833(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1834UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1835UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1836
d2cc3551
JH
1837The pointer to the PV of the dsv is returned.
1838
1839=cut */
e6b2e755 1840char *
e1ec3a88 1841Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
1842{
1843 int truncated = 0;
e1ec3a88 1844 const char *s, *e;
e6b2e755
JH
1845
1846 sv_setpvn(dsv, "", 0);
e1ec3a88 1847 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1848 UV u;
a49f32c6
NC
1849 /* This serves double duty as a flag and a character to print after
1850 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1851 */
1852 char ok = 0;
c728cb41 1853
e6b2e755
JH
1854 if (pvlim && SvCUR(dsv) >= pvlim) {
1855 truncated++;
1856 break;
1857 }
1858 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1859 if (u < 256) {
a3b680e6 1860 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1861 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1862 switch (c) {
c728cb41 1863 case '\n':
a49f32c6 1864 ok = 'n'; break;
c728cb41 1865 case '\r':
a49f32c6 1866 ok = 'r'; break;
c728cb41 1867 case '\t':
a49f32c6 1868 ok = 't'; break;
c728cb41 1869 case '\f':
a49f32c6 1870 ok = 'f'; break;
c728cb41 1871 case '\a':
a49f32c6 1872 ok = 'a'; break;
c728cb41 1873 case '\\':
a49f32c6 1874 ok = '\\'; break;
c728cb41
JH
1875 default: break;
1876 }
a49f32c6
NC
1877 if (ok) {
1878 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1879 }
c728cb41 1880 }
00e86452 1881 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
1882 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1883 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1884 ok = 1;
0a2ef054 1885 }
c728cb41
JH
1886 }
1887 if (!ok)
9e55ce06 1888 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1889 }
1890 if (truncated)
1891 sv_catpvn(dsv, "...", 3);
1892
1893 return SvPVX(dsv);
1894}
2b9d42f0 1895
d2cc3551
JH
1896/*
1897=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1898
1899Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1900the displayable version being at most pvlim bytes long
d2cc3551 1901(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1902
1903The flags argument is as in pv_uni_display().
1904
d2cc3551
JH
1905The pointer to the PV of the dsv is returned.
1906
1907=cut */
e6b2e755
JH
1908char *
1909Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1910{
701a277b
JH
1911 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1912 pvlim, flags);
1913}
1914
d2cc3551 1915/*
d07ddd77 1916=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1917
1918Return true if the strings s1 and s2 differ case-insensitively, false
1919if not (if they are equal case-insensitively). If u1 is true, the
1920string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1921the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1922are false, the respective string is assumed to be in native 8-bit
1923encoding.
1924
1925If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1926in there (they will point at the beginning of the I<next> character).
1927If the pointers behind pe1 or pe2 are non-NULL, they are the end
1928pointers beyond which scanning will not continue under any
4cdaeff7 1929circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1930s2+l2 will be used as goal end pointers that will also stop the scan,
1931and which qualify towards defining a successful match: all the scans
1932that define an explicit length must reach their goal pointers for
1933a match to succeed).
d2cc3551
JH
1934
1935For case-insensitiveness, the "casefolding" of Unicode is used
1936instead of upper/lowercasing both the characters, see
1937http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1938
1939=cut */
701a277b 1940I32
d07ddd77 1941Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1942{
e1ec3a88
AL
1943 register const U8 *p1 = (const U8*)s1;
1944 register const U8 *p2 = (const U8*)s2;
1945 register const U8 *f1 = 0, *f2 = 0;
1946 register U8 *e1 = 0, *q1 = 0;
1947 register U8 *e2 = 0, *q2 = 0;
d07ddd77 1948 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
1949 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1950 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
1951 U8 natbuf[1+1];
1952 STRLEN foldlen1, foldlen2;
d07ddd77 1953 bool match;
332ddc25 1954
d07ddd77
JH
1955 if (pe1)
1956 e1 = *(U8**)pe1;
e1ec3a88
AL
1957 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1958 f1 = (const U8*)s1 + l1;
d07ddd77
JH
1959 if (pe2)
1960 e2 = *(U8**)pe2;
e1ec3a88
AL
1961 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1962 f2 = (const U8*)s2 + l2;
d07ddd77
JH
1963
1964 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1965 return 1; /* mismatch; possible infinite loop or false positive */
1966
a6872d42
JH
1967 if (!u1 || !u2)
1968 natbuf[1] = 0; /* Need to terminate the buffer. */
1969
d07ddd77
JH
1970 while ((e1 == 0 || p1 < e1) &&
1971 (f1 == 0 || p1 < f1) &&
1972 (e2 == 0 || p2 < e2) &&
1973 (f2 == 0 || p2 < f2)) {
1974 if (n1 == 0) {
d7f013c8
JH
1975 if (u1)
1976 to_utf8_fold(p1, foldbuf1, &foldlen1);
1977 else {
f5cee151 1978 natbuf[0] = *p1;
d7f013c8
JH
1979 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1980 }
1981 q1 = foldbuf1;
d07ddd77 1982 n1 = foldlen1;
332ddc25 1983 }
d07ddd77 1984 if (n2 == 0) {
d7f013c8
JH
1985 if (u2)
1986 to_utf8_fold(p2, foldbuf2, &foldlen2);
1987 else {
f5cee151 1988 natbuf[0] = *p2;
d7f013c8
JH
1989 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1990 }
1991 q2 = foldbuf2;
d07ddd77 1992 n2 = foldlen2;
332ddc25 1993 }
d07ddd77
JH
1994 while (n1 && n2) {
1995 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1996 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1997 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1998 return 1; /* mismatch */
d07ddd77 1999 n1 -= UTF8SKIP(q1);
d7f013c8 2000 q1 += UTF8SKIP(q1);
d07ddd77 2001 n2 -= UTF8SKIP(q2);
d7f013c8 2002 q2 += UTF8SKIP(q2);
701a277b 2003 }
d07ddd77 2004 if (n1 == 0)
d7f013c8 2005 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2006 if (n2 == 0)
d7f013c8
JH
2007 p2 += u2 ? UTF8SKIP(p2) : 1;
2008
d2cc3551 2009 }
5469e704 2010
d07ddd77
JH
2011 /* A match is defined by all the scans that specified
2012 * an explicit length reaching their final goals. */
2013 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2014
2015 if (match) {
d07ddd77
JH
2016 if (pe1)
2017 *pe1 = (char*)p1;
2018 if (pe2)
2019 *pe2 = (char*)p2;
5469e704
JH
2020 }
2021
2022 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2023}
701a277b 2024
a49f32c6
NC
2025/*
2026 * Local variables:
2027 * c-indentation-style: bsd
2028 * c-basic-offset: 4
2029 * indent-tabs-mode: t
2030 * End:
2031 *
37442d52
RGS
2032 * ex: set ts=8 sts=4 sw=4 noet:
2033 */