This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
include flags and refcount in the list of leaked scalars
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
b5f8cc5c 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a0ed51b3
LW
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
864dbfa3 24#define PERL_IN_UTF8_C
a0ed51b3
LW
25#include "perl.h"
26
901b21bf
JH
27static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
28
ccfc67b7
JH
29/*
30=head1 Unicode Support
a0ed51b3 31
166f8a29
DM
32This file contains various utility functions for manipulating UTF8-encoded
33strings. For the uninitiated, this is a method of representing arbitrary
61296642 34Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
35characters in the ASCII range are unmodified, and a zero byte never appears
36within non-zero characters.
166f8a29 37
b851fbc1 38=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 39
1e54db1a 40Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
eebe1485
SC
41of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
42bytes available. The return value is the pointer to the byte after the
9041c2e3 43end of the new character. In other words,
eebe1485 44
b851fbc1
JH
45 d = uvuni_to_utf8_flags(d, uv, flags);
46
47or, in most cases,
48
9041c2e3 49 d = uvuni_to_utf8(d, uv);
eebe1485 50
b851fbc1
JH
51(which is equivalent to)
52
53 d = uvuni_to_utf8_flags(d, uv, 0);
54
eebe1485
SC
55is the recommended Unicode-aware way of saying
56
57 *(d++) = uv;
58
59=cut
60*/
61
dfe13c55 62U8 *
b851fbc1 63Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 64{
62961d2e 65 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
66 if (UNICODE_IS_SURROGATE(uv) &&
67 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 68 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
69 else if (
70 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
71 !(flags & UNICODE_ALLOW_FDD0))
72 ||
c867b360 73 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
74 !(flags & UNICODE_ALLOW_FFFF))) &&
75 /* UNICODE_ALLOW_SUPER includes
2a20b9da 76 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
77 ((uv <= PERL_UNICODE_MAX) ||
78 !(flags & UNICODE_ALLOW_SUPER))
79 )
9014280d 80 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
81 "Unicode character 0x%04"UVxf" is illegal", uv);
82 }
c4d5f83a 83 if (UNI_IS_INVARIANT(uv)) {
eb160463 84 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
85 return d;
86 }
2d331972 87#if defined(EBCDIC)
1d72bdf6
NIS
88 else {
89 STRLEN len = UNISKIP(uv);
90 U8 *p = d+len-1;
91 while (p > d) {
eb160463 92 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
93 uv >>= UTF_ACCUMULATION_SHIFT;
94 }
eb160463 95 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
96 return d+len;
97 }
98#else /* Non loop style */
a0ed51b3 99 if (uv < 0x800) {
eb160463
GS
100 *d++ = (U8)(( uv >> 6) | 0xc0);
101 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
102 return d;
103 }
104 if (uv < 0x10000) {
eb160463
GS
105 *d++ = (U8)(( uv >> 12) | 0xe0);
106 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
107 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
108 return d;
109 }
110 if (uv < 0x200000) {
eb160463
GS
111 *d++ = (U8)(( uv >> 18) | 0xf0);
112 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
113 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
114 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
115 return d;
116 }
117 if (uv < 0x4000000) {
eb160463
GS
118 *d++ = (U8)(( uv >> 24) | 0xf8);
119 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
120 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
121 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
122 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
123 return d;
124 }
125 if (uv < 0x80000000) {
eb160463
GS
126 *d++ = (U8)(( uv >> 30) | 0xfc);
127 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
132 return d;
133 }
6b8eaf93 134#ifdef HAS_QUAD
d7578b48 135 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
136#endif
137 {
eb160463
GS
138 *d++ = 0xfe; /* Can't match U+FEFF! */
139 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
140 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
141 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
144 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
145 return d;
146 }
6b8eaf93 147#ifdef HAS_QUAD
a0ed51b3 148 {
eb160463
GS
149 *d++ = 0xff; /* Can't match U+FFFE! */
150 *d++ = 0x80; /* 6 Reserved bits */
151 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
152 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
161 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
162 return d;
163 }
164#endif
1d72bdf6 165#endif /* Loop style */
a0ed51b3 166}
b851fbc1
JH
167
168U8 *
169Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
170{
171 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
172}
9041c2e3
NIS
173
174
175/*
eebe1485
SC
176=for apidoc A|STRLEN|is_utf8_char|U8 *s
177
5da9da9e 178Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
9041c2e3 182
82686b01 183=cut */
067a85ef 184STRLEN
386d01d6
GS
185Perl_is_utf8_char(pTHX_ U8 *s)
186{
187 U8 u = *s;
067a85ef
A
188 STRLEN slen, len;
189 UV uv, ouv;
386d01d6 190
1d72bdf6 191 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
192 return 1;
193
60006e79 194 if (!UTF8_IS_START(u))
386d01d6
GS
195 return 0;
196
9f07fdcd 197 len = UTF8SKIP(s);
386d01d6 198
60006e79 199 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
200 return 0;
201
386d01d6
GS
202 slen = len - 1;
203 s++;
6eb6869e 204 u &= UTF_START_MASK(len);
1d72bdf6 205 uv = u;
067a85ef 206 ouv = uv;
386d01d6 207 while (slen--) {
60006e79 208 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 209 return 0;
8850bf83 210 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 211 if (uv < ouv)
067a85ef
A
212 return 0;
213 ouv = uv;
386d01d6
GS
214 s++;
215 }
067a85ef 216
eb160463 217 if ((STRLEN)UNISKIP(uv) < len)
067a85ef
A
218 return 0;
219
386d01d6
GS
220 return len;
221}
222
6662521e 223/*
eebe1485 224=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 225
c9ada85f 226Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
227UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
228not mean 'a string that contains code points above 0x7F encoded in UTF-8'
229because a valid ASCII string is a valid UTF-8 string.
6662521e
GS
230
231=cut
232*/
233
8e84507e 234bool
6662521e
GS
235Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
236{
067a85ef 237 U8* x = s;
1aa99e6b 238 U8* send;
067a85ef
A
239 STRLEN c;
240
61468b03 241 if (!len && s)
6cd5fe39 242 len = strlen((char *)s);
1aa99e6b
IH
243 send = s + len;
244
6662521e 245 while (x < send) {
1acdb0da
JH
246 /* Inline the easy bits of is_utf8_char() here for speed... */
247 if (UTF8_IS_INVARIANT(*x))
248 c = 1;
249 else if (!UTF8_IS_START(*x))
250 return FALSE;
251 else {
252 /* ... and call is_utf8_char() only if really needed. */
253 c = is_utf8_char(x);
254 if (!c)
255 return FALSE;
256 }
6662521e 257 x += c;
6662521e 258 }
60006e79
JH
259 if (x != send)
260 return FALSE;
067a85ef
A
261
262 return TRUE;
6662521e
GS
263}
264
67e989fb 265/*
81cd54e3
JH
266=for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
267
268Like is_ut8_string but store the location of the failure in
269the last argument.
270
271=cut
272*/
273
274bool
275Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
276{
277 U8* x = s;
278 U8* send;
279 STRLEN c;
280
61468b03 281 if (!len && s)
81cd54e3
JH
282 len = strlen((char *)s);
283 send = s + len;
284
285 while (x < send) {
286 /* Inline the easy bits of is_utf8_char() here for speed... */
287 if (UTF8_IS_INVARIANT(*x))
288 c = 1;
289 else if (!UTF8_IS_START(*x)) {
290 if (p)
291 *p = x;
292 return FALSE;
293 }
294 else {
295 /* ... and call is_utf8_char() only if really needed. */
296 c = is_utf8_char(x);
297 if (!c) {
298 if (p)
299 *p = x;
300 return FALSE;
301 }
302 }
303 x += c;
304 }
305 if (x != send) {
306 if (p)
307 *p = x;
308 return FALSE;
309 }
310
311 return TRUE;
312}
313
314/*
9041c2e3 315=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 316
9041c2e3
NIS
317Bottom level UTF-8 decode routine.
318Returns the unicode code point value of the first character in the string C<s>
1e54db1a 319which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 320C<retlen> will be set to the length, in bytes, of that character.
67e989fb 321
1e54db1a 322If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
323is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
324it is assumed that the caller will raise a warning, and this function
28d3d195
JH
325will silently just set C<retlen> to C<-1> and return zero. If the
326C<flags> does not contain UTF8_CHECK_ONLY, warnings about
327malformations will be given, C<retlen> will be set to the expected
328length of the UTF-8 character in bytes, and zero will be returned.
329
330The C<flags> can also contain various flags to allow deviations from
331the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 332
9041c2e3
NIS
333Most code should use utf8_to_uvchr() rather than call this directly.
334
37607a96
PK
335=cut
336*/
67e989fb 337
a0ed51b3 338UV
37607a96 339Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 340{
097fb8e2 341 U8 *s0 = s;
9c5ffd7c 342 UV uv = *s, ouv = 0;
ba210ebe
JH
343 STRLEN len = 1;
344 bool dowarn = ckWARN_d(WARN_UTF8);
4f849cb6 345 UV startbyte = *s;
ba210ebe 346 STRLEN expectlen = 0;
a0dbb045
JH
347 U32 warning = 0;
348
349/* This list is a superset of the UTF8_ALLOW_XXX. */
350
351#define UTF8_WARN_EMPTY 1
352#define UTF8_WARN_CONTINUATION 2
353#define UTF8_WARN_NON_CONTINUATION 3
354#define UTF8_WARN_FE_FF 4
355#define UTF8_WARN_SHORT 5
356#define UTF8_WARN_OVERFLOW 6
357#define UTF8_WARN_SURROGATE 7
c867b360
JH
358#define UTF8_WARN_LONG 8
359#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
360
361 if (curlen == 0 &&
362 !(flags & UTF8_ALLOW_EMPTY)) {
363 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
364 goto malformed;
365 }
366
1d72bdf6 367 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
368 if (retlen)
369 *retlen = 1;
c4d5f83a 370 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 371 }
67e989fb 372
421a8bf2 373 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 374 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 375 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
376 goto malformed;
377 }
378
421a8bf2 379 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 380 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 381 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
382 goto malformed;
383 }
9041c2e3 384
1d72bdf6 385#ifdef EBCDIC
75383841 386 uv = NATIVE_TO_UTF(uv);
1d72bdf6 387#else
fcc8fcf6
JH
388 if ((uv == 0xfe || uv == 0xff) &&
389 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 390 warning = UTF8_WARN_FE_FF;
ba210ebe 391 goto malformed;
a0ed51b3 392 }
1d72bdf6
NIS
393#endif
394
ba210ebe
JH
395 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
396 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
397 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
398 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
399#ifdef EBCDIC
400 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
401 else { len = 7; uv &= 0x01; }
402#else
ba210ebe
JH
403 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
404 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
405 else { len = 13; uv = 0; } /* whoa! */
406#endif
407
a0ed51b3
LW
408 if (retlen)
409 *retlen = len;
9041c2e3 410
ba210ebe
JH
411 expectlen = len;
412
fcc8fcf6
JH
413 if ((curlen < expectlen) &&
414 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 415 warning = UTF8_WARN_SHORT;
ba210ebe
JH
416 goto malformed;
417 }
418
419 len--;
a0ed51b3 420 s++;
ba210ebe
JH
421 ouv = uv;
422
a0ed51b3 423 while (len--) {
421a8bf2
JH
424 if (!UTF8_IS_CONTINUATION(*s) &&
425 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
426 s--;
427 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 428 goto malformed;
a0ed51b3
LW
429 }
430 else
8850bf83 431 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
432 if (!(uv > ouv)) {
433 /* These cannot be allowed. */
434 if (uv == ouv) {
435 if (!(flags & UTF8_ALLOW_LONG)) {
436 warning = UTF8_WARN_LONG;
437 goto malformed;
438 }
439 }
440 else { /* uv < ouv */
441 /* This cannot be allowed. */
442 warning = UTF8_WARN_OVERFLOW;
443 goto malformed;
444 }
ba210ebe
JH
445 }
446 s++;
447 ouv = uv;
448 }
449
421a8bf2 450 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 451 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 452 warning = UTF8_WARN_SURROGATE;
ba210ebe 453 goto malformed;
eb160463 454 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 455 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 456 warning = UTF8_WARN_LONG;
ba210ebe 457 goto malformed;
421a8bf2 458 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 459 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 460 warning = UTF8_WARN_FFFF;
a9917092 461 goto malformed;
a0ed51b3 462 }
ba210ebe 463
a0ed51b3 464 return uv;
ba210ebe
JH
465
466malformed:
467
fcc8fcf6 468 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 469 if (retlen)
cc366d4b 470 *retlen = -1;
ba210ebe
JH
471 return 0;
472 }
473
a0dbb045
JH
474 if (dowarn) {
475 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
476
477 switch (warning) {
478 case 0: /* Intentionally empty. */ break;
479 case UTF8_WARN_EMPTY:
480 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
481 break;
482 case UTF8_WARN_CONTINUATION:
097fb8e2 483 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
484 break;
485 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
486 if (s == s0)
487 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
488 (UV)s[1], startbyte);
489 else
490 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
491 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
492
a0dbb045
JH
493 break;
494 case UTF8_WARN_FE_FF:
495 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
496 break;
497 case UTF8_WARN_SHORT:
097fb8e2
JH
498 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
499 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
b31f83c2 500 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
501 break;
502 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
503 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
504 ouv, *s, startbyte);
a0dbb045
JH
505 break;
506 case UTF8_WARN_SURROGATE:
507 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
508 break;
a0dbb045 509 case UTF8_WARN_LONG:
097fb8e2
JH
510 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
511 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
512 break;
513 case UTF8_WARN_FFFF:
514 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
515 break;
516 default:
517 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
518 break;
519 }
520
521 if (warning) {
522 char *s = SvPVX(sv);
523
524 if (PL_op)
9014280d 525 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 526 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 527 else
9014280d 528 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
529 }
530 }
531
ba210ebe 532 if (retlen)
28d3d195 533 *retlen = expectlen ? expectlen : len;
ba210ebe 534
28d3d195 535 return 0;
a0ed51b3
LW
536}
537
8e84507e 538/*
37607a96 539=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
540
541Returns the native character value of the first character in the string C<s>
1e54db1a 542which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
543length, in bytes, of that character.
544
1e54db1a 545If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
546returned and retlen is set, if possible, to -1.
547
548=cut
549*/
550
551UV
37607a96 552Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 553{
872c91ae
JH
554 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
555 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
556}
557
558/*
37607a96 559=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
560
561Returns the Unicode code point of the first character in the string C<s>
1e54db1a 562which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
563length, in bytes, of that character.
564
565This function should only be used when returned UV is considered
566an index into the Unicode semantic tables (e.g. swashes).
567
1e54db1a 568If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 569returned and retlen is set, if possible, to -1.
8e84507e
NIS
570
571=cut
572*/
573
574UV
37607a96 575Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 576{
9041c2e3 577 /* Call the low level routine asking for checks */
872c91ae
JH
578 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
579 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
580}
581
b76347f2 582/*
37607a96 583=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
584
585Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
586Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
587up past C<e>, croaks.
b76347f2
JH
588
589=cut
590*/
591
592STRLEN
37607a96 593Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
594{
595 STRLEN len = 0;
596
8850bf83
JH
597 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
598 * the bitops (especially ~) can create illegal UTF-8.
599 * In other words: in Perl UTF-8 is not just for Unicode. */
600
901b21bf
JH
601 if (e < s) {
602 if (ckWARN_d(WARN_UTF8)) {
603 if (PL_op)
604 Perl_warner(aTHX_ packWARN(WARN_UTF8),
605 "%s in %s", unees, OP_DESC(PL_op));
606 else
607 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
608 }
609 return 0;
610 }
b76347f2 611 while (s < e) {
02eb7b47 612 U8 t = UTF8SKIP(s);
b76347f2 613
901b21bf
JH
614 if (e - s < t) {
615 if (ckWARN_d(WARN_UTF8)) {
616 if (PL_op)
617 Perl_warner(aTHX_ packWARN(WARN_UTF8),
618 unees, OP_DESC(PL_op));
619 else
620 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
621 }
622 return len;
623 }
b76347f2
JH
624 s += t;
625 len++;
626 }
627
628 return len;
629}
630
b06226ff 631/*
eebe1485 632=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 633
1e54db1a 634Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
635and C<b>.
636
637WARNING: use only if you *know* that the pointers point inside the
638same UTF-8 buffer.
639
37607a96
PK
640=cut
641*/
a0ed51b3 642
02eb7b47 643IV
864dbfa3 644Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 645{
02eb7b47
JH
646 IV off = 0;
647
8850bf83
JH
648 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
649 * the bitops (especially ~) can create illegal UTF-8.
650 * In other words: in Perl UTF-8 is not just for Unicode. */
651
a0ed51b3
LW
652 if (a < b) {
653 while (a < b) {
02eb7b47
JH
654 U8 c = UTF8SKIP(a);
655
901b21bf
JH
656 if (b - a < c) {
657 if (ckWARN_d(WARN_UTF8)) {
658 if (PL_op)
659 Perl_warner(aTHX_ packWARN(WARN_UTF8),
660 "%s in %s", unees, OP_DESC(PL_op));
661 else
662 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
663 }
664 return off;
665 }
02eb7b47 666 a += c;
a0ed51b3
LW
667 off--;
668 }
669 }
670 else {
671 while (b < a) {
02eb7b47
JH
672 U8 c = UTF8SKIP(b);
673
901b21bf
JH
674 if (a - b < c) {
675 if (ckWARN_d(WARN_UTF8)) {
676 if (PL_op)
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
678 "%s in %s", unees, OP_DESC(PL_op));
679 else
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 }
682 return off;
683 }
02eb7b47 684 b += c;
a0ed51b3
LW
685 off++;
686 }
687 }
02eb7b47 688
a0ed51b3
LW
689 return off;
690}
691
b06226ff 692/*
37607a96 693=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 694
8850bf83
JH
695Return the UTF-8 pointer C<s> displaced by C<off> characters, either
696forward or backward.
b06226ff
JH
697
698WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
699the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
700on the first byte of character or just after the last byte of a character.
b06226ff 701
37607a96
PK
702=cut
703*/
a0ed51b3
LW
704
705U8 *
864dbfa3 706Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 707{
8850bf83
JH
708 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
709 * the bitops (especially ~) can create illegal UTF-8.
710 * In other words: in Perl UTF-8 is not just for Unicode. */
711
a0ed51b3
LW
712 if (off >= 0) {
713 while (off--)
714 s += UTF8SKIP(s);
715 }
716 else {
717 while (off++) {
718 s--;
8850bf83
JH
719 while (UTF8_IS_CONTINUATION(*s))
720 s--;
a0ed51b3
LW
721 }
722 }
723 return s;
724}
725
6940069f 726/*
eebe1485 727=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 728
1e54db1a 729Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
730Unlike C<bytes_to_utf8>, this over-writes the original string, and
731updates len to contain the new length.
67e989fb 732Returns zero on failure, setting C<len> to -1.
6940069f
GS
733
734=cut
735*/
736
737U8 *
37607a96 738Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 739{
6940069f
GS
740 U8 *send;
741 U8 *d;
dcad2880 742 U8 *save = s;
246fae53 743
1e54db1a 744 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
745 for (send = s + *len; s < send; ) {
746 U8 c = *s++;
747
1d72bdf6
NIS
748 if (!UTF8_IS_INVARIANT(c) &&
749 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
750 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
751 *len = -1;
752 return 0;
753 }
246fae53 754 }
dcad2880
JH
755
756 d = s = save;
6940069f 757 while (s < send) {
ed646e6e 758 STRLEN ulen;
9041c2e3 759 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 760 s += ulen;
6940069f
GS
761 }
762 *d = '\0';
246fae53 763 *len = d - save;
6940069f
GS
764 return save;
765}
766
767/*
f9a63242
JH
768=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
769
1e54db1a 770Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
f9a63242 771Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
772the newly-created string, and updates C<len> to contain the new
773length. Returns the original string if no conversion occurs, C<len>
774is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7750 if C<s> is converted or contains all 7bit characters.
f9a63242 776
37607a96
PK
777=cut
778*/
f9a63242
JH
779
780U8 *
37607a96 781Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 782{
f9a63242
JH
783 U8 *d;
784 U8 *start = s;
db42d148 785 U8 *send;
f9a63242
JH
786 I32 count = 0;
787
788 if (!*is_utf8)
789 return start;
790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242
JH
792 for (send = s + *len; s < send;) {
793 U8 c = *s++;
1d72bdf6 794 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
795 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
796 (c = *s++) && UTF8_IS_CONTINUATION(c))
797 count++;
798 else
f9a63242 799 return start;
db42d148 800 }
f9a63242
JH
801 }
802
803 *is_utf8 = 0;
804
f9a63242 805 Newz(801, d, (*len) - count + 1, U8);
ef9edfd0 806 s = start; start = d;
f9a63242
JH
807 while (s < send) {
808 U8 c = *s++;
c4d5f83a
NIS
809 if (!UTF8_IS_INVARIANT(c)) {
810 /* Then it is two-byte encoded */
811 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
812 c = ASCII_TO_NATIVE(c);
813 }
814 *d++ = c;
f9a63242
JH
815 }
816 *d = '\0';
817 *len = d - start;
818 return start;
819}
820
821/*
eebe1485 822=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 823
1e54db1a 824Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
825Returns a pointer to the newly-created string, and sets C<len> to
826reflect the new length.
6940069f 827
1e54db1a 828If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
829see sv_recode_to_utf8().
830
497711e7 831=cut
6940069f
GS
832*/
833
834U8*
37607a96 835Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 836{
6940069f
GS
837 U8 *send;
838 U8 *d;
839 U8 *dst;
6662521e 840 send = s + (*len);
6940069f 841
6662521e 842 Newz(801, d, (*len) * 2 + 1, U8);
6940069f
GS
843 dst = d;
844
845 while (s < send) {
db42d148 846 UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 847 if (UNI_IS_INVARIANT(uv))
eb160463 848 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 849 else {
eb160463
GS
850 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
851 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
852 }
853 }
854 *d = '\0';
6662521e 855 *len = d-dst;
6940069f
GS
856 return dst;
857}
858
a0ed51b3 859/*
dea0fc0b 860 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
861 *
862 * Destination must be pre-extended to 3/2 source. Do not use in-place.
863 * We optimize for native, for obvious reasons. */
864
865U8*
dea0fc0b 866Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 867{
dea0fc0b
JH
868 U8* pend;
869 U8* dstart = d;
870
1de9afcd
RGS
871 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
872 d[0] = 0;
873 *newlen = 1;
874 return d;
875 }
876
dea0fc0b 877 if (bytelen & 1)
1de9afcd 878 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %d", bytelen);
dea0fc0b
JH
879
880 pend = p + bytelen;
881
a0ed51b3 882 while (p < pend) {
dea0fc0b
JH
883 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
884 p += 2;
a0ed51b3 885 if (uv < 0x80) {
eb160463 886 *d++ = (U8)uv;
a0ed51b3
LW
887 continue;
888 }
889 if (uv < 0x800) {
eb160463
GS
890 *d++ = (U8)(( uv >> 6) | 0xc0);
891 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
892 continue;
893 }
894 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
895 UV low = (p[0] << 8) + p[1];
896 p += 2;
dea0fc0b
JH
897 if (low < 0xdc00 || low >= 0xdfff)
898 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
899 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
900 }
901 if (uv < 0x10000) {
eb160463
GS
902 *d++ = (U8)(( uv >> 12) | 0xe0);
903 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
904 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
905 continue;
906 }
907 else {
eb160463
GS
908 *d++ = (U8)(( uv >> 18) | 0xf0);
909 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
910 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
911 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
912 continue;
913 }
914 }
dea0fc0b 915 *newlen = d - dstart;
a0ed51b3
LW
916 return d;
917}
918
919/* Note: this one is slightly destructive of the source. */
920
921U8*
dea0fc0b 922Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
923{
924 U8* s = (U8*)p;
925 U8* send = s + bytelen;
926 while (s < send) {
927 U8 tmp = s[0];
928 s[0] = s[1];
929 s[1] = tmp;
930 s += 2;
931 }
dea0fc0b 932 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
933}
934
935/* for now these are all defined (inefficiently) in terms of the utf8 versions */
936
937bool
84afefe6 938Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 939{
ad391ad9 940 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 941 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
942 return is_utf8_alnum(tmpbuf);
943}
944
945bool
84afefe6 946Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 947{
ad391ad9 948 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 949 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
950 return is_utf8_alnumc(tmpbuf);
951}
952
953bool
84afefe6 954Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 955{
ad391ad9 956 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 957 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
958 return is_utf8_idfirst(tmpbuf);
959}
960
961bool
84afefe6 962Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 963{
ad391ad9 964 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 965 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
966 return is_utf8_alpha(tmpbuf);
967}
968
969bool
84afefe6 970Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 971{
ad391ad9 972 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 973 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
974 return is_utf8_ascii(tmpbuf);
975}
976
977bool
84afefe6 978Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 979{
ad391ad9 980 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 981 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
982 return is_utf8_space(tmpbuf);
983}
984
985bool
84afefe6 986Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 987{
ad391ad9 988 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 989 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
990 return is_utf8_digit(tmpbuf);
991}
992
993bool
84afefe6 994Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 995{
ad391ad9 996 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 997 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
998 return is_utf8_upper(tmpbuf);
999}
1000
1001bool
84afefe6 1002Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1003{
ad391ad9 1004 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1005 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1006 return is_utf8_lower(tmpbuf);
1007}
1008
1009bool
84afefe6 1010Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1011{
ad391ad9 1012 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1013 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1014 return is_utf8_cntrl(tmpbuf);
1015}
1016
1017bool
84afefe6 1018Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1019{
ad391ad9 1020 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1021 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1022 return is_utf8_graph(tmpbuf);
1023}
1024
1025bool
84afefe6 1026Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1027{
ad391ad9 1028 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1029 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1030 return is_utf8_print(tmpbuf);
1031}
1032
b8c5462f 1033bool
84afefe6 1034Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1035{
ad391ad9 1036 U8 tmpbuf[UTF8_MAXLEN+1];
230880c1 1037 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1038 return is_utf8_punct(tmpbuf);
1039}
1040
4d61ec05 1041bool
84afefe6 1042Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1043{
e7ae6809 1044 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
230880c1 1045 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1046 return is_utf8_xdigit(tmpbuf);
1047}
1048
84afefe6
JH
1049UV
1050Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1051{
0ebc6274
JH
1052 uvchr_to_utf8(p, c);
1053 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1054}
1055
84afefe6
JH
1056UV
1057Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1058{
0ebc6274
JH
1059 uvchr_to_utf8(p, c);
1060 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1061}
1062
84afefe6
JH
1063UV
1064Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1065{
0ebc6274
JH
1066 uvchr_to_utf8(p, c);
1067 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1068}
1069
84afefe6
JH
1070UV
1071Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1072{
0ebc6274
JH
1073 uvchr_to_utf8(p, c);
1074 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1075}
1076
a0ed51b3
LW
1077/* for now these all assume no locale info available for Unicode > 255 */
1078
1079bool
84afefe6 1080Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1081{
1082 return is_uni_alnum(c); /* XXX no locale support yet */
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1087{
1088 return is_uni_alnumc(c); /* XXX no locale support yet */
1089}
1090
1091bool
84afefe6 1092Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1093{
1094 return is_uni_idfirst(c); /* XXX no locale support yet */
1095}
1096
1097bool
84afefe6 1098Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1099{
1100 return is_uni_alpha(c); /* XXX no locale support yet */
1101}
1102
1103bool
84afefe6 1104Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1105{
1106 return is_uni_ascii(c); /* XXX no locale support yet */
1107}
1108
1109bool
84afefe6 1110Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1111{
1112 return is_uni_space(c); /* XXX no locale support yet */
1113}
1114
1115bool
84afefe6 1116Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1117{
1118 return is_uni_digit(c); /* XXX no locale support yet */
1119}
1120
1121bool
84afefe6 1122Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1123{
1124 return is_uni_upper(c); /* XXX no locale support yet */
1125}
1126
1127bool
84afefe6 1128Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1129{
1130 return is_uni_lower(c); /* XXX no locale support yet */
1131}
1132
1133bool
84afefe6 1134Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1135{
1136 return is_uni_cntrl(c); /* XXX no locale support yet */
1137}
1138
1139bool
84afefe6 1140Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1141{
1142 return is_uni_graph(c); /* XXX no locale support yet */
1143}
1144
1145bool
84afefe6 1146Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1147{
1148 return is_uni_print(c); /* XXX no locale support yet */
1149}
1150
b8c5462f 1151bool
84afefe6 1152Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1153{
1154 return is_uni_punct(c); /* XXX no locale support yet */
1155}
1156
4d61ec05 1157bool
84afefe6 1158Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1159{
1160 return is_uni_xdigit(c); /* XXX no locale support yet */
1161}
1162
b7ac61fa
JH
1163U32
1164Perl_to_uni_upper_lc(pTHX_ U32 c)
1165{
ee099d14
JH
1166 /* XXX returns only the first character -- do not use XXX */
1167 /* XXX no locale support yet */
1168 STRLEN len;
1169 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1170 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1171}
1172
1173U32
1174Perl_to_uni_title_lc(pTHX_ U32 c)
1175{
ee099d14
JH
1176 /* XXX returns only the first character XXX -- do not use XXX */
1177 /* XXX no locale support yet */
1178 STRLEN len;
1179 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1180 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1181}
1182
1183U32
1184Perl_to_uni_lower_lc(pTHX_ U32 c)
1185{
ee099d14
JH
1186 /* XXX returns only the first character -- do not use XXX */
1187 /* XXX no locale support yet */
1188 STRLEN len;
1189 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1190 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1191}
1192
a0ed51b3 1193bool
864dbfa3 1194Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1195{
386d01d6
GS
1196 if (!is_utf8_char(p))
1197 return FALSE;
a0ed51b3 1198 if (!PL_utf8_alnum)
289d4f09
ML
1199 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1200 * descendant of isalnum(3), in other words, it doesn't
1201 * contain the '_'. --jhi */
1202 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1203 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1204/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1205#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1206 if (!PL_utf8_alnum)
1207 PL_utf8_alnum = swash_init("utf8", "",
1208 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1209 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1210#endif
1211}
1212
1213bool
b8c5462f
JH
1214Perl_is_utf8_alnumc(pTHX_ U8 *p)
1215{
386d01d6
GS
1216 if (!is_utf8_char(p))
1217 return FALSE;
b8c5462f
JH
1218 if (!PL_utf8_alnum)
1219 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1220 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1221/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1222#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1223 if (!PL_utf8_alnum)
1224 PL_utf8_alnum = swash_init("utf8", "",
1225 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1226 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1227#endif
1228}
1229
1230bool
82686b01 1231Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1232{
82686b01
JH
1233 if (*p == '_')
1234 return TRUE;
1235 if (!is_utf8_char(p))
1236 return FALSE;
1237 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1238 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1239 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1240}
1241
1242bool
1243Perl_is_utf8_idcont(pTHX_ U8 *p)
1244{
1245 if (*p == '_')
1246 return TRUE;
1247 if (!is_utf8_char(p))
1248 return FALSE;
1249 if (!PL_utf8_idcont)
1250 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1251 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1252}
1253
1254bool
864dbfa3 1255Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1256{
386d01d6
GS
1257 if (!is_utf8_char(p))
1258 return FALSE;
a0ed51b3 1259 if (!PL_utf8_alpha)
e24b16f9 1260 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1261 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1262}
1263
1264bool
b8c5462f
JH
1265Perl_is_utf8_ascii(pTHX_ U8 *p)
1266{
386d01d6
GS
1267 if (!is_utf8_char(p))
1268 return FALSE;
b8c5462f
JH
1269 if (!PL_utf8_ascii)
1270 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1271 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1272}
1273
1274bool
864dbfa3 1275Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1276{
386d01d6
GS
1277 if (!is_utf8_char(p))
1278 return FALSE;
a0ed51b3 1279 if (!PL_utf8_space)
3bec3564 1280 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1281 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1282}
1283
1284bool
864dbfa3 1285Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1286{
386d01d6
GS
1287 if (!is_utf8_char(p))
1288 return FALSE;
a0ed51b3 1289 if (!PL_utf8_digit)
e24b16f9 1290 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1291 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1292}
1293
1294bool
864dbfa3 1295Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1296{
386d01d6
GS
1297 if (!is_utf8_char(p))
1298 return FALSE;
a0ed51b3 1299 if (!PL_utf8_upper)
c65e4d19 1300 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1301 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1302}
1303
1304bool
864dbfa3 1305Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1306{
386d01d6
GS
1307 if (!is_utf8_char(p))
1308 return FALSE;
a0ed51b3 1309 if (!PL_utf8_lower)
c65e4d19 1310 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1311 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1312}
1313
1314bool
b8c5462f
JH
1315Perl_is_utf8_cntrl(pTHX_ U8 *p)
1316{
386d01d6
GS
1317 if (!is_utf8_char(p))
1318 return FALSE;
b8c5462f
JH
1319 if (!PL_utf8_cntrl)
1320 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1321 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1322}
1323
1324bool
1325Perl_is_utf8_graph(pTHX_ U8 *p)
1326{
386d01d6
GS
1327 if (!is_utf8_char(p))
1328 return FALSE;
b8c5462f
JH
1329 if (!PL_utf8_graph)
1330 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1331 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1332}
1333
1334bool
864dbfa3 1335Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1336{
386d01d6
GS
1337 if (!is_utf8_char(p))
1338 return FALSE;
a0ed51b3 1339 if (!PL_utf8_print)
e24b16f9 1340 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1341 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1342}
1343
1344bool
b8c5462f
JH
1345Perl_is_utf8_punct(pTHX_ U8 *p)
1346{
386d01d6
GS
1347 if (!is_utf8_char(p))
1348 return FALSE;
b8c5462f
JH
1349 if (!PL_utf8_punct)
1350 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1351 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1352}
1353
1354bool
1355Perl_is_utf8_xdigit(pTHX_ U8 *p)
1356{
386d01d6
GS
1357 if (!is_utf8_char(p))
1358 return FALSE;
b8c5462f
JH
1359 if (!PL_utf8_xdigit)
1360 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1361 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1362}
1363
1364bool
864dbfa3 1365Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1366{
386d01d6
GS
1367 if (!is_utf8_char(p))
1368 return FALSE;
a0ed51b3 1369 if (!PL_utf8_mark)
e24b16f9 1370 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1371 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1372}
1373
6b5c0936
JH
1374/*
1375=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377The "p" contains the pointer to the UTF-8 string encoding
1378the character that is being converted.
1379
1380The "ustrp" is a pointer to the character buffer to put the
1381conversion result to. The "lenp" is a pointer to the length
1382of the result.
1383
0134edef 1384The "swashp" is a pointer to the swash to use.
6b5c0936 1385
0134edef
JH
1386Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1388but not always, a multicharacter mapping), is tried first.
6b5c0936 1389
0134edef
JH
1390The "special" is a string like "utf8::ToSpecLower", which means the
1391hash %utf8::ToSpecLower. The access to the hash is through
1392Perl_to_utf8_case().
6b5c0936 1393
0134edef
JH
1394The "normal" is a string like "ToLower" which means the swash
1395%utf8::ToLower.
1396
1397=cut */
6b5c0936 1398
2104c8d9 1399UV
a6872d42 1400Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1401{
0134edef 1402 UV uv0, uv1;
2f9475ad 1403 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
0134edef 1404 STRLEN len = 0;
a0ed51b3 1405
1feea2c7
JH
1406 uv0 = utf8_to_uvchr(p, 0);
1407 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1408 * are necessary in EBCDIC, they are redundant no-ops
1409 * in ASCII-ish platforms, and hopefully optimized away. */
1410 uv1 = NATIVE_TO_UNI(uv0);
1411 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1412
1413 if (!*swashp) /* load on-demand */
1414 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1415
b08cf34e
JH
1416 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1417 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1418 /* It might be "special" (sometimes, but not always,
2a37f04d 1419 * a multicharacter mapping) */
983ffd37 1420 HV *hv;
b08cf34e
JH
1421 SV **svp;
1422
1423 if ((hv = get_hv(special, FALSE)) &&
1424 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1425 (*svp)) {
1426 char *s;
47654450 1427
b08cf34e 1428 s = SvPV(*svp, len);
47654450
JH
1429 if (len == 1)
1430 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1431 else {
2f9475ad
JH
1432#ifdef EBCDIC
1433 /* If we have EBCDIC we need to remap the characters
1434 * since any characters in the low 256 are Unicode
1435 * code points, not EBCDIC. */
7cda7a3d 1436 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1437
1438 d = tmpbuf;
b08cf34e 1439 if (SvUTF8(*svp)) {
2f9475ad
JH
1440 STRLEN tlen = 0;
1441
1442 while (t < tend) {
1443 UV c = utf8_to_uvchr(t, &tlen);
1444 if (tlen > 0) {
1445 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1446 t += tlen;
1447 }
1448 else
1449 break;
1450 }
1451 }
1452 else {
36fec512
JH
1453 while (t < tend) {
1454 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1455 t++;
1456 }
2f9475ad
JH
1457 }
1458 len = d - tmpbuf;
1459 Copy(tmpbuf, ustrp, len, U8);
1460#else
d2dcd0fb 1461 Copy(s, ustrp, len, U8);
2f9475ad 1462#endif
29e98929 1463 }
983ffd37 1464 }
0134edef
JH
1465 }
1466
1467 if (!len && *swashp) {
1468 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1469
1470 if (uv2) {
1471 /* It was "normal" (a single character mapping). */
1472 UV uv3 = UNI_TO_NATIVE(uv2);
1473
e9101d72 1474 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1475 }
1476 }
1feea2c7 1477
0134edef
JH
1478 if (!len) /* Neither: just copy. */
1479 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1480
2a37f04d
JH
1481 if (lenp)
1482 *lenp = len;
1483
0134edef 1484 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1485}
1486
d3e79532
JH
1487/*
1488=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490Convert the UTF-8 encoded character at p to its uppercase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1492that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1493uppercase version may be longer than the original character (up to two
1494characters).
1495
1496The first character of the uppercased version is returned
1497(but note, as explained above, that there may be more.)
1498
1499=cut */
1500
2104c8d9 1501UV
983ffd37 1502Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1503{
983ffd37 1504 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1505 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1506}
a0ed51b3 1507
d3e79532
JH
1508/*
1509=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1510
1511Convert the UTF-8 encoded character at p to its titlecase version and
1512store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1513that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1514titlecase version may be longer than the original character (up to two
1515characters).
1516
1517The first character of the titlecased version is returned
1518(but note, as explained above, that there may be more.)
1519
1520=cut */
1521
983ffd37
JH
1522UV
1523Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1524{
1525 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1526 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1527}
1528
d3e79532
JH
1529/*
1530=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1531
1532Convert the UTF-8 encoded character at p to its lowercase version and
1533store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1534that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1535lowercase version may be longer than the original character (up to two
1536characters).
1537
1538The first character of the lowercased version is returned
1539(but note, as explained above, that there may be more.)
1540
1541=cut */
1542
2104c8d9 1543UV
a2a2844f 1544Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1545{
983ffd37 1546 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1547 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1548}
1549
d3e79532
JH
1550/*
1551=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1552
1553Convert the UTF-8 encoded character at p to its foldcase version and
1554store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1555that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1556foldcase version may be longer than the original character (up to
1557three characters).
1558
1559The first character of the foldcased version is returned
1560(but note, as explained above, that there may be more.)
1561
1562=cut */
1563
b4e400f9
JH
1564UV
1565Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1566{
1567 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1568 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1569}
1570
1571/* a "swash" is a swatch hash */
1572
1573SV*
864dbfa3 1574Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1575{
1576 SV* retval;
bf1fed83 1577 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
8e84507e 1578 dSP;
71bed85a
NC
1579 size_t pkg_len = strlen(pkg);
1580 size_t name_len = strlen(name);
1581 HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1582 SV* errsv_save;
ce3b816e 1583
1b026014 1584 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1585 ENTER;
f8be5cf0 1586 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1587 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1588 Nullsv);
f8be5cf0
JH
1589 if (!SvTRUE(ERRSV))
1590 sv_setsv(ERRSV, errsv_save);
1591 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1592 LEAVE;
1593 }
1594 SPAGAIN;
a0ed51b3
LW
1595 PUSHSTACKi(PERLSI_MAGIC);
1596 PUSHMARK(SP);
1597 EXTEND(SP,5);
71bed85a
NC
1598 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1599 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1600 PUSHs(listsv);
1601 PUSHs(sv_2mortal(newSViv(minbits)));
1602 PUSHs(sv_2mortal(newSViv(none)));
1603 PUTBACK;
1604 ENTER;
1605 SAVEI32(PL_hints);
1606 PL_hints = 0;
1607 save_re_context();
923e4eb5 1608 if (IN_PERL_COMPILETIME) {
bf1fed83 1609 /* XXX ought to be handled by lex_start */
82686b01 1610 SAVEI32(PL_in_my);
2b4bd638 1611 PL_in_my = 0;
bf1fed83 1612 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1613 }
f8be5cf0 1614 errsv_save = newSVsv(ERRSV);
864dbfa3 1615 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1616 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1617 else
e24b16f9 1618 retval = &PL_sv_undef;
f8be5cf0
JH
1619 if (!SvTRUE(ERRSV))
1620 sv_setsv(ERRSV, errsv_save);
1621 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1622 LEAVE;
1623 POPSTACK;
923e4eb5 1624 if (IN_PERL_COMPILETIME) {
bf1fed83
JH
1625 STRLEN len;
1626 char* pv = SvPV(tokenbufsv, len);
1627
1628 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1629 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1630 }
bc45ce41
JH
1631 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1632 if (SvPOK(retval))
35c1215d
NC
1633 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1634 retval);
cea2e8a9 1635 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1636 }
a0ed51b3
LW
1637 return retval;
1638}
1639
035d37be
JH
1640
1641/* This API is wrong for special case conversions since we may need to
1642 * return several Unicode characters for a single Unicode character
1643 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1644 * the lower-level routine, and it is similarly broken for returning
1645 * multiple values. --jhi */
a0ed51b3 1646UV
3568d838 1647Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3
LW
1648{
1649 HV* hv = (HV*)SvRV(sv);
3568d838
JH
1650 U32 klen;
1651 U32 off;
a0ed51b3 1652 STRLEN slen;
7d85a32c 1653 STRLEN needents;
4ea42e7f 1654 U8 *tmps = NULL;
a0ed51b3
LW
1655 U32 bit;
1656 SV *retval;
3568d838
JH
1657 U8 tmputf8[2];
1658 UV c = NATIVE_TO_ASCII(*ptr);
1659
1660 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1661 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1662 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1663 ptr = tmputf8;
1664 }
1665 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1666 * then the "swatch" is a vec() for al the chars which start
1667 * with 0xAA..0xYY
1668 * So the key in the hash (klen) is length of encoded char -1
1669 */
1670 klen = UTF8SKIP(ptr) - 1;
1671 off = ptr[klen];
a0ed51b3 1672
7d85a32c
JH
1673 if (klen == 0)
1674 {
1675 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1676 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1677 */
1678 needents = UTF_CONTINUATION_MARK;
1679 off = NATIVE_TO_UTF(ptr[klen]);
1680 }
1681 else
1682 {
1683 /* If char is encoded then swatch is for the prefix */
1684 needents = (1 << UTF_ACCUMULATION_SHIFT);
1685 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1686 }
1687
a0ed51b3
LW
1688 /*
1689 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1690 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1691 * it's nothing to sniff at.) Pity we usually come through at least
1692 * two function calls to get here...
1693 *
1694 * NB: this code assumes that swatches are never modified, once generated!
1695 */
1696
3568d838 1697 if (hv == PL_last_swash_hv &&
a0ed51b3 1698 klen == PL_last_swash_klen &&
3568d838 1699 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1700 {
1701 tmps = PL_last_swash_tmps;
1702 slen = PL_last_swash_slen;
1703 }
1704 else {
1705 /* Try our second-level swatch cache, kept in a hash. */
dfe13c55 1706 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
a0ed51b3
LW
1707
1708 /* If not cached, generate it via utf8::SWASHGET */
dfe13c55 1709 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
a0ed51b3 1710 dSP;
2b9d42f0
NIS
1711 /* We use utf8n_to_uvuni() as we want an index into
1712 Unicode tables, not a native character number.
1713 */
872c91ae
JH
1714 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1715 ckWARN(WARN_UTF8) ?
1716 0 : UTF8_ALLOW_ANY);
f8be5cf0 1717 SV *errsv_save;
a0ed51b3
LW
1718 ENTER;
1719 SAVETMPS;
1720 save_re_context();
1721 PUSHSTACKi(PERLSI_MAGIC);
1722 PUSHMARK(SP);
1723 EXTEND(SP,3);
1724 PUSHs((SV*)sv);
ffbc6a93 1725 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1726 PUSHs(sv_2mortal(newSViv((klen) ?
1727 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1728 PUSHs(sv_2mortal(newSViv(needents)));
1729 PUTBACK;
f8be5cf0 1730 errsv_save = newSVsv(ERRSV);
864dbfa3 1731 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1732 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1733 else
e24b16f9 1734 retval = &PL_sv_undef;
f8be5cf0
JH
1735 if (!SvTRUE(ERRSV))
1736 sv_setsv(ERRSV, errsv_save);
1737 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1738 POPSTACK;
1739 FREETMPS;
1740 LEAVE;
923e4eb5 1741 if (IN_PERL_COMPILETIME)
eb160463 1742 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1743
dfe13c55 1744 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
a0ed51b3 1745
7d85a32c 1746 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1747 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1748 }
1749
1750 PL_last_swash_hv = hv;
1751 PL_last_swash_klen = klen;
1752 PL_last_swash_tmps = tmps;
1753 PL_last_swash_slen = slen;
1754 if (klen)
1755 Copy(ptr, PL_last_swash_key, klen, U8);
1756 }
1757
9faf8d75 1758 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1759 case 1:
1760 bit = 1 << (off & 7);
1761 off >>= 3;
1762 return (tmps[off] & bit) != 0;
1763 case 8:
1764 return tmps[off];
1765 case 16:
1766 off <<= 1;
1767 return (tmps[off] << 8) + tmps[off + 1] ;
1768 case 32:
1769 off <<= 2;
1770 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1771 }
cea2e8a9 1772 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1773 return 0;
1774}
2b9d42f0
NIS
1775
1776
1777/*
37607a96 1778=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1779
1e54db1a 1780Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2b9d42f0
NIS
1781of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1782bytes available. The return value is the pointer to the byte after the
1783end of the new character. In other words,
1784
1785 d = uvchr_to_utf8(d, uv);
1786
1787is the recommended wide native character-aware way of saying
1788
1789 *(d++) = uv;
1790
1791=cut
1792*/
1793
1794/* On ASCII machines this is normally a macro but we want a
1795 real function in case XS code wants it
1796*/
1797#undef Perl_uvchr_to_utf8
1798U8 *
1799Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1800{
b851fbc1 1801 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1802}
1803
b851fbc1
JH
1804U8 *
1805Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1806{
1807 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1808}
2b9d42f0
NIS
1809
1810/*
37607a96 1811=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1812
1813Returns the native character value of the first character in the string C<s>
1e54db1a 1814which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1815length, in bytes, of that character.
1816
1817Allows length and flags to be passed to low level routine.
1818
1819=cut
1820*/
0a2ef054
JH
1821/* On ASCII machines this is normally a macro but we want
1822 a real function in case XS code wants it
2b9d42f0
NIS
1823*/
1824#undef Perl_utf8n_to_uvchr
1825UV
37607a96 1826Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0
NIS
1827{
1828 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1829 return UNI_TO_NATIVE(uv);
1830}
1831
d2cc3551
JH
1832/*
1833=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1834
1835Build to the scalar dsv a displayable version of the string spv,
1836length len, the displayable version being at most pvlim bytes long
1837(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1838
9e55ce06 1839The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1840isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1841to display the \\[nrfta\\] as the backslashed versions (like '\n')
1842(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1843UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1844UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1845
d2cc3551
JH
1846The pointer to the PV of the dsv is returned.
1847
1848=cut */
e6b2e755
JH
1849char *
1850Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1851{
1852 int truncated = 0;
1853 char *s, *e;
1854
1855 sv_setpvn(dsv, "", 0);
1856 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1857 UV u;
c728cb41
JH
1858 bool ok = FALSE;
1859
e6b2e755
JH
1860 if (pvlim && SvCUR(dsv) >= pvlim) {
1861 truncated++;
1862 break;
1863 }
1864 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1865 if (u < 256) {
c728cb41
JH
1866 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1867 switch (u & 0xFF) {
1868 case '\n':
1869 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1870 case '\r':
1871 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1872 case '\t':
1873 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1874 case '\f':
1875 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1876 case '\a':
1877 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1878 case '\\':
d79a7a3d 1879 Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
c728cb41
JH
1880 default: break;
1881 }
1882 }
00e86452
JH
1883 /* isPRINT() is the locale-blind version. */
1884 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
2c4547fe 1885 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
0a2ef054
JH
1886 ok = TRUE;
1887 }
c728cb41
JH
1888 }
1889 if (!ok)
9e55ce06 1890 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1891 }
1892 if (truncated)
1893 sv_catpvn(dsv, "...", 3);
1894
1895 return SvPVX(dsv);
1896}
2b9d42f0 1897
d2cc3551
JH
1898/*
1899=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1900
1901Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1902the displayable version being at most pvlim bytes long
d2cc3551 1903(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1904
1905The flags argument is as in pv_uni_display().
1906
d2cc3551
JH
1907The pointer to the PV of the dsv is returned.
1908
1909=cut */
e6b2e755
JH
1910char *
1911Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1912{
701a277b
JH
1913 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1914 pvlim, flags);
1915}
1916
d2cc3551 1917/*
d07ddd77 1918=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1919
1920Return true if the strings s1 and s2 differ case-insensitively, false
1921if not (if they are equal case-insensitively). If u1 is true, the
1922string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1923the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1924are false, the respective string is assumed to be in native 8-bit
1925encoding.
1926
1927If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1928in there (they will point at the beginning of the I<next> character).
1929If the pointers behind pe1 or pe2 are non-NULL, they are the end
1930pointers beyond which scanning will not continue under any
1931circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1932s2+l2 will be used as goal end pointers that will also stop the scan,
1933and which qualify towards defining a successful match: all the scans
1934that define an explicit length must reach their goal pointers for
1935a match to succeed).
d2cc3551
JH
1936
1937For case-insensitiveness, the "casefolding" of Unicode is used
1938instead of upper/lowercasing both the characters, see
1939http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1940
1941=cut */
701a277b 1942I32
d07ddd77 1943Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 1944{
5469e704
JH
1945 register U8 *p1 = (U8*)s1;
1946 register U8 *p2 = (U8*)s2;
d07ddd77
JH
1947 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1948 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1949 STRLEN n1 = 0, n2 = 0;
ffce6cc2
JH
1950 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1951 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
d7f013c8
JH
1952 U8 natbuf[1+1];
1953 STRLEN foldlen1, foldlen2;
d07ddd77 1954 bool match;
332ddc25 1955
d07ddd77
JH
1956 if (pe1)
1957 e1 = *(U8**)pe1;
eb160463 1958 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
d07ddd77
JH
1959 f1 = (U8*)s1 + l1;
1960 if (pe2)
1961 e2 = *(U8**)pe2;
eb160463 1962 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
d07ddd77
JH
1963 f2 = (U8*)s2 + l2;
1964
1965 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1966 return 1; /* mismatch; possible infinite loop or false positive */
1967
a6872d42
JH
1968 if (!u1 || !u2)
1969 natbuf[1] = 0; /* Need to terminate the buffer. */
1970
d07ddd77
JH
1971 while ((e1 == 0 || p1 < e1) &&
1972 (f1 == 0 || p1 < f1) &&
1973 (e2 == 0 || p2 < e2) &&
1974 (f2 == 0 || p2 < f2)) {
1975 if (n1 == 0) {
d7f013c8
JH
1976 if (u1)
1977 to_utf8_fold(p1, foldbuf1, &foldlen1);
1978 else {
f5cee151 1979 natbuf[0] = *p1;
d7f013c8
JH
1980 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1981 }
1982 q1 = foldbuf1;
d07ddd77 1983 n1 = foldlen1;
332ddc25 1984 }
d07ddd77 1985 if (n2 == 0) {
d7f013c8
JH
1986 if (u2)
1987 to_utf8_fold(p2, foldbuf2, &foldlen2);
1988 else {
f5cee151 1989 natbuf[0] = *p2;
d7f013c8
JH
1990 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1991 }
1992 q2 = foldbuf2;
d07ddd77 1993 n2 = foldlen2;
332ddc25 1994 }
d07ddd77
JH
1995 while (n1 && n2) {
1996 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1997 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1998 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 1999 return 1; /* mismatch */
d07ddd77 2000 n1 -= UTF8SKIP(q1);
d7f013c8 2001 q1 += UTF8SKIP(q1);
d07ddd77 2002 n2 -= UTF8SKIP(q2);
d7f013c8 2003 q2 += UTF8SKIP(q2);
701a277b 2004 }
d07ddd77 2005 if (n1 == 0)
d7f013c8 2006 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2007 if (n2 == 0)
d7f013c8
JH
2008 p2 += u2 ? UTF8SKIP(p2) : 1;
2009
d2cc3551 2010 }
5469e704 2011
d07ddd77
JH
2012 /* A match is defined by all the scans that specified
2013 * an explicit length reaching their final goals. */
2014 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2015
2016 if (match) {
d07ddd77
JH
2017 if (pe1)
2018 *pe1 = (char*)p1;
2019 if (pe2)
2020 *pe2 = (char*)p2;
5469e704
JH
2021 }
2022
2023 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2024}
701a277b 2025