This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
fix signed/unsigned nit in reentr.c
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
b94e2f88
RGS
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
5f66b61c 189S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
TS
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
96a5add6 241 PERL_UNUSED_CONTEXT;
3b0fc154 242#ifdef IS_UTF8_CHAR
768c67ee 243 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
244 return IS_UTF8_CHAR(s, len) ? len : 0;
245#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 246 return is_utf8_char_slow(s, len);
386d01d6
GS
247}
248
6662521e 249/*
7fc63493 250=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 251
c9ada85f 252Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
253UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
254not mean 'a string that contains code points above 0x7F encoded in UTF-8'
255because a valid ASCII string is a valid UTF-8 string.
6662521e 256
768c67ee
JH
257See also is_utf8_string_loclen() and is_utf8_string_loc().
258
6662521e
GS
259=cut
260*/
261
8e84507e 262bool
7fc63493 263Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 264{
35da51f7 265 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 266 const U8* x = s;
067a85ef 267
96a5add6 268 PERL_UNUSED_CONTEXT;
1aa99e6b 269
6662521e 270 while (x < send) {
a3b680e6 271 STRLEN c;
1acdb0da
JH
272 /* Inline the easy bits of is_utf8_char() here for speed... */
273 if (UTF8_IS_INVARIANT(*x))
274 c = 1;
275 else if (!UTF8_IS_START(*x))
768c67ee 276 goto out;
1acdb0da
JH
277 else {
278 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
279#ifdef IS_UTF8_CHAR
280 c = UTF8SKIP(x);
768c67ee
JH
281 if (IS_UTF8_CHAR_FAST(c)) {
282 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
283 c = 0;
284 }
285 else
286 c = is_utf8_char_slow(x, c);
646ca15d
JH
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee
JH
295
296 out:
60006e79
JH
297 if (x != send)
298 return FALSE;
067a85ef
A
299
300 return TRUE;
6662521e
GS
301}
302
67e989fb 303/*
814fafa7
NC
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
35da51f7 329 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 330 const U8* x = s;
81cd54e3 331 STRLEN c;
3ebfea28 332 STRLEN outlen = 0;
96a5add6 333 PERL_UNUSED_CONTEXT;
81cd54e3 334
81cd54e3
JH
335 while (x < send) {
336 /* Inline the easy bits of is_utf8_char() here for speed... */
337 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
338 c = 1;
339 else if (!UTF8_IS_START(*x))
340 goto out;
81cd54e3 341 else {
768c67ee
JH
342 /* ... and call is_utf8_char() only if really needed. */
343#ifdef IS_UTF8_CHAR
344 c = UTF8SKIP(x);
345 if (IS_UTF8_CHAR_FAST(c)) {
346 if (!IS_UTF8_CHAR(x, c))
347 c = 0;
348 } else
349 c = is_utf8_char_slow(x, c);
350#else
351 c = is_utf8_char(x);
352#endif /* #ifdef IS_UTF8_CHAR */
353 if (!c)
354 goto out;
81cd54e3 355 }
768c67ee 356 x += c;
3ebfea28 357 outlen++;
81cd54e3 358 }
768c67ee
JH
359
360 out:
3ebfea28
AL
361 if (el)
362 *el = outlen;
363
768c67ee
JH
364 if (ep)
365 *ep = x;
3ebfea28 366 return (x == send);
81cd54e3
JH
367}
368
369/*
768c67ee 370
7fc63493 371=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 372
9041c2e3
NIS
373Bottom level UTF-8 decode routine.
374Returns the unicode code point value of the first character in the string C<s>
1e54db1a 375which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 376C<retlen> will be set to the length, in bytes, of that character.
67e989fb 377
1e54db1a 378If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
379is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
380it is assumed that the caller will raise a warning, and this function
28d3d195
JH
381will silently just set C<retlen> to C<-1> and return zero. If the
382C<flags> does not contain UTF8_CHECK_ONLY, warnings about
383malformations will be given, C<retlen> will be set to the expected
384length of the UTF-8 character in bytes, and zero will be returned.
385
386The C<flags> can also contain various flags to allow deviations from
387the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 388
9041c2e3
NIS
389Most code should use utf8_to_uvchr() rather than call this directly.
390
37607a96
PK
391=cut
392*/
67e989fb 393
a0ed51b3 394UV
7fc63493 395Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 396{
97aff369 397 dVAR;
d4c19fe8 398 const U8 * const s0 = s;
9c5ffd7c 399 UV uv = *s, ouv = 0;
ba210ebe 400 STRLEN len = 1;
7fc63493
AL
401 const bool dowarn = ckWARN_d(WARN_UTF8);
402 const UV startbyte = *s;
ba210ebe 403 STRLEN expectlen = 0;
a0dbb045
JH
404 U32 warning = 0;
405
406/* This list is a superset of the UTF8_ALLOW_XXX. */
407
408#define UTF8_WARN_EMPTY 1
409#define UTF8_WARN_CONTINUATION 2
410#define UTF8_WARN_NON_CONTINUATION 3
411#define UTF8_WARN_FE_FF 4
412#define UTF8_WARN_SHORT 5
413#define UTF8_WARN_OVERFLOW 6
414#define UTF8_WARN_SURROGATE 7
c867b360
JH
415#define UTF8_WARN_LONG 8
416#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
417
418 if (curlen == 0 &&
419 !(flags & UTF8_ALLOW_EMPTY)) {
420 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
421 goto malformed;
422 }
423
1d72bdf6 424 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
425 if (retlen)
426 *retlen = 1;
c4d5f83a 427 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 428 }
67e989fb 429
421a8bf2 430 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 431 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 432 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
433 goto malformed;
434 }
435
421a8bf2 436 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 437 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 438 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
439 goto malformed;
440 }
9041c2e3 441
1d72bdf6 442#ifdef EBCDIC
75383841 443 uv = NATIVE_TO_UTF(uv);
1d72bdf6 444#else
fcc8fcf6
JH
445 if ((uv == 0xfe || uv == 0xff) &&
446 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 447 warning = UTF8_WARN_FE_FF;
ba210ebe 448 goto malformed;
a0ed51b3 449 }
1d72bdf6
NIS
450#endif
451
ba210ebe
JH
452 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
453 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
454 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
455 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
456#ifdef EBCDIC
457 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
458 else { len = 7; uv &= 0x01; }
459#else
ba210ebe
JH
460 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
461 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
462 else { len = 13; uv = 0; } /* whoa! */
463#endif
464
a0ed51b3
LW
465 if (retlen)
466 *retlen = len;
9041c2e3 467
ba210ebe
JH
468 expectlen = len;
469
fcc8fcf6
JH
470 if ((curlen < expectlen) &&
471 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 472 warning = UTF8_WARN_SHORT;
ba210ebe
JH
473 goto malformed;
474 }
475
476 len--;
a0ed51b3 477 s++;
ba210ebe
JH
478 ouv = uv;
479
a0ed51b3 480 while (len--) {
421a8bf2
JH
481 if (!UTF8_IS_CONTINUATION(*s) &&
482 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
483 s--;
484 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 485 goto malformed;
a0ed51b3
LW
486 }
487 else
8850bf83 488 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
489 if (!(uv > ouv)) {
490 /* These cannot be allowed. */
491 if (uv == ouv) {
75dbc644 492 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
493 warning = UTF8_WARN_LONG;
494 goto malformed;
495 }
496 }
497 else { /* uv < ouv */
498 /* This cannot be allowed. */
499 warning = UTF8_WARN_OVERFLOW;
500 goto malformed;
501 }
ba210ebe
JH
502 }
503 s++;
504 ouv = uv;
505 }
506
421a8bf2 507 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 508 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 509 warning = UTF8_WARN_SURROGATE;
ba210ebe 510 goto malformed;
eb160463 511 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 513 warning = UTF8_WARN_LONG;
ba210ebe 514 goto malformed;
421a8bf2 515 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 516 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 517 warning = UTF8_WARN_FFFF;
a9917092 518 goto malformed;
a0ed51b3 519 }
ba210ebe 520
a0ed51b3 521 return uv;
ba210ebe
JH
522
523malformed:
524
fcc8fcf6 525 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 526 if (retlen)
cc366d4b 527 *retlen = -1;
ba210ebe
JH
528 return 0;
529 }
530
a0dbb045 531 if (dowarn) {
396482e1 532 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045
JH
533
534 switch (warning) {
535 case 0: /* Intentionally empty. */ break;
536 case UTF8_WARN_EMPTY:
396482e1 537 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
538 break;
539 case UTF8_WARN_CONTINUATION:
097fb8e2 540 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
541 break;
542 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
543 if (s == s0)
544 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
545 (UV)s[1], startbyte);
551405c4
AL
546 else {
547 const int len = (int)(s-s0);
097fb8e2 548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
549 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
550 }
551
a0dbb045
JH
552 break;
553 case UTF8_WARN_FE_FF:
554 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
555 break;
556 case UTF8_WARN_SHORT:
097fb8e2 557 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 558 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 559 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
560 break;
561 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
562 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
563 ouv, *s, startbyte);
a0dbb045
JH
564 break;
565 case UTF8_WARN_SURROGATE:
566 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
567 break;
a0dbb045 568 case UTF8_WARN_LONG:
097fb8e2 569 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 570 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
571 break;
572 case UTF8_WARN_FFFF:
573 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
574 break;
575 default:
396482e1 576 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
577 break;
578 }
579
580 if (warning) {
44f8325f 581 const char * const s = SvPVX_const(sv);
a0dbb045
JH
582
583 if (PL_op)
9014280d 584 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 585 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 586 else
9014280d 587 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
588 }
589 }
590
ba210ebe 591 if (retlen)
28d3d195 592 *retlen = expectlen ? expectlen : len;
ba210ebe 593
28d3d195 594 return 0;
a0ed51b3
LW
595}
596
8e84507e 597/*
7fc63493 598=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
599
600Returns the native character value of the first character in the string C<s>
1e54db1a 601which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
602length, in bytes, of that character.
603
1e54db1a 604If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
605returned and retlen is set, if possible, to -1.
606
607=cut
608*/
609
610UV
7fc63493 611Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 612{
1754c1a1
NC
613 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
614 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
615}
616
617/*
7fc63493 618=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
619
620Returns the Unicode code point of the first character in the string C<s>
1e54db1a 621which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
622length, in bytes, of that character.
623
624This function should only be used when returned UV is considered
625an index into the Unicode semantic tables (e.g. swashes).
626
1e54db1a 627If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 628returned and retlen is set, if possible, to -1.
8e84507e
NIS
629
630=cut
631*/
632
633UV
7fc63493 634Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 635{
9041c2e3 636 /* Call the low level routine asking for checks */
89ebb4a3 637 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 638 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
639}
640
b76347f2 641/*
35a4481c 642=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
643
644Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
645Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
646up past C<e>, croaks.
b76347f2
JH
647
648=cut
649*/
650
651STRLEN
35a4481c 652Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 653{
97aff369 654 dVAR;
b76347f2
JH
655 STRLEN len = 0;
656
8850bf83
JH
657 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
658 * the bitops (especially ~) can create illegal UTF-8.
659 * In other words: in Perl UTF-8 is not just for Unicode. */
660
a3b680e6
AL
661 if (e < s)
662 goto warn_and_return;
b76347f2 663 while (s < e) {
4373e329 664 const U8 t = UTF8SKIP(s);
901b21bf 665 if (e - s < t) {
a3b680e6 666 warn_and_return:
901b21bf
JH
667 if (ckWARN_d(WARN_UTF8)) {
668 if (PL_op)
669 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 670 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
671 else
672 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
673 }
674 return len;
675 }
b76347f2
JH
676 s += t;
677 len++;
678 }
679
680 return len;
681}
682
b06226ff 683/*
35a4481c 684=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 685
1e54db1a 686Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
687and C<b>.
688
689WARNING: use only if you *know* that the pointers point inside the
690same UTF-8 buffer.
691
37607a96
PK
692=cut
693*/
a0ed51b3 694
02eb7b47 695IV
35a4481c 696Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 697{
bf1665bc 698 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
699}
700
b06226ff 701/*
37607a96 702=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 703
8850bf83
JH
704Return the UTF-8 pointer C<s> displaced by C<off> characters, either
705forward or backward.
b06226ff
JH
706
707WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
708the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
709on the first byte of character or just after the last byte of a character.
b06226ff 710
37607a96
PK
711=cut
712*/
a0ed51b3
LW
713
714U8 *
4373e329 715Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 716{
96a5add6 717 PERL_UNUSED_CONTEXT;
8850bf83
JH
718 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
719 * the bitops (especially ~) can create illegal UTF-8.
720 * In other words: in Perl UTF-8 is not just for Unicode. */
721
a0ed51b3
LW
722 if (off >= 0) {
723 while (off--)
724 s += UTF8SKIP(s);
725 }
726 else {
727 while (off++) {
728 s--;
8850bf83
JH
729 while (UTF8_IS_CONTINUATION(*s))
730 s--;
a0ed51b3
LW
731 }
732 }
4373e329 733 return (U8 *)s;
a0ed51b3
LW
734}
735
6940069f 736/*
eebe1485 737=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 738
1e54db1a 739Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
740Unlike C<bytes_to_utf8>, this over-writes the original string, and
741updates len to contain the new length.
67e989fb 742Returns zero on failure, setting C<len> to -1.
6940069f 743
95be277c
NC
744If you need a copy of the string, see C<bytes_from_utf8>.
745
6940069f
GS
746=cut
747*/
748
749U8 *
37607a96 750Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 751{
d4c19fe8
AL
752 U8 * const save = s;
753 U8 * const send = s + *len;
6940069f 754 U8 *d;
246fae53 755
1e54db1a 756 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 757 while (s < send) {
dcad2880
JH
758 U8 c = *s++;
759
1d72bdf6
NIS
760 if (!UTF8_IS_INVARIANT(c) &&
761 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
762 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
763 *len = -1;
764 return 0;
765 }
246fae53 766 }
dcad2880
JH
767
768 d = s = save;
6940069f 769 while (s < send) {
ed646e6e 770 STRLEN ulen;
9041c2e3 771 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 772 s += ulen;
6940069f
GS
773 }
774 *d = '\0';
246fae53 775 *len = d - save;
6940069f
GS
776 return save;
777}
778
779/*
e1ec3a88 780=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 781
1e54db1a 782Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 783Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
784the newly-created string, and updates C<len> to contain the new
785length. Returns the original string if no conversion occurs, C<len>
786is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7870 if C<s> is converted or contains all 7bit characters.
f9a63242 788
37607a96
PK
789=cut
790*/
f9a63242
JH
791
792U8 *
e1ec3a88 793Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 794{
f9a63242 795 U8 *d;
e1ec3a88
AL
796 const U8 *start = s;
797 const U8 *send;
f9a63242
JH
798 I32 count = 0;
799
96a5add6 800 PERL_UNUSED_CONTEXT;
f9a63242 801 if (!*is_utf8)
73d840c0 802 return (U8 *)start;
f9a63242 803
1e54db1a 804 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 805 for (send = s + *len; s < send;) {
e1ec3a88 806 U8 c = *s++;
1d72bdf6 807 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
808 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
809 (c = *s++) && UTF8_IS_CONTINUATION(c))
810 count++;
811 else
73d840c0 812 return (U8 *)start;
db42d148 813 }
f9a63242
JH
814 }
815
35da51f7 816 *is_utf8 = FALSE;
f9a63242 817
212542aa 818 Newx(d, (*len) - count + 1, U8);
ef9edfd0 819 s = start; start = d;
f9a63242
JH
820 while (s < send) {
821 U8 c = *s++;
c4d5f83a
NIS
822 if (!UTF8_IS_INVARIANT(c)) {
823 /* Then it is two-byte encoded */
824 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
825 c = ASCII_TO_NATIVE(c);
826 }
827 *d++ = c;
f9a63242
JH
828 }
829 *d = '\0';
830 *len = d - start;
73d840c0 831 return (U8 *)start;
f9a63242
JH
832}
833
834/*
35a4481c 835=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 836
1e54db1a 837Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
838Returns a pointer to the newly-created string, and sets C<len> to
839reflect the new length.
6940069f 840
1e54db1a 841If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
842see sv_recode_to_utf8().
843
497711e7 844=cut
6940069f
GS
845*/
846
847U8*
35a4481c 848Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 849{
35a4481c 850 const U8 * const send = s + (*len);
6940069f
GS
851 U8 *d;
852 U8 *dst;
96a5add6 853 PERL_UNUSED_CONTEXT;
6940069f 854
212542aa 855 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
856 dst = d;
857
858 while (s < send) {
35a4481c 859 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 860 if (UNI_IS_INVARIANT(uv))
eb160463 861 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 862 else {
eb160463
GS
863 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
864 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
865 }
866 }
867 *d = '\0';
6662521e 868 *len = d-dst;
6940069f
GS
869 return dst;
870}
871
a0ed51b3 872/*
dea0fc0b 873 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
874 *
875 * Destination must be pre-extended to 3/2 source. Do not use in-place.
876 * We optimize for native, for obvious reasons. */
877
878U8*
dea0fc0b 879Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 880{
dea0fc0b
JH
881 U8* pend;
882 U8* dstart = d;
883
1de9afcd
RGS
884 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
885 d[0] = 0;
886 *newlen = 1;
887 return d;
888 }
889
dea0fc0b 890 if (bytelen & 1)
014ead4b 891 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
892
893 pend = p + bytelen;
894
a0ed51b3 895 while (p < pend) {
dea0fc0b
JH
896 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
897 p += 2;
a0ed51b3 898 if (uv < 0x80) {
eb160463 899 *d++ = (U8)uv;
a0ed51b3
LW
900 continue;
901 }
902 if (uv < 0x800) {
eb160463
GS
903 *d++ = (U8)(( uv >> 6) | 0xc0);
904 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
905 continue;
906 }
907 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
908 UV low = (p[0] << 8) + p[1];
909 p += 2;
dea0fc0b
JH
910 if (low < 0xdc00 || low >= 0xdfff)
911 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
912 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
913 }
914 if (uv < 0x10000) {
eb160463
GS
915 *d++ = (U8)(( uv >> 12) | 0xe0);
916 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
917 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
918 continue;
919 }
920 else {
eb160463
GS
921 *d++ = (U8)(( uv >> 18) | 0xf0);
922 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
923 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
924 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
925 continue;
926 }
927 }
dea0fc0b 928 *newlen = d - dstart;
a0ed51b3
LW
929 return d;
930}
931
932/* Note: this one is slightly destructive of the source. */
933
934U8*
dea0fc0b 935Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
936{
937 U8* s = (U8*)p;
d4c19fe8 938 U8* const send = s + bytelen;
a0ed51b3 939 while (s < send) {
d4c19fe8 940 const U8 tmp = s[0];
a0ed51b3
LW
941 s[0] = s[1];
942 s[1] = tmp;
943 s += 2;
944 }
dea0fc0b 945 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
946}
947
948/* for now these are all defined (inefficiently) in terms of the utf8 versions */
949
950bool
84afefe6 951Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 952{
89ebb4a3 953 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 954 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
955 return is_utf8_alnum(tmpbuf);
956}
957
958bool
84afefe6 959Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 960{
89ebb4a3 961 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 962 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
963 return is_utf8_alnumc(tmpbuf);
964}
965
966bool
84afefe6 967Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 968{
89ebb4a3 969 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 970 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
971 return is_utf8_idfirst(tmpbuf);
972}
973
974bool
84afefe6 975Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 976{
89ebb4a3 977 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 978 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
979 return is_utf8_alpha(tmpbuf);
980}
981
982bool
84afefe6 983Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 984{
89ebb4a3 985 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 986 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
987 return is_utf8_ascii(tmpbuf);
988}
989
990bool
84afefe6 991Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 992{
89ebb4a3 993 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 994 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
995 return is_utf8_space(tmpbuf);
996}
997
998bool
84afefe6 999Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1000{
89ebb4a3 1001 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1002 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1003 return is_utf8_digit(tmpbuf);
1004}
1005
1006bool
84afefe6 1007Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1008{
89ebb4a3 1009 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1010 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1011 return is_utf8_upper(tmpbuf);
1012}
1013
1014bool
84afefe6 1015Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1016{
89ebb4a3 1017 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1018 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1019 return is_utf8_lower(tmpbuf);
1020}
1021
1022bool
84afefe6 1023Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1024{
89ebb4a3 1025 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1026 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1027 return is_utf8_cntrl(tmpbuf);
1028}
1029
1030bool
84afefe6 1031Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1032{
89ebb4a3 1033 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1034 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1035 return is_utf8_graph(tmpbuf);
1036}
1037
1038bool
84afefe6 1039Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1040{
89ebb4a3 1041 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1042 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1043 return is_utf8_print(tmpbuf);
1044}
1045
b8c5462f 1046bool
84afefe6 1047Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1048{
89ebb4a3 1049 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1050 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1051 return is_utf8_punct(tmpbuf);
1052}
1053
4d61ec05 1054bool
84afefe6 1055Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1056{
89ebb4a3 1057 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1058 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1059 return is_utf8_xdigit(tmpbuf);
1060}
1061
84afefe6
JH
1062UV
1063Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1064{
0ebc6274
JH
1065 uvchr_to_utf8(p, c);
1066 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1067}
1068
84afefe6
JH
1069UV
1070Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1071{
0ebc6274
JH
1072 uvchr_to_utf8(p, c);
1073 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1074}
1075
84afefe6
JH
1076UV
1077Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1078{
0ebc6274
JH
1079 uvchr_to_utf8(p, c);
1080 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1081}
1082
84afefe6
JH
1083UV
1084Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1085{
0ebc6274
JH
1086 uvchr_to_utf8(p, c);
1087 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1088}
1089
a0ed51b3
LW
1090/* for now these all assume no locale info available for Unicode > 255 */
1091
1092bool
84afefe6 1093Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1094{
1095 return is_uni_alnum(c); /* XXX no locale support yet */
1096}
1097
1098bool
84afefe6 1099Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1100{
1101 return is_uni_alnumc(c); /* XXX no locale support yet */
1102}
1103
1104bool
84afefe6 1105Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1106{
1107 return is_uni_idfirst(c); /* XXX no locale support yet */
1108}
1109
1110bool
84afefe6 1111Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1112{
1113 return is_uni_alpha(c); /* XXX no locale support yet */
1114}
1115
1116bool
84afefe6 1117Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1118{
1119 return is_uni_ascii(c); /* XXX no locale support yet */
1120}
1121
1122bool
84afefe6 1123Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1124{
1125 return is_uni_space(c); /* XXX no locale support yet */
1126}
1127
1128bool
84afefe6 1129Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1130{
1131 return is_uni_digit(c); /* XXX no locale support yet */
1132}
1133
1134bool
84afefe6 1135Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1136{
1137 return is_uni_upper(c); /* XXX no locale support yet */
1138}
1139
1140bool
84afefe6 1141Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1142{
1143 return is_uni_lower(c); /* XXX no locale support yet */
1144}
1145
1146bool
84afefe6 1147Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1148{
1149 return is_uni_cntrl(c); /* XXX no locale support yet */
1150}
1151
1152bool
84afefe6 1153Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1154{
1155 return is_uni_graph(c); /* XXX no locale support yet */
1156}
1157
1158bool
84afefe6 1159Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1160{
1161 return is_uni_print(c); /* XXX no locale support yet */
1162}
1163
b8c5462f 1164bool
84afefe6 1165Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1166{
1167 return is_uni_punct(c); /* XXX no locale support yet */
1168}
1169
4d61ec05 1170bool
84afefe6 1171Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1172{
1173 return is_uni_xdigit(c); /* XXX no locale support yet */
1174}
1175
b7ac61fa
JH
1176U32
1177Perl_to_uni_upper_lc(pTHX_ U32 c)
1178{
ee099d14
JH
1179 /* XXX returns only the first character -- do not use XXX */
1180 /* XXX no locale support yet */
1181 STRLEN len;
89ebb4a3 1182 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1183 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1184}
1185
1186U32
1187Perl_to_uni_title_lc(pTHX_ U32 c)
1188{
ee099d14
JH
1189 /* XXX returns only the first character XXX -- do not use XXX */
1190 /* XXX no locale support yet */
1191 STRLEN len;
89ebb4a3 1192 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1193 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1194}
1195
1196U32
1197Perl_to_uni_lower_lc(pTHX_ U32 c)
1198{
ee099d14
JH
1199 /* XXX returns only the first character -- do not use XXX */
1200 /* XXX no locale support yet */
1201 STRLEN len;
89ebb4a3 1202 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1203 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1204}
1205
7452cf6a 1206static bool
5141f98e 1207S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1208 const char *const swashname)
1209{
97aff369 1210 dVAR;
bde6a22d
NC
1211 if (!is_utf8_char(p))
1212 return FALSE;
1213 if (!*swash)
711a919c 1214 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1215 return swash_fetch(*swash, p, TRUE) != 0;
1216}
1217
1218bool
7fc63493 1219Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1220{
97aff369 1221 dVAR;
671c33bf
NC
1222 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1223 * descendant of isalnum(3), in other words, it doesn't
1224 * contain the '_'. --jhi */
d4c19fe8 1225 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1226}
1227
1228bool
7fc63493 1229Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1230{
97aff369 1231 dVAR;
d4c19fe8 1232 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1233}
1234
1235bool
7fc63493 1236Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1237{
97aff369 1238 dVAR;
82686b01
JH
1239 if (*p == '_')
1240 return TRUE;
bde6a22d 1241 /* is_utf8_idstart would be more logical. */
d4c19fe8 1242 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1243}
1244
1245bool
7fc63493 1246Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1247{
97aff369 1248 dVAR;
82686b01
JH
1249 if (*p == '_')
1250 return TRUE;
d4c19fe8 1251 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1252}
1253
1254bool
7fc63493 1255Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1256{
97aff369 1257 dVAR;
d4c19fe8 1258 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1259}
1260
1261bool
7fc63493 1262Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1263{
97aff369 1264 dVAR;
d4c19fe8 1265 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1266}
1267
1268bool
7fc63493 1269Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1270{
97aff369 1271 dVAR;
d4c19fe8 1272 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1273}
1274
1275bool
7fc63493 1276Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1277{
97aff369 1278 dVAR;
d4c19fe8 1279 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1280}
1281
1282bool
7fc63493 1283Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1284{
97aff369 1285 dVAR;
d4c19fe8 1286 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1287}
1288
1289bool
7fc63493 1290Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1291{
97aff369 1292 dVAR;
d4c19fe8 1293 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1298{
97aff369 1299 dVAR;
d4c19fe8 1300 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1301}
1302
1303bool
7fc63493 1304Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1305{
97aff369 1306 dVAR;
d4c19fe8 1307 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1308}
1309
1310bool
7fc63493 1311Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1312{
97aff369 1313 dVAR;
d4c19fe8 1314 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1315}
1316
1317bool
7fc63493 1318Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1319{
97aff369 1320 dVAR;
d4c19fe8 1321 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1326{
97aff369 1327 dVAR;
d4c19fe8 1328 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1329}
1330
1331bool
7fc63493 1332Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1333{
97aff369 1334 dVAR;
d4c19fe8 1335 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1336}
1337
6b5c0936
JH
1338/*
1339=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1340
1341The "p" contains the pointer to the UTF-8 string encoding
1342the character that is being converted.
1343
1344The "ustrp" is a pointer to the character buffer to put the
1345conversion result to. The "lenp" is a pointer to the length
1346of the result.
1347
0134edef 1348The "swashp" is a pointer to the swash to use.
6b5c0936 1349
0134edef 1350Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1351and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1352but not always, a multicharacter mapping), is tried first.
6b5c0936 1353
0134edef
JH
1354The "special" is a string like "utf8::ToSpecLower", which means the
1355hash %utf8::ToSpecLower. The access to the hash is through
1356Perl_to_utf8_case().
6b5c0936 1357
0134edef
JH
1358The "normal" is a string like "ToLower" which means the swash
1359%utf8::ToLower.
1360
1361=cut */
6b5c0936 1362
2104c8d9 1363UV
9a957fbc
AL
1364Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1365 SV **swashp, const char *normal, const char *special)
a0ed51b3 1366{
97aff369 1367 dVAR;
89ebb4a3 1368 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1369 STRLEN len = 0;
a0ed51b3 1370
aec46f14 1371 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1372 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1373 * are necessary in EBCDIC, they are redundant no-ops
1374 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1375 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1376 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1377
1378 if (!*swashp) /* load on-demand */
1379 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1380
b08cf34e
JH
1381 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1382 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1383 /* It might be "special" (sometimes, but not always,
2a37f04d 1384 * a multicharacter mapping) */
35da51f7 1385 HV * const hv = get_hv(special, FALSE);
b08cf34e
JH
1386 SV **svp;
1387
35da51f7 1388 if (hv &&
b08cf34e
JH
1389 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1390 (*svp)) {
cfd0369c 1391 const char *s;
47654450 1392
cfd0369c 1393 s = SvPV_const(*svp, len);
47654450
JH
1394 if (len == 1)
1395 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1396 else {
2f9475ad
JH
1397#ifdef EBCDIC
1398 /* If we have EBCDIC we need to remap the characters
1399 * since any characters in the low 256 are Unicode
1400 * code points, not EBCDIC. */
7cda7a3d 1401 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1402
1403 d = tmpbuf;
b08cf34e 1404 if (SvUTF8(*svp)) {
2f9475ad
JH
1405 STRLEN tlen = 0;
1406
1407 while (t < tend) {
d4c19fe8 1408 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1409 if (tlen > 0) {
1410 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1411 t += tlen;
1412 }
1413 else
1414 break;
1415 }
1416 }
1417 else {
36fec512
JH
1418 while (t < tend) {
1419 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1420 t++;
1421 }
2f9475ad
JH
1422 }
1423 len = d - tmpbuf;
1424 Copy(tmpbuf, ustrp, len, U8);
1425#else
d2dcd0fb 1426 Copy(s, ustrp, len, U8);
2f9475ad 1427#endif
29e98929 1428 }
983ffd37 1429 }
0134edef
JH
1430 }
1431
1432 if (!len && *swashp) {
d4c19fe8
AL
1433 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1434
0134edef
JH
1435 if (uv2) {
1436 /* It was "normal" (a single character mapping). */
d4c19fe8 1437 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1438 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1439 }
1440 }
1feea2c7 1441
0134edef
JH
1442 if (!len) /* Neither: just copy. */
1443 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1444
2a37f04d
JH
1445 if (lenp)
1446 *lenp = len;
1447
0134edef 1448 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1449}
1450
d3e79532 1451/*
7fc63493 1452=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1453
1454Convert the UTF-8 encoded character at p to its uppercase version and
1455store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1456that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1457the uppercase version may be longer than the original character.
d3e79532
JH
1458
1459The first character of the uppercased version is returned
1460(but note, as explained above, that there may be more.)
1461
1462=cut */
1463
2104c8d9 1464UV
7fc63493 1465Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1466{
97aff369 1467 dVAR;
983ffd37 1468 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1469 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1470}
a0ed51b3 1471
d3e79532 1472/*
7fc63493 1473=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1474
1475Convert the UTF-8 encoded character at p to its titlecase version and
1476store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1477that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1478titlecase version may be longer than the original character.
d3e79532
JH
1479
1480The first character of the titlecased version is returned
1481(but note, as explained above, that there may be more.)
1482
1483=cut */
1484
983ffd37 1485UV
7fc63493 1486Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1487{
97aff369 1488 dVAR;
983ffd37 1489 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1490 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1491}
1492
d3e79532 1493/*
7fc63493 1494=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1495
1496Convert the UTF-8 encoded character at p to its lowercase version and
1497store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1498that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1499lowercase version may be longer than the original character.
d3e79532
JH
1500
1501The first character of the lowercased version is returned
1502(but note, as explained above, that there may be more.)
1503
1504=cut */
1505
2104c8d9 1506UV
7fc63493 1507Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1508{
97aff369 1509 dVAR;
983ffd37 1510 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1511 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1512}
1513
d3e79532 1514/*
7fc63493 1515=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1516
1517Convert the UTF-8 encoded character at p to its foldcase version and
1518store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1519that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1520foldcase version may be longer than the original character (up to
1521three characters).
1522
1523The first character of the foldcased version is returned
1524(but note, as explained above, that there may be more.)
1525
1526=cut */
1527
b4e400f9 1528UV
7fc63493 1529Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1530{
97aff369 1531 dVAR;
b4e400f9
JH
1532 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1533 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1534}
1535
711a919c
TS
1536/* Note:
1537 * A "swash" is a swatch hash.
1538 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1539 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1540 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1541 */
a0ed51b3 1542SV*
7fc63493 1543Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1544{
27da23d5 1545 dVAR;
a0ed51b3 1546 SV* retval;
9a957fbc 1547 SV* const tokenbufsv = sv_newmortal();
8e84507e 1548 dSP;
7fc63493
AL
1549 const size_t pkg_len = strlen(pkg);
1550 const size_t name_len = strlen(name);
aec46f14 1551 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1552 SV* errsv_save;
ce3b816e 1553
96ca9f55
DM
1554 PUSHSTACKi(PERLSI_MAGIC);
1555 ENTER;
1556 SAVEI32(PL_hints);
1557 PL_hints = 0;
1558 save_re_context();
1b026014 1559 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1560 ENTER;
f8be5cf0 1561 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1562 /* It is assumed that callers of this routine are not passing in any
1563 user derived data. */
1564 /* Need to do this after save_re_context() as it will set PL_tainted to
1565 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1566 Even line to create errsv_save can turn on PL_tainted. */
1567 SAVEBOOL(PL_tainted);
1568 PL_tainted = 0;
71bed85a 1569 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1570 NULL);
f8be5cf0
JH
1571 if (!SvTRUE(ERRSV))
1572 sv_setsv(ERRSV, errsv_save);
1573 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1574 LEAVE;
1575 }
1576 SPAGAIN;
a0ed51b3
LW
1577 PUSHMARK(SP);
1578 EXTEND(SP,5);
71bed85a
NC
1579 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1580 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1581 PUSHs(listsv);
1582 PUSHs(sv_2mortal(newSViv(minbits)));
1583 PUSHs(sv_2mortal(newSViv(none)));
1584 PUTBACK;
923e4eb5 1585 if (IN_PERL_COMPILETIME) {
bf1fed83 1586 /* XXX ought to be handled by lex_start */
82686b01 1587 SAVEI32(PL_in_my);
2b4bd638 1588 PL_in_my = 0;
bf1fed83 1589 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1590 }
f8be5cf0 1591 errsv_save = newSVsv(ERRSV);
864dbfa3 1592 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1593 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1594 else
e24b16f9 1595 retval = &PL_sv_undef;
f8be5cf0
JH
1596 if (!SvTRUE(ERRSV))
1597 sv_setsv(ERRSV, errsv_save);
1598 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1599 LEAVE;
1600 POPSTACK;
923e4eb5 1601 if (IN_PERL_COMPILETIME) {
bf1fed83 1602 STRLEN len;
aec46f14 1603 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1604
1605 Copy(pv, PL_tokenbuf, len+1, char);
623e6609 1606 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1607 }
bc45ce41
JH
1608 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1609 if (SvPOK(retval))
35c1215d 1610 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
95b63a38 1611 (void*)retval);
cea2e8a9 1612 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1613 }
a0ed51b3
LW
1614 return retval;
1615}
1616
035d37be
JH
1617
1618/* This API is wrong for special case conversions since we may need to
1619 * return several Unicode characters for a single Unicode character
1620 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1621 * the lower-level routine, and it is similarly broken for returning
1622 * multiple values. --jhi */
979f2922 1623/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1624
1625/* Note:
1626 * Returns the value of property/mapping C<swash> for the first character
1627 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1628 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1629 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1630 */
a0ed51b3 1631UV
680c470c 1632Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1633{
27da23d5 1634 dVAR;
680c470c 1635 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1636 U32 klen;
1637 U32 off;
a0ed51b3 1638 STRLEN slen;
7d85a32c 1639 STRLEN needents;
cfd0369c 1640 const U8 *tmps = NULL;
a0ed51b3 1641 U32 bit;
979f2922 1642 SV *swatch;
3568d838 1643 U8 tmputf8[2];
35da51f7 1644 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838
JH
1645
1646 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1647 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1648 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1649 ptr = tmputf8;
3568d838
JH
1650 }
1651 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1652 * then the "swatch" is a vec() for al the chars which start
1653 * with 0xAA..0xYY
1654 * So the key in the hash (klen) is length of encoded char -1
1655 */
1656 klen = UTF8SKIP(ptr) - 1;
1657 off = ptr[klen];
a0ed51b3 1658
979f2922 1659 if (klen == 0) {
7d85a32c 1660 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1661 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1662 */
979f2922
TS
1663 needents = UTF_CONTINUATION_MARK;
1664 off = NATIVE_TO_UTF(ptr[klen]);
1665 }
1666 else {
7d85a32c 1667 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1668 needents = (1 << UTF_ACCUMULATION_SHIFT);
1669 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1670 }
7d85a32c 1671
a0ed51b3
LW
1672 /*
1673 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1674 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1675 * it's nothing to sniff at.) Pity we usually come through at least
1676 * two function calls to get here...
1677 *
1678 * NB: this code assumes that swatches are never modified, once generated!
1679 */
1680
3568d838 1681 if (hv == PL_last_swash_hv &&
a0ed51b3 1682 klen == PL_last_swash_klen &&
27da23d5 1683 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1684 {
1685 tmps = PL_last_swash_tmps;
1686 slen = PL_last_swash_slen;
1687 }
1688 else {
1689 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1690 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1691
979f2922
TS
1692 /* If not cached, generate it via swash_get */
1693 if (!svp || !SvPOK(*svp)
1694 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1695 /* We use utf8n_to_uvuni() as we want an index into
1696 Unicode tables, not a native character number.
1697 */
aec46f14 1698 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1699 ckWARN(WARN_UTF8) ?
1700 0 : UTF8_ALLOW_ANY);
680c470c 1701 swatch = swash_get(swash,
979f2922
TS
1702 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1703 (klen) ? (code_point & ~(needents - 1)) : 0,
1704 needents);
1705
923e4eb5 1706 if (IN_PERL_COMPILETIME)
623e6609 1707 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1708
979f2922 1709 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1710
979f2922
TS
1711 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1712 || (slen << 3) < needents)
660a4616 1713 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1714 }
1715
1716 PL_last_swash_hv = hv;
1717 PL_last_swash_klen = klen;
cfd0369c
NC
1718 /* FIXME change interpvar.h? */
1719 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1720 PL_last_swash_slen = slen;
1721 if (klen)
1722 Copy(ptr, PL_last_swash_key, klen, U8);
1723 }
1724
9faf8d75 1725 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1726 case 1:
1727 bit = 1 << (off & 7);
1728 off >>= 3;
1729 return (tmps[off] & bit) != 0;
1730 case 8:
1731 return tmps[off];
1732 case 16:
1733 off <<= 1;
1734 return (tmps[off] << 8) + tmps[off + 1] ;
1735 case 32:
1736 off <<= 2;
1737 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1738 }
660a4616 1739 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
a0ed51b3 1740}
2b9d42f0 1741
979f2922
TS
1742/* Note:
1743 * Returns a swatch (a bit vector string) for a code point sequence
1744 * that starts from the value C<start> and comprises the number C<span>.
1745 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1746 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1747 */
1748STATIC SV*
1749S_swash_get(pTHX_ SV* swash, UV start, UV span)
1750{
1751 SV *swatch;
711a919c 1752 U8 *l, *lend, *x, *xend, *s;
979f2922
TS
1753 STRLEN lcur, xcur, scur;
1754
1755 HV* const hv = (HV*)SvRV(swash);
017a3ce5
GA
1756 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1757 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1758 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1759 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1760 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1761 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1762 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1763 const STRLEN bits = SvUV(*bitssvp);
1764 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1765 const UV none = SvUV(*nonesvp);
1766 const UV end = start + span;
979f2922
TS
1767
1768 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1769 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1770 (UV)bits);
979f2922
TS
1771 }
1772
1773 /* create and initialize $swatch */
396482e1 1774 swatch = newSVpvs("");
979f2922
TS
1775 scur = octets ? (span * octets) : (span + 7) / 8;
1776 SvGROW(swatch, scur + 1);
1777 s = (U8*)SvPVX(swatch);
1778 if (octets && none) {
0bd48802 1779 const U8* const e = s + scur;
979f2922
TS
1780 while (s < e) {
1781 if (bits == 8)
1782 *s++ = (U8)(none & 0xff);
1783 else if (bits == 16) {
1784 *s++ = (U8)((none >> 8) & 0xff);
1785 *s++ = (U8)( none & 0xff);
1786 }
1787 else if (bits == 32) {
1788 *s++ = (U8)((none >> 24) & 0xff);
1789 *s++ = (U8)((none >> 16) & 0xff);
1790 *s++ = (U8)((none >> 8) & 0xff);
1791 *s++ = (U8)( none & 0xff);
1792 }
1793 }
1794 *s = '\0';
1795 }
1796 else {
1797 (void)memzero((U8*)s, scur + 1);
1798 }
1799 SvCUR_set(swatch, scur);
1800 s = (U8*)SvPVX(swatch);
1801
1802 /* read $swash->{LIST} */
1803 l = (U8*)SvPV(*listsvp, lcur);
1804 lend = l + lcur;
1805 while (l < lend) {
35da51f7 1806 UV min, max, val;
979f2922
TS
1807 STRLEN numlen;
1808 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1809
0bd48802 1810 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1811
1812 numlen = lend - l;
1813 min = grok_hex((char *)l, &numlen, &flags, NULL);
1814 if (numlen)
1815 l += numlen;
1816 else if (nl) {
1817 l = nl + 1; /* 1 is length of "\n" */
1818 continue;
1819 }
1820 else {
1821 l = lend; /* to LIST's end at which \n is not found */
1822 break;
1823 }
1824
1825 if (isBLANK(*l)) {
1826 ++l;
1827 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1828 numlen = lend - l;
1829 max = grok_hex((char *)l, &numlen, &flags, NULL);
1830 if (numlen)
1831 l += numlen;
1832 else
1833 max = min;
1834
1835 if (octets) {
1836 if (isBLANK(*l)) {
1837 ++l;
1838 flags = PERL_SCAN_SILENT_ILLDIGIT |
1839 PERL_SCAN_DISALLOW_PREFIX;
1840 numlen = lend - l;
1841 val = grok_hex((char *)l, &numlen, &flags, NULL);
1842 if (numlen)
1843 l += numlen;
1844 else
1845 val = 0;
1846 }
1847 else {
1848 val = 0;
1849 if (typeto) {
1850 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1851 typestr, l);
1852 }
1853 }
1854 }
711a919c
TS
1855 else
1856 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1857 }
1858 else {
1859 max = min;
1860 if (octets) {
1861 val = 0;
1862 if (typeto) {
1863 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1864 }
1865 }
711a919c
TS
1866 else
1867 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1868 }
1869
1870 if (nl)
1871 l = nl + 1;
1872 else
1873 l = lend;
1874
1875 if (max < start)
1876 continue;
1877
1878 if (octets) {
35da51f7 1879 UV key;
979f2922
TS
1880 if (min < start) {
1881 if (!none || val < none) {
1882 val += start - min;
1883 }
1884 min = start;
1885 }
1886 for (key = min; key <= max; key++) {
1887 STRLEN offset;
1888 if (key >= end)
1889 goto go_out_list;
1890 /* offset must be non-negative (start <= min <= key < end) */
1891 offset = octets * (key - start);
1892 if (bits == 8)
1893 s[offset] = (U8)(val & 0xff);
1894 else if (bits == 16) {
1895 s[offset ] = (U8)((val >> 8) & 0xff);
1896 s[offset + 1] = (U8)( val & 0xff);
1897 }
1898 else if (bits == 32) {
1899 s[offset ] = (U8)((val >> 24) & 0xff);
1900 s[offset + 1] = (U8)((val >> 16) & 0xff);
1901 s[offset + 2] = (U8)((val >> 8) & 0xff);
1902 s[offset + 3] = (U8)( val & 0xff);
1903 }
1904
1905 if (!none || val < none)
1906 ++val;
1907 }
1908 }
711a919c 1909 else { /* bits == 1, then val should be ignored */
35da51f7 1910 UV key;
979f2922
TS
1911 if (min < start)
1912 min = start;
1913 for (key = min; key <= max; key++) {
0bd48802 1914 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
1915 if (key >= end)
1916 goto go_out_list;
1917 s[offset >> 3] |= 1 << (offset & 7);
1918 }
1919 }
1920 } /* while */
1921 go_out_list:
1922
1923 /* read $swash->{EXTRAS} */
1924 x = (U8*)SvPV(*extssvp, xcur);
1925 xend = x + xcur;
1926 while (x < xend) {
1927 STRLEN namelen;
1928 U8 *namestr;
1929 SV** othersvp;
1930 HV* otherhv;
1931 STRLEN otherbits;
1932 SV **otherbitssvp, *other;
711a919c 1933 U8 *s, *o, *nl;
979f2922
TS
1934 STRLEN slen, olen;
1935
35da51f7 1936 const U8 opc = *x++;
979f2922
TS
1937 if (opc == '\n')
1938 continue;
1939
1940 nl = (U8*)memchr(x, '\n', xend - x);
1941
1942 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1943 if (nl) {
1944 x = nl + 1; /* 1 is length of "\n" */
1945 continue;
1946 }
1947 else {
1948 x = xend; /* to EXTRAS' end at which \n is not found */
1949 break;
1950 }
1951 }
1952
1953 namestr = x;
1954 if (nl) {
1955 namelen = nl - namestr;
1956 x = nl + 1;
1957 }
1958 else {
1959 namelen = xend - namestr;
1960 x = xend;
1961 }
1962
1963 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 1964 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 1965 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
1966 otherbits = (STRLEN)SvUV(*otherbitssvp);
1967 if (bits < otherbits)
660a4616 1968 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
1969
1970 /* The "other" swatch must be destroyed after. */
1971 other = swash_get(*othersvp, start, span);
1972 o = (U8*)SvPV(other, olen);
1973
1974 if (!olen)
660a4616 1975 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
1976
1977 s = (U8*)SvPV(swatch, slen);
1978 if (bits == 1 && otherbits == 1) {
1979 if (slen != olen)
660a4616 1980 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
1981
1982 switch (opc) {
1983 case '+':
1984 while (slen--)
1985 *s++ |= *o++;
1986 break;
1987 case '!':
1988 while (slen--)
1989 *s++ |= ~*o++;
1990 break;
1991 case '-':
1992 while (slen--)
1993 *s++ &= ~*o++;
1994 break;
1995 case '&':
1996 while (slen--)
1997 *s++ &= *o++;
1998 break;
1999 default:
2000 break;
2001 }
2002 }
711a919c 2003 else {
979f2922
TS
2004 STRLEN otheroctets = otherbits >> 3;
2005 STRLEN offset = 0;
35da51f7 2006 U8* const send = s + slen;
979f2922
TS
2007
2008 while (s < send) {
2009 UV otherval = 0;
2010
2011 if (otherbits == 1) {
2012 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2013 ++offset;
2014 }
2015 else {
2016 STRLEN vlen = otheroctets;
2017 otherval = *o++;
2018 while (--vlen) {
2019 otherval <<= 8;
2020 otherval |= *o++;
2021 }
2022 }
2023
711a919c 2024 if (opc == '+' && otherval)
6f207bd3 2025 NOOP; /* replace with otherval */
979f2922
TS
2026 else if (opc == '!' && !otherval)
2027 otherval = 1;
2028 else if (opc == '-' && otherval)
2029 otherval = 0;
2030 else if (opc == '&' && !otherval)
2031 otherval = 0;
2032 else {
711a919c 2033 s += octets; /* no replacement */
979f2922
TS
2034 continue;
2035 }
2036
2037 if (bits == 8)
2038 *s++ = (U8)( otherval & 0xff);
2039 else if (bits == 16) {
2040 *s++ = (U8)((otherval >> 8) & 0xff);
2041 *s++ = (U8)( otherval & 0xff);
2042 }
2043 else if (bits == 32) {
2044 *s++ = (U8)((otherval >> 24) & 0xff);
2045 *s++ = (U8)((otherval >> 16) & 0xff);
2046 *s++ = (U8)((otherval >> 8) & 0xff);
2047 *s++ = (U8)( otherval & 0xff);
2048 }
2049 }
2050 }
2051 sv_free(other); /* through with it! */
2052 } /* while */
2053 return swatch;
2054}
2055
0f830e0b
NC
2056/*
2057=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2058
2059Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2060of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2061bytes available. The return value is the pointer to the byte after the
2062end of the new character. In other words,
2063
2064 d = uvchr_to_utf8(d, uv);
2065
2066is the recommended wide native character-aware way of saying
2067
2068 *(d++) = uv;
2069
2070=cut
2071*/
2072
2073/* On ASCII machines this is normally a macro but we want a
2074 real function in case XS code wants it
2075*/
0f830e0b
NC
2076U8 *
2077Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2078{
2079 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2080}
2081
b851fbc1
JH
2082U8 *
2083Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2084{
2085 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2086}
2b9d42f0
NIS
2087
2088/*
0f830e0b
NC
2089=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2090flags
2091
2092Returns the native character value of the first character in the string
2093C<s>
2094which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2095length, in bytes, of that character.
2096
2097Allows length and flags to be passed to low level routine.
2098
2099=cut
2100*/
2101/* On ASCII machines this is normally a macro but we want
2102 a real function in case XS code wants it
2103*/
0f830e0b
NC
2104UV
2105Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2106U32 flags)
2107{
2108 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2109 return UNI_TO_NATIVE(uv);
2110}
2111
2112/*
d2cc3551
JH
2113=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2114
2115Build to the scalar dsv a displayable version of the string spv,
2116length len, the displayable version being at most pvlim bytes long
2117(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2118
9e55ce06 2119The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2120isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2121to display the \\[nrfta\\] as the backslashed versions (like '\n')
2122(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2123UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2124UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2125
d2cc3551
JH
2126The pointer to the PV of the dsv is returned.
2127
2128=cut */
e6b2e755 2129char *
e1ec3a88 2130Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2131{
2132 int truncated = 0;
e1ec3a88 2133 const char *s, *e;
e6b2e755
JH
2134
2135 sv_setpvn(dsv, "", 0);
e1ec3a88 2136 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2137 UV u;
a49f32c6
NC
2138 /* This serves double duty as a flag and a character to print after
2139 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2140 */
2141 char ok = 0;
c728cb41 2142
e6b2e755
JH
2143 if (pvlim && SvCUR(dsv) >= pvlim) {
2144 truncated++;
2145 break;
2146 }
2147 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2148 if (u < 256) {
a3b680e6 2149 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2150 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2151 switch (c) {
c728cb41 2152 case '\n':
a49f32c6 2153 ok = 'n'; break;
c728cb41 2154 case '\r':
a49f32c6 2155 ok = 'r'; break;
c728cb41 2156 case '\t':
a49f32c6 2157 ok = 't'; break;
c728cb41 2158 case '\f':
a49f32c6 2159 ok = 'f'; break;
c728cb41 2160 case '\a':
a49f32c6 2161 ok = 'a'; break;
c728cb41 2162 case '\\':
a49f32c6 2163 ok = '\\'; break;
c728cb41
JH
2164 default: break;
2165 }
a49f32c6
NC
2166 if (ok) {
2167 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2168 }
c728cb41 2169 }
00e86452 2170 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2171 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2172 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2173 ok = 1;
0a2ef054 2174 }
c728cb41
JH
2175 }
2176 if (!ok)
9e55ce06 2177 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2178 }
2179 if (truncated)
396482e1 2180 sv_catpvs(dsv, "...");
e6b2e755
JH
2181
2182 return SvPVX(dsv);
2183}
2b9d42f0 2184
d2cc3551
JH
2185/*
2186=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2187
2188Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2189the displayable version being at most pvlim bytes long
d2cc3551 2190(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2191
2192The flags argument is as in pv_uni_display().
2193
d2cc3551
JH
2194The pointer to the PV of the dsv is returned.
2195
d4c19fe8
AL
2196=cut
2197*/
e6b2e755
JH
2198char *
2199Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2200{
cfd0369c
NC
2201 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2202 SvCUR(ssv), pvlim, flags);
701a277b
JH
2203}
2204
d2cc3551 2205/*
d07ddd77 2206=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2207
2208Return true if the strings s1 and s2 differ case-insensitively, false
2209if not (if they are equal case-insensitively). If u1 is true, the
2210string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2211the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2212are false, the respective string is assumed to be in native 8-bit
2213encoding.
2214
2215If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2216in there (they will point at the beginning of the I<next> character).
2217If the pointers behind pe1 or pe2 are non-NULL, they are the end
2218pointers beyond which scanning will not continue under any
4cdaeff7 2219circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2220s2+l2 will be used as goal end pointers that will also stop the scan,
2221and which qualify towards defining a successful match: all the scans
2222that define an explicit length must reach their goal pointers for
2223a match to succeed).
d2cc3551
JH
2224
2225For case-insensitiveness, the "casefolding" of Unicode is used
2226instead of upper/lowercasing both the characters, see
2227http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2228
2229=cut */
701a277b 2230I32
d07ddd77 2231Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2232{
97aff369 2233 dVAR;
e1ec3a88
AL
2234 register const U8 *p1 = (const U8*)s1;
2235 register const U8 *p2 = (const U8*)s2;
cbbf8932 2236 register const U8 *f1 = NULL;
2f73348c 2237 register const U8 *f2 = NULL;
cbbf8932
AL
2238 register U8 *e1 = NULL;
2239 register U8 *q1 = NULL;
2240 register U8 *e2 = NULL;
2241 register U8 *q2 = NULL;
d07ddd77 2242 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2243 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2244 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2245 U8 natbuf[1+1];
2246 STRLEN foldlen1, foldlen2;
d07ddd77 2247 bool match;
332ddc25 2248
d07ddd77
JH
2249 if (pe1)
2250 e1 = *(U8**)pe1;
e1ec3a88
AL
2251 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2252 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2253 if (pe2)
2254 e2 = *(U8**)pe2;
e1ec3a88
AL
2255 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2256 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2257
2258 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2259 return 1; /* mismatch; possible infinite loop or false positive */
2260
a6872d42
JH
2261 if (!u1 || !u2)
2262 natbuf[1] = 0; /* Need to terminate the buffer. */
2263
d07ddd77
JH
2264 while ((e1 == 0 || p1 < e1) &&
2265 (f1 == 0 || p1 < f1) &&
2266 (e2 == 0 || p2 < e2) &&
2267 (f2 == 0 || p2 < f2)) {
2268 if (n1 == 0) {
d7f013c8
JH
2269 if (u1)
2270 to_utf8_fold(p1, foldbuf1, &foldlen1);
2271 else {
809e8e66 2272 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2273 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2274 }
2275 q1 = foldbuf1;
d07ddd77 2276 n1 = foldlen1;
332ddc25 2277 }
d07ddd77 2278 if (n2 == 0) {
d7f013c8
JH
2279 if (u2)
2280 to_utf8_fold(p2, foldbuf2, &foldlen2);
2281 else {
809e8e66 2282 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2283 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2284 }
2285 q2 = foldbuf2;
d07ddd77 2286 n2 = foldlen2;
332ddc25 2287 }
d07ddd77
JH
2288 while (n1 && n2) {
2289 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2290 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2291 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2292 return 1; /* mismatch */
d07ddd77 2293 n1 -= UTF8SKIP(q1);
d7f013c8 2294 q1 += UTF8SKIP(q1);
d07ddd77 2295 n2 -= UTF8SKIP(q2);
d7f013c8 2296 q2 += UTF8SKIP(q2);
701a277b 2297 }
d07ddd77 2298 if (n1 == 0)
d7f013c8 2299 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2300 if (n2 == 0)
d7f013c8
JH
2301 p2 += u2 ? UTF8SKIP(p2) : 1;
2302
d2cc3551 2303 }
5469e704 2304
d07ddd77
JH
2305 /* A match is defined by all the scans that specified
2306 * an explicit length reaching their final goals. */
2307 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2308
2309 if (match) {
d07ddd77
JH
2310 if (pe1)
2311 *pe1 = (char*)p1;
2312 if (pe2)
2313 *pe2 = (char*)p2;
5469e704
JH
2314 }
2315
2316 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2317}
701a277b 2318
a49f32c6
NC
2319/*
2320 * Local variables:
2321 * c-indentation-style: bsd
2322 * c-basic-offset: 4
2323 * indent-tabs-mode: t
2324 * End:
2325 *
37442d52
RGS
2326 * ex: set ts=8 sts=4 sw=4 noet:
2327 */