This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'blead' into dual/Safe
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550
TC
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3
LW
33#include "perl.h"
34
a0c21aa1
JH
35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5
JH
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
ccfc67b7
JH
45/*
46=head1 Unicode Support
a0ed51b3 47
166f8a29
DM
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
87cea99e 54=for apidoc uvuni_to_utf8_flags
eebe1485 55
1e54db1a 56Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 57of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 58bytes available. The return value is the pointer to the byte after the
9041c2e3 59end of the new character. In other words,
eebe1485 60
b851fbc1
JH
61 d = uvuni_to_utf8_flags(d, uv, flags);
62
63or, in most cases,
64
9041c2e3 65 d = uvuni_to_utf8(d, uv);
eebe1485 66
b851fbc1
JH
67(which is equivalent to)
68
69 d = uvuni_to_utf8_flags(d, uv, 0);
70
eebe1485
SC
71is the recommended Unicode-aware way of saying
72
73 *(d++) = uv;
74
75=cut
76*/
77
dfe13c55 78U8 *
b851fbc1 79Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 80{
7918f24d
NC
81 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
82
62961d2e 83 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
84 if (UNICODE_IS_SURROGATE(uv) &&
85 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 86 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
87 else if (
88 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
89 !(flags & UNICODE_ALLOW_FDD0))
90 ||
c867b360 91 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
92 !(flags & UNICODE_ALLOW_FFFF))) &&
93 /* UNICODE_ALLOW_SUPER includes
2a20b9da 94 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
95 ((uv <= PERL_UNICODE_MAX) ||
96 !(flags & UNICODE_ALLOW_SUPER))
97 )
9014280d 98 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
99 "Unicode character 0x%04"UVxf" is illegal", uv);
100 }
c4d5f83a 101 if (UNI_IS_INVARIANT(uv)) {
eb160463 102 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
103 return d;
104 }
2d331972 105#if defined(EBCDIC)
1d72bdf6
NIS
106 else {
107 STRLEN len = UNISKIP(uv);
108 U8 *p = d+len-1;
109 while (p > d) {
eb160463 110 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
111 uv >>= UTF_ACCUMULATION_SHIFT;
112 }
eb160463 113 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
114 return d+len;
115 }
116#else /* Non loop style */
a0ed51b3 117 if (uv < 0x800) {
eb160463
GS
118 *d++ = (U8)(( uv >> 6) | 0xc0);
119 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
120 return d;
121 }
122 if (uv < 0x10000) {
eb160463
GS
123 *d++ = (U8)(( uv >> 12) | 0xe0);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
128 if (uv < 0x200000) {
eb160463
GS
129 *d++ = (U8)(( uv >> 18) | 0xf0);
130 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
132 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
133 return d;
134 }
135 if (uv < 0x4000000) {
eb160463
GS
136 *d++ = (U8)(( uv >> 24) | 0xf8);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
141 return d;
142 }
143 if (uv < 0x80000000) {
eb160463
GS
144 *d++ = (U8)(( uv >> 30) | 0xfc);
145 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
146 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
149 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
150 return d;
151 }
6b8eaf93 152#ifdef HAS_QUAD
d7578b48 153 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
154#endif
155 {
eb160463
GS
156 *d++ = 0xfe; /* Can't match U+FEFF! */
157 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
162 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
163 return d;
164 }
6b8eaf93 165#ifdef HAS_QUAD
a0ed51b3 166 {
eb160463
GS
167 *d++ = 0xff; /* Can't match U+FFFE! */
168 *d++ = 0x80; /* 6 Reserved bits */
169 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
170 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
173 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
174 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
175 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
176 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
177 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
179 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
180 return d;
181 }
182#endif
1d72bdf6 183#endif /* Loop style */
a0ed51b3 184}
9041c2e3 185
646ca15d
JH
186/*
187
188Tests if some arbitrary number of bytes begins in a valid UTF-8
189character. Note that an INVARIANT (i.e. ASCII) character is a valid
190UTF-8 character. The actual number of bytes in the UTF-8 character
191will be returned if it is valid, otherwise 0.
192
193This is the "slow" version as opposed to the "fast" version which is
194the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
195difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
196or less you should use the IS_UTF8_CHAR(), for lengths of five or more
197you should use the _slow(). In practice this means that the _slow()
198will be used very rarely, since the maximum Unicode code point (as of
199Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
200the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
201five bytes or more.
202
203=cut */
c053b435 204STATIC STRLEN
5f66b61c 205S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
206{
207 U8 u = *s;
208 STRLEN slen;
209 UV uv, ouv;
210
7918f24d
NC
211 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
212
646ca15d
JH
213 if (UTF8_IS_INVARIANT(u))
214 return 1;
215
216 if (!UTF8_IS_START(u))
217 return 0;
218
219 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
220 return 0;
221
222 slen = len - 1;
223 s++;
77263263
TS
224#ifdef EBCDIC
225 u = NATIVE_TO_UTF(u);
226#endif
646ca15d
JH
227 u &= UTF_START_MASK(len);
228 uv = u;
229 ouv = uv;
230 while (slen--) {
231 if (!UTF8_IS_CONTINUATION(*s))
232 return 0;
233 uv = UTF8_ACCUMULATE(uv, *s);
234 if (uv < ouv)
235 return 0;
236 ouv = uv;
237 s++;
238 }
239
240 if ((STRLEN)UNISKIP(uv) < len)
241 return 0;
242
243 return len;
244}
9041c2e3
NIS
245
246/*
87cea99e 247=for apidoc is_utf8_char
eebe1485 248
5da9da9e 249Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
250character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
251character is a valid UTF-8 character. The actual number of bytes in the UTF-8
252character will be returned if it is valid, otherwise 0.
9041c2e3 253
82686b01 254=cut */
067a85ef 255STRLEN
7fc63493 256Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 257{
44f8325f 258 const STRLEN len = UTF8SKIP(s);
7918f24d
NC
259
260 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
96a5add6 261 PERL_UNUSED_CONTEXT;
3b0fc154 262#ifdef IS_UTF8_CHAR
768c67ee 263 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
264 return IS_UTF8_CHAR(s, len) ? len : 0;
265#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 266 return is_utf8_char_slow(s, len);
386d01d6
GS
267}
268
6662521e 269/*
87cea99e 270=for apidoc is_utf8_string
6662521e 271
c9ada85f 272Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
273UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
274not mean 'a string that contains code points above 0x7F encoded in UTF-8'
275because a valid ASCII string is a valid UTF-8 string.
6662521e 276
768c67ee
JH
277See also is_utf8_string_loclen() and is_utf8_string_loc().
278
6662521e
GS
279=cut
280*/
281
8e84507e 282bool
7fc63493 283Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 284{
35da51f7 285 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 286 const U8* x = s;
067a85ef 287
7918f24d 288 PERL_ARGS_ASSERT_IS_UTF8_STRING;
96a5add6 289 PERL_UNUSED_CONTEXT;
1aa99e6b 290
6662521e 291 while (x < send) {
a3b680e6 292 STRLEN c;
1acdb0da
JH
293 /* Inline the easy bits of is_utf8_char() here for speed... */
294 if (UTF8_IS_INVARIANT(*x))
295 c = 1;
296 else if (!UTF8_IS_START(*x))
768c67ee 297 goto out;
1acdb0da
JH
298 else {
299 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
300#ifdef IS_UTF8_CHAR
301 c = UTF8SKIP(x);
768c67ee
JH
302 if (IS_UTF8_CHAR_FAST(c)) {
303 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
304 c = 0;
305 }
306 else
307 c = is_utf8_char_slow(x, c);
646ca15d
JH
308#else
309 c = is_utf8_char(x);
310#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 311 if (!c)
768c67ee 312 goto out;
1acdb0da 313 }
6662521e 314 x += c;
6662521e 315 }
768c67ee
JH
316
317 out:
60006e79
JH
318 if (x != send)
319 return FALSE;
067a85ef
A
320
321 return TRUE;
6662521e
GS
322}
323
67e989fb 324/*
814fafa7
NC
325Implemented as a macro in utf8.h
326
87cea99e 327=for apidoc is_utf8_string_loc
814fafa7
NC
328
329Like is_utf8_string() but stores the location of the failure (in the
330case of "utf8ness failure") or the location s+len (in the case of
331"utf8ness success") in the C<ep>.
332
333See also is_utf8_string_loclen() and is_utf8_string().
334
87cea99e 335=for apidoc is_utf8_string_loclen
81cd54e3 336
e3e4599f 337Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
338case of "utf8ness failure") or the location s+len (in the case of
339"utf8ness success") in the C<ep>, and the number of UTF-8
340encoded characters in the C<el>.
341
342See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
343
344=cut
345*/
346
347bool
768c67ee 348Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 349{
35da51f7 350 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 351 const U8* x = s;
81cd54e3 352 STRLEN c;
3ebfea28 353 STRLEN outlen = 0;
7918f24d
NC
354
355 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
96a5add6 356 PERL_UNUSED_CONTEXT;
81cd54e3 357
81cd54e3
JH
358 while (x < send) {
359 /* Inline the easy bits of is_utf8_char() here for speed... */
360 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
361 c = 1;
362 else if (!UTF8_IS_START(*x))
363 goto out;
81cd54e3 364 else {
768c67ee
JH
365 /* ... and call is_utf8_char() only if really needed. */
366#ifdef IS_UTF8_CHAR
367 c = UTF8SKIP(x);
368 if (IS_UTF8_CHAR_FAST(c)) {
369 if (!IS_UTF8_CHAR(x, c))
370 c = 0;
371 } else
372 c = is_utf8_char_slow(x, c);
373#else
374 c = is_utf8_char(x);
375#endif /* #ifdef IS_UTF8_CHAR */
376 if (!c)
377 goto out;
81cd54e3 378 }
768c67ee 379 x += c;
3ebfea28 380 outlen++;
81cd54e3 381 }
768c67ee
JH
382
383 out:
3ebfea28
AL
384 if (el)
385 *el = outlen;
386
768c67ee
JH
387 if (ep)
388 *ep = x;
3ebfea28 389 return (x == send);
81cd54e3
JH
390}
391
392/*
768c67ee 393
87cea99e 394=for apidoc utf8n_to_uvuni
67e989fb 395
9041c2e3 396Bottom level UTF-8 decode routine.
38a44b82 397Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 398which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 399C<retlen> will be set to the length, in bytes, of that character.
67e989fb 400
1e54db1a 401If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
402is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
403it is assumed that the caller will raise a warning, and this function
28d3d195
JH
404will silently just set C<retlen> to C<-1> and return zero. If the
405C<flags> does not contain UTF8_CHECK_ONLY, warnings about
406malformations will be given, C<retlen> will be set to the expected
407length of the UTF-8 character in bytes, and zero will be returned.
408
409The C<flags> can also contain various flags to allow deviations from
410the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 411
9041c2e3
NIS
412Most code should use utf8_to_uvchr() rather than call this directly.
413
37607a96
PK
414=cut
415*/
67e989fb 416
a0ed51b3 417UV
7fc63493 418Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 419{
97aff369 420 dVAR;
d4c19fe8 421 const U8 * const s0 = s;
9c5ffd7c 422 UV uv = *s, ouv = 0;
ba210ebe 423 STRLEN len = 1;
7fc63493
AL
424 const bool dowarn = ckWARN_d(WARN_UTF8);
425 const UV startbyte = *s;
ba210ebe 426 STRLEN expectlen = 0;
a0dbb045
JH
427 U32 warning = 0;
428
7918f24d
NC
429 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
430
a0dbb045
JH
431/* This list is a superset of the UTF8_ALLOW_XXX. */
432
433#define UTF8_WARN_EMPTY 1
434#define UTF8_WARN_CONTINUATION 2
435#define UTF8_WARN_NON_CONTINUATION 3
436#define UTF8_WARN_FE_FF 4
437#define UTF8_WARN_SHORT 5
438#define UTF8_WARN_OVERFLOW 6
439#define UTF8_WARN_SURROGATE 7
c867b360
JH
440#define UTF8_WARN_LONG 8
441#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
442
443 if (curlen == 0 &&
444 !(flags & UTF8_ALLOW_EMPTY)) {
445 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
446 goto malformed;
447 }
448
1d72bdf6 449 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
450 if (retlen)
451 *retlen = 1;
c4d5f83a 452 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 453 }
67e989fb 454
421a8bf2 455 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 456 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 457 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
458 goto malformed;
459 }
460
421a8bf2 461 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 462 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 463 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
464 goto malformed;
465 }
9041c2e3 466
1d72bdf6 467#ifdef EBCDIC
75383841 468 uv = NATIVE_TO_UTF(uv);
1d72bdf6 469#else
fcc8fcf6
JH
470 if ((uv == 0xfe || uv == 0xff) &&
471 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 472 warning = UTF8_WARN_FE_FF;
ba210ebe 473 goto malformed;
a0ed51b3 474 }
1d72bdf6
NIS
475#endif
476
ba210ebe
JH
477 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
478 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
479 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
480 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
481#ifdef EBCDIC
482 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
483 else { len = 7; uv &= 0x01; }
484#else
ba210ebe
JH
485 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
486 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
487 else { len = 13; uv = 0; } /* whoa! */
488#endif
489
a0ed51b3
LW
490 if (retlen)
491 *retlen = len;
9041c2e3 492
ba210ebe
JH
493 expectlen = len;
494
fcc8fcf6
JH
495 if ((curlen < expectlen) &&
496 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 497 warning = UTF8_WARN_SHORT;
ba210ebe
JH
498 goto malformed;
499 }
500
501 len--;
a0ed51b3 502 s++;
ba210ebe
JH
503 ouv = uv;
504
a0ed51b3 505 while (len--) {
421a8bf2
JH
506 if (!UTF8_IS_CONTINUATION(*s) &&
507 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
508 s--;
509 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 510 goto malformed;
a0ed51b3
LW
511 }
512 else
8850bf83 513 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
514 if (!(uv > ouv)) {
515 /* These cannot be allowed. */
516 if (uv == ouv) {
75dbc644 517 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
518 warning = UTF8_WARN_LONG;
519 goto malformed;
520 }
521 }
522 else { /* uv < ouv */
523 /* This cannot be allowed. */
524 warning = UTF8_WARN_OVERFLOW;
525 goto malformed;
526 }
ba210ebe
JH
527 }
528 s++;
529 ouv = uv;
530 }
531
421a8bf2 532 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 533 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 534 warning = UTF8_WARN_SURROGATE;
ba210ebe 535 goto malformed;
eb160463 536 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 537 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 538 warning = UTF8_WARN_LONG;
ba210ebe 539 goto malformed;
421a8bf2 540 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 541 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 542 warning = UTF8_WARN_FFFF;
a9917092 543 goto malformed;
a0ed51b3 544 }
ba210ebe 545
a0ed51b3 546 return uv;
ba210ebe
JH
547
548malformed:
549
fcc8fcf6 550 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 551 if (retlen)
10edeb5d 552 *retlen = ((STRLEN) -1);
ba210ebe
JH
553 return 0;
554 }
555
a0dbb045 556 if (dowarn) {
84bafc02 557 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045
JH
558
559 switch (warning) {
560 case 0: /* Intentionally empty. */ break;
561 case UTF8_WARN_EMPTY:
396482e1 562 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
563 break;
564 case UTF8_WARN_CONTINUATION:
097fb8e2 565 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
566 break;
567 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
568 if (s == s0)
569 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
570 (UV)s[1], startbyte);
551405c4
AL
571 else {
572 const int len = (int)(s-s0);
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
574 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
575 }
576
a0dbb045
JH
577 break;
578 case UTF8_WARN_FE_FF:
579 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
580 break;
581 case UTF8_WARN_SHORT:
097fb8e2 582 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 583 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 584 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
585 break;
586 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
587 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
588 ouv, *s, startbyte);
a0dbb045
JH
589 break;
590 case UTF8_WARN_SURROGATE:
591 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
592 break;
a0dbb045 593 case UTF8_WARN_LONG:
097fb8e2 594 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 595 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
596 break;
597 case UTF8_WARN_FFFF:
598 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
599 break;
600 default:
396482e1 601 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
602 break;
603 }
604
605 if (warning) {
44f8325f 606 const char * const s = SvPVX_const(sv);
a0dbb045
JH
607
608 if (PL_op)
9014280d 609 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 610 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 611 else
9014280d 612 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
613 }
614 }
615
ba210ebe 616 if (retlen)
28d3d195 617 *retlen = expectlen ? expectlen : len;
ba210ebe 618
28d3d195 619 return 0;
a0ed51b3
LW
620}
621
8e84507e 622/*
87cea99e 623=for apidoc utf8_to_uvchr
9041c2e3
NIS
624
625Returns the native character value of the first character in the string C<s>
1e54db1a 626which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
627length, in bytes, of that character.
628
1e54db1a 629If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
630returned and retlen is set, if possible, to -1.
631
632=cut
633*/
634
635UV
7fc63493 636Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 637{
7918f24d
NC
638 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
639
1754c1a1
NC
640 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
641 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
642}
643
644/*
87cea99e 645=for apidoc utf8_to_uvuni
9041c2e3
NIS
646
647Returns the Unicode code point of the first character in the string C<s>
1e54db1a 648which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
649length, in bytes, of that character.
650
2bbc8d55 651This function should only be used when the returned UV is considered
9041c2e3
NIS
652an index into the Unicode semantic tables (e.g. swashes).
653
1e54db1a 654If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 655returned and retlen is set, if possible, to -1.
8e84507e
NIS
656
657=cut
658*/
659
660UV
7fc63493 661Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 662{
7918f24d
NC
663 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
664
9041c2e3 665 /* Call the low level routine asking for checks */
89ebb4a3 666 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 667 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
668}
669
b76347f2 670/*
87cea99e 671=for apidoc utf8_length
b76347f2
JH
672
673Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
674Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
675up past C<e>, croaks.
b76347f2
JH
676
677=cut
678*/
679
680STRLEN
35a4481c 681Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 682{
97aff369 683 dVAR;
b76347f2
JH
684 STRLEN len = 0;
685
7918f24d
NC
686 PERL_ARGS_ASSERT_UTF8_LENGTH;
687
8850bf83
JH
688 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
689 * the bitops (especially ~) can create illegal UTF-8.
690 * In other words: in Perl UTF-8 is not just for Unicode. */
691
a3b680e6
AL
692 if (e < s)
693 goto warn_and_return;
b76347f2 694 while (s < e) {
8e91ec7f
AV
695 if (!UTF8_IS_INVARIANT(*s))
696 s += UTF8SKIP(s);
697 else
698 s++;
699 len++;
700 }
701
702 if (e != s) {
703 len--;
704 warn_and_return:
705 if (ckWARN_d(WARN_UTF8)) {
706 if (PL_op)
707 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 708 "%s in %s", unees, OP_DESC(PL_op));
8e91ec7f
AV
709 else
710 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
901b21bf 711 }
b76347f2
JH
712 }
713
714 return len;
715}
716
b06226ff 717/*
87cea99e 718=for apidoc utf8_distance
b06226ff 719
1e54db1a 720Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
721and C<b>.
722
723WARNING: use only if you *know* that the pointers point inside the
724same UTF-8 buffer.
725
37607a96
PK
726=cut
727*/
a0ed51b3 728
02eb7b47 729IV
35a4481c 730Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 731{
7918f24d
NC
732 PERL_ARGS_ASSERT_UTF8_DISTANCE;
733
bf1665bc 734 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
735}
736
b06226ff 737/*
87cea99e 738=for apidoc utf8_hop
b06226ff 739
8850bf83
JH
740Return the UTF-8 pointer C<s> displaced by C<off> characters, either
741forward or backward.
b06226ff
JH
742
743WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
744the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
745on the first byte of character or just after the last byte of a character.
b06226ff 746
37607a96
PK
747=cut
748*/
a0ed51b3
LW
749
750U8 *
4373e329 751Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 752{
7918f24d
NC
753 PERL_ARGS_ASSERT_UTF8_HOP;
754
96a5add6 755 PERL_UNUSED_CONTEXT;
8850bf83
JH
756 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
757 * the bitops (especially ~) can create illegal UTF-8.
758 * In other words: in Perl UTF-8 is not just for Unicode. */
759
a0ed51b3
LW
760 if (off >= 0) {
761 while (off--)
762 s += UTF8SKIP(s);
763 }
764 else {
765 while (off++) {
766 s--;
8850bf83
JH
767 while (UTF8_IS_CONTINUATION(*s))
768 s--;
a0ed51b3
LW
769 }
770 }
4373e329 771 return (U8 *)s;
a0ed51b3
LW
772}
773
6940069f 774/*
87cea99e 775=for apidoc utf8_to_bytes
6940069f 776
2bbc8d55 777Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
246fae53
MG
778Unlike C<bytes_to_utf8>, this over-writes the original string, and
779updates len to contain the new length.
67e989fb 780Returns zero on failure, setting C<len> to -1.
6940069f 781
95be277c
NC
782If you need a copy of the string, see C<bytes_from_utf8>.
783
6940069f
GS
784=cut
785*/
786
787U8 *
37607a96 788Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 789{
d4c19fe8
AL
790 U8 * const save = s;
791 U8 * const send = s + *len;
6940069f 792 U8 *d;
246fae53 793
7918f24d
NC
794 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
795
1e54db1a 796 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 797 while (s < send) {
dcad2880
JH
798 U8 c = *s++;
799
1d72bdf6
NIS
800 if (!UTF8_IS_INVARIANT(c) &&
801 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
802 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 803 *len = ((STRLEN) -1);
dcad2880
JH
804 return 0;
805 }
246fae53 806 }
dcad2880
JH
807
808 d = s = save;
6940069f 809 while (s < send) {
ed646e6e 810 STRLEN ulen;
9041c2e3 811 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 812 s += ulen;
6940069f
GS
813 }
814 *d = '\0';
246fae53 815 *len = d - save;
6940069f
GS
816 return save;
817}
818
819/*
87cea99e 820=for apidoc bytes_from_utf8
f9a63242 821
2bbc8d55 822Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
35a4481c 823Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
824the newly-created string, and updates C<len> to contain the new
825length. Returns the original string if no conversion occurs, C<len>
826is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
8270 if C<s> is converted or consisted entirely of characters that are invariant
828in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 829
37607a96
PK
830=cut
831*/
f9a63242
JH
832
833U8 *
e1ec3a88 834Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 835{
f9a63242 836 U8 *d;
e1ec3a88
AL
837 const U8 *start = s;
838 const U8 *send;
f9a63242
JH
839 I32 count = 0;
840
7918f24d
NC
841 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
842
96a5add6 843 PERL_UNUSED_CONTEXT;
f9a63242 844 if (!*is_utf8)
73d840c0 845 return (U8 *)start;
f9a63242 846
1e54db1a 847 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 848 for (send = s + *len; s < send;) {
e1ec3a88 849 U8 c = *s++;
1d72bdf6 850 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
851 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
852 (c = *s++) && UTF8_IS_CONTINUATION(c))
853 count++;
854 else
73d840c0 855 return (U8 *)start;
db42d148 856 }
f9a63242
JH
857 }
858
35da51f7 859 *is_utf8 = FALSE;
f9a63242 860
212542aa 861 Newx(d, (*len) - count + 1, U8);
ef9edfd0 862 s = start; start = d;
f9a63242
JH
863 while (s < send) {
864 U8 c = *s++;
c4d5f83a
NIS
865 if (!UTF8_IS_INVARIANT(c)) {
866 /* Then it is two-byte encoded */
867 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
868 c = ASCII_TO_NATIVE(c);
869 }
870 *d++ = c;
f9a63242
JH
871 }
872 *d = '\0';
873 *len = d - start;
73d840c0 874 return (U8 *)start;
f9a63242
JH
875}
876
877/*
87cea99e 878=for apidoc bytes_to_utf8
6940069f 879
2bbc8d55 880Converts a string C<s> of length C<len> from the native encoding into UTF-8.
6662521e
GS
881Returns a pointer to the newly-created string, and sets C<len> to
882reflect the new length.
6940069f 883
2bbc8d55
SP
884A NUL character will be written after the end of the string.
885
886If you want to convert to UTF-8 from encodings other than
887the native (Latin1 or EBCDIC),
c9ada85f
JH
888see sv_recode_to_utf8().
889
497711e7 890=cut
6940069f
GS
891*/
892
893U8*
35a4481c 894Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 895{
35a4481c 896 const U8 * const send = s + (*len);
6940069f
GS
897 U8 *d;
898 U8 *dst;
7918f24d
NC
899
900 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 901 PERL_UNUSED_CONTEXT;
6940069f 902
212542aa 903 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
904 dst = d;
905
906 while (s < send) {
35a4481c 907 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 908 if (UNI_IS_INVARIANT(uv))
eb160463 909 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 910 else {
eb160463
GS
911 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
912 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
913 }
914 }
915 *d = '\0';
6662521e 916 *len = d-dst;
6940069f
GS
917 return dst;
918}
919
a0ed51b3 920/*
dea0fc0b 921 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
922 *
923 * Destination must be pre-extended to 3/2 source. Do not use in-place.
924 * We optimize for native, for obvious reasons. */
925
926U8*
dea0fc0b 927Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 928{
dea0fc0b
JH
929 U8* pend;
930 U8* dstart = d;
931
7918f24d
NC
932 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
933
1de9afcd
RGS
934 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
935 d[0] = 0;
936 *newlen = 1;
937 return d;
938 }
939
dea0fc0b 940 if (bytelen & 1)
f5992bc4 941 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
942
943 pend = p + bytelen;
944
a0ed51b3 945 while (p < pend) {
dea0fc0b
JH
946 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
947 p += 2;
a0ed51b3 948 if (uv < 0x80) {
e294cc5d
JH
949#ifdef EBCDIC
950 *d++ = UNI_TO_NATIVE(uv);
951#else
eb160463 952 *d++ = (U8)uv;
e294cc5d 953#endif
a0ed51b3
LW
954 continue;
955 }
956 if (uv < 0x800) {
eb160463
GS
957 *d++ = (U8)(( uv >> 6) | 0xc0);
958 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
959 continue;
960 }
961 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
962 UV low = (p[0] << 8) + p[1];
963 p += 2;
dea0fc0b
JH
964 if (low < 0xdc00 || low >= 0xdfff)
965 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
966 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
967 }
968 if (uv < 0x10000) {
eb160463
GS
969 *d++ = (U8)(( uv >> 12) | 0xe0);
970 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
971 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
972 continue;
973 }
974 else {
eb160463
GS
975 *d++ = (U8)(( uv >> 18) | 0xf0);
976 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
977 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
978 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
979 continue;
980 }
981 }
dea0fc0b 982 *newlen = d - dstart;
a0ed51b3
LW
983 return d;
984}
985
986/* Note: this one is slightly destructive of the source. */
987
988U8*
dea0fc0b 989Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
990{
991 U8* s = (U8*)p;
d4c19fe8 992 U8* const send = s + bytelen;
7918f24d
NC
993
994 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
995
a0ed51b3 996 while (s < send) {
d4c19fe8 997 const U8 tmp = s[0];
a0ed51b3
LW
998 s[0] = s[1];
999 s[1] = tmp;
1000 s += 2;
1001 }
dea0fc0b 1002 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1003}
1004
1005/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1006
1007bool
84afefe6 1008Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_alnum(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1020 return is_utf8_alnumc(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_idfirst(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_alpha(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1044 return is_utf8_ascii(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_space(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1060 return is_utf8_digit(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1068 return is_utf8_upper(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1076 return is_utf8_lower(tmpbuf);
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1084 return is_utf8_cntrl(tmpbuf);
1085}
1086
1087bool
84afefe6 1088Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1092 return is_utf8_graph(tmpbuf);
1093}
1094
1095bool
84afefe6 1096Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1097{
89ebb4a3 1098 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1099 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1100 return is_utf8_print(tmpbuf);
1101}
1102
b8c5462f 1103bool
84afefe6 1104Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1105{
89ebb4a3 1106 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1107 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1108 return is_utf8_punct(tmpbuf);
1109}
1110
4d61ec05 1111bool
84afefe6 1112Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1113{
89ebb4a3 1114 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1115 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1116 return is_utf8_xdigit(tmpbuf);
1117}
1118
84afefe6
JH
1119UV
1120Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1121{
7918f24d
NC
1122 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1123
0ebc6274
JH
1124 uvchr_to_utf8(p, c);
1125 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1126}
1127
84afefe6
JH
1128UV
1129Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1130{
7918f24d
NC
1131 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1132
0ebc6274
JH
1133 uvchr_to_utf8(p, c);
1134 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1135}
1136
84afefe6
JH
1137UV
1138Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1139{
7918f24d
NC
1140 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1141
0ebc6274
JH
1142 uvchr_to_utf8(p, c);
1143 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1144}
1145
84afefe6
JH
1146UV
1147Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1148{
7918f24d
NC
1149 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1150
0ebc6274
JH
1151 uvchr_to_utf8(p, c);
1152 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1153}
1154
a0ed51b3
LW
1155/* for now these all assume no locale info available for Unicode > 255 */
1156
1157bool
84afefe6 1158Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1159{
1160 return is_uni_alnum(c); /* XXX no locale support yet */
1161}
1162
1163bool
84afefe6 1164Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1165{
1166 return is_uni_alnumc(c); /* XXX no locale support yet */
1167}
1168
1169bool
84afefe6 1170Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1171{
1172 return is_uni_idfirst(c); /* XXX no locale support yet */
1173}
1174
1175bool
84afefe6 1176Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1177{
1178 return is_uni_alpha(c); /* XXX no locale support yet */
1179}
1180
1181bool
84afefe6 1182Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1183{
1184 return is_uni_ascii(c); /* XXX no locale support yet */
1185}
1186
1187bool
84afefe6 1188Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1189{
1190 return is_uni_space(c); /* XXX no locale support yet */
1191}
1192
1193bool
84afefe6 1194Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1195{
1196 return is_uni_digit(c); /* XXX no locale support yet */
1197}
1198
1199bool
84afefe6 1200Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1201{
1202 return is_uni_upper(c); /* XXX no locale support yet */
1203}
1204
1205bool
84afefe6 1206Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1207{
1208 return is_uni_lower(c); /* XXX no locale support yet */
1209}
1210
1211bool
84afefe6 1212Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1213{
1214 return is_uni_cntrl(c); /* XXX no locale support yet */
1215}
1216
1217bool
84afefe6 1218Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1219{
1220 return is_uni_graph(c); /* XXX no locale support yet */
1221}
1222
1223bool
84afefe6 1224Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1225{
1226 return is_uni_print(c); /* XXX no locale support yet */
1227}
1228
b8c5462f 1229bool
84afefe6 1230Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1231{
1232 return is_uni_punct(c); /* XXX no locale support yet */
1233}
1234
4d61ec05 1235bool
84afefe6 1236Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1237{
1238 return is_uni_xdigit(c); /* XXX no locale support yet */
1239}
1240
b7ac61fa
JH
1241U32
1242Perl_to_uni_upper_lc(pTHX_ U32 c)
1243{
ee099d14
JH
1244 /* XXX returns only the first character -- do not use XXX */
1245 /* XXX no locale support yet */
1246 STRLEN len;
89ebb4a3 1247 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1248 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1249}
1250
1251U32
1252Perl_to_uni_title_lc(pTHX_ U32 c)
1253{
ee099d14
JH
1254 /* XXX returns only the first character XXX -- do not use XXX */
1255 /* XXX no locale support yet */
1256 STRLEN len;
89ebb4a3 1257 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1258 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1259}
1260
1261U32
1262Perl_to_uni_lower_lc(pTHX_ U32 c)
1263{
ee099d14
JH
1264 /* XXX returns only the first character -- do not use XXX */
1265 /* XXX no locale support yet */
1266 STRLEN len;
89ebb4a3 1267 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1268 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1269}
1270
7452cf6a 1271static bool
5141f98e 1272S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1273 const char *const swashname)
1274{
97aff369 1275 dVAR;
7918f24d
NC
1276
1277 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1278
bde6a22d
NC
1279 if (!is_utf8_char(p))
1280 return FALSE;
1281 if (!*swash)
711a919c 1282 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1283 return swash_fetch(*swash, p, TRUE) != 0;
1284}
1285
1286bool
7fc63493 1287Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1288{
97aff369 1289 dVAR;
7918f24d
NC
1290
1291 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1292
671c33bf
NC
1293 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1294 * descendant of isalnum(3), in other words, it doesn't
1295 * contain the '_'. --jhi */
d4c19fe8 1296 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1297}
1298
1299bool
7fc63493 1300Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1301{
97aff369 1302 dVAR;
7918f24d
NC
1303
1304 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1305
d4c19fe8 1306 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1307}
1308
1309bool
7fc63493 1310Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1311{
97aff369 1312 dVAR;
7918f24d
NC
1313
1314 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1315
82686b01
JH
1316 if (*p == '_')
1317 return TRUE;
bde6a22d 1318 /* is_utf8_idstart would be more logical. */
d4c19fe8 1319 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1320}
1321
1322bool
7fc63493 1323Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1324{
97aff369 1325 dVAR;
7918f24d
NC
1326
1327 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1328
82686b01
JH
1329 if (*p == '_')
1330 return TRUE;
d4c19fe8 1331 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1332}
1333
1334bool
7fc63493 1335Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1336{
97aff369 1337 dVAR;
7918f24d
NC
1338
1339 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1340
d4c19fe8 1341 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1342}
1343
1344bool
7fc63493 1345Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1346{
97aff369 1347 dVAR;
7918f24d
NC
1348
1349 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1350
d4c19fe8 1351 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1352}
1353
1354bool
7fc63493 1355Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1356{
97aff369 1357 dVAR;
7918f24d
NC
1358
1359 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1360
d4c19fe8 1361 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1362}
1363
1364bool
7fc63493 1365Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1366{
97aff369 1367 dVAR;
7918f24d
NC
1368
1369 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1370
d4c19fe8 1371 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1372}
1373
1374bool
7fc63493 1375Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1376{
97aff369 1377 dVAR;
7918f24d
NC
1378
1379 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1380
d4c19fe8 1381 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1382}
1383
1384bool
7fc63493 1385Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1386{
97aff369 1387 dVAR;
7918f24d
NC
1388
1389 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1390
d4c19fe8 1391 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1392}
1393
1394bool
7fc63493 1395Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1396{
97aff369 1397 dVAR;
7918f24d
NC
1398
1399 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1400
d4c19fe8 1401 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1402}
1403
1404bool
7fc63493 1405Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1406{
97aff369 1407 dVAR;
7918f24d
NC
1408
1409 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1410
d4c19fe8 1411 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1412}
1413
1414bool
7fc63493 1415Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1416{
97aff369 1417 dVAR;
7918f24d
NC
1418
1419 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1420
d4c19fe8 1421 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1422}
1423
1424bool
7fc63493 1425Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1426{
97aff369 1427 dVAR;
7918f24d
NC
1428
1429 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1430
d4c19fe8 1431 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1432}
1433
1434bool
7fc63493 1435Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1436{
97aff369 1437 dVAR;
7918f24d
NC
1438
1439 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1440
d4c19fe8 1441 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1442}
1443
1444bool
7fc63493 1445Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1446{
97aff369 1447 dVAR;
7918f24d
NC
1448
1449 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1450
d4c19fe8 1451 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1452}
1453
6b5c0936 1454/*
87cea99e 1455=for apidoc to_utf8_case
6b5c0936
JH
1456
1457The "p" contains the pointer to the UTF-8 string encoding
1458the character that is being converted.
1459
1460The "ustrp" is a pointer to the character buffer to put the
1461conversion result to. The "lenp" is a pointer to the length
1462of the result.
1463
0134edef 1464The "swashp" is a pointer to the swash to use.
6b5c0936 1465
0134edef 1466Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1467and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1468but not always, a multicharacter mapping), is tried first.
6b5c0936 1469
0134edef
JH
1470The "special" is a string like "utf8::ToSpecLower", which means the
1471hash %utf8::ToSpecLower. The access to the hash is through
1472Perl_to_utf8_case().
6b5c0936 1473
0134edef
JH
1474The "normal" is a string like "ToLower" which means the swash
1475%utf8::ToLower.
1476
1477=cut */
6b5c0936 1478
2104c8d9 1479UV
9a957fbc
AL
1480Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1481 SV **swashp, const char *normal, const char *special)
a0ed51b3 1482{
97aff369 1483 dVAR;
89ebb4a3 1484 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1485 STRLEN len = 0;
aec46f14 1486 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1487 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1488 * are necessary in EBCDIC, they are redundant no-ops
1489 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1490 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
1491
1492 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1493
1feea2c7 1494 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1495
1496 if (!*swashp) /* load on-demand */
1497 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1498
b08cf34e
JH
1499 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1500 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1501 /* It might be "special" (sometimes, but not always,
2a37f04d 1502 * a multicharacter mapping) */
6673a63c 1503 HV * const hv = get_hv(special, 0);
b08cf34e
JH
1504 SV **svp;
1505
35da51f7 1506 if (hv &&
b08cf34e
JH
1507 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1508 (*svp)) {
cfd0369c 1509 const char *s;
47654450 1510
cfd0369c 1511 s = SvPV_const(*svp, len);
47654450
JH
1512 if (len == 1)
1513 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1514 else {
2f9475ad
JH
1515#ifdef EBCDIC
1516 /* If we have EBCDIC we need to remap the characters
1517 * since any characters in the low 256 are Unicode
1518 * code points, not EBCDIC. */
7cda7a3d 1519 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1520
1521 d = tmpbuf;
b08cf34e 1522 if (SvUTF8(*svp)) {
2f9475ad
JH
1523 STRLEN tlen = 0;
1524
1525 while (t < tend) {
d4c19fe8 1526 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1527 if (tlen > 0) {
1528 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1529 t += tlen;
1530 }
1531 else
1532 break;
1533 }
1534 }
1535 else {
36fec512
JH
1536 while (t < tend) {
1537 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1538 t++;
1539 }
2f9475ad
JH
1540 }
1541 len = d - tmpbuf;
1542 Copy(tmpbuf, ustrp, len, U8);
1543#else
d2dcd0fb 1544 Copy(s, ustrp, len, U8);
2f9475ad 1545#endif
29e98929 1546 }
983ffd37 1547 }
0134edef
JH
1548 }
1549
1550 if (!len && *swashp) {
d4c19fe8
AL
1551 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1552
0134edef
JH
1553 if (uv2) {
1554 /* It was "normal" (a single character mapping). */
d4c19fe8 1555 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1556 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1557 }
1558 }
1feea2c7 1559
0134edef
JH
1560 if (!len) /* Neither: just copy. */
1561 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1562
2a37f04d
JH
1563 if (lenp)
1564 *lenp = len;
1565
0134edef 1566 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1567}
1568
d3e79532 1569/*
87cea99e 1570=for apidoc to_utf8_upper
d3e79532
JH
1571
1572Convert the UTF-8 encoded character at p to its uppercase version and
1573store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1574that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1575the uppercase version may be longer than the original character.
d3e79532
JH
1576
1577The first character of the uppercased version is returned
1578(but note, as explained above, that there may be more.)
1579
1580=cut */
1581
2104c8d9 1582UV
7fc63493 1583Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1584{
97aff369 1585 dVAR;
7918f24d
NC
1586
1587 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1588
983ffd37 1589 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1590 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1591}
a0ed51b3 1592
d3e79532 1593/*
87cea99e 1594=for apidoc to_utf8_title
d3e79532
JH
1595
1596Convert the UTF-8 encoded character at p to its titlecase version and
1597store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1598that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1599titlecase version may be longer than the original character.
d3e79532
JH
1600
1601The first character of the titlecased version is returned
1602(but note, as explained above, that there may be more.)
1603
1604=cut */
1605
983ffd37 1606UV
7fc63493 1607Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1608{
97aff369 1609 dVAR;
7918f24d
NC
1610
1611 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1612
983ffd37 1613 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1614 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1615}
1616
d3e79532 1617/*
87cea99e 1618=for apidoc to_utf8_lower
d3e79532
JH
1619
1620Convert the UTF-8 encoded character at p to its lowercase version and
1621store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1622that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1623lowercase version may be longer than the original character.
d3e79532
JH
1624
1625The first character of the lowercased version is returned
1626(but note, as explained above, that there may be more.)
1627
1628=cut */
1629
2104c8d9 1630UV
7fc63493 1631Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1632{
97aff369 1633 dVAR;
7918f24d
NC
1634
1635 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1636
983ffd37 1637 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1638 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1639}
1640
d3e79532 1641/*
87cea99e 1642=for apidoc to_utf8_fold
d3e79532
JH
1643
1644Convert the UTF-8 encoded character at p to its foldcase version and
1645store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1646that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1647foldcase version may be longer than the original character (up to
1648three characters).
1649
1650The first character of the foldcased version is returned
1651(but note, as explained above, that there may be more.)
1652
1653=cut */
1654
b4e400f9 1655UV
7fc63493 1656Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1657{
97aff369 1658 dVAR;
7918f24d
NC
1659
1660 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1661
b4e400f9
JH
1662 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1663 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1664}
1665
711a919c
TS
1666/* Note:
1667 * A "swash" is a swatch hash.
1668 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1669 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1670 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1671 */
a0ed51b3 1672SV*
7fc63493 1673Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1674{
27da23d5 1675 dVAR;
a0ed51b3 1676 SV* retval;
8e84507e 1677 dSP;
7fc63493
AL
1678 const size_t pkg_len = strlen(pkg);
1679 const size_t name_len = strlen(name);
da51bb9b 1680 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1681 SV* errsv_save;
ce3b816e 1682
7918f24d
NC
1683 PERL_ARGS_ASSERT_SWASH_INIT;
1684
96ca9f55
DM
1685 PUSHSTACKi(PERLSI_MAGIC);
1686 ENTER;
1687 SAVEI32(PL_hints);
1688 PL_hints = 0;
1689 save_re_context();
1b026014 1690 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1691 ENTER;
f8be5cf0 1692 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1693 /* It is assumed that callers of this routine are not passing in any
1694 user derived data. */
1695 /* Need to do this after save_re_context() as it will set PL_tainted to
1696 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1697 Even line to create errsv_save can turn on PL_tainted. */
1698 SAVEBOOL(PL_tainted);
1699 PL_tainted = 0;
71bed85a 1700 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1701 NULL);
f8be5cf0
JH
1702 if (!SvTRUE(ERRSV))
1703 sv_setsv(ERRSV, errsv_save);
1704 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1705 LEAVE;
1706 }
1707 SPAGAIN;
a0ed51b3
LW
1708 PUSHMARK(SP);
1709 EXTEND(SP,5);
6e449a3a
MHM
1710 mPUSHp(pkg, pkg_len);
1711 mPUSHp(name, name_len);
a0ed51b3 1712 PUSHs(listsv);
6e449a3a
MHM
1713 mPUSHi(minbits);
1714 mPUSHi(none);
a0ed51b3 1715 PUTBACK;
f8be5cf0 1716 errsv_save = newSVsv(ERRSV);
864dbfa3 1717 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1718 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1719 else
e24b16f9 1720 retval = &PL_sv_undef;
f8be5cf0
JH
1721 if (!SvTRUE(ERRSV))
1722 sv_setsv(ERRSV, errsv_save);
1723 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1724 LEAVE;
1725 POPSTACK;
923e4eb5 1726 if (IN_PERL_COMPILETIME) {
623e6609 1727 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1728 }
bc45ce41
JH
1729 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1730 if (SvPOK(retval))
35c1215d 1731 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1732 SVfARG(retval));
cea2e8a9 1733 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1734 }
a0ed51b3
LW
1735 return retval;
1736}
1737
035d37be
JH
1738
1739/* This API is wrong for special case conversions since we may need to
1740 * return several Unicode characters for a single Unicode character
1741 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1742 * the lower-level routine, and it is similarly broken for returning
1743 * multiple values. --jhi */
979f2922 1744/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1745
1746/* Note:
1747 * Returns the value of property/mapping C<swash> for the first character
1748 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1749 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1750 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1751 */
a0ed51b3 1752UV
680c470c 1753Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1754{
27da23d5 1755 dVAR;
ef8f7699 1756 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
1757 U32 klen;
1758 U32 off;
a0ed51b3 1759 STRLEN slen;
7d85a32c 1760 STRLEN needents;
cfd0369c 1761 const U8 *tmps = NULL;
a0ed51b3 1762 U32 bit;
979f2922 1763 SV *swatch;
3568d838 1764 U8 tmputf8[2];
35da51f7 1765 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1766
7918f24d
NC
1767 PERL_ARGS_ASSERT_SWASH_FETCH;
1768
3568d838 1769 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1770 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1771 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1772 ptr = tmputf8;
3568d838
JH
1773 }
1774 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1775 * then the "swatch" is a vec() for al the chars which start
1776 * with 0xAA..0xYY
1777 * So the key in the hash (klen) is length of encoded char -1
1778 */
1779 klen = UTF8SKIP(ptr) - 1;
1780 off = ptr[klen];
a0ed51b3 1781
979f2922 1782 if (klen == 0) {
7d85a32c 1783 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1784 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1785 */
979f2922
TS
1786 needents = UTF_CONTINUATION_MARK;
1787 off = NATIVE_TO_UTF(ptr[klen]);
1788 }
1789 else {
7d85a32c 1790 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1791 needents = (1 << UTF_ACCUMULATION_SHIFT);
1792 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1793 }
7d85a32c 1794
a0ed51b3
LW
1795 /*
1796 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1797 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1798 * it's nothing to sniff at.) Pity we usually come through at least
1799 * two function calls to get here...
1800 *
1801 * NB: this code assumes that swatches are never modified, once generated!
1802 */
1803
3568d838 1804 if (hv == PL_last_swash_hv &&
a0ed51b3 1805 klen == PL_last_swash_klen &&
27da23d5 1806 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1807 {
1808 tmps = PL_last_swash_tmps;
1809 slen = PL_last_swash_slen;
1810 }
1811 else {
1812 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1813 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1814
979f2922
TS
1815 /* If not cached, generate it via swash_get */
1816 if (!svp || !SvPOK(*svp)
1817 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1818 /* We use utf8n_to_uvuni() as we want an index into
1819 Unicode tables, not a native character number.
1820 */
aec46f14 1821 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1822 ckWARN(WARN_UTF8) ?
1823 0 : UTF8_ALLOW_ANY);
680c470c 1824 swatch = swash_get(swash,
979f2922
TS
1825 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1826 (klen) ? (code_point & ~(needents - 1)) : 0,
1827 needents);
1828
923e4eb5 1829 if (IN_PERL_COMPILETIME)
623e6609 1830 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1831
979f2922 1832 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1833
979f2922
TS
1834 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1835 || (slen << 3) < needents)
660a4616 1836 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1837 }
1838
1839 PL_last_swash_hv = hv;
16d8f38a 1840 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1841 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
1842 /* FIXME change interpvar.h? */
1843 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1844 PL_last_swash_slen = slen;
1845 if (klen)
1846 Copy(ptr, PL_last_swash_key, klen, U8);
1847 }
1848
9faf8d75 1849 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1850 case 1:
1851 bit = 1 << (off & 7);
1852 off >>= 3;
1853 return (tmps[off] & bit) != 0;
1854 case 8:
1855 return tmps[off];
1856 case 16:
1857 off <<= 1;
1858 return (tmps[off] << 8) + tmps[off + 1] ;
1859 case 32:
1860 off <<= 2;
1861 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1862 }
660a4616 1863 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1864 NORETURN_FUNCTION_END;
a0ed51b3 1865}
2b9d42f0 1866
979f2922
TS
1867/* Note:
1868 * Returns a swatch (a bit vector string) for a code point sequence
1869 * that starts from the value C<start> and comprises the number C<span>.
1870 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1871 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1872 */
1873STATIC SV*
1874S_swash_get(pTHX_ SV* swash, UV start, UV span)
1875{
1876 SV *swatch;
711a919c 1877 U8 *l, *lend, *x, *xend, *s;
979f2922 1878 STRLEN lcur, xcur, scur;
ef8f7699 1879 HV *const hv = MUTABLE_HV(SvRV(swash));
017a3ce5
GA
1880 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1881 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1882 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1883 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1884 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1885 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1886 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1887 const STRLEN bits = SvUV(*bitssvp);
1888 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1889 const UV none = SvUV(*nonesvp);
1890 const UV end = start + span;
979f2922 1891
7918f24d
NC
1892 PERL_ARGS_ASSERT_SWASH_GET;
1893
979f2922 1894 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1895 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1896 (UV)bits);
979f2922
TS
1897 }
1898
1899 /* create and initialize $swatch */
979f2922 1900 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
1901 swatch = newSV(scur);
1902 SvPOK_on(swatch);
979f2922
TS
1903 s = (U8*)SvPVX(swatch);
1904 if (octets && none) {
0bd48802 1905 const U8* const e = s + scur;
979f2922
TS
1906 while (s < e) {
1907 if (bits == 8)
1908 *s++ = (U8)(none & 0xff);
1909 else if (bits == 16) {
1910 *s++ = (U8)((none >> 8) & 0xff);
1911 *s++ = (U8)( none & 0xff);
1912 }
1913 else if (bits == 32) {
1914 *s++ = (U8)((none >> 24) & 0xff);
1915 *s++ = (U8)((none >> 16) & 0xff);
1916 *s++ = (U8)((none >> 8) & 0xff);
1917 *s++ = (U8)( none & 0xff);
1918 }
1919 }
1920 *s = '\0';
1921 }
1922 else {
1923 (void)memzero((U8*)s, scur + 1);
1924 }
1925 SvCUR_set(swatch, scur);
1926 s = (U8*)SvPVX(swatch);
1927
1928 /* read $swash->{LIST} */
1929 l = (U8*)SvPV(*listsvp, lcur);
1930 lend = l + lcur;
1931 while (l < lend) {
35da51f7 1932 UV min, max, val;
979f2922
TS
1933 STRLEN numlen;
1934 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1935
0bd48802 1936 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1937
1938 numlen = lend - l;
1939 min = grok_hex((char *)l, &numlen, &flags, NULL);
1940 if (numlen)
1941 l += numlen;
1942 else if (nl) {
1943 l = nl + 1; /* 1 is length of "\n" */
1944 continue;
1945 }
1946 else {
1947 l = lend; /* to LIST's end at which \n is not found */
1948 break;
1949 }
1950
1951 if (isBLANK(*l)) {
1952 ++l;
1953 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1954 numlen = lend - l;
1955 max = grok_hex((char *)l, &numlen, &flags, NULL);
1956 if (numlen)
1957 l += numlen;
1958 else
1959 max = min;
1960
1961 if (octets) {
1962 if (isBLANK(*l)) {
1963 ++l;
1964 flags = PERL_SCAN_SILENT_ILLDIGIT |
1965 PERL_SCAN_DISALLOW_PREFIX;
1966 numlen = lend - l;
1967 val = grok_hex((char *)l, &numlen, &flags, NULL);
1968 if (numlen)
1969 l += numlen;
1970 else
1971 val = 0;
1972 }
1973 else {
1974 val = 0;
1975 if (typeto) {
1976 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1977 typestr, l);
1978 }
1979 }
1980 }
711a919c
TS
1981 else
1982 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1983 }
1984 else {
1985 max = min;
1986 if (octets) {
1987 val = 0;
1988 if (typeto) {
1989 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1990 }
1991 }
711a919c
TS
1992 else
1993 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1994 }
1995
1996 if (nl)
1997 l = nl + 1;
1998 else
1999 l = lend;
2000
2001 if (max < start)
2002 continue;
2003
2004 if (octets) {
35da51f7 2005 UV key;
979f2922
TS
2006 if (min < start) {
2007 if (!none || val < none) {
2008 val += start - min;
2009 }
2010 min = start;
2011 }
2012 for (key = min; key <= max; key++) {
2013 STRLEN offset;
2014 if (key >= end)
2015 goto go_out_list;
2016 /* offset must be non-negative (start <= min <= key < end) */
2017 offset = octets * (key - start);
2018 if (bits == 8)
2019 s[offset] = (U8)(val & 0xff);
2020 else if (bits == 16) {
2021 s[offset ] = (U8)((val >> 8) & 0xff);
2022 s[offset + 1] = (U8)( val & 0xff);
2023 }
2024 else if (bits == 32) {
2025 s[offset ] = (U8)((val >> 24) & 0xff);
2026 s[offset + 1] = (U8)((val >> 16) & 0xff);
2027 s[offset + 2] = (U8)((val >> 8) & 0xff);
2028 s[offset + 3] = (U8)( val & 0xff);
2029 }
2030
2031 if (!none || val < none)
2032 ++val;
2033 }
2034 }
711a919c 2035 else { /* bits == 1, then val should be ignored */
35da51f7 2036 UV key;
979f2922
TS
2037 if (min < start)
2038 min = start;
2039 for (key = min; key <= max; key++) {
0bd48802 2040 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
2041 if (key >= end)
2042 goto go_out_list;
2043 s[offset >> 3] |= 1 << (offset & 7);
2044 }
2045 }
2046 } /* while */
2047 go_out_list:
2048
2049 /* read $swash->{EXTRAS} */
2050 x = (U8*)SvPV(*extssvp, xcur);
2051 xend = x + xcur;
2052 while (x < xend) {
2053 STRLEN namelen;
2054 U8 *namestr;
2055 SV** othersvp;
2056 HV* otherhv;
2057 STRLEN otherbits;
2058 SV **otherbitssvp, *other;
711a919c 2059 U8 *s, *o, *nl;
979f2922
TS
2060 STRLEN slen, olen;
2061
35da51f7 2062 const U8 opc = *x++;
979f2922
TS
2063 if (opc == '\n')
2064 continue;
2065
2066 nl = (U8*)memchr(x, '\n', xend - x);
2067
2068 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2069 if (nl) {
2070 x = nl + 1; /* 1 is length of "\n" */
2071 continue;
2072 }
2073 else {
2074 x = xend; /* to EXTRAS' end at which \n is not found */
2075 break;
2076 }
2077 }
2078
2079 namestr = x;
2080 if (nl) {
2081 namelen = nl - namestr;
2082 x = nl + 1;
2083 }
2084 else {
2085 namelen = xend - namestr;
2086 x = xend;
2087 }
2088
2089 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 2090 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 2091 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
2092 otherbits = (STRLEN)SvUV(*otherbitssvp);
2093 if (bits < otherbits)
660a4616 2094 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
2095
2096 /* The "other" swatch must be destroyed after. */
2097 other = swash_get(*othersvp, start, span);
2098 o = (U8*)SvPV(other, olen);
2099
2100 if (!olen)
660a4616 2101 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
2102
2103 s = (U8*)SvPV(swatch, slen);
2104 if (bits == 1 && otherbits == 1) {
2105 if (slen != olen)
660a4616 2106 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
2107
2108 switch (opc) {
2109 case '+':
2110 while (slen--)
2111 *s++ |= *o++;
2112 break;
2113 case '!':
2114 while (slen--)
2115 *s++ |= ~*o++;
2116 break;
2117 case '-':
2118 while (slen--)
2119 *s++ &= ~*o++;
2120 break;
2121 case '&':
2122 while (slen--)
2123 *s++ &= *o++;
2124 break;
2125 default:
2126 break;
2127 }
2128 }
711a919c 2129 else {
979f2922
TS
2130 STRLEN otheroctets = otherbits >> 3;
2131 STRLEN offset = 0;
35da51f7 2132 U8* const send = s + slen;
979f2922
TS
2133
2134 while (s < send) {
2135 UV otherval = 0;
2136
2137 if (otherbits == 1) {
2138 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2139 ++offset;
2140 }
2141 else {
2142 STRLEN vlen = otheroctets;
2143 otherval = *o++;
2144 while (--vlen) {
2145 otherval <<= 8;
2146 otherval |= *o++;
2147 }
2148 }
2149
711a919c 2150 if (opc == '+' && otherval)
6f207bd3 2151 NOOP; /* replace with otherval */
979f2922
TS
2152 else if (opc == '!' && !otherval)
2153 otherval = 1;
2154 else if (opc == '-' && otherval)
2155 otherval = 0;
2156 else if (opc == '&' && !otherval)
2157 otherval = 0;
2158 else {
711a919c 2159 s += octets; /* no replacement */
979f2922
TS
2160 continue;
2161 }
2162
2163 if (bits == 8)
2164 *s++ = (U8)( otherval & 0xff);
2165 else if (bits == 16) {
2166 *s++ = (U8)((otherval >> 8) & 0xff);
2167 *s++ = (U8)( otherval & 0xff);
2168 }
2169 else if (bits == 32) {
2170 *s++ = (U8)((otherval >> 24) & 0xff);
2171 *s++ = (U8)((otherval >> 16) & 0xff);
2172 *s++ = (U8)((otherval >> 8) & 0xff);
2173 *s++ = (U8)( otherval & 0xff);
2174 }
2175 }
2176 }
2177 sv_free(other); /* through with it! */
2178 } /* while */
2179 return swatch;
2180}
2181
0f830e0b 2182/*
87cea99e 2183=for apidoc uvchr_to_utf8
0f830e0b
NC
2184
2185Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2186of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2187bytes available. The return value is the pointer to the byte after the
2188end of the new character. In other words,
2189
2190 d = uvchr_to_utf8(d, uv);
2191
2192is the recommended wide native character-aware way of saying
2193
2194 *(d++) = uv;
2195
2196=cut
2197*/
2198
2199/* On ASCII machines this is normally a macro but we want a
2200 real function in case XS code wants it
2201*/
0f830e0b
NC
2202U8 *
2203Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2204{
7918f24d
NC
2205 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2206
0f830e0b
NC
2207 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2208}
2209
b851fbc1
JH
2210U8 *
2211Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2212{
7918f24d
NC
2213 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2214
b851fbc1
JH
2215 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2216}
2b9d42f0
NIS
2217
2218/*
87cea99e 2219=for apidoc utf8n_to_uvchr
0f830e0b
NC
2220flags
2221
2222Returns the native character value of the first character in the string
2223C<s>
2224which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2225length, in bytes, of that character.
2226
2227Allows length and flags to be passed to low level routine.
2228
2229=cut
2230*/
2231/* On ASCII machines this is normally a macro but we want
2232 a real function in case XS code wants it
2233*/
0f830e0b
NC
2234UV
2235Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2236U32 flags)
2237{
2238 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
2239
2240 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2241
0f830e0b
NC
2242 return UNI_TO_NATIVE(uv);
2243}
2244
2245/*
87cea99e 2246=for apidoc pv_uni_display
d2cc3551
JH
2247
2248Build to the scalar dsv a displayable version of the string spv,
2249length len, the displayable version being at most pvlim bytes long
2250(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2251
9e55ce06 2252The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2253isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2254to display the \\[nrfta\\] as the backslashed versions (like '\n')
2255(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2256UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2257UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2258
d2cc3551
JH
2259The pointer to the PV of the dsv is returned.
2260
2261=cut */
e6b2e755 2262char *
e1ec3a88 2263Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2264{
2265 int truncated = 0;
e1ec3a88 2266 const char *s, *e;
e6b2e755 2267
7918f24d
NC
2268 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2269
76f68e9b 2270 sv_setpvs(dsv, "");
7fddd944 2271 SvUTF8_off(dsv);
e1ec3a88 2272 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2273 UV u;
a49f32c6
NC
2274 /* This serves double duty as a flag and a character to print after
2275 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2276 */
2277 char ok = 0;
c728cb41 2278
e6b2e755
JH
2279 if (pvlim && SvCUR(dsv) >= pvlim) {
2280 truncated++;
2281 break;
2282 }
2283 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2284 if (u < 256) {
a3b680e6 2285 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2286 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2287 switch (c) {
c728cb41 2288 case '\n':
a49f32c6 2289 ok = 'n'; break;
c728cb41 2290 case '\r':
a49f32c6 2291 ok = 'r'; break;
c728cb41 2292 case '\t':
a49f32c6 2293 ok = 't'; break;
c728cb41 2294 case '\f':
a49f32c6 2295 ok = 'f'; break;
c728cb41 2296 case '\a':
a49f32c6 2297 ok = 'a'; break;
c728cb41 2298 case '\\':
a49f32c6 2299 ok = '\\'; break;
c728cb41
JH
2300 default: break;
2301 }
a49f32c6 2302 if (ok) {
88c9ea1e 2303 const char string = ok;
76f68e9b 2304 sv_catpvs(dsv, "\\");
5e7aa789 2305 sv_catpvn(dsv, &string, 1);
a49f32c6 2306 }
c728cb41 2307 }
00e86452 2308 /* isPRINT() is the locale-blind version. */
a49f32c6 2309 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2310 const char string = c;
5e7aa789 2311 sv_catpvn(dsv, &string, 1);
a49f32c6 2312 ok = 1;
0a2ef054 2313 }
c728cb41
JH
2314 }
2315 if (!ok)
9e55ce06 2316 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2317 }
2318 if (truncated)
396482e1 2319 sv_catpvs(dsv, "...");
e6b2e755
JH
2320
2321 return SvPVX(dsv);
2322}
2b9d42f0 2323
d2cc3551 2324/*
87cea99e 2325=for apidoc sv_uni_display
d2cc3551
JH
2326
2327Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2328the displayable version being at most pvlim bytes long
d2cc3551 2329(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2330
2331The flags argument is as in pv_uni_display().
2332
d2cc3551
JH
2333The pointer to the PV of the dsv is returned.
2334
d4c19fe8
AL
2335=cut
2336*/
e6b2e755
JH
2337char *
2338Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2339{
7918f24d
NC
2340 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2341
cfd0369c
NC
2342 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2343 SvCUR(ssv), pvlim, flags);
701a277b
JH
2344}
2345
d2cc3551 2346/*
87cea99e 2347=for apidoc ibcmp_utf8
d2cc3551
JH
2348
2349Return true if the strings s1 and s2 differ case-insensitively, false
2350if not (if they are equal case-insensitively). If u1 is true, the
2351string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2352the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2353are false, the respective string is assumed to be in native 8-bit
2354encoding.
2355
2356If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2357in there (they will point at the beginning of the I<next> character).
2358If the pointers behind pe1 or pe2 are non-NULL, they are the end
2359pointers beyond which scanning will not continue under any
4cdaeff7 2360circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2361s2+l2 will be used as goal end pointers that will also stop the scan,
2362and which qualify towards defining a successful match: all the scans
2363that define an explicit length must reach their goal pointers for
2364a match to succeed).
d2cc3551
JH
2365
2366For case-insensitiveness, the "casefolding" of Unicode is used
2367instead of upper/lowercasing both the characters, see
2368http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2369
2370=cut */
701a277b 2371I32
d07ddd77 2372Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2373{
97aff369 2374 dVAR;
e1ec3a88
AL
2375 register const U8 *p1 = (const U8*)s1;
2376 register const U8 *p2 = (const U8*)s2;
cbbf8932 2377 register const U8 *f1 = NULL;
2f73348c 2378 register const U8 *f2 = NULL;
cbbf8932
AL
2379 register U8 *e1 = NULL;
2380 register U8 *q1 = NULL;
2381 register U8 *e2 = NULL;
2382 register U8 *q2 = NULL;
d07ddd77 2383 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2384 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2385 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2386 U8 natbuf[1+1];
2387 STRLEN foldlen1, foldlen2;
d07ddd77 2388 bool match;
7918f24d
NC
2389
2390 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2391
d07ddd77
JH
2392 if (pe1)
2393 e1 = *(U8**)pe1;
a0a388a1 2394 /* assert(e1 || l1); */
e1ec3a88
AL
2395 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2396 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2397 if (pe2)
2398 e2 = *(U8**)pe2;
a0a388a1 2399 /* assert(e2 || l2); */
e1ec3a88
AL
2400 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2401 f2 = (const U8*)s2 + l2;
d07ddd77 2402
a0a388a1
YO
2403 /* This shouldn't happen. However, putting an assert() there makes some
2404 * tests fail. */
2405 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77
JH
2406 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2407 return 1; /* mismatch; possible infinite loop or false positive */
2408
a6872d42
JH
2409 if (!u1 || !u2)
2410 natbuf[1] = 0; /* Need to terminate the buffer. */
2411
d07ddd77
JH
2412 while ((e1 == 0 || p1 < e1) &&
2413 (f1 == 0 || p1 < f1) &&
2414 (e2 == 0 || p2 < e2) &&
2415 (f2 == 0 || p2 < f2)) {
2416 if (n1 == 0) {
d7f013c8
JH
2417 if (u1)
2418 to_utf8_fold(p1, foldbuf1, &foldlen1);
2419 else {
809e8e66 2420 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2421 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2422 }
2423 q1 = foldbuf1;
d07ddd77 2424 n1 = foldlen1;
332ddc25 2425 }
d07ddd77 2426 if (n2 == 0) {
d7f013c8
JH
2427 if (u2)
2428 to_utf8_fold(p2, foldbuf2, &foldlen2);
2429 else {
809e8e66 2430 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2431 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2432 }
2433 q2 = foldbuf2;
d07ddd77 2434 n2 = foldlen2;
332ddc25 2435 }
d07ddd77
JH
2436 while (n1 && n2) {
2437 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2438 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2439 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2440 return 1; /* mismatch */
d07ddd77 2441 n1 -= UTF8SKIP(q1);
d7f013c8 2442 q1 += UTF8SKIP(q1);
d07ddd77 2443 n2 -= UTF8SKIP(q2);
d7f013c8 2444 q2 += UTF8SKIP(q2);
701a277b 2445 }
d07ddd77 2446 if (n1 == 0)
d7f013c8 2447 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2448 if (n2 == 0)
d7f013c8
JH
2449 p2 += u2 ? UTF8SKIP(p2) : 1;
2450
d2cc3551 2451 }
5469e704 2452
d07ddd77
JH
2453 /* A match is defined by all the scans that specified
2454 * an explicit length reaching their final goals. */
2455 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2456
2457 if (match) {
d07ddd77
JH
2458 if (pe1)
2459 *pe1 = (char*)p1;
2460 if (pe2)
2461 *pe2 = (char*)p2;
5469e704
JH
2462 }
2463
2464 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2465}
701a277b 2466
a49f32c6
NC
2467/*
2468 * Local variables:
2469 * c-indentation-style: bsd
2470 * c-basic-offset: 4
2471 * indent-tabs-mode: t
2472 * End:
2473 *
37442d52
RGS
2474 * ex: set ts=8 sts=4 sw=4 noet:
2475 */