This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Docs: Better[?] development of SUPER
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550
TC
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3
LW
33#include "perl.h"
34
a0c21aa1
JH
35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5
JH
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
ccfc67b7
JH
45/*
46=head1 Unicode Support
a0ed51b3 47
166f8a29
DM
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
87cea99e 54=for apidoc uvuni_to_utf8_flags
eebe1485 55
1e54db1a 56Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 57of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 58bytes available. The return value is the pointer to the byte after the
9041c2e3 59end of the new character. In other words,
eebe1485 60
b851fbc1
JH
61 d = uvuni_to_utf8_flags(d, uv, flags);
62
63or, in most cases,
64
9041c2e3 65 d = uvuni_to_utf8(d, uv);
eebe1485 66
b851fbc1
JH
67(which is equivalent to)
68
69 d = uvuni_to_utf8_flags(d, uv, 0);
70
eebe1485
SC
71is the recommended Unicode-aware way of saying
72
73 *(d++) = uv;
74
75=cut
76*/
77
dfe13c55 78U8 *
b851fbc1 79Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 80{
7918f24d
NC
81 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
82
62961d2e 83 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
84 if (UNICODE_IS_SURROGATE(uv) &&
85 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 86 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
87 else if (
88 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
89 !(flags & UNICODE_ALLOW_FDD0))
90 ||
c867b360 91 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
92 !(flags & UNICODE_ALLOW_FFFF))) &&
93 /* UNICODE_ALLOW_SUPER includes
2a20b9da 94 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
95 ((uv <= PERL_UNICODE_MAX) ||
96 !(flags & UNICODE_ALLOW_SUPER))
97 )
9014280d 98 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
99 "Unicode character 0x%04"UVxf" is illegal", uv);
100 }
c4d5f83a 101 if (UNI_IS_INVARIANT(uv)) {
eb160463 102 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
103 return d;
104 }
2d331972 105#if defined(EBCDIC)
1d72bdf6
NIS
106 else {
107 STRLEN len = UNISKIP(uv);
108 U8 *p = d+len-1;
109 while (p > d) {
eb160463 110 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
111 uv >>= UTF_ACCUMULATION_SHIFT;
112 }
eb160463 113 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
114 return d+len;
115 }
116#else /* Non loop style */
a0ed51b3 117 if (uv < 0x800) {
eb160463
GS
118 *d++ = (U8)(( uv >> 6) | 0xc0);
119 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
120 return d;
121 }
122 if (uv < 0x10000) {
eb160463
GS
123 *d++ = (U8)(( uv >> 12) | 0xe0);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
128 if (uv < 0x200000) {
eb160463
GS
129 *d++ = (U8)(( uv >> 18) | 0xf0);
130 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
132 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
133 return d;
134 }
135 if (uv < 0x4000000) {
eb160463
GS
136 *d++ = (U8)(( uv >> 24) | 0xf8);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
141 return d;
142 }
143 if (uv < 0x80000000) {
eb160463
GS
144 *d++ = (U8)(( uv >> 30) | 0xfc);
145 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
146 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
149 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
150 return d;
151 }
6b8eaf93 152#ifdef HAS_QUAD
d7578b48 153 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
154#endif
155 {
eb160463
GS
156 *d++ = 0xfe; /* Can't match U+FEFF! */
157 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
162 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
163 return d;
164 }
6b8eaf93 165#ifdef HAS_QUAD
a0ed51b3 166 {
eb160463
GS
167 *d++ = 0xff; /* Can't match U+FFFE! */
168 *d++ = 0x80; /* 6 Reserved bits */
169 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
170 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
173 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
174 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
175 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
176 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
177 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
179 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
180 return d;
181 }
182#endif
1d72bdf6 183#endif /* Loop style */
a0ed51b3 184}
9041c2e3 185
646ca15d
JH
186/*
187
188Tests if some arbitrary number of bytes begins in a valid UTF-8
189character. Note that an INVARIANT (i.e. ASCII) character is a valid
190UTF-8 character. The actual number of bytes in the UTF-8 character
191will be returned if it is valid, otherwise 0.
192
193This is the "slow" version as opposed to the "fast" version which is
194the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
195difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
196or less you should use the IS_UTF8_CHAR(), for lengths of five or more
197you should use the _slow(). In practice this means that the _slow()
198will be used very rarely, since the maximum Unicode code point (as of
199Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
200the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
201five bytes or more.
202
203=cut */
c053b435 204STATIC STRLEN
5f66b61c 205S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
206{
207 U8 u = *s;
208 STRLEN slen;
209 UV uv, ouv;
210
7918f24d
NC
211 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
212
646ca15d
JH
213 if (UTF8_IS_INVARIANT(u))
214 return 1;
215
216 if (!UTF8_IS_START(u))
217 return 0;
218
219 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
220 return 0;
221
222 slen = len - 1;
223 s++;
77263263
TS
224#ifdef EBCDIC
225 u = NATIVE_TO_UTF(u);
226#endif
646ca15d
JH
227 u &= UTF_START_MASK(len);
228 uv = u;
229 ouv = uv;
230 while (slen--) {
231 if (!UTF8_IS_CONTINUATION(*s))
232 return 0;
233 uv = UTF8_ACCUMULATE(uv, *s);
234 if (uv < ouv)
235 return 0;
236 ouv = uv;
237 s++;
238 }
239
240 if ((STRLEN)UNISKIP(uv) < len)
241 return 0;
242
243 return len;
244}
9041c2e3
NIS
245
246/*
87cea99e 247=for apidoc is_utf8_char
eebe1485 248
5da9da9e 249Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
250character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
251character is a valid UTF-8 character. The actual number of bytes in the UTF-8
252character will be returned if it is valid, otherwise 0.
9041c2e3 253
82686b01 254=cut */
067a85ef 255STRLEN
7fc63493 256Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 257{
44f8325f 258 const STRLEN len = UTF8SKIP(s);
7918f24d
NC
259
260 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
96a5add6 261 PERL_UNUSED_CONTEXT;
3b0fc154 262#ifdef IS_UTF8_CHAR
768c67ee 263 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
264 return IS_UTF8_CHAR(s, len) ? len : 0;
265#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 266 return is_utf8_char_slow(s, len);
386d01d6
GS
267}
268
6662521e 269/*
87cea99e 270=for apidoc is_utf8_string
6662521e 271
c9ada85f 272Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
273UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
274not mean 'a string that contains code points above 0x7F encoded in UTF-8'
275because a valid ASCII string is a valid UTF-8 string.
6662521e 276
768c67ee
JH
277See also is_utf8_string_loclen() and is_utf8_string_loc().
278
6662521e
GS
279=cut
280*/
281
8e84507e 282bool
7fc63493 283Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 284{
35da51f7 285 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 286 const U8* x = s;
067a85ef 287
7918f24d 288 PERL_ARGS_ASSERT_IS_UTF8_STRING;
96a5add6 289 PERL_UNUSED_CONTEXT;
1aa99e6b 290
6662521e 291 while (x < send) {
a3b680e6 292 STRLEN c;
1acdb0da
JH
293 /* Inline the easy bits of is_utf8_char() here for speed... */
294 if (UTF8_IS_INVARIANT(*x))
295 c = 1;
296 else if (!UTF8_IS_START(*x))
768c67ee 297 goto out;
1acdb0da
JH
298 else {
299 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
300#ifdef IS_UTF8_CHAR
301 c = UTF8SKIP(x);
768c67ee
JH
302 if (IS_UTF8_CHAR_FAST(c)) {
303 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
304 c = 0;
305 }
306 else
307 c = is_utf8_char_slow(x, c);
646ca15d
JH
308#else
309 c = is_utf8_char(x);
310#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 311 if (!c)
768c67ee 312 goto out;
1acdb0da 313 }
6662521e 314 x += c;
6662521e 315 }
768c67ee
JH
316
317 out:
60006e79
JH
318 if (x != send)
319 return FALSE;
067a85ef
A
320
321 return TRUE;
6662521e
GS
322}
323
67e989fb 324/*
814fafa7
NC
325Implemented as a macro in utf8.h
326
87cea99e 327=for apidoc is_utf8_string_loc
814fafa7
NC
328
329Like is_utf8_string() but stores the location of the failure (in the
330case of "utf8ness failure") or the location s+len (in the case of
331"utf8ness success") in the C<ep>.
332
333See also is_utf8_string_loclen() and is_utf8_string().
334
87cea99e 335=for apidoc is_utf8_string_loclen
81cd54e3 336
e3e4599f 337Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
338case of "utf8ness failure") or the location s+len (in the case of
339"utf8ness success") in the C<ep>, and the number of UTF-8
340encoded characters in the C<el>.
341
342See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
343
344=cut
345*/
346
347bool
768c67ee 348Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 349{
35da51f7 350 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 351 const U8* x = s;
81cd54e3 352 STRLEN c;
3ebfea28 353 STRLEN outlen = 0;
7918f24d
NC
354
355 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
96a5add6 356 PERL_UNUSED_CONTEXT;
81cd54e3 357
81cd54e3
JH
358 while (x < send) {
359 /* Inline the easy bits of is_utf8_char() here for speed... */
360 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
361 c = 1;
362 else if (!UTF8_IS_START(*x))
363 goto out;
81cd54e3 364 else {
768c67ee
JH
365 /* ... and call is_utf8_char() only if really needed. */
366#ifdef IS_UTF8_CHAR
367 c = UTF8SKIP(x);
368 if (IS_UTF8_CHAR_FAST(c)) {
369 if (!IS_UTF8_CHAR(x, c))
370 c = 0;
371 } else
372 c = is_utf8_char_slow(x, c);
373#else
374 c = is_utf8_char(x);
375#endif /* #ifdef IS_UTF8_CHAR */
376 if (!c)
377 goto out;
81cd54e3 378 }
768c67ee 379 x += c;
3ebfea28 380 outlen++;
81cd54e3 381 }
768c67ee
JH
382
383 out:
3ebfea28
AL
384 if (el)
385 *el = outlen;
386
768c67ee
JH
387 if (ep)
388 *ep = x;
3ebfea28 389 return (x == send);
81cd54e3
JH
390}
391
392/*
768c67ee 393
87cea99e 394=for apidoc utf8n_to_uvuni
67e989fb 395
9041c2e3 396Bottom level UTF-8 decode routine.
38a44b82 397Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 398which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 399C<retlen> will be set to the length, in bytes, of that character.
67e989fb 400
1e54db1a 401If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
402is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
403it is assumed that the caller will raise a warning, and this function
28d3d195
JH
404will silently just set C<retlen> to C<-1> and return zero. If the
405C<flags> does not contain UTF8_CHECK_ONLY, warnings about
406malformations will be given, C<retlen> will be set to the expected
407length of the UTF-8 character in bytes, and zero will be returned.
408
409The C<flags> can also contain various flags to allow deviations from
410the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 411
9041c2e3
NIS
412Most code should use utf8_to_uvchr() rather than call this directly.
413
37607a96
PK
414=cut
415*/
67e989fb 416
a0ed51b3 417UV
7fc63493 418Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 419{
97aff369 420 dVAR;
d4c19fe8 421 const U8 * const s0 = s;
9c5ffd7c 422 UV uv = *s, ouv = 0;
ba210ebe 423 STRLEN len = 1;
7fc63493
AL
424 const bool dowarn = ckWARN_d(WARN_UTF8);
425 const UV startbyte = *s;
ba210ebe 426 STRLEN expectlen = 0;
a0dbb045
JH
427 U32 warning = 0;
428
7918f24d
NC
429 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
430
a0dbb045
JH
431/* This list is a superset of the UTF8_ALLOW_XXX. */
432
433#define UTF8_WARN_EMPTY 1
434#define UTF8_WARN_CONTINUATION 2
435#define UTF8_WARN_NON_CONTINUATION 3
436#define UTF8_WARN_FE_FF 4
437#define UTF8_WARN_SHORT 5
438#define UTF8_WARN_OVERFLOW 6
439#define UTF8_WARN_SURROGATE 7
c867b360
JH
440#define UTF8_WARN_LONG 8
441#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
442
443 if (curlen == 0 &&
444 !(flags & UTF8_ALLOW_EMPTY)) {
445 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
446 goto malformed;
447 }
448
1d72bdf6 449 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
450 if (retlen)
451 *retlen = 1;
c4d5f83a 452 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 453 }
67e989fb 454
421a8bf2 455 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 456 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 457 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
458 goto malformed;
459 }
460
421a8bf2 461 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 462 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 463 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
464 goto malformed;
465 }
9041c2e3 466
1d72bdf6 467#ifdef EBCDIC
75383841 468 uv = NATIVE_TO_UTF(uv);
1d72bdf6 469#else
fcc8fcf6
JH
470 if ((uv == 0xfe || uv == 0xff) &&
471 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 472 warning = UTF8_WARN_FE_FF;
ba210ebe 473 goto malformed;
a0ed51b3 474 }
1d72bdf6
NIS
475#endif
476
ba210ebe
JH
477 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
478 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
479 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
480 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
481#ifdef EBCDIC
482 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
483 else { len = 7; uv &= 0x01; }
484#else
ba210ebe
JH
485 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
486 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
487 else { len = 13; uv = 0; } /* whoa! */
488#endif
489
a0ed51b3
LW
490 if (retlen)
491 *retlen = len;
9041c2e3 492
ba210ebe
JH
493 expectlen = len;
494
fcc8fcf6
JH
495 if ((curlen < expectlen) &&
496 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 497 warning = UTF8_WARN_SHORT;
ba210ebe
JH
498 goto malformed;
499 }
500
501 len--;
a0ed51b3 502 s++;
ba210ebe
JH
503 ouv = uv;
504
a0ed51b3 505 while (len--) {
421a8bf2
JH
506 if (!UTF8_IS_CONTINUATION(*s) &&
507 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
508 s--;
509 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 510 goto malformed;
a0ed51b3
LW
511 }
512 else
8850bf83 513 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
514 if (!(uv > ouv)) {
515 /* These cannot be allowed. */
516 if (uv == ouv) {
75dbc644 517 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
518 warning = UTF8_WARN_LONG;
519 goto malformed;
520 }
521 }
522 else { /* uv < ouv */
523 /* This cannot be allowed. */
524 warning = UTF8_WARN_OVERFLOW;
525 goto malformed;
526 }
ba210ebe
JH
527 }
528 s++;
529 ouv = uv;
530 }
531
421a8bf2 532 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 533 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 534 warning = UTF8_WARN_SURROGATE;
ba210ebe 535 goto malformed;
eb160463 536 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 537 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 538 warning = UTF8_WARN_LONG;
ba210ebe 539 goto malformed;
421a8bf2 540 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 541 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 542 warning = UTF8_WARN_FFFF;
a9917092 543 goto malformed;
a0ed51b3 544 }
ba210ebe 545
a0ed51b3 546 return uv;
ba210ebe
JH
547
548malformed:
549
fcc8fcf6 550 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 551 if (retlen)
10edeb5d 552 *retlen = ((STRLEN) -1);
ba210ebe
JH
553 return 0;
554 }
555
a0dbb045 556 if (dowarn) {
84bafc02 557 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045
JH
558
559 switch (warning) {
560 case 0: /* Intentionally empty. */ break;
561 case UTF8_WARN_EMPTY:
396482e1 562 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
563 break;
564 case UTF8_WARN_CONTINUATION:
097fb8e2 565 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
566 break;
567 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
568 if (s == s0)
569 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
570 (UV)s[1], startbyte);
551405c4
AL
571 else {
572 const int len = (int)(s-s0);
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
574 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
575 }
576
a0dbb045
JH
577 break;
578 case UTF8_WARN_FE_FF:
579 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
580 break;
581 case UTF8_WARN_SHORT:
097fb8e2 582 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 583 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 584 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
585 break;
586 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
587 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
588 ouv, *s, startbyte);
a0dbb045
JH
589 break;
590 case UTF8_WARN_SURROGATE:
591 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
592 break;
a0dbb045 593 case UTF8_WARN_LONG:
097fb8e2 594 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 595 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
596 break;
597 case UTF8_WARN_FFFF:
598 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
599 break;
600 default:
396482e1 601 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
602 break;
603 }
604
605 if (warning) {
44f8325f 606 const char * const s = SvPVX_const(sv);
a0dbb045
JH
607
608 if (PL_op)
9014280d 609 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 610 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 611 else
9014280d 612 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
613 }
614 }
615
ba210ebe 616 if (retlen)
28d3d195 617 *retlen = expectlen ? expectlen : len;
ba210ebe 618
28d3d195 619 return 0;
a0ed51b3
LW
620}
621
8e84507e 622/*
87cea99e 623=for apidoc utf8_to_uvchr
9041c2e3
NIS
624
625Returns the native character value of the first character in the string C<s>
1e54db1a 626which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
627length, in bytes, of that character.
628
1e54db1a 629If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
630returned and retlen is set, if possible, to -1.
631
632=cut
633*/
634
635UV
7fc63493 636Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 637{
7918f24d
NC
638 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
639
1754c1a1
NC
640 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
641 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
642}
643
644/*
87cea99e 645=for apidoc utf8_to_uvuni
9041c2e3
NIS
646
647Returns the Unicode code point of the first character in the string C<s>
1e54db1a 648which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
649length, in bytes, of that character.
650
2bbc8d55 651This function should only be used when the returned UV is considered
9041c2e3
NIS
652an index into the Unicode semantic tables (e.g. swashes).
653
1e54db1a 654If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 655returned and retlen is set, if possible, to -1.
8e84507e
NIS
656
657=cut
658*/
659
660UV
7fc63493 661Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 662{
7918f24d
NC
663 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
664
9041c2e3 665 /* Call the low level routine asking for checks */
89ebb4a3 666 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 667 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
668}
669
b76347f2 670/*
87cea99e 671=for apidoc utf8_length
b76347f2
JH
672
673Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
674Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
675up past C<e>, croaks.
b76347f2
JH
676
677=cut
678*/
679
680STRLEN
35a4481c 681Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 682{
97aff369 683 dVAR;
b76347f2 684 STRLEN len = 0;
10edeb5d 685 U8 t = 0;
b76347f2 686
7918f24d
NC
687 PERL_ARGS_ASSERT_UTF8_LENGTH;
688
8850bf83
JH
689 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
690 * the bitops (especially ~) can create illegal UTF-8.
691 * In other words: in Perl UTF-8 is not just for Unicode. */
692
a3b680e6
AL
693 if (e < s)
694 goto warn_and_return;
b76347f2 695 while (s < e) {
10edeb5d 696 t = UTF8SKIP(s);
901b21bf 697 if (e - s < t) {
a3b680e6 698 warn_and_return:
901b21bf
JH
699 if (ckWARN_d(WARN_UTF8)) {
700 if (PL_op)
701 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 702 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
703 else
704 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
705 }
706 return len;
707 }
b76347f2
JH
708 s += t;
709 len++;
710 }
711
712 return len;
713}
714
b06226ff 715/*
87cea99e 716=for apidoc utf8_distance
b06226ff 717
1e54db1a 718Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
719and C<b>.
720
721WARNING: use only if you *know* that the pointers point inside the
722same UTF-8 buffer.
723
37607a96
PK
724=cut
725*/
a0ed51b3 726
02eb7b47 727IV
35a4481c 728Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 729{
7918f24d
NC
730 PERL_ARGS_ASSERT_UTF8_DISTANCE;
731
bf1665bc 732 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
733}
734
b06226ff 735/*
87cea99e 736=for apidoc utf8_hop
b06226ff 737
8850bf83
JH
738Return the UTF-8 pointer C<s> displaced by C<off> characters, either
739forward or backward.
b06226ff
JH
740
741WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
742the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
743on the first byte of character or just after the last byte of a character.
b06226ff 744
37607a96
PK
745=cut
746*/
a0ed51b3
LW
747
748U8 *
4373e329 749Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 750{
7918f24d
NC
751 PERL_ARGS_ASSERT_UTF8_HOP;
752
96a5add6 753 PERL_UNUSED_CONTEXT;
8850bf83
JH
754 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
755 * the bitops (especially ~) can create illegal UTF-8.
756 * In other words: in Perl UTF-8 is not just for Unicode. */
757
a0ed51b3
LW
758 if (off >= 0) {
759 while (off--)
760 s += UTF8SKIP(s);
761 }
762 else {
763 while (off++) {
764 s--;
8850bf83
JH
765 while (UTF8_IS_CONTINUATION(*s))
766 s--;
a0ed51b3
LW
767 }
768 }
4373e329 769 return (U8 *)s;
a0ed51b3
LW
770}
771
6940069f 772/*
87cea99e 773=for apidoc utf8_to_bytes
6940069f 774
2bbc8d55 775Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
246fae53
MG
776Unlike C<bytes_to_utf8>, this over-writes the original string, and
777updates len to contain the new length.
67e989fb 778Returns zero on failure, setting C<len> to -1.
6940069f 779
95be277c
NC
780If you need a copy of the string, see C<bytes_from_utf8>.
781
6940069f
GS
782=cut
783*/
784
785U8 *
37607a96 786Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 787{
d4c19fe8
AL
788 U8 * const save = s;
789 U8 * const send = s + *len;
6940069f 790 U8 *d;
246fae53 791
7918f24d
NC
792 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
793
1e54db1a 794 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 795 while (s < send) {
dcad2880
JH
796 U8 c = *s++;
797
1d72bdf6
NIS
798 if (!UTF8_IS_INVARIANT(c) &&
799 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
800 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 801 *len = ((STRLEN) -1);
dcad2880
JH
802 return 0;
803 }
246fae53 804 }
dcad2880
JH
805
806 d = s = save;
6940069f 807 while (s < send) {
ed646e6e 808 STRLEN ulen;
9041c2e3 809 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 810 s += ulen;
6940069f
GS
811 }
812 *d = '\0';
246fae53 813 *len = d - save;
6940069f
GS
814 return save;
815}
816
817/*
87cea99e 818=for apidoc bytes_from_utf8
f9a63242 819
2bbc8d55 820Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
35a4481c 821Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
822the newly-created string, and updates C<len> to contain the new
823length. Returns the original string if no conversion occurs, C<len>
824is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
8250 if C<s> is converted or consisted entirely of characters that are invariant
826in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 827
37607a96
PK
828=cut
829*/
f9a63242
JH
830
831U8 *
e1ec3a88 832Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 833{
f9a63242 834 U8 *d;
e1ec3a88
AL
835 const U8 *start = s;
836 const U8 *send;
f9a63242
JH
837 I32 count = 0;
838
7918f24d
NC
839 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
840
96a5add6 841 PERL_UNUSED_CONTEXT;
f9a63242 842 if (!*is_utf8)
73d840c0 843 return (U8 *)start;
f9a63242 844
1e54db1a 845 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 846 for (send = s + *len; s < send;) {
e1ec3a88 847 U8 c = *s++;
1d72bdf6 848 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
849 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
850 (c = *s++) && UTF8_IS_CONTINUATION(c))
851 count++;
852 else
73d840c0 853 return (U8 *)start;
db42d148 854 }
f9a63242
JH
855 }
856
35da51f7 857 *is_utf8 = FALSE;
f9a63242 858
212542aa 859 Newx(d, (*len) - count + 1, U8);
ef9edfd0 860 s = start; start = d;
f9a63242
JH
861 while (s < send) {
862 U8 c = *s++;
c4d5f83a
NIS
863 if (!UTF8_IS_INVARIANT(c)) {
864 /* Then it is two-byte encoded */
865 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
866 c = ASCII_TO_NATIVE(c);
867 }
868 *d++ = c;
f9a63242
JH
869 }
870 *d = '\0';
871 *len = d - start;
73d840c0 872 return (U8 *)start;
f9a63242
JH
873}
874
875/*
87cea99e 876=for apidoc bytes_to_utf8
6940069f 877
2bbc8d55 878Converts a string C<s> of length C<len> from the native encoding into UTF-8.
6662521e
GS
879Returns a pointer to the newly-created string, and sets C<len> to
880reflect the new length.
6940069f 881
2bbc8d55
SP
882A NUL character will be written after the end of the string.
883
884If you want to convert to UTF-8 from encodings other than
885the native (Latin1 or EBCDIC),
c9ada85f
JH
886see sv_recode_to_utf8().
887
497711e7 888=cut
6940069f
GS
889*/
890
891U8*
35a4481c 892Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 893{
35a4481c 894 const U8 * const send = s + (*len);
6940069f
GS
895 U8 *d;
896 U8 *dst;
7918f24d
NC
897
898 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 899 PERL_UNUSED_CONTEXT;
6940069f 900
212542aa 901 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
902 dst = d;
903
904 while (s < send) {
35a4481c 905 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 906 if (UNI_IS_INVARIANT(uv))
eb160463 907 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 908 else {
eb160463
GS
909 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
910 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
911 }
912 }
913 *d = '\0';
6662521e 914 *len = d-dst;
6940069f
GS
915 return dst;
916}
917
a0ed51b3 918/*
dea0fc0b 919 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
920 *
921 * Destination must be pre-extended to 3/2 source. Do not use in-place.
922 * We optimize for native, for obvious reasons. */
923
924U8*
dea0fc0b 925Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 926{
dea0fc0b
JH
927 U8* pend;
928 U8* dstart = d;
929
7918f24d
NC
930 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
931
1de9afcd
RGS
932 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
933 d[0] = 0;
934 *newlen = 1;
935 return d;
936 }
937
dea0fc0b 938 if (bytelen & 1)
f5992bc4 939 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
940
941 pend = p + bytelen;
942
a0ed51b3 943 while (p < pend) {
dea0fc0b
JH
944 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
945 p += 2;
a0ed51b3 946 if (uv < 0x80) {
e294cc5d
JH
947#ifdef EBCDIC
948 *d++ = UNI_TO_NATIVE(uv);
949#else
eb160463 950 *d++ = (U8)uv;
e294cc5d 951#endif
a0ed51b3
LW
952 continue;
953 }
954 if (uv < 0x800) {
eb160463
GS
955 *d++ = (U8)(( uv >> 6) | 0xc0);
956 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
957 continue;
958 }
959 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
960 UV low = (p[0] << 8) + p[1];
961 p += 2;
dea0fc0b
JH
962 if (low < 0xdc00 || low >= 0xdfff)
963 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
964 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
965 }
966 if (uv < 0x10000) {
eb160463
GS
967 *d++ = (U8)(( uv >> 12) | 0xe0);
968 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
969 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
970 continue;
971 }
972 else {
eb160463
GS
973 *d++ = (U8)(( uv >> 18) | 0xf0);
974 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
975 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
976 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
977 continue;
978 }
979 }
dea0fc0b 980 *newlen = d - dstart;
a0ed51b3
LW
981 return d;
982}
983
984/* Note: this one is slightly destructive of the source. */
985
986U8*
dea0fc0b 987Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
988{
989 U8* s = (U8*)p;
d4c19fe8 990 U8* const send = s + bytelen;
7918f24d
NC
991
992 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
993
a0ed51b3 994 while (s < send) {
d4c19fe8 995 const U8 tmp = s[0];
a0ed51b3
LW
996 s[0] = s[1];
997 s[1] = tmp;
998 s += 2;
999 }
dea0fc0b 1000 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1001}
1002
1003/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1004
1005bool
84afefe6 1006Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1007{
89ebb4a3 1008 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1009 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1010 return is_utf8_alnum(tmpbuf);
1011}
1012
1013bool
84afefe6 1014Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1015{
89ebb4a3 1016 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1017 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1018 return is_utf8_alnumc(tmpbuf);
1019}
1020
1021bool
84afefe6 1022Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1023{
89ebb4a3 1024 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1025 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1026 return is_utf8_idfirst(tmpbuf);
1027}
1028
1029bool
84afefe6 1030Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1031{
89ebb4a3 1032 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1033 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1034 return is_utf8_alpha(tmpbuf);
1035}
1036
1037bool
84afefe6 1038Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1039{
89ebb4a3 1040 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1041 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1042 return is_utf8_ascii(tmpbuf);
1043}
1044
1045bool
84afefe6 1046Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1047{
89ebb4a3 1048 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1049 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1050 return is_utf8_space(tmpbuf);
1051}
1052
1053bool
84afefe6 1054Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1055{
89ebb4a3 1056 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1057 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1058 return is_utf8_digit(tmpbuf);
1059}
1060
1061bool
84afefe6 1062Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1063{
89ebb4a3 1064 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1065 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1066 return is_utf8_upper(tmpbuf);
1067}
1068
1069bool
84afefe6 1070Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1071{
89ebb4a3 1072 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1073 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1074 return is_utf8_lower(tmpbuf);
1075}
1076
1077bool
84afefe6 1078Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1079{
89ebb4a3 1080 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1081 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1082 return is_utf8_cntrl(tmpbuf);
1083}
1084
1085bool
84afefe6 1086Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1087{
89ebb4a3 1088 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1089 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1090 return is_utf8_graph(tmpbuf);
1091}
1092
1093bool
84afefe6 1094Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1095{
89ebb4a3 1096 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1097 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1098 return is_utf8_print(tmpbuf);
1099}
1100
b8c5462f 1101bool
84afefe6 1102Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1103{
89ebb4a3 1104 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1105 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1106 return is_utf8_punct(tmpbuf);
1107}
1108
4d61ec05 1109bool
84afefe6 1110Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1111{
89ebb4a3 1112 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1113 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1114 return is_utf8_xdigit(tmpbuf);
1115}
1116
84afefe6
JH
1117UV
1118Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1119{
7918f24d
NC
1120 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1121
0ebc6274
JH
1122 uvchr_to_utf8(p, c);
1123 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1124}
1125
84afefe6
JH
1126UV
1127Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1128{
7918f24d
NC
1129 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1130
0ebc6274
JH
1131 uvchr_to_utf8(p, c);
1132 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1133}
1134
84afefe6
JH
1135UV
1136Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1137{
7918f24d
NC
1138 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1139
0ebc6274
JH
1140 uvchr_to_utf8(p, c);
1141 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1142}
1143
84afefe6
JH
1144UV
1145Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1146{
7918f24d
NC
1147 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1148
0ebc6274
JH
1149 uvchr_to_utf8(p, c);
1150 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1151}
1152
a0ed51b3
LW
1153/* for now these all assume no locale info available for Unicode > 255 */
1154
1155bool
84afefe6 1156Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1157{
1158 return is_uni_alnum(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1163{
1164 return is_uni_alnumc(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_idfirst(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1175{
1176 return is_uni_alpha(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1181{
1182 return is_uni_ascii(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1187{
1188 return is_uni_space(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1193{
1194 return is_uni_digit(c); /* XXX no locale support yet */
1195}
1196
1197bool
84afefe6 1198Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1199{
1200 return is_uni_upper(c); /* XXX no locale support yet */
1201}
1202
1203bool
84afefe6 1204Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1205{
1206 return is_uni_lower(c); /* XXX no locale support yet */
1207}
1208
1209bool
84afefe6 1210Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1211{
1212 return is_uni_cntrl(c); /* XXX no locale support yet */
1213}
1214
1215bool
84afefe6 1216Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1217{
1218 return is_uni_graph(c); /* XXX no locale support yet */
1219}
1220
1221bool
84afefe6 1222Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1223{
1224 return is_uni_print(c); /* XXX no locale support yet */
1225}
1226
b8c5462f 1227bool
84afefe6 1228Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1229{
1230 return is_uni_punct(c); /* XXX no locale support yet */
1231}
1232
4d61ec05 1233bool
84afefe6 1234Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1235{
1236 return is_uni_xdigit(c); /* XXX no locale support yet */
1237}
1238
b7ac61fa
JH
1239U32
1240Perl_to_uni_upper_lc(pTHX_ U32 c)
1241{
ee099d14
JH
1242 /* XXX returns only the first character -- do not use XXX */
1243 /* XXX no locale support yet */
1244 STRLEN len;
89ebb4a3 1245 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1246 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1247}
1248
1249U32
1250Perl_to_uni_title_lc(pTHX_ U32 c)
1251{
ee099d14
JH
1252 /* XXX returns only the first character XXX -- do not use XXX */
1253 /* XXX no locale support yet */
1254 STRLEN len;
89ebb4a3 1255 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1256 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1257}
1258
1259U32
1260Perl_to_uni_lower_lc(pTHX_ U32 c)
1261{
ee099d14
JH
1262 /* XXX returns only the first character -- do not use XXX */
1263 /* XXX no locale support yet */
1264 STRLEN len;
89ebb4a3 1265 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1266 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1267}
1268
7452cf6a 1269static bool
5141f98e 1270S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1271 const char *const swashname)
1272{
97aff369 1273 dVAR;
7918f24d
NC
1274
1275 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1276
bde6a22d
NC
1277 if (!is_utf8_char(p))
1278 return FALSE;
1279 if (!*swash)
711a919c 1280 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1281 return swash_fetch(*swash, p, TRUE) != 0;
1282}
1283
1284bool
7fc63493 1285Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1286{
97aff369 1287 dVAR;
7918f24d
NC
1288
1289 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1290
671c33bf
NC
1291 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1292 * descendant of isalnum(3), in other words, it doesn't
1293 * contain the '_'. --jhi */
d4c19fe8 1294 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1295}
1296
1297bool
7fc63493 1298Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1299{
97aff369 1300 dVAR;
7918f24d
NC
1301
1302 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1303
d4c19fe8 1304 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1305}
1306
1307bool
7fc63493 1308Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1309{
97aff369 1310 dVAR;
7918f24d
NC
1311
1312 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1313
82686b01
JH
1314 if (*p == '_')
1315 return TRUE;
bde6a22d 1316 /* is_utf8_idstart would be more logical. */
d4c19fe8 1317 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1318}
1319
1320bool
7fc63493 1321Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1322{
97aff369 1323 dVAR;
7918f24d
NC
1324
1325 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1326
82686b01
JH
1327 if (*p == '_')
1328 return TRUE;
d4c19fe8 1329 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1330}
1331
1332bool
7fc63493 1333Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1334{
97aff369 1335 dVAR;
7918f24d
NC
1336
1337 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1338
d4c19fe8 1339 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1340}
1341
1342bool
7fc63493 1343Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1344{
97aff369 1345 dVAR;
7918f24d
NC
1346
1347 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1348
d4c19fe8 1349 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1350}
1351
1352bool
7fc63493 1353Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1354{
97aff369 1355 dVAR;
7918f24d
NC
1356
1357 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1358
d4c19fe8 1359 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1360}
1361
1362bool
7fc63493 1363Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1364{
97aff369 1365 dVAR;
7918f24d
NC
1366
1367 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1368
d4c19fe8 1369 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1370}
1371
1372bool
7fc63493 1373Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1374{
97aff369 1375 dVAR;
7918f24d
NC
1376
1377 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1378
d4c19fe8 1379 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1380}
1381
1382bool
7fc63493 1383Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1384{
97aff369 1385 dVAR;
7918f24d
NC
1386
1387 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1388
d4c19fe8 1389 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1390}
1391
1392bool
7fc63493 1393Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1394{
97aff369 1395 dVAR;
7918f24d
NC
1396
1397 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1398
d4c19fe8 1399 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1400}
1401
1402bool
7fc63493 1403Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1404{
97aff369 1405 dVAR;
7918f24d
NC
1406
1407 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1408
d4c19fe8 1409 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1410}
1411
1412bool
7fc63493 1413Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1414{
97aff369 1415 dVAR;
7918f24d
NC
1416
1417 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1418
d4c19fe8 1419 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1420}
1421
1422bool
7fc63493 1423Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1424{
97aff369 1425 dVAR;
7918f24d
NC
1426
1427 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1428
d4c19fe8 1429 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1430}
1431
1432bool
7fc63493 1433Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1434{
97aff369 1435 dVAR;
7918f24d
NC
1436
1437 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1438
d4c19fe8 1439 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1440}
1441
1442bool
7fc63493 1443Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1444{
97aff369 1445 dVAR;
7918f24d
NC
1446
1447 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1448
d4c19fe8 1449 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1450}
1451
6b5c0936 1452/*
87cea99e 1453=for apidoc to_utf8_case
6b5c0936
JH
1454
1455The "p" contains the pointer to the UTF-8 string encoding
1456the character that is being converted.
1457
1458The "ustrp" is a pointer to the character buffer to put the
1459conversion result to. The "lenp" is a pointer to the length
1460of the result.
1461
0134edef 1462The "swashp" is a pointer to the swash to use.
6b5c0936 1463
0134edef 1464Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1465and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1466but not always, a multicharacter mapping), is tried first.
6b5c0936 1467
0134edef
JH
1468The "special" is a string like "utf8::ToSpecLower", which means the
1469hash %utf8::ToSpecLower. The access to the hash is through
1470Perl_to_utf8_case().
6b5c0936 1471
0134edef
JH
1472The "normal" is a string like "ToLower" which means the swash
1473%utf8::ToLower.
1474
1475=cut */
6b5c0936 1476
2104c8d9 1477UV
9a957fbc
AL
1478Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1479 SV **swashp, const char *normal, const char *special)
a0ed51b3 1480{
97aff369 1481 dVAR;
89ebb4a3 1482 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1483 STRLEN len = 0;
aec46f14 1484 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1485 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1486 * are necessary in EBCDIC, they are redundant no-ops
1487 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1488 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
1489
1490 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1491
1feea2c7 1492 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1493
1494 if (!*swashp) /* load on-demand */
1495 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1496
b08cf34e
JH
1497 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1498 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1499 /* It might be "special" (sometimes, but not always,
2a37f04d 1500 * a multicharacter mapping) */
6673a63c 1501 HV * const hv = get_hv(special, 0);
b08cf34e
JH
1502 SV **svp;
1503
35da51f7 1504 if (hv &&
b08cf34e
JH
1505 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1506 (*svp)) {
cfd0369c 1507 const char *s;
47654450 1508
cfd0369c 1509 s = SvPV_const(*svp, len);
47654450
JH
1510 if (len == 1)
1511 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1512 else {
2f9475ad
JH
1513#ifdef EBCDIC
1514 /* If we have EBCDIC we need to remap the characters
1515 * since any characters in the low 256 are Unicode
1516 * code points, not EBCDIC. */
7cda7a3d 1517 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1518
1519 d = tmpbuf;
b08cf34e 1520 if (SvUTF8(*svp)) {
2f9475ad
JH
1521 STRLEN tlen = 0;
1522
1523 while (t < tend) {
d4c19fe8 1524 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1525 if (tlen > 0) {
1526 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1527 t += tlen;
1528 }
1529 else
1530 break;
1531 }
1532 }
1533 else {
36fec512
JH
1534 while (t < tend) {
1535 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1536 t++;
1537 }
2f9475ad
JH
1538 }
1539 len = d - tmpbuf;
1540 Copy(tmpbuf, ustrp, len, U8);
1541#else
d2dcd0fb 1542 Copy(s, ustrp, len, U8);
2f9475ad 1543#endif
29e98929 1544 }
983ffd37 1545 }
0134edef
JH
1546 }
1547
1548 if (!len && *swashp) {
d4c19fe8
AL
1549 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1550
0134edef
JH
1551 if (uv2) {
1552 /* It was "normal" (a single character mapping). */
d4c19fe8 1553 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1554 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1555 }
1556 }
1feea2c7 1557
0134edef
JH
1558 if (!len) /* Neither: just copy. */
1559 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1560
2a37f04d
JH
1561 if (lenp)
1562 *lenp = len;
1563
0134edef 1564 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1565}
1566
d3e79532 1567/*
87cea99e 1568=for apidoc to_utf8_upper
d3e79532
JH
1569
1570Convert the UTF-8 encoded character at p to its uppercase version and
1571store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1572that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1573the uppercase version may be longer than the original character.
d3e79532
JH
1574
1575The first character of the uppercased version is returned
1576(but note, as explained above, that there may be more.)
1577
1578=cut */
1579
2104c8d9 1580UV
7fc63493 1581Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1582{
97aff369 1583 dVAR;
7918f24d
NC
1584
1585 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1586
983ffd37 1587 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1588 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1589}
a0ed51b3 1590
d3e79532 1591/*
87cea99e 1592=for apidoc to_utf8_title
d3e79532
JH
1593
1594Convert the UTF-8 encoded character at p to its titlecase version and
1595store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1596that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1597titlecase version may be longer than the original character.
d3e79532
JH
1598
1599The first character of the titlecased version is returned
1600(but note, as explained above, that there may be more.)
1601
1602=cut */
1603
983ffd37 1604UV
7fc63493 1605Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1606{
97aff369 1607 dVAR;
7918f24d
NC
1608
1609 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1610
983ffd37 1611 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1612 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1613}
1614
d3e79532 1615/*
87cea99e 1616=for apidoc to_utf8_lower
d3e79532
JH
1617
1618Convert the UTF-8 encoded character at p to its lowercase version and
1619store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1620that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1621lowercase version may be longer than the original character.
d3e79532
JH
1622
1623The first character of the lowercased version is returned
1624(but note, as explained above, that there may be more.)
1625
1626=cut */
1627
2104c8d9 1628UV
7fc63493 1629Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1630{
97aff369 1631 dVAR;
7918f24d
NC
1632
1633 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1634
983ffd37 1635 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1636 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1637}
1638
d3e79532 1639/*
87cea99e 1640=for apidoc to_utf8_fold
d3e79532
JH
1641
1642Convert the UTF-8 encoded character at p to its foldcase version and
1643store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1644that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1645foldcase version may be longer than the original character (up to
1646three characters).
1647
1648The first character of the foldcased version is returned
1649(but note, as explained above, that there may be more.)
1650
1651=cut */
1652
b4e400f9 1653UV
7fc63493 1654Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1655{
97aff369 1656 dVAR;
7918f24d
NC
1657
1658 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1659
b4e400f9
JH
1660 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1661 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1662}
1663
711a919c
TS
1664/* Note:
1665 * A "swash" is a swatch hash.
1666 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1667 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1668 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1669 */
a0ed51b3 1670SV*
7fc63493 1671Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1672{
27da23d5 1673 dVAR;
a0ed51b3 1674 SV* retval;
8e84507e 1675 dSP;
7fc63493
AL
1676 const size_t pkg_len = strlen(pkg);
1677 const size_t name_len = strlen(name);
da51bb9b 1678 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1679 SV* errsv_save;
ce3b816e 1680
7918f24d
NC
1681 PERL_ARGS_ASSERT_SWASH_INIT;
1682
96ca9f55
DM
1683 PUSHSTACKi(PERLSI_MAGIC);
1684 ENTER;
1685 SAVEI32(PL_hints);
1686 PL_hints = 0;
1687 save_re_context();
1b026014 1688 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1689 ENTER;
f8be5cf0 1690 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1691 /* It is assumed that callers of this routine are not passing in any
1692 user derived data. */
1693 /* Need to do this after save_re_context() as it will set PL_tainted to
1694 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1695 Even line to create errsv_save can turn on PL_tainted. */
1696 SAVEBOOL(PL_tainted);
1697 PL_tainted = 0;
71bed85a 1698 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1699 NULL);
f8be5cf0
JH
1700 if (!SvTRUE(ERRSV))
1701 sv_setsv(ERRSV, errsv_save);
1702 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1703 LEAVE;
1704 }
1705 SPAGAIN;
a0ed51b3
LW
1706 PUSHMARK(SP);
1707 EXTEND(SP,5);
6e449a3a
MHM
1708 mPUSHp(pkg, pkg_len);
1709 mPUSHp(name, name_len);
a0ed51b3 1710 PUSHs(listsv);
6e449a3a
MHM
1711 mPUSHi(minbits);
1712 mPUSHi(none);
a0ed51b3 1713 PUTBACK;
f8be5cf0 1714 errsv_save = newSVsv(ERRSV);
864dbfa3 1715 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1716 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1717 else
e24b16f9 1718 retval = &PL_sv_undef;
f8be5cf0
JH
1719 if (!SvTRUE(ERRSV))
1720 sv_setsv(ERRSV, errsv_save);
1721 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1722 LEAVE;
1723 POPSTACK;
923e4eb5 1724 if (IN_PERL_COMPILETIME) {
623e6609 1725 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1726 }
bc45ce41
JH
1727 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1728 if (SvPOK(retval))
35c1215d 1729 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1730 SVfARG(retval));
cea2e8a9 1731 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1732 }
a0ed51b3
LW
1733 return retval;
1734}
1735
035d37be
JH
1736
1737/* This API is wrong for special case conversions since we may need to
1738 * return several Unicode characters for a single Unicode character
1739 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1740 * the lower-level routine, and it is similarly broken for returning
1741 * multiple values. --jhi */
979f2922 1742/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1743
1744/* Note:
1745 * Returns the value of property/mapping C<swash> for the first character
1746 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1747 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1748 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1749 */
a0ed51b3 1750UV
680c470c 1751Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1752{
27da23d5 1753 dVAR;
ef8f7699 1754 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
1755 U32 klen;
1756 U32 off;
a0ed51b3 1757 STRLEN slen;
7d85a32c 1758 STRLEN needents;
cfd0369c 1759 const U8 *tmps = NULL;
a0ed51b3 1760 U32 bit;
979f2922 1761 SV *swatch;
3568d838 1762 U8 tmputf8[2];
35da51f7 1763 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1764
7918f24d
NC
1765 PERL_ARGS_ASSERT_SWASH_FETCH;
1766
3568d838 1767 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1768 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1769 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1770 ptr = tmputf8;
3568d838
JH
1771 }
1772 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1773 * then the "swatch" is a vec() for al the chars which start
1774 * with 0xAA..0xYY
1775 * So the key in the hash (klen) is length of encoded char -1
1776 */
1777 klen = UTF8SKIP(ptr) - 1;
1778 off = ptr[klen];
a0ed51b3 1779
979f2922 1780 if (klen == 0) {
7d85a32c 1781 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1782 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1783 */
979f2922
TS
1784 needents = UTF_CONTINUATION_MARK;
1785 off = NATIVE_TO_UTF(ptr[klen]);
1786 }
1787 else {
7d85a32c 1788 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1789 needents = (1 << UTF_ACCUMULATION_SHIFT);
1790 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1791 }
7d85a32c 1792
a0ed51b3
LW
1793 /*
1794 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1795 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1796 * it's nothing to sniff at.) Pity we usually come through at least
1797 * two function calls to get here...
1798 *
1799 * NB: this code assumes that swatches are never modified, once generated!
1800 */
1801
3568d838 1802 if (hv == PL_last_swash_hv &&
a0ed51b3 1803 klen == PL_last_swash_klen &&
27da23d5 1804 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1805 {
1806 tmps = PL_last_swash_tmps;
1807 slen = PL_last_swash_slen;
1808 }
1809 else {
1810 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1811 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1812
979f2922
TS
1813 /* If not cached, generate it via swash_get */
1814 if (!svp || !SvPOK(*svp)
1815 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1816 /* We use utf8n_to_uvuni() as we want an index into
1817 Unicode tables, not a native character number.
1818 */
aec46f14 1819 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1820 ckWARN(WARN_UTF8) ?
1821 0 : UTF8_ALLOW_ANY);
680c470c 1822 swatch = swash_get(swash,
979f2922
TS
1823 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1824 (klen) ? (code_point & ~(needents - 1)) : 0,
1825 needents);
1826
923e4eb5 1827 if (IN_PERL_COMPILETIME)
623e6609 1828 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1829
979f2922 1830 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1831
979f2922
TS
1832 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1833 || (slen << 3) < needents)
660a4616 1834 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1835 }
1836
1837 PL_last_swash_hv = hv;
16d8f38a 1838 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1839 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
1840 /* FIXME change interpvar.h? */
1841 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1842 PL_last_swash_slen = slen;
1843 if (klen)
1844 Copy(ptr, PL_last_swash_key, klen, U8);
1845 }
1846
9faf8d75 1847 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1848 case 1:
1849 bit = 1 << (off & 7);
1850 off >>= 3;
1851 return (tmps[off] & bit) != 0;
1852 case 8:
1853 return tmps[off];
1854 case 16:
1855 off <<= 1;
1856 return (tmps[off] << 8) + tmps[off + 1] ;
1857 case 32:
1858 off <<= 2;
1859 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1860 }
660a4616 1861 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1862 NORETURN_FUNCTION_END;
a0ed51b3 1863}
2b9d42f0 1864
979f2922
TS
1865/* Note:
1866 * Returns a swatch (a bit vector string) for a code point sequence
1867 * that starts from the value C<start> and comprises the number C<span>.
1868 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1869 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1870 */
1871STATIC SV*
1872S_swash_get(pTHX_ SV* swash, UV start, UV span)
1873{
1874 SV *swatch;
711a919c 1875 U8 *l, *lend, *x, *xend, *s;
979f2922 1876 STRLEN lcur, xcur, scur;
ef8f7699 1877 HV *const hv = MUTABLE_HV(SvRV(swash));
017a3ce5
GA
1878 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1879 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1880 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1881 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1882 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1883 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1884 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1885 const STRLEN bits = SvUV(*bitssvp);
1886 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1887 const UV none = SvUV(*nonesvp);
1888 const UV end = start + span;
979f2922 1889
7918f24d
NC
1890 PERL_ARGS_ASSERT_SWASH_GET;
1891
979f2922 1892 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1893 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1894 (UV)bits);
979f2922
TS
1895 }
1896
1897 /* create and initialize $swatch */
979f2922 1898 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
1899 swatch = newSV(scur);
1900 SvPOK_on(swatch);
979f2922
TS
1901 s = (U8*)SvPVX(swatch);
1902 if (octets && none) {
0bd48802 1903 const U8* const e = s + scur;
979f2922
TS
1904 while (s < e) {
1905 if (bits == 8)
1906 *s++ = (U8)(none & 0xff);
1907 else if (bits == 16) {
1908 *s++ = (U8)((none >> 8) & 0xff);
1909 *s++ = (U8)( none & 0xff);
1910 }
1911 else if (bits == 32) {
1912 *s++ = (U8)((none >> 24) & 0xff);
1913 *s++ = (U8)((none >> 16) & 0xff);
1914 *s++ = (U8)((none >> 8) & 0xff);
1915 *s++ = (U8)( none & 0xff);
1916 }
1917 }
1918 *s = '\0';
1919 }
1920 else {
1921 (void)memzero((U8*)s, scur + 1);
1922 }
1923 SvCUR_set(swatch, scur);
1924 s = (U8*)SvPVX(swatch);
1925
1926 /* read $swash->{LIST} */
1927 l = (U8*)SvPV(*listsvp, lcur);
1928 lend = l + lcur;
1929 while (l < lend) {
35da51f7 1930 UV min, max, val;
979f2922
TS
1931 STRLEN numlen;
1932 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1933
0bd48802 1934 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1935
1936 numlen = lend - l;
1937 min = grok_hex((char *)l, &numlen, &flags, NULL);
1938 if (numlen)
1939 l += numlen;
1940 else if (nl) {
1941 l = nl + 1; /* 1 is length of "\n" */
1942 continue;
1943 }
1944 else {
1945 l = lend; /* to LIST's end at which \n is not found */
1946 break;
1947 }
1948
1949 if (isBLANK(*l)) {
1950 ++l;
1951 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1952 numlen = lend - l;
1953 max = grok_hex((char *)l, &numlen, &flags, NULL);
1954 if (numlen)
1955 l += numlen;
1956 else
1957 max = min;
1958
1959 if (octets) {
1960 if (isBLANK(*l)) {
1961 ++l;
1962 flags = PERL_SCAN_SILENT_ILLDIGIT |
1963 PERL_SCAN_DISALLOW_PREFIX;
1964 numlen = lend - l;
1965 val = grok_hex((char *)l, &numlen, &flags, NULL);
1966 if (numlen)
1967 l += numlen;
1968 else
1969 val = 0;
1970 }
1971 else {
1972 val = 0;
1973 if (typeto) {
1974 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1975 typestr, l);
1976 }
1977 }
1978 }
711a919c
TS
1979 else
1980 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1981 }
1982 else {
1983 max = min;
1984 if (octets) {
1985 val = 0;
1986 if (typeto) {
1987 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1988 }
1989 }
711a919c
TS
1990 else
1991 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1992 }
1993
1994 if (nl)
1995 l = nl + 1;
1996 else
1997 l = lend;
1998
1999 if (max < start)
2000 continue;
2001
2002 if (octets) {
35da51f7 2003 UV key;
979f2922
TS
2004 if (min < start) {
2005 if (!none || val < none) {
2006 val += start - min;
2007 }
2008 min = start;
2009 }
2010 for (key = min; key <= max; key++) {
2011 STRLEN offset;
2012 if (key >= end)
2013 goto go_out_list;
2014 /* offset must be non-negative (start <= min <= key < end) */
2015 offset = octets * (key - start);
2016 if (bits == 8)
2017 s[offset] = (U8)(val & 0xff);
2018 else if (bits == 16) {
2019 s[offset ] = (U8)((val >> 8) & 0xff);
2020 s[offset + 1] = (U8)( val & 0xff);
2021 }
2022 else if (bits == 32) {
2023 s[offset ] = (U8)((val >> 24) & 0xff);
2024 s[offset + 1] = (U8)((val >> 16) & 0xff);
2025 s[offset + 2] = (U8)((val >> 8) & 0xff);
2026 s[offset + 3] = (U8)( val & 0xff);
2027 }
2028
2029 if (!none || val < none)
2030 ++val;
2031 }
2032 }
711a919c 2033 else { /* bits == 1, then val should be ignored */
35da51f7 2034 UV key;
979f2922
TS
2035 if (min < start)
2036 min = start;
2037 for (key = min; key <= max; key++) {
0bd48802 2038 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
2039 if (key >= end)
2040 goto go_out_list;
2041 s[offset >> 3] |= 1 << (offset & 7);
2042 }
2043 }
2044 } /* while */
2045 go_out_list:
2046
2047 /* read $swash->{EXTRAS} */
2048 x = (U8*)SvPV(*extssvp, xcur);
2049 xend = x + xcur;
2050 while (x < xend) {
2051 STRLEN namelen;
2052 U8 *namestr;
2053 SV** othersvp;
2054 HV* otherhv;
2055 STRLEN otherbits;
2056 SV **otherbitssvp, *other;
711a919c 2057 U8 *s, *o, *nl;
979f2922
TS
2058 STRLEN slen, olen;
2059
35da51f7 2060 const U8 opc = *x++;
979f2922
TS
2061 if (opc == '\n')
2062 continue;
2063
2064 nl = (U8*)memchr(x, '\n', xend - x);
2065
2066 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2067 if (nl) {
2068 x = nl + 1; /* 1 is length of "\n" */
2069 continue;
2070 }
2071 else {
2072 x = xend; /* to EXTRAS' end at which \n is not found */
2073 break;
2074 }
2075 }
2076
2077 namestr = x;
2078 if (nl) {
2079 namelen = nl - namestr;
2080 x = nl + 1;
2081 }
2082 else {
2083 namelen = xend - namestr;
2084 x = xend;
2085 }
2086
2087 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 2088 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 2089 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
2090 otherbits = (STRLEN)SvUV(*otherbitssvp);
2091 if (bits < otherbits)
660a4616 2092 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
2093
2094 /* The "other" swatch must be destroyed after. */
2095 other = swash_get(*othersvp, start, span);
2096 o = (U8*)SvPV(other, olen);
2097
2098 if (!olen)
660a4616 2099 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
2100
2101 s = (U8*)SvPV(swatch, slen);
2102 if (bits == 1 && otherbits == 1) {
2103 if (slen != olen)
660a4616 2104 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
2105
2106 switch (opc) {
2107 case '+':
2108 while (slen--)
2109 *s++ |= *o++;
2110 break;
2111 case '!':
2112 while (slen--)
2113 *s++ |= ~*o++;
2114 break;
2115 case '-':
2116 while (slen--)
2117 *s++ &= ~*o++;
2118 break;
2119 case '&':
2120 while (slen--)
2121 *s++ &= *o++;
2122 break;
2123 default:
2124 break;
2125 }
2126 }
711a919c 2127 else {
979f2922
TS
2128 STRLEN otheroctets = otherbits >> 3;
2129 STRLEN offset = 0;
35da51f7 2130 U8* const send = s + slen;
979f2922
TS
2131
2132 while (s < send) {
2133 UV otherval = 0;
2134
2135 if (otherbits == 1) {
2136 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2137 ++offset;
2138 }
2139 else {
2140 STRLEN vlen = otheroctets;
2141 otherval = *o++;
2142 while (--vlen) {
2143 otherval <<= 8;
2144 otherval |= *o++;
2145 }
2146 }
2147
711a919c 2148 if (opc == '+' && otherval)
6f207bd3 2149 NOOP; /* replace with otherval */
979f2922
TS
2150 else if (opc == '!' && !otherval)
2151 otherval = 1;
2152 else if (opc == '-' && otherval)
2153 otherval = 0;
2154 else if (opc == '&' && !otherval)
2155 otherval = 0;
2156 else {
711a919c 2157 s += octets; /* no replacement */
979f2922
TS
2158 continue;
2159 }
2160
2161 if (bits == 8)
2162 *s++ = (U8)( otherval & 0xff);
2163 else if (bits == 16) {
2164 *s++ = (U8)((otherval >> 8) & 0xff);
2165 *s++ = (U8)( otherval & 0xff);
2166 }
2167 else if (bits == 32) {
2168 *s++ = (U8)((otherval >> 24) & 0xff);
2169 *s++ = (U8)((otherval >> 16) & 0xff);
2170 *s++ = (U8)((otherval >> 8) & 0xff);
2171 *s++ = (U8)( otherval & 0xff);
2172 }
2173 }
2174 }
2175 sv_free(other); /* through with it! */
2176 } /* while */
2177 return swatch;
2178}
2179
0f830e0b 2180/*
87cea99e 2181=for apidoc uvchr_to_utf8
0f830e0b
NC
2182
2183Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2184of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2185bytes available. The return value is the pointer to the byte after the
2186end of the new character. In other words,
2187
2188 d = uvchr_to_utf8(d, uv);
2189
2190is the recommended wide native character-aware way of saying
2191
2192 *(d++) = uv;
2193
2194=cut
2195*/
2196
2197/* On ASCII machines this is normally a macro but we want a
2198 real function in case XS code wants it
2199*/
0f830e0b
NC
2200U8 *
2201Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2202{
7918f24d
NC
2203 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2204
0f830e0b
NC
2205 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2206}
2207
b851fbc1
JH
2208U8 *
2209Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2210{
7918f24d
NC
2211 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2212
b851fbc1
JH
2213 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2214}
2b9d42f0
NIS
2215
2216/*
87cea99e 2217=for apidoc utf8n_to_uvchr
0f830e0b
NC
2218flags
2219
2220Returns the native character value of the first character in the string
2221C<s>
2222which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2223length, in bytes, of that character.
2224
2225Allows length and flags to be passed to low level routine.
2226
2227=cut
2228*/
2229/* On ASCII machines this is normally a macro but we want
2230 a real function in case XS code wants it
2231*/
0f830e0b
NC
2232UV
2233Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2234U32 flags)
2235{
2236 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
2237
2238 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2239
0f830e0b
NC
2240 return UNI_TO_NATIVE(uv);
2241}
2242
2243/*
87cea99e 2244=for apidoc pv_uni_display
d2cc3551
JH
2245
2246Build to the scalar dsv a displayable version of the string spv,
2247length len, the displayable version being at most pvlim bytes long
2248(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2249
9e55ce06 2250The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2251isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2252to display the \\[nrfta\\] as the backslashed versions (like '\n')
2253(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2254UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2255UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2256
d2cc3551
JH
2257The pointer to the PV of the dsv is returned.
2258
2259=cut */
e6b2e755 2260char *
e1ec3a88 2261Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2262{
2263 int truncated = 0;
e1ec3a88 2264 const char *s, *e;
e6b2e755 2265
7918f24d
NC
2266 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2267
76f68e9b 2268 sv_setpvs(dsv, "");
7fddd944 2269 SvUTF8_off(dsv);
e1ec3a88 2270 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2271 UV u;
a49f32c6
NC
2272 /* This serves double duty as a flag and a character to print after
2273 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2274 */
2275 char ok = 0;
c728cb41 2276
e6b2e755
JH
2277 if (pvlim && SvCUR(dsv) >= pvlim) {
2278 truncated++;
2279 break;
2280 }
2281 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2282 if (u < 256) {
a3b680e6 2283 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2284 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2285 switch (c) {
c728cb41 2286 case '\n':
a49f32c6 2287 ok = 'n'; break;
c728cb41 2288 case '\r':
a49f32c6 2289 ok = 'r'; break;
c728cb41 2290 case '\t':
a49f32c6 2291 ok = 't'; break;
c728cb41 2292 case '\f':
a49f32c6 2293 ok = 'f'; break;
c728cb41 2294 case '\a':
a49f32c6 2295 ok = 'a'; break;
c728cb41 2296 case '\\':
a49f32c6 2297 ok = '\\'; break;
c728cb41
JH
2298 default: break;
2299 }
a49f32c6 2300 if (ok) {
88c9ea1e 2301 const char string = ok;
76f68e9b 2302 sv_catpvs(dsv, "\\");
5e7aa789 2303 sv_catpvn(dsv, &string, 1);
a49f32c6 2304 }
c728cb41 2305 }
00e86452 2306 /* isPRINT() is the locale-blind version. */
a49f32c6 2307 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2308 const char string = c;
5e7aa789 2309 sv_catpvn(dsv, &string, 1);
a49f32c6 2310 ok = 1;
0a2ef054 2311 }
c728cb41
JH
2312 }
2313 if (!ok)
9e55ce06 2314 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2315 }
2316 if (truncated)
396482e1 2317 sv_catpvs(dsv, "...");
e6b2e755
JH
2318
2319 return SvPVX(dsv);
2320}
2b9d42f0 2321
d2cc3551 2322/*
87cea99e 2323=for apidoc sv_uni_display
d2cc3551
JH
2324
2325Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2326the displayable version being at most pvlim bytes long
d2cc3551 2327(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2328
2329The flags argument is as in pv_uni_display().
2330
d2cc3551
JH
2331The pointer to the PV of the dsv is returned.
2332
d4c19fe8
AL
2333=cut
2334*/
e6b2e755
JH
2335char *
2336Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2337{
7918f24d
NC
2338 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2339
cfd0369c
NC
2340 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2341 SvCUR(ssv), pvlim, flags);
701a277b
JH
2342}
2343
d2cc3551 2344/*
87cea99e 2345=for apidoc ibcmp_utf8
d2cc3551
JH
2346
2347Return true if the strings s1 and s2 differ case-insensitively, false
2348if not (if they are equal case-insensitively). If u1 is true, the
2349string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2350the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2351are false, the respective string is assumed to be in native 8-bit
2352encoding.
2353
2354If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2355in there (they will point at the beginning of the I<next> character).
2356If the pointers behind pe1 or pe2 are non-NULL, they are the end
2357pointers beyond which scanning will not continue under any
4cdaeff7 2358circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2359s2+l2 will be used as goal end pointers that will also stop the scan,
2360and which qualify towards defining a successful match: all the scans
2361that define an explicit length must reach their goal pointers for
2362a match to succeed).
d2cc3551
JH
2363
2364For case-insensitiveness, the "casefolding" of Unicode is used
2365instead of upper/lowercasing both the characters, see
2366http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2367
2368=cut */
701a277b 2369I32
d07ddd77 2370Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2371{
97aff369 2372 dVAR;
e1ec3a88
AL
2373 register const U8 *p1 = (const U8*)s1;
2374 register const U8 *p2 = (const U8*)s2;
cbbf8932 2375 register const U8 *f1 = NULL;
2f73348c 2376 register const U8 *f2 = NULL;
cbbf8932
AL
2377 register U8 *e1 = NULL;
2378 register U8 *q1 = NULL;
2379 register U8 *e2 = NULL;
2380 register U8 *q2 = NULL;
d07ddd77 2381 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2382 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2383 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2384 U8 natbuf[1+1];
2385 STRLEN foldlen1, foldlen2;
d07ddd77 2386 bool match;
7918f24d
NC
2387
2388 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2389
d07ddd77
JH
2390 if (pe1)
2391 e1 = *(U8**)pe1;
a0a388a1 2392 /* assert(e1 || l1); */
e1ec3a88
AL
2393 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2394 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2395 if (pe2)
2396 e2 = *(U8**)pe2;
a0a388a1 2397 /* assert(e2 || l2); */
e1ec3a88
AL
2398 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2399 f2 = (const U8*)s2 + l2;
d07ddd77 2400
a0a388a1
YO
2401 /* This shouldn't happen. However, putting an assert() there makes some
2402 * tests fail. */
2403 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77
JH
2404 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2405 return 1; /* mismatch; possible infinite loop or false positive */
2406
a6872d42
JH
2407 if (!u1 || !u2)
2408 natbuf[1] = 0; /* Need to terminate the buffer. */
2409
d07ddd77
JH
2410 while ((e1 == 0 || p1 < e1) &&
2411 (f1 == 0 || p1 < f1) &&
2412 (e2 == 0 || p2 < e2) &&
2413 (f2 == 0 || p2 < f2)) {
2414 if (n1 == 0) {
d7f013c8
JH
2415 if (u1)
2416 to_utf8_fold(p1, foldbuf1, &foldlen1);
2417 else {
809e8e66 2418 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2419 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2420 }
2421 q1 = foldbuf1;
d07ddd77 2422 n1 = foldlen1;
332ddc25 2423 }
d07ddd77 2424 if (n2 == 0) {
d7f013c8
JH
2425 if (u2)
2426 to_utf8_fold(p2, foldbuf2, &foldlen2);
2427 else {
809e8e66 2428 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2429 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2430 }
2431 q2 = foldbuf2;
d07ddd77 2432 n2 = foldlen2;
332ddc25 2433 }
d07ddd77
JH
2434 while (n1 && n2) {
2435 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2436 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2437 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2438 return 1; /* mismatch */
d07ddd77 2439 n1 -= UTF8SKIP(q1);
d7f013c8 2440 q1 += UTF8SKIP(q1);
d07ddd77 2441 n2 -= UTF8SKIP(q2);
d7f013c8 2442 q2 += UTF8SKIP(q2);
701a277b 2443 }
d07ddd77 2444 if (n1 == 0)
d7f013c8 2445 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2446 if (n2 == 0)
d7f013c8
JH
2447 p2 += u2 ? UTF8SKIP(p2) : 1;
2448
d2cc3551 2449 }
5469e704 2450
d07ddd77
JH
2451 /* A match is defined by all the scans that specified
2452 * an explicit length reaching their final goals. */
2453 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2454
2455 if (match) {
d07ddd77
JH
2456 if (pe1)
2457 *pe1 = (char*)p1;
2458 if (pe2)
2459 *pe2 = (char*)p2;
5469e704
JH
2460 }
2461
2462 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2463}
701a277b 2464
a49f32c6
NC
2465/*
2466 * Local variables:
2467 * c-indentation-style: bsd
2468 * c-basic-offset: 4
2469 * indent-tabs-mode: t
2470 * End:
2471 *
37442d52
RGS
2472 * ex: set ts=8 sts=4 sw=4 noet:
2473 */