This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add a section on applying a patch
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550
TC
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3
LW
33#include "perl.h"
34
a0c21aa1
JH
35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5
JH
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
ccfc67b7
JH
45/*
46=head1 Unicode Support
a0ed51b3 47
166f8a29
DM
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
87cea99e 54=for apidoc uvuni_to_utf8_flags
eebe1485 55
1e54db1a 56Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 57of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 58bytes available. The return value is the pointer to the byte after the
9041c2e3 59end of the new character. In other words,
eebe1485 60
b851fbc1
JH
61 d = uvuni_to_utf8_flags(d, uv, flags);
62
63or, in most cases,
64
9041c2e3 65 d = uvuni_to_utf8(d, uv);
eebe1485 66
b851fbc1
JH
67(which is equivalent to)
68
69 d = uvuni_to_utf8_flags(d, uv, 0);
70
eebe1485
SC
71is the recommended Unicode-aware way of saying
72
73 *(d++) = uv;
74
75=cut
76*/
77
dfe13c55 78U8 *
b851fbc1 79Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 80{
7918f24d
NC
81 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
82
62961d2e 83 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
84 if (UNICODE_IS_SURROGATE(uv) &&
85 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 86 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
87 else if (
88 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
89 !(flags & UNICODE_ALLOW_FDD0))
90 ||
c867b360 91 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
92 !(flags & UNICODE_ALLOW_FFFF))) &&
93 /* UNICODE_ALLOW_SUPER includes
2a20b9da 94 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
95 ((uv <= PERL_UNICODE_MAX) ||
96 !(flags & UNICODE_ALLOW_SUPER))
97 )
9014280d 98 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
99 "Unicode character 0x%04"UVxf" is illegal", uv);
100 }
c4d5f83a 101 if (UNI_IS_INVARIANT(uv)) {
eb160463 102 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
103 return d;
104 }
2d331972 105#if defined(EBCDIC)
1d72bdf6
NIS
106 else {
107 STRLEN len = UNISKIP(uv);
108 U8 *p = d+len-1;
109 while (p > d) {
eb160463 110 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
111 uv >>= UTF_ACCUMULATION_SHIFT;
112 }
eb160463 113 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
114 return d+len;
115 }
116#else /* Non loop style */
a0ed51b3 117 if (uv < 0x800) {
eb160463
GS
118 *d++ = (U8)(( uv >> 6) | 0xc0);
119 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
120 return d;
121 }
122 if (uv < 0x10000) {
eb160463
GS
123 *d++ = (U8)(( uv >> 12) | 0xe0);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
128 if (uv < 0x200000) {
eb160463
GS
129 *d++ = (U8)(( uv >> 18) | 0xf0);
130 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
132 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
133 return d;
134 }
135 if (uv < 0x4000000) {
eb160463
GS
136 *d++ = (U8)(( uv >> 24) | 0xf8);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
141 return d;
142 }
143 if (uv < 0x80000000) {
eb160463
GS
144 *d++ = (U8)(( uv >> 30) | 0xfc);
145 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
146 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
147 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
148 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
149 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
150 return d;
151 }
6b8eaf93 152#ifdef HAS_QUAD
d7578b48 153 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
154#endif
155 {
eb160463
GS
156 *d++ = 0xfe; /* Can't match U+FEFF! */
157 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
162 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
163 return d;
164 }
6b8eaf93 165#ifdef HAS_QUAD
a0ed51b3 166 {
eb160463
GS
167 *d++ = 0xff; /* Can't match U+FFFE! */
168 *d++ = 0x80; /* 6 Reserved bits */
169 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
170 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
173 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
174 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
175 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
176 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
177 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
179 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
180 return d;
181 }
182#endif
1d72bdf6 183#endif /* Loop style */
a0ed51b3 184}
9041c2e3 185
646ca15d
JH
186/*
187
188Tests if some arbitrary number of bytes begins in a valid UTF-8
189character. Note that an INVARIANT (i.e. ASCII) character is a valid
190UTF-8 character. The actual number of bytes in the UTF-8 character
191will be returned if it is valid, otherwise 0.
192
193This is the "slow" version as opposed to the "fast" version which is
194the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
195difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
196or less you should use the IS_UTF8_CHAR(), for lengths of five or more
197you should use the _slow(). In practice this means that the _slow()
198will be used very rarely, since the maximum Unicode code point (as of
199Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
200the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
201five bytes or more.
202
203=cut */
c053b435 204STATIC STRLEN
5f66b61c 205S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
206{
207 U8 u = *s;
208 STRLEN slen;
209 UV uv, ouv;
210
7918f24d
NC
211 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
212
646ca15d
JH
213 if (UTF8_IS_INVARIANT(u))
214 return 1;
215
216 if (!UTF8_IS_START(u))
217 return 0;
218
219 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
220 return 0;
221
222 slen = len - 1;
223 s++;
77263263
ST
224#ifdef EBCDIC
225 u = NATIVE_TO_UTF(u);
226#endif
646ca15d
JH
227 u &= UTF_START_MASK(len);
228 uv = u;
229 ouv = uv;
230 while (slen--) {
231 if (!UTF8_IS_CONTINUATION(*s))
232 return 0;
233 uv = UTF8_ACCUMULATE(uv, *s);
234 if (uv < ouv)
235 return 0;
236 ouv = uv;
237 s++;
238 }
239
240 if ((STRLEN)UNISKIP(uv) < len)
241 return 0;
242
243 return len;
244}
9041c2e3
NIS
245
246/*
87cea99e 247=for apidoc is_utf8_char
eebe1485 248
5da9da9e 249Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
250character. Note that an INVARIANT (i.e. ASCII) character is a valid
251UTF-8 character. The actual number of bytes in the UTF-8 character
252will be returned if it is valid, otherwise 0.
9041c2e3 253
82686b01 254=cut */
067a85ef 255STRLEN
7fc63493 256Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 257{
44f8325f 258 const STRLEN len = UTF8SKIP(s);
7918f24d
NC
259
260 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
96a5add6 261 PERL_UNUSED_CONTEXT;
3b0fc154 262#ifdef IS_UTF8_CHAR
768c67ee 263 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
264 return IS_UTF8_CHAR(s, len) ? len : 0;
265#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 266 return is_utf8_char_slow(s, len);
386d01d6
GS
267}
268
6662521e 269/*
87cea99e 270=for apidoc is_utf8_string
6662521e 271
c9ada85f 272Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
273UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
274not mean 'a string that contains code points above 0x7F encoded in UTF-8'
275because a valid ASCII string is a valid UTF-8 string.
6662521e 276
768c67ee
JH
277See also is_utf8_string_loclen() and is_utf8_string_loc().
278
6662521e
GS
279=cut
280*/
281
8e84507e 282bool
7fc63493 283Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 284{
35da51f7 285 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 286 const U8* x = s;
067a85ef 287
7918f24d 288 PERL_ARGS_ASSERT_IS_UTF8_STRING;
96a5add6 289 PERL_UNUSED_CONTEXT;
1aa99e6b 290
6662521e 291 while (x < send) {
a3b680e6 292 STRLEN c;
1acdb0da
JH
293 /* Inline the easy bits of is_utf8_char() here for speed... */
294 if (UTF8_IS_INVARIANT(*x))
295 c = 1;
296 else if (!UTF8_IS_START(*x))
768c67ee 297 goto out;
1acdb0da
JH
298 else {
299 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
300#ifdef IS_UTF8_CHAR
301 c = UTF8SKIP(x);
768c67ee
JH
302 if (IS_UTF8_CHAR_FAST(c)) {
303 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
304 c = 0;
305 }
306 else
307 c = is_utf8_char_slow(x, c);
646ca15d
JH
308#else
309 c = is_utf8_char(x);
310#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 311 if (!c)
768c67ee 312 goto out;
1acdb0da 313 }
6662521e 314 x += c;
6662521e 315 }
768c67ee
JH
316
317 out:
60006e79
JH
318 if (x != send)
319 return FALSE;
067a85ef
A
320
321 return TRUE;
6662521e
GS
322}
323
67e989fb 324/*
814fafa7
NC
325Implemented as a macro in utf8.h
326
87cea99e 327=for apidoc is_utf8_string_loc
814fafa7
NC
328
329Like is_utf8_string() but stores the location of the failure (in the
330case of "utf8ness failure") or the location s+len (in the case of
331"utf8ness success") in the C<ep>.
332
333See also is_utf8_string_loclen() and is_utf8_string().
334
87cea99e 335=for apidoc is_utf8_string_loclen
81cd54e3 336
e3e4599f 337Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
338case of "utf8ness failure") or the location s+len (in the case of
339"utf8ness success") in the C<ep>, and the number of UTF-8
340encoded characters in the C<el>.
341
342See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
343
344=cut
345*/
346
347bool
768c67ee 348Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 349{
35da51f7 350 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 351 const U8* x = s;
81cd54e3 352 STRLEN c;
3ebfea28 353 STRLEN outlen = 0;
7918f24d
NC
354
355 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
96a5add6 356 PERL_UNUSED_CONTEXT;
81cd54e3 357
81cd54e3
JH
358 while (x < send) {
359 /* Inline the easy bits of is_utf8_char() here for speed... */
360 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
361 c = 1;
362 else if (!UTF8_IS_START(*x))
363 goto out;
81cd54e3 364 else {
768c67ee
JH
365 /* ... and call is_utf8_char() only if really needed. */
366#ifdef IS_UTF8_CHAR
367 c = UTF8SKIP(x);
368 if (IS_UTF8_CHAR_FAST(c)) {
369 if (!IS_UTF8_CHAR(x, c))
370 c = 0;
371 } else
372 c = is_utf8_char_slow(x, c);
373#else
374 c = is_utf8_char(x);
375#endif /* #ifdef IS_UTF8_CHAR */
376 if (!c)
377 goto out;
81cd54e3 378 }
768c67ee 379 x += c;
3ebfea28 380 outlen++;
81cd54e3 381 }
768c67ee
JH
382
383 out:
3ebfea28
AL
384 if (el)
385 *el = outlen;
386
768c67ee
JH
387 if (ep)
388 *ep = x;
3ebfea28 389 return (x == send);
81cd54e3
JH
390}
391
392/*
768c67ee 393
87cea99e 394=for apidoc utf8n_to_uvuni
67e989fb 395
9041c2e3 396Bottom level UTF-8 decode routine.
38a44b82 397Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 398which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 399C<retlen> will be set to the length, in bytes, of that character.
67e989fb 400
1e54db1a 401If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
402is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
403it is assumed that the caller will raise a warning, and this function
28d3d195
JH
404will silently just set C<retlen> to C<-1> and return zero. If the
405C<flags> does not contain UTF8_CHECK_ONLY, warnings about
406malformations will be given, C<retlen> will be set to the expected
407length of the UTF-8 character in bytes, and zero will be returned.
408
409The C<flags> can also contain various flags to allow deviations from
410the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 411
9041c2e3
NIS
412Most code should use utf8_to_uvchr() rather than call this directly.
413
37607a96
PK
414=cut
415*/
67e989fb 416
a0ed51b3 417UV
7fc63493 418Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 419{
97aff369 420 dVAR;
d4c19fe8 421 const U8 * const s0 = s;
9c5ffd7c 422 UV uv = *s, ouv = 0;
ba210ebe 423 STRLEN len = 1;
7fc63493
AL
424 const bool dowarn = ckWARN_d(WARN_UTF8);
425 const UV startbyte = *s;
ba210ebe 426 STRLEN expectlen = 0;
a0dbb045
JH
427 U32 warning = 0;
428
7918f24d
NC
429 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
430
a0dbb045
JH
431/* This list is a superset of the UTF8_ALLOW_XXX. */
432
433#define UTF8_WARN_EMPTY 1
434#define UTF8_WARN_CONTINUATION 2
435#define UTF8_WARN_NON_CONTINUATION 3
436#define UTF8_WARN_FE_FF 4
437#define UTF8_WARN_SHORT 5
438#define UTF8_WARN_OVERFLOW 6
439#define UTF8_WARN_SURROGATE 7
c867b360
JH
440#define UTF8_WARN_LONG 8
441#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
442
443 if (curlen == 0 &&
444 !(flags & UTF8_ALLOW_EMPTY)) {
445 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
446 goto malformed;
447 }
448
1d72bdf6 449 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
450 if (retlen)
451 *retlen = 1;
c4d5f83a 452 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 453 }
67e989fb 454
421a8bf2 455 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 456 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 457 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
458 goto malformed;
459 }
460
421a8bf2 461 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 462 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 463 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
464 goto malformed;
465 }
9041c2e3 466
1d72bdf6 467#ifdef EBCDIC
75383841 468 uv = NATIVE_TO_UTF(uv);
1d72bdf6 469#else
fcc8fcf6
JH
470 if ((uv == 0xfe || uv == 0xff) &&
471 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 472 warning = UTF8_WARN_FE_FF;
ba210ebe 473 goto malformed;
a0ed51b3 474 }
1d72bdf6
NIS
475#endif
476
ba210ebe
JH
477 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
478 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
479 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
480 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
481#ifdef EBCDIC
482 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
483 else { len = 7; uv &= 0x01; }
484#else
ba210ebe
JH
485 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
486 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
487 else { len = 13; uv = 0; } /* whoa! */
488#endif
489
a0ed51b3
LW
490 if (retlen)
491 *retlen = len;
9041c2e3 492
ba210ebe
JH
493 expectlen = len;
494
fcc8fcf6
JH
495 if ((curlen < expectlen) &&
496 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 497 warning = UTF8_WARN_SHORT;
ba210ebe
JH
498 goto malformed;
499 }
500
501 len--;
a0ed51b3 502 s++;
ba210ebe
JH
503 ouv = uv;
504
a0ed51b3 505 while (len--) {
421a8bf2
JH
506 if (!UTF8_IS_CONTINUATION(*s) &&
507 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
508 s--;
509 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 510 goto malformed;
a0ed51b3
LW
511 }
512 else
8850bf83 513 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
514 if (!(uv > ouv)) {
515 /* These cannot be allowed. */
516 if (uv == ouv) {
75dbc644 517 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
518 warning = UTF8_WARN_LONG;
519 goto malformed;
520 }
521 }
522 else { /* uv < ouv */
523 /* This cannot be allowed. */
524 warning = UTF8_WARN_OVERFLOW;
525 goto malformed;
526 }
ba210ebe
JH
527 }
528 s++;
529 ouv = uv;
530 }
531
421a8bf2 532 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 533 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 534 warning = UTF8_WARN_SURROGATE;
ba210ebe 535 goto malformed;
eb160463 536 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 537 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 538 warning = UTF8_WARN_LONG;
ba210ebe 539 goto malformed;
421a8bf2 540 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 541 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 542 warning = UTF8_WARN_FFFF;
a9917092 543 goto malformed;
a0ed51b3 544 }
ba210ebe 545
a0ed51b3 546 return uv;
ba210ebe
JH
547
548malformed:
549
fcc8fcf6 550 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 551 if (retlen)
10edeb5d 552 *retlen = ((STRLEN) -1);
ba210ebe
JH
553 return 0;
554 }
555
a0dbb045 556 if (dowarn) {
84bafc02 557 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045
JH
558
559 switch (warning) {
560 case 0: /* Intentionally empty. */ break;
561 case UTF8_WARN_EMPTY:
396482e1 562 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
563 break;
564 case UTF8_WARN_CONTINUATION:
097fb8e2 565 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
566 break;
567 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
568 if (s == s0)
569 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
570 (UV)s[1], startbyte);
551405c4
AL
571 else {
572 const int len = (int)(s-s0);
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
574 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
575 }
576
a0dbb045
JH
577 break;
578 case UTF8_WARN_FE_FF:
579 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
580 break;
581 case UTF8_WARN_SHORT:
097fb8e2 582 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 583 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 584 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
585 break;
586 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
587 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
588 ouv, *s, startbyte);
a0dbb045
JH
589 break;
590 case UTF8_WARN_SURROGATE:
591 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
592 break;
a0dbb045 593 case UTF8_WARN_LONG:
097fb8e2 594 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 595 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
596 break;
597 case UTF8_WARN_FFFF:
598 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
599 break;
600 default:
396482e1 601 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
602 break;
603 }
604
605 if (warning) {
44f8325f 606 const char * const s = SvPVX_const(sv);
a0dbb045
JH
607
608 if (PL_op)
9014280d 609 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 610 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 611 else
9014280d 612 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
613 }
614 }
615
ba210ebe 616 if (retlen)
28d3d195 617 *retlen = expectlen ? expectlen : len;
ba210ebe 618
28d3d195 619 return 0;
a0ed51b3
LW
620}
621
8e84507e 622/*
87cea99e 623=for apidoc utf8_to_uvchr
9041c2e3
NIS
624
625Returns the native character value of the first character in the string C<s>
1e54db1a 626which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
627length, in bytes, of that character.
628
1e54db1a 629If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
630returned and retlen is set, if possible, to -1.
631
632=cut
633*/
634
635UV
7fc63493 636Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 637{
7918f24d
NC
638 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
639
1754c1a1
NC
640 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
641 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
642}
643
644/*
87cea99e 645=for apidoc utf8_to_uvuni
9041c2e3
NIS
646
647Returns the Unicode code point of the first character in the string C<s>
1e54db1a 648which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
649length, in bytes, of that character.
650
651This function should only be used when returned UV is considered
652an index into the Unicode semantic tables (e.g. swashes).
653
1e54db1a 654If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 655returned and retlen is set, if possible, to -1.
8e84507e
NIS
656
657=cut
658*/
659
660UV
7fc63493 661Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 662{
7918f24d
NC
663 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
664
9041c2e3 665 /* Call the low level routine asking for checks */
89ebb4a3 666 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 667 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
668}
669
b76347f2 670/*
87cea99e 671=for apidoc utf8_length
b76347f2
JH
672
673Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
674Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
675up past C<e>, croaks.
b76347f2
JH
676
677=cut
678*/
679
680STRLEN
35a4481c 681Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 682{
97aff369 683 dVAR;
b76347f2 684 STRLEN len = 0;
10edeb5d 685 U8 t = 0;
b76347f2 686
7918f24d
NC
687 PERL_ARGS_ASSERT_UTF8_LENGTH;
688
8850bf83
JH
689 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
690 * the bitops (especially ~) can create illegal UTF-8.
691 * In other words: in Perl UTF-8 is not just for Unicode. */
692
a3b680e6
AL
693 if (e < s)
694 goto warn_and_return;
b76347f2 695 while (s < e) {
10edeb5d 696 t = UTF8SKIP(s);
901b21bf 697 if (e - s < t) {
a3b680e6 698 warn_and_return:
901b21bf
JH
699 if (ckWARN_d(WARN_UTF8)) {
700 if (PL_op)
701 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 702 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
703 else
704 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
705 }
706 return len;
707 }
b76347f2
JH
708 s += t;
709 len++;
710 }
711
712 return len;
713}
714
b06226ff 715/*
87cea99e 716=for apidoc utf8_distance
b06226ff 717
1e54db1a 718Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
719and C<b>.
720
721WARNING: use only if you *know* that the pointers point inside the
722same UTF-8 buffer.
723
37607a96
PK
724=cut
725*/
a0ed51b3 726
02eb7b47 727IV
35a4481c 728Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 729{
7918f24d
NC
730 PERL_ARGS_ASSERT_UTF8_DISTANCE;
731
bf1665bc 732 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
733}
734
b06226ff 735/*
87cea99e 736=for apidoc utf8_hop
b06226ff 737
8850bf83
JH
738Return the UTF-8 pointer C<s> displaced by C<off> characters, either
739forward or backward.
b06226ff
JH
740
741WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
742the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
743on the first byte of character or just after the last byte of a character.
b06226ff 744
37607a96
PK
745=cut
746*/
a0ed51b3
LW
747
748U8 *
4373e329 749Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 750{
7918f24d
NC
751 PERL_ARGS_ASSERT_UTF8_HOP;
752
96a5add6 753 PERL_UNUSED_CONTEXT;
8850bf83
JH
754 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
755 * the bitops (especially ~) can create illegal UTF-8.
756 * In other words: in Perl UTF-8 is not just for Unicode. */
757
a0ed51b3
LW
758 if (off >= 0) {
759 while (off--)
760 s += UTF8SKIP(s);
761 }
762 else {
763 while (off++) {
764 s--;
8850bf83
JH
765 while (UTF8_IS_CONTINUATION(*s))
766 s--;
a0ed51b3
LW
767 }
768 }
4373e329 769 return (U8 *)s;
a0ed51b3
LW
770}
771
6940069f 772/*
87cea99e 773=for apidoc utf8_to_bytes
6940069f 774
1e54db1a 775Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
776Unlike C<bytes_to_utf8>, this over-writes the original string, and
777updates len to contain the new length.
67e989fb 778Returns zero on failure, setting C<len> to -1.
6940069f 779
95be277c
NC
780If you need a copy of the string, see C<bytes_from_utf8>.
781
6940069f
GS
782=cut
783*/
784
785U8 *
37607a96 786Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 787{
d4c19fe8
AL
788 U8 * const save = s;
789 U8 * const send = s + *len;
6940069f 790 U8 *d;
246fae53 791
7918f24d
NC
792 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
793
1e54db1a 794 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 795 while (s < send) {
dcad2880
JH
796 U8 c = *s++;
797
1d72bdf6
NIS
798 if (!UTF8_IS_INVARIANT(c) &&
799 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
800 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 801 *len = ((STRLEN) -1);
dcad2880
JH
802 return 0;
803 }
246fae53 804 }
dcad2880
JH
805
806 d = s = save;
6940069f 807 while (s < send) {
ed646e6e 808 STRLEN ulen;
9041c2e3 809 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 810 s += ulen;
6940069f
GS
811 }
812 *d = '\0';
246fae53 813 *len = d - save;
6940069f
GS
814 return save;
815}
816
817/*
87cea99e 818=for apidoc bytes_from_utf8
f9a63242 819
1e54db1a 820Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 821Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
822the newly-created string, and updates C<len> to contain the new
823length. Returns the original string if no conversion occurs, C<len>
824is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8250 if C<s> is converted or contains all 7bit characters.
f9a63242 826
37607a96
PK
827=cut
828*/
f9a63242
JH
829
830U8 *
e1ec3a88 831Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 832{
f9a63242 833 U8 *d;
e1ec3a88
AL
834 const U8 *start = s;
835 const U8 *send;
f9a63242
JH
836 I32 count = 0;
837
7918f24d
NC
838 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
839
96a5add6 840 PERL_UNUSED_CONTEXT;
f9a63242 841 if (!*is_utf8)
73d840c0 842 return (U8 *)start;
f9a63242 843
1e54db1a 844 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 845 for (send = s + *len; s < send;) {
e1ec3a88 846 U8 c = *s++;
1d72bdf6 847 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
848 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
849 (c = *s++) && UTF8_IS_CONTINUATION(c))
850 count++;
851 else
73d840c0 852 return (U8 *)start;
db42d148 853 }
f9a63242
JH
854 }
855
35da51f7 856 *is_utf8 = FALSE;
f9a63242 857
212542aa 858 Newx(d, (*len) - count + 1, U8);
ef9edfd0 859 s = start; start = d;
f9a63242
JH
860 while (s < send) {
861 U8 c = *s++;
c4d5f83a
NIS
862 if (!UTF8_IS_INVARIANT(c)) {
863 /* Then it is two-byte encoded */
864 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
865 c = ASCII_TO_NATIVE(c);
866 }
867 *d++ = c;
f9a63242
JH
868 }
869 *d = '\0';
870 *len = d - start;
73d840c0 871 return (U8 *)start;
f9a63242
JH
872}
873
874/*
87cea99e 875=for apidoc bytes_to_utf8
6940069f 876
1e54db1a 877Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
878Returns a pointer to the newly-created string, and sets C<len> to
879reflect the new length.
6940069f 880
1e54db1a 881If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
882see sv_recode_to_utf8().
883
497711e7 884=cut
6940069f
GS
885*/
886
887U8*
35a4481c 888Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 889{
35a4481c 890 const U8 * const send = s + (*len);
6940069f
GS
891 U8 *d;
892 U8 *dst;
7918f24d
NC
893
894 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 895 PERL_UNUSED_CONTEXT;
6940069f 896
212542aa 897 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
898 dst = d;
899
900 while (s < send) {
35a4481c 901 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 902 if (UNI_IS_INVARIANT(uv))
eb160463 903 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 904 else {
eb160463
GS
905 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
906 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
907 }
908 }
909 *d = '\0';
6662521e 910 *len = d-dst;
6940069f
GS
911 return dst;
912}
913
a0ed51b3 914/*
dea0fc0b 915 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
916 *
917 * Destination must be pre-extended to 3/2 source. Do not use in-place.
918 * We optimize for native, for obvious reasons. */
919
920U8*
dea0fc0b 921Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 922{
dea0fc0b
JH
923 U8* pend;
924 U8* dstart = d;
925
7918f24d
NC
926 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
927
1de9afcd
RGS
928 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
929 d[0] = 0;
930 *newlen = 1;
931 return d;
932 }
933
dea0fc0b 934 if (bytelen & 1)
f5992bc4 935 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
936
937 pend = p + bytelen;
938
a0ed51b3 939 while (p < pend) {
dea0fc0b
JH
940 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
941 p += 2;
a0ed51b3 942 if (uv < 0x80) {
e294cc5d
JH
943#ifdef EBCDIC
944 *d++ = UNI_TO_NATIVE(uv);
945#else
eb160463 946 *d++ = (U8)uv;
e294cc5d 947#endif
a0ed51b3
LW
948 continue;
949 }
950 if (uv < 0x800) {
eb160463
GS
951 *d++ = (U8)(( uv >> 6) | 0xc0);
952 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
953 continue;
954 }
955 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
956 UV low = (p[0] << 8) + p[1];
957 p += 2;
dea0fc0b
JH
958 if (low < 0xdc00 || low >= 0xdfff)
959 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
960 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
961 }
962 if (uv < 0x10000) {
eb160463
GS
963 *d++ = (U8)(( uv >> 12) | 0xe0);
964 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
965 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
966 continue;
967 }
968 else {
eb160463
GS
969 *d++ = (U8)(( uv >> 18) | 0xf0);
970 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
971 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
972 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
973 continue;
974 }
975 }
dea0fc0b 976 *newlen = d - dstart;
a0ed51b3
LW
977 return d;
978}
979
980/* Note: this one is slightly destructive of the source. */
981
982U8*
dea0fc0b 983Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
984{
985 U8* s = (U8*)p;
d4c19fe8 986 U8* const send = s + bytelen;
7918f24d
NC
987
988 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
989
a0ed51b3 990 while (s < send) {
d4c19fe8 991 const U8 tmp = s[0];
a0ed51b3
LW
992 s[0] = s[1];
993 s[1] = tmp;
994 s += 2;
995 }
dea0fc0b 996 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
997}
998
999/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1000
1001bool
84afefe6 1002Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1003{
89ebb4a3 1004 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1005 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1006 return is_utf8_alnum(tmpbuf);
1007}
1008
1009bool
84afefe6 1010Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1011{
89ebb4a3 1012 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1013 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1014 return is_utf8_alnumc(tmpbuf);
1015}
1016
1017bool
84afefe6 1018Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1019{
89ebb4a3 1020 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1021 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1022 return is_utf8_idfirst(tmpbuf);
1023}
1024
1025bool
84afefe6 1026Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1027{
89ebb4a3 1028 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1029 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1030 return is_utf8_alpha(tmpbuf);
1031}
1032
1033bool
84afefe6 1034Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1035{
89ebb4a3 1036 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1037 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1038 return is_utf8_ascii(tmpbuf);
1039}
1040
1041bool
84afefe6 1042Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1043{
89ebb4a3 1044 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1045 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1046 return is_utf8_space(tmpbuf);
1047}
1048
1049bool
84afefe6 1050Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1051{
89ebb4a3 1052 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1053 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1054 return is_utf8_digit(tmpbuf);
1055}
1056
1057bool
84afefe6 1058Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1059{
89ebb4a3 1060 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1061 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1062 return is_utf8_upper(tmpbuf);
1063}
1064
1065bool
84afefe6 1066Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1067{
89ebb4a3 1068 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1069 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1070 return is_utf8_lower(tmpbuf);
1071}
1072
1073bool
84afefe6 1074Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1075{
89ebb4a3 1076 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1077 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1078 return is_utf8_cntrl(tmpbuf);
1079}
1080
1081bool
84afefe6 1082Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1083{
89ebb4a3 1084 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1085 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1086 return is_utf8_graph(tmpbuf);
1087}
1088
1089bool
84afefe6 1090Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1091{
89ebb4a3 1092 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1093 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1094 return is_utf8_print(tmpbuf);
1095}
1096
b8c5462f 1097bool
84afefe6 1098Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1099{
89ebb4a3 1100 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1101 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1102 return is_utf8_punct(tmpbuf);
1103}
1104
4d61ec05 1105bool
84afefe6 1106Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1107{
89ebb4a3 1108 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1109 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1110 return is_utf8_xdigit(tmpbuf);
1111}
1112
84afefe6
JH
1113UV
1114Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1115{
7918f24d
NC
1116 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1117
0ebc6274
JH
1118 uvchr_to_utf8(p, c);
1119 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1120}
1121
84afefe6
JH
1122UV
1123Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1124{
7918f24d
NC
1125 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1126
0ebc6274
JH
1127 uvchr_to_utf8(p, c);
1128 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1129}
1130
84afefe6
JH
1131UV
1132Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1133{
7918f24d
NC
1134 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1135
0ebc6274
JH
1136 uvchr_to_utf8(p, c);
1137 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1138}
1139
84afefe6
JH
1140UV
1141Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1142{
7918f24d
NC
1143 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1144
0ebc6274
JH
1145 uvchr_to_utf8(p, c);
1146 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1147}
1148
a0ed51b3
LW
1149/* for now these all assume no locale info available for Unicode > 255 */
1150
1151bool
84afefe6 1152Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1153{
1154 return is_uni_alnum(c); /* XXX no locale support yet */
1155}
1156
1157bool
84afefe6 1158Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1159{
1160 return is_uni_alnumc(c); /* XXX no locale support yet */
1161}
1162
1163bool
84afefe6 1164Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1165{
1166 return is_uni_idfirst(c); /* XXX no locale support yet */
1167}
1168
1169bool
84afefe6 1170Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1171{
1172 return is_uni_alpha(c); /* XXX no locale support yet */
1173}
1174
1175bool
84afefe6 1176Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1177{
1178 return is_uni_ascii(c); /* XXX no locale support yet */
1179}
1180
1181bool
84afefe6 1182Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1183{
1184 return is_uni_space(c); /* XXX no locale support yet */
1185}
1186
1187bool
84afefe6 1188Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1189{
1190 return is_uni_digit(c); /* XXX no locale support yet */
1191}
1192
1193bool
84afefe6 1194Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1195{
1196 return is_uni_upper(c); /* XXX no locale support yet */
1197}
1198
1199bool
84afefe6 1200Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1201{
1202 return is_uni_lower(c); /* XXX no locale support yet */
1203}
1204
1205bool
84afefe6 1206Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1207{
1208 return is_uni_cntrl(c); /* XXX no locale support yet */
1209}
1210
1211bool
84afefe6 1212Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1213{
1214 return is_uni_graph(c); /* XXX no locale support yet */
1215}
1216
1217bool
84afefe6 1218Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1219{
1220 return is_uni_print(c); /* XXX no locale support yet */
1221}
1222
b8c5462f 1223bool
84afefe6 1224Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1225{
1226 return is_uni_punct(c); /* XXX no locale support yet */
1227}
1228
4d61ec05 1229bool
84afefe6 1230Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1231{
1232 return is_uni_xdigit(c); /* XXX no locale support yet */
1233}
1234
b7ac61fa
JH
1235U32
1236Perl_to_uni_upper_lc(pTHX_ U32 c)
1237{
ee099d14
JH
1238 /* XXX returns only the first character -- do not use XXX */
1239 /* XXX no locale support yet */
1240 STRLEN len;
89ebb4a3 1241 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1242 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1243}
1244
1245U32
1246Perl_to_uni_title_lc(pTHX_ U32 c)
1247{
ee099d14
JH
1248 /* XXX returns only the first character XXX -- do not use XXX */
1249 /* XXX no locale support yet */
1250 STRLEN len;
89ebb4a3 1251 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1252 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1253}
1254
1255U32
1256Perl_to_uni_lower_lc(pTHX_ U32 c)
1257{
ee099d14
JH
1258 /* XXX returns only the first character -- do not use XXX */
1259 /* XXX no locale support yet */
1260 STRLEN len;
89ebb4a3 1261 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1262 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1263}
1264
7452cf6a 1265static bool
5141f98e 1266S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1267 const char *const swashname)
1268{
97aff369 1269 dVAR;
7918f24d
NC
1270
1271 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1272
bde6a22d
NC
1273 if (!is_utf8_char(p))
1274 return FALSE;
1275 if (!*swash)
711a919c 1276 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1277 return swash_fetch(*swash, p, TRUE) != 0;
1278}
1279
1280bool
7fc63493 1281Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1282{
97aff369 1283 dVAR;
7918f24d
NC
1284
1285 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1286
671c33bf
NC
1287 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1288 * descendant of isalnum(3), in other words, it doesn't
1289 * contain the '_'. --jhi */
d4c19fe8 1290 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1291}
1292
1293bool
7fc63493 1294Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1295{
97aff369 1296 dVAR;
7918f24d
NC
1297
1298 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1299
d4c19fe8 1300 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1301}
1302
1303bool
7fc63493 1304Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1305{
97aff369 1306 dVAR;
7918f24d
NC
1307
1308 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1309
82686b01
JH
1310 if (*p == '_')
1311 return TRUE;
bde6a22d 1312 /* is_utf8_idstart would be more logical. */
d4c19fe8 1313 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1314}
1315
1316bool
7fc63493 1317Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1318{
97aff369 1319 dVAR;
7918f24d
NC
1320
1321 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1322
82686b01
JH
1323 if (*p == '_')
1324 return TRUE;
d4c19fe8 1325 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1326}
1327
1328bool
7fc63493 1329Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1330{
97aff369 1331 dVAR;
7918f24d
NC
1332
1333 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1334
d4c19fe8 1335 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1336}
1337
1338bool
7fc63493 1339Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1340{
97aff369 1341 dVAR;
7918f24d
NC
1342
1343 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1344
d4c19fe8 1345 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1350{
97aff369 1351 dVAR;
7918f24d
NC
1352
1353 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1354
d4c19fe8 1355 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1356}
1357
1358bool
7fc63493 1359Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1360{
97aff369 1361 dVAR;
7918f24d
NC
1362
1363 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1364
d4c19fe8 1365 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1366}
1367
1368bool
7fc63493 1369Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1370{
97aff369 1371 dVAR;
7918f24d
NC
1372
1373 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1374
d4c19fe8 1375 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1376}
1377
1378bool
7fc63493 1379Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1380{
97aff369 1381 dVAR;
7918f24d
NC
1382
1383 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1384
d4c19fe8 1385 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1386}
1387
1388bool
7fc63493 1389Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1390{
97aff369 1391 dVAR;
7918f24d
NC
1392
1393 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1394
d4c19fe8 1395 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1396}
1397
1398bool
7fc63493 1399Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1400{
97aff369 1401 dVAR;
7918f24d
NC
1402
1403 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1404
d4c19fe8 1405 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1406}
1407
1408bool
7fc63493 1409Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1410{
97aff369 1411 dVAR;
7918f24d
NC
1412
1413 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1414
d4c19fe8 1415 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1416}
1417
1418bool
7fc63493 1419Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1420{
97aff369 1421 dVAR;
7918f24d
NC
1422
1423 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1424
d4c19fe8 1425 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1426}
1427
1428bool
7fc63493 1429Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1430{
97aff369 1431 dVAR;
7918f24d
NC
1432
1433 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1434
d4c19fe8 1435 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1436}
1437
1438bool
7fc63493 1439Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1440{
97aff369 1441 dVAR;
7918f24d
NC
1442
1443 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1444
d4c19fe8 1445 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1446}
1447
6b5c0936 1448/*
87cea99e 1449=for apidoc to_utf8_case
6b5c0936
JH
1450
1451The "p" contains the pointer to the UTF-8 string encoding
1452the character that is being converted.
1453
1454The "ustrp" is a pointer to the character buffer to put the
1455conversion result to. The "lenp" is a pointer to the length
1456of the result.
1457
0134edef 1458The "swashp" is a pointer to the swash to use.
6b5c0936 1459
0134edef 1460Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1461and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1462but not always, a multicharacter mapping), is tried first.
6b5c0936 1463
0134edef
JH
1464The "special" is a string like "utf8::ToSpecLower", which means the
1465hash %utf8::ToSpecLower. The access to the hash is through
1466Perl_to_utf8_case().
6b5c0936 1467
0134edef
JH
1468The "normal" is a string like "ToLower" which means the swash
1469%utf8::ToLower.
1470
1471=cut */
6b5c0936 1472
2104c8d9 1473UV
9a957fbc
AL
1474Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1475 SV **swashp, const char *normal, const char *special)
a0ed51b3 1476{
97aff369 1477 dVAR;
89ebb4a3 1478 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1479 STRLEN len = 0;
aec46f14 1480 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1481 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1482 * are necessary in EBCDIC, they are redundant no-ops
1483 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1484 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
1485
1486 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1487
1feea2c7 1488 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1489
1490 if (!*swashp) /* load on-demand */
1491 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1492
b08cf34e
JH
1493 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1494 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1495 /* It might be "special" (sometimes, but not always,
2a37f04d 1496 * a multicharacter mapping) */
35da51f7 1497 HV * const hv = get_hv(special, FALSE);
b08cf34e
JH
1498 SV **svp;
1499
35da51f7 1500 if (hv &&
b08cf34e
JH
1501 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1502 (*svp)) {
cfd0369c 1503 const char *s;
47654450 1504
cfd0369c 1505 s = SvPV_const(*svp, len);
47654450
JH
1506 if (len == 1)
1507 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1508 else {
2f9475ad
JH
1509#ifdef EBCDIC
1510 /* If we have EBCDIC we need to remap the characters
1511 * since any characters in the low 256 are Unicode
1512 * code points, not EBCDIC. */
7cda7a3d 1513 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1514
1515 d = tmpbuf;
b08cf34e 1516 if (SvUTF8(*svp)) {
2f9475ad
JH
1517 STRLEN tlen = 0;
1518
1519 while (t < tend) {
d4c19fe8 1520 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1521 if (tlen > 0) {
1522 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1523 t += tlen;
1524 }
1525 else
1526 break;
1527 }
1528 }
1529 else {
36fec512
JH
1530 while (t < tend) {
1531 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1532 t++;
1533 }
2f9475ad
JH
1534 }
1535 len = d - tmpbuf;
1536 Copy(tmpbuf, ustrp, len, U8);
1537#else
d2dcd0fb 1538 Copy(s, ustrp, len, U8);
2f9475ad 1539#endif
29e98929 1540 }
983ffd37 1541 }
0134edef
JH
1542 }
1543
1544 if (!len && *swashp) {
d4c19fe8
AL
1545 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1546
0134edef
JH
1547 if (uv2) {
1548 /* It was "normal" (a single character mapping). */
d4c19fe8 1549 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1550 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1551 }
1552 }
1feea2c7 1553
0134edef
JH
1554 if (!len) /* Neither: just copy. */
1555 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1556
2a37f04d
JH
1557 if (lenp)
1558 *lenp = len;
1559
0134edef 1560 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1561}
1562
d3e79532 1563/*
87cea99e 1564=for apidoc to_utf8_upper
d3e79532
JH
1565
1566Convert the UTF-8 encoded character at p to its uppercase version and
1567store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1568that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1569the uppercase version may be longer than the original character.
d3e79532
JH
1570
1571The first character of the uppercased version is returned
1572(but note, as explained above, that there may be more.)
1573
1574=cut */
1575
2104c8d9 1576UV
7fc63493 1577Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1578{
97aff369 1579 dVAR;
7918f24d
NC
1580
1581 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1582
983ffd37 1583 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1584 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1585}
a0ed51b3 1586
d3e79532 1587/*
87cea99e 1588=for apidoc to_utf8_title
d3e79532
JH
1589
1590Convert the UTF-8 encoded character at p to its titlecase version and
1591store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1592that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1593titlecase version may be longer than the original character.
d3e79532
JH
1594
1595The first character of the titlecased version is returned
1596(but note, as explained above, that there may be more.)
1597
1598=cut */
1599
983ffd37 1600UV
7fc63493 1601Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1602{
97aff369 1603 dVAR;
7918f24d
NC
1604
1605 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1606
983ffd37 1607 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1608 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1609}
1610
d3e79532 1611/*
87cea99e 1612=for apidoc to_utf8_lower
d3e79532
JH
1613
1614Convert the UTF-8 encoded character at p to its lowercase version and
1615store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1616that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1617lowercase version may be longer than the original character.
d3e79532
JH
1618
1619The first character of the lowercased version is returned
1620(but note, as explained above, that there may be more.)
1621
1622=cut */
1623
2104c8d9 1624UV
7fc63493 1625Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1626{
97aff369 1627 dVAR;
7918f24d
NC
1628
1629 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1630
983ffd37 1631 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1632 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1633}
1634
d3e79532 1635/*
87cea99e 1636=for apidoc to_utf8_fold
d3e79532
JH
1637
1638Convert the UTF-8 encoded character at p to its foldcase version and
1639store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1640that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1641foldcase version may be longer than the original character (up to
1642three characters).
1643
1644The first character of the foldcased version is returned
1645(but note, as explained above, that there may be more.)
1646
1647=cut */
1648
b4e400f9 1649UV
7fc63493 1650Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1651{
97aff369 1652 dVAR;
7918f24d
NC
1653
1654 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1655
b4e400f9
JH
1656 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1657 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1658}
1659
711a919c
ST
1660/* Note:
1661 * A "swash" is a swatch hash.
1662 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1663 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1664 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1665 */
a0ed51b3 1666SV*
7fc63493 1667Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1668{
27da23d5 1669 dVAR;
a0ed51b3 1670 SV* retval;
8e84507e 1671 dSP;
7fc63493
AL
1672 const size_t pkg_len = strlen(pkg);
1673 const size_t name_len = strlen(name);
da51bb9b 1674 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1675 SV* errsv_save;
ce3b816e 1676
7918f24d
NC
1677 PERL_ARGS_ASSERT_SWASH_INIT;
1678
96ca9f55
DM
1679 PUSHSTACKi(PERLSI_MAGIC);
1680 ENTER;
1681 SAVEI32(PL_hints);
1682 PL_hints = 0;
1683 save_re_context();
1b026014 1684 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1685 ENTER;
f8be5cf0 1686 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1687 /* It is assumed that callers of this routine are not passing in any
1688 user derived data. */
1689 /* Need to do this after save_re_context() as it will set PL_tainted to
1690 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1691 Even line to create errsv_save can turn on PL_tainted. */
1692 SAVEBOOL(PL_tainted);
1693 PL_tainted = 0;
71bed85a 1694 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1695 NULL);
f8be5cf0
JH
1696 if (!SvTRUE(ERRSV))
1697 sv_setsv(ERRSV, errsv_save);
1698 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1699 LEAVE;
1700 }
1701 SPAGAIN;
a0ed51b3
LW
1702 PUSHMARK(SP);
1703 EXTEND(SP,5);
6e449a3a
MHM
1704 mPUSHp(pkg, pkg_len);
1705 mPUSHp(name, name_len);
a0ed51b3 1706 PUSHs(listsv);
6e449a3a
MHM
1707 mPUSHi(minbits);
1708 mPUSHi(none);
a0ed51b3 1709 PUTBACK;
f8be5cf0 1710 errsv_save = newSVsv(ERRSV);
864dbfa3 1711 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1712 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1713 else
e24b16f9 1714 retval = &PL_sv_undef;
f8be5cf0
JH
1715 if (!SvTRUE(ERRSV))
1716 sv_setsv(ERRSV, errsv_save);
1717 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1718 LEAVE;
1719 POPSTACK;
923e4eb5 1720 if (IN_PERL_COMPILETIME) {
623e6609 1721 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1722 }
bc45ce41
JH
1723 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1724 if (SvPOK(retval))
35c1215d 1725 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1726 SVfARG(retval));
cea2e8a9 1727 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1728 }
a0ed51b3
LW
1729 return retval;
1730}
1731
035d37be
JH
1732
1733/* This API is wrong for special case conversions since we may need to
1734 * return several Unicode characters for a single Unicode character
1735 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1736 * the lower-level routine, and it is similarly broken for returning
1737 * multiple values. --jhi */
979f2922 1738/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
ST
1739
1740/* Note:
1741 * Returns the value of property/mapping C<swash> for the first character
1742 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1743 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1744 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1745 */
a0ed51b3 1746UV
680c470c 1747Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1748{
27da23d5 1749 dVAR;
ef8f7699 1750 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
1751 U32 klen;
1752 U32 off;
a0ed51b3 1753 STRLEN slen;
7d85a32c 1754 STRLEN needents;
cfd0369c 1755 const U8 *tmps = NULL;
a0ed51b3 1756 U32 bit;
979f2922 1757 SV *swatch;
3568d838 1758 U8 tmputf8[2];
35da51f7 1759 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1760
7918f24d
NC
1761 PERL_ARGS_ASSERT_SWASH_FETCH;
1762
3568d838 1763 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
ST
1764 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1765 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1766 ptr = tmputf8;
3568d838
JH
1767 }
1768 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1769 * then the "swatch" is a vec() for al the chars which start
1770 * with 0xAA..0xYY
1771 * So the key in the hash (klen) is length of encoded char -1
1772 */
1773 klen = UTF8SKIP(ptr) - 1;
1774 off = ptr[klen];
a0ed51b3 1775
979f2922 1776 if (klen == 0) {
7d85a32c 1777 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1778 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1779 */
979f2922
ST
1780 needents = UTF_CONTINUATION_MARK;
1781 off = NATIVE_TO_UTF(ptr[klen]);
1782 }
1783 else {
7d85a32c 1784 /* If char is encoded then swatch is for the prefix */
979f2922
ST
1785 needents = (1 << UTF_ACCUMULATION_SHIFT);
1786 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1787 }
7d85a32c 1788
a0ed51b3
LW
1789 /*
1790 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1791 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1792 * it's nothing to sniff at.) Pity we usually come through at least
1793 * two function calls to get here...
1794 *
1795 * NB: this code assumes that swatches are never modified, once generated!
1796 */
1797
3568d838 1798 if (hv == PL_last_swash_hv &&
a0ed51b3 1799 klen == PL_last_swash_klen &&
27da23d5 1800 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1801 {
1802 tmps = PL_last_swash_tmps;
1803 slen = PL_last_swash_slen;
1804 }
1805 else {
1806 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1807 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1808
979f2922
ST
1809 /* If not cached, generate it via swash_get */
1810 if (!svp || !SvPOK(*svp)
1811 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1812 /* We use utf8n_to_uvuni() as we want an index into
1813 Unicode tables, not a native character number.
1814 */
aec46f14 1815 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1816 ckWARN(WARN_UTF8) ?
1817 0 : UTF8_ALLOW_ANY);
680c470c 1818 swatch = swash_get(swash,
979f2922
ST
1819 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1820 (klen) ? (code_point & ~(needents - 1)) : 0,
1821 needents);
1822
923e4eb5 1823 if (IN_PERL_COMPILETIME)
623e6609 1824 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1825
979f2922 1826 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1827
979f2922
ST
1828 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1829 || (slen << 3) < needents)
660a4616 1830 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1831 }
1832
1833 PL_last_swash_hv = hv;
16d8f38a 1834 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1835 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
1836 /* FIXME change interpvar.h? */
1837 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1838 PL_last_swash_slen = slen;
1839 if (klen)
1840 Copy(ptr, PL_last_swash_key, klen, U8);
1841 }
1842
9faf8d75 1843 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1844 case 1:
1845 bit = 1 << (off & 7);
1846 off >>= 3;
1847 return (tmps[off] & bit) != 0;
1848 case 8:
1849 return tmps[off];
1850 case 16:
1851 off <<= 1;
1852 return (tmps[off] << 8) + tmps[off + 1] ;
1853 case 32:
1854 off <<= 2;
1855 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1856 }
660a4616 1857 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1858 NORETURN_FUNCTION_END;
a0ed51b3 1859}
2b9d42f0 1860
979f2922
ST
1861/* Note:
1862 * Returns a swatch (a bit vector string) for a code point sequence
1863 * that starts from the value C<start> and comprises the number C<span>.
1864 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1865 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1866 */
1867STATIC SV*
1868S_swash_get(pTHX_ SV* swash, UV start, UV span)
1869{
1870 SV *swatch;
711a919c 1871 U8 *l, *lend, *x, *xend, *s;
979f2922 1872 STRLEN lcur, xcur, scur;
ef8f7699 1873 HV *const hv = MUTABLE_HV(SvRV(swash));
017a3ce5
GA
1874 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1875 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1876 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1877 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1878 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1879 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1880 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1881 const STRLEN bits = SvUV(*bitssvp);
1882 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1883 const UV none = SvUV(*nonesvp);
1884 const UV end = start + span;
979f2922 1885
7918f24d
NC
1886 PERL_ARGS_ASSERT_SWASH_GET;
1887
979f2922 1888 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
ST
1889 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1890 (UV)bits);
979f2922
ST
1891 }
1892
1893 /* create and initialize $swatch */
979f2922 1894 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
1895 swatch = newSV(scur);
1896 SvPOK_on(swatch);
979f2922
ST
1897 s = (U8*)SvPVX(swatch);
1898 if (octets && none) {
0bd48802 1899 const U8* const e = s + scur;
979f2922
ST
1900 while (s < e) {
1901 if (bits == 8)
1902 *s++ = (U8)(none & 0xff);
1903 else if (bits == 16) {
1904 *s++ = (U8)((none >> 8) & 0xff);
1905 *s++ = (U8)( none & 0xff);
1906 }
1907 else if (bits == 32) {
1908 *s++ = (U8)((none >> 24) & 0xff);
1909 *s++ = (U8)((none >> 16) & 0xff);
1910 *s++ = (U8)((none >> 8) & 0xff);
1911 *s++ = (U8)( none & 0xff);
1912 }
1913 }
1914 *s = '\0';
1915 }
1916 else {
1917 (void)memzero((U8*)s, scur + 1);
1918 }
1919 SvCUR_set(swatch, scur);
1920 s = (U8*)SvPVX(swatch);
1921
1922 /* read $swash->{LIST} */
1923 l = (U8*)SvPV(*listsvp, lcur);
1924 lend = l + lcur;
1925 while (l < lend) {
35da51f7 1926 UV min, max, val;
979f2922
ST
1927 STRLEN numlen;
1928 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1929
0bd48802 1930 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
ST
1931
1932 numlen = lend - l;
1933 min = grok_hex((char *)l, &numlen, &flags, NULL);
1934 if (numlen)
1935 l += numlen;
1936 else if (nl) {
1937 l = nl + 1; /* 1 is length of "\n" */
1938 continue;
1939 }
1940 else {
1941 l = lend; /* to LIST's end at which \n is not found */
1942 break;
1943 }
1944
1945 if (isBLANK(*l)) {
1946 ++l;
1947 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1948 numlen = lend - l;
1949 max = grok_hex((char *)l, &numlen, &flags, NULL);
1950 if (numlen)
1951 l += numlen;
1952 else
1953 max = min;
1954
1955 if (octets) {
1956 if (isBLANK(*l)) {
1957 ++l;
1958 flags = PERL_SCAN_SILENT_ILLDIGIT |
1959 PERL_SCAN_DISALLOW_PREFIX;
1960 numlen = lend - l;
1961 val = grok_hex((char *)l, &numlen, &flags, NULL);
1962 if (numlen)
1963 l += numlen;
1964 else
1965 val = 0;
1966 }
1967 else {
1968 val = 0;
1969 if (typeto) {
1970 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1971 typestr, l);
1972 }
1973 }
1974 }
711a919c
ST
1975 else
1976 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1977 }
1978 else {
1979 max = min;
1980 if (octets) {
1981 val = 0;
1982 if (typeto) {
1983 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1984 }
1985 }
711a919c
ST
1986 else
1987 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1988 }
1989
1990 if (nl)
1991 l = nl + 1;
1992 else
1993 l = lend;
1994
1995 if (max < start)
1996 continue;
1997
1998 if (octets) {
35da51f7 1999 UV key;
979f2922
ST
2000 if (min < start) {
2001 if (!none || val < none) {
2002 val += start - min;
2003 }
2004 min = start;
2005 }
2006 for (key = min; key <= max; key++) {
2007 STRLEN offset;
2008 if (key >= end)
2009 goto go_out_list;
2010 /* offset must be non-negative (start <= min <= key < end) */
2011 offset = octets * (key - start);
2012 if (bits == 8)
2013 s[offset] = (U8)(val & 0xff);
2014 else if (bits == 16) {
2015 s[offset ] = (U8)((val >> 8) & 0xff);
2016 s[offset + 1] = (U8)( val & 0xff);
2017 }
2018 else if (bits == 32) {
2019 s[offset ] = (U8)((val >> 24) & 0xff);
2020 s[offset + 1] = (U8)((val >> 16) & 0xff);
2021 s[offset + 2] = (U8)((val >> 8) & 0xff);
2022 s[offset + 3] = (U8)( val & 0xff);
2023 }
2024
2025 if (!none || val < none)
2026 ++val;
2027 }
2028 }
711a919c 2029 else { /* bits == 1, then val should be ignored */
35da51f7 2030 UV key;
979f2922
ST
2031 if (min < start)
2032 min = start;
2033 for (key = min; key <= max; key++) {
0bd48802 2034 const STRLEN offset = (STRLEN)(key - start);
979f2922
ST
2035 if (key >= end)
2036 goto go_out_list;
2037 s[offset >> 3] |= 1 << (offset & 7);
2038 }
2039 }
2040 } /* while */
2041 go_out_list:
2042
2043 /* read $swash->{EXTRAS} */
2044 x = (U8*)SvPV(*extssvp, xcur);
2045 xend = x + xcur;
2046 while (x < xend) {
2047 STRLEN namelen;
2048 U8 *namestr;
2049 SV** othersvp;
2050 HV* otherhv;
2051 STRLEN otherbits;
2052 SV **otherbitssvp, *other;
711a919c 2053 U8 *s, *o, *nl;
979f2922
ST
2054 STRLEN slen, olen;
2055
35da51f7 2056 const U8 opc = *x++;
979f2922
ST
2057 if (opc == '\n')
2058 continue;
2059
2060 nl = (U8*)memchr(x, '\n', xend - x);
2061
2062 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2063 if (nl) {
2064 x = nl + 1; /* 1 is length of "\n" */
2065 continue;
2066 }
2067 else {
2068 x = xend; /* to EXTRAS' end at which \n is not found */
2069 break;
2070 }
2071 }
2072
2073 namestr = x;
2074 if (nl) {
2075 namelen = nl - namestr;
2076 x = nl + 1;
2077 }
2078 else {
2079 namelen = xend - namestr;
2080 x = xend;
2081 }
2082
2083 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 2084 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 2085 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
ST
2086 otherbits = (STRLEN)SvUV(*otherbitssvp);
2087 if (bits < otherbits)
660a4616 2088 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
ST
2089
2090 /* The "other" swatch must be destroyed after. */
2091 other = swash_get(*othersvp, start, span);
2092 o = (U8*)SvPV(other, olen);
2093
2094 if (!olen)
660a4616 2095 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
ST
2096
2097 s = (U8*)SvPV(swatch, slen);
2098 if (bits == 1 && otherbits == 1) {
2099 if (slen != olen)
660a4616 2100 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
ST
2101
2102 switch (opc) {
2103 case '+':
2104 while (slen--)
2105 *s++ |= *o++;
2106 break;
2107 case '!':
2108 while (slen--)
2109 *s++ |= ~*o++;
2110 break;
2111 case '-':
2112 while (slen--)
2113 *s++ &= ~*o++;
2114 break;
2115 case '&':
2116 while (slen--)
2117 *s++ &= *o++;
2118 break;
2119 default:
2120 break;
2121 }
2122 }
711a919c 2123 else {
979f2922
ST
2124 STRLEN otheroctets = otherbits >> 3;
2125 STRLEN offset = 0;
35da51f7 2126 U8* const send = s + slen;
979f2922
ST
2127
2128 while (s < send) {
2129 UV otherval = 0;
2130
2131 if (otherbits == 1) {
2132 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2133 ++offset;
2134 }
2135 else {
2136 STRLEN vlen = otheroctets;
2137 otherval = *o++;
2138 while (--vlen) {
2139 otherval <<= 8;
2140 otherval |= *o++;
2141 }
2142 }
2143
711a919c 2144 if (opc == '+' && otherval)
6f207bd3 2145 NOOP; /* replace with otherval */
979f2922
ST
2146 else if (opc == '!' && !otherval)
2147 otherval = 1;
2148 else if (opc == '-' && otherval)
2149 otherval = 0;
2150 else if (opc == '&' && !otherval)
2151 otherval = 0;
2152 else {
711a919c 2153 s += octets; /* no replacement */
979f2922
ST
2154 continue;
2155 }
2156
2157 if (bits == 8)
2158 *s++ = (U8)( otherval & 0xff);
2159 else if (bits == 16) {
2160 *s++ = (U8)((otherval >> 8) & 0xff);
2161 *s++ = (U8)( otherval & 0xff);
2162 }
2163 else if (bits == 32) {
2164 *s++ = (U8)((otherval >> 24) & 0xff);
2165 *s++ = (U8)((otherval >> 16) & 0xff);
2166 *s++ = (U8)((otherval >> 8) & 0xff);
2167 *s++ = (U8)( otherval & 0xff);
2168 }
2169 }
2170 }
2171 sv_free(other); /* through with it! */
2172 } /* while */
2173 return swatch;
2174}
2175
0f830e0b 2176/*
87cea99e 2177=for apidoc uvchr_to_utf8
0f830e0b
NC
2178
2179Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2180of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2181bytes available. The return value is the pointer to the byte after the
2182end of the new character. In other words,
2183
2184 d = uvchr_to_utf8(d, uv);
2185
2186is the recommended wide native character-aware way of saying
2187
2188 *(d++) = uv;
2189
2190=cut
2191*/
2192
2193/* On ASCII machines this is normally a macro but we want a
2194 real function in case XS code wants it
2195*/
0f830e0b
NC
2196U8 *
2197Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2198{
7918f24d
NC
2199 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2200
0f830e0b
NC
2201 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2202}
2203
b851fbc1
JH
2204U8 *
2205Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2206{
7918f24d
NC
2207 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2208
b851fbc1
JH
2209 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2210}
2b9d42f0
NIS
2211
2212/*
87cea99e 2213=for apidoc utf8n_to_uvchr
0f830e0b
NC
2214flags
2215
2216Returns the native character value of the first character in the string
2217C<s>
2218which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2219length, in bytes, of that character.
2220
2221Allows length and flags to be passed to low level routine.
2222
2223=cut
2224*/
2225/* On ASCII machines this is normally a macro but we want
2226 a real function in case XS code wants it
2227*/
0f830e0b
NC
2228UV
2229Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2230U32 flags)
2231{
2232 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
2233
2234 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2235
0f830e0b
NC
2236 return UNI_TO_NATIVE(uv);
2237}
2238
2239/*
87cea99e 2240=for apidoc pv_uni_display
d2cc3551
JH
2241
2242Build to the scalar dsv a displayable version of the string spv,
2243length len, the displayable version being at most pvlim bytes long
2244(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2245
9e55ce06 2246The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2247isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2248to display the \\[nrfta\\] as the backslashed versions (like '\n')
2249(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2250UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2251UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2252
d2cc3551
JH
2253The pointer to the PV of the dsv is returned.
2254
2255=cut */
e6b2e755 2256char *
e1ec3a88 2257Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2258{
2259 int truncated = 0;
e1ec3a88 2260 const char *s, *e;
e6b2e755 2261
7918f24d
NC
2262 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2263
76f68e9b 2264 sv_setpvs(dsv, "");
7fddd944 2265 SvUTF8_off(dsv);
e1ec3a88 2266 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2267 UV u;
a49f32c6
NC
2268 /* This serves double duty as a flag and a character to print after
2269 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2270 */
2271 char ok = 0;
c728cb41 2272
e6b2e755
JH
2273 if (pvlim && SvCUR(dsv) >= pvlim) {
2274 truncated++;
2275 break;
2276 }
2277 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2278 if (u < 256) {
a3b680e6 2279 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2280 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2281 switch (c) {
c728cb41 2282 case '\n':
a49f32c6 2283 ok = 'n'; break;
c728cb41 2284 case '\r':
a49f32c6 2285 ok = 'r'; break;
c728cb41 2286 case '\t':
a49f32c6 2287 ok = 't'; break;
c728cb41 2288 case '\f':
a49f32c6 2289 ok = 'f'; break;
c728cb41 2290 case '\a':
a49f32c6 2291 ok = 'a'; break;
c728cb41 2292 case '\\':
a49f32c6 2293 ok = '\\'; break;
c728cb41
JH
2294 default: break;
2295 }
a49f32c6 2296 if (ok) {
88c9ea1e 2297 const char string = ok;
76f68e9b 2298 sv_catpvs(dsv, "\\");
5e7aa789 2299 sv_catpvn(dsv, &string, 1);
a49f32c6 2300 }
c728cb41 2301 }
00e86452 2302 /* isPRINT() is the locale-blind version. */
a49f32c6 2303 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2304 const char string = c;
5e7aa789 2305 sv_catpvn(dsv, &string, 1);
a49f32c6 2306 ok = 1;
0a2ef054 2307 }
c728cb41
JH
2308 }
2309 if (!ok)
9e55ce06 2310 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2311 }
2312 if (truncated)
396482e1 2313 sv_catpvs(dsv, "...");
e6b2e755
JH
2314
2315 return SvPVX(dsv);
2316}
2b9d42f0 2317
d2cc3551 2318/*
87cea99e 2319=for apidoc sv_uni_display
d2cc3551
JH
2320
2321Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2322the displayable version being at most pvlim bytes long
d2cc3551 2323(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2324
2325The flags argument is as in pv_uni_display().
2326
d2cc3551
JH
2327The pointer to the PV of the dsv is returned.
2328
d4c19fe8
AL
2329=cut
2330*/
e6b2e755
JH
2331char *
2332Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2333{
7918f24d
NC
2334 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2335
cfd0369c
NC
2336 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2337 SvCUR(ssv), pvlim, flags);
701a277b
JH
2338}
2339
d2cc3551 2340/*
87cea99e 2341=for apidoc ibcmp_utf8
d2cc3551
JH
2342
2343Return true if the strings s1 and s2 differ case-insensitively, false
2344if not (if they are equal case-insensitively). If u1 is true, the
2345string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2346the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2347are false, the respective string is assumed to be in native 8-bit
2348encoding.
2349
2350If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2351in there (they will point at the beginning of the I<next> character).
2352If the pointers behind pe1 or pe2 are non-NULL, they are the end
2353pointers beyond which scanning will not continue under any
4cdaeff7 2354circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2355s2+l2 will be used as goal end pointers that will also stop the scan,
2356and which qualify towards defining a successful match: all the scans
2357that define an explicit length must reach their goal pointers for
2358a match to succeed).
d2cc3551
JH
2359
2360For case-insensitiveness, the "casefolding" of Unicode is used
2361instead of upper/lowercasing both the characters, see
2362http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2363
2364=cut */
701a277b 2365I32
d07ddd77 2366Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2367{
97aff369 2368 dVAR;
e1ec3a88
AL
2369 register const U8 *p1 = (const U8*)s1;
2370 register const U8 *p2 = (const U8*)s2;
cbbf8932 2371 register const U8 *f1 = NULL;
2f73348c 2372 register const U8 *f2 = NULL;
cbbf8932
AL
2373 register U8 *e1 = NULL;
2374 register U8 *q1 = NULL;
2375 register U8 *e2 = NULL;
2376 register U8 *q2 = NULL;
d07ddd77 2377 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2378 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2379 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2380 U8 natbuf[1+1];
2381 STRLEN foldlen1, foldlen2;
d07ddd77 2382 bool match;
7918f24d
NC
2383
2384 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2385
d07ddd77
JH
2386 if (pe1)
2387 e1 = *(U8**)pe1;
a0a388a1 2388 /* assert(e1 || l1); */
e1ec3a88
AL
2389 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2390 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2391 if (pe2)
2392 e2 = *(U8**)pe2;
a0a388a1 2393 /* assert(e2 || l2); */
e1ec3a88
AL
2394 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2395 f2 = (const U8*)s2 + l2;
d07ddd77 2396
a0a388a1
YO
2397 /* This shouldn't happen. However, putting an assert() there makes some
2398 * tests fail. */
2399 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77
JH
2400 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2401 return 1; /* mismatch; possible infinite loop or false positive */
2402
a6872d42
JH
2403 if (!u1 || !u2)
2404 natbuf[1] = 0; /* Need to terminate the buffer. */
2405
d07ddd77
JH
2406 while ((e1 == 0 || p1 < e1) &&
2407 (f1 == 0 || p1 < f1) &&
2408 (e2 == 0 || p2 < e2) &&
2409 (f2 == 0 || p2 < f2)) {
2410 if (n1 == 0) {
d7f013c8
JH
2411 if (u1)
2412 to_utf8_fold(p1, foldbuf1, &foldlen1);
2413 else {
809e8e66 2414 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2415 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2416 }
2417 q1 = foldbuf1;
d07ddd77 2418 n1 = foldlen1;
332ddc25 2419 }
d07ddd77 2420 if (n2 == 0) {
d7f013c8
JH
2421 if (u2)
2422 to_utf8_fold(p2, foldbuf2, &foldlen2);
2423 else {
809e8e66 2424 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2425 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2426 }
2427 q2 = foldbuf2;
d07ddd77 2428 n2 = foldlen2;
332ddc25 2429 }
d07ddd77
JH
2430 while (n1 && n2) {
2431 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2432 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2433 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2434 return 1; /* mismatch */
d07ddd77 2435 n1 -= UTF8SKIP(q1);
d7f013c8 2436 q1 += UTF8SKIP(q1);
d07ddd77 2437 n2 -= UTF8SKIP(q2);
d7f013c8 2438 q2 += UTF8SKIP(q2);
701a277b 2439 }
d07ddd77 2440 if (n1 == 0)
d7f013c8 2441 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2442 if (n2 == 0)
d7f013c8
JH
2443 p2 += u2 ? UTF8SKIP(p2) : 1;
2444
d2cc3551 2445 }
5469e704 2446
d07ddd77
JH
2447 /* A match is defined by all the scans that specified
2448 * an explicit length reaching their final goals. */
2449 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2450
2451 if (match) {
d07ddd77
JH
2452 if (pe1)
2453 *pe1 = (char*)p1;
2454 if (pe2)
2455 *pe2 = (char*)p2;
5469e704
JH
2456 }
2457
2458 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2459}
701a277b 2460
a49f32c6
NC
2461/*
2462 * Local variables:
2463 * c-indentation-style: bsd
2464 * c-basic-offset: 4
2465 * indent-tabs-mode: t
2466 * End:
2467 *
37442d52
RGS
2468 * ex: set ts=8 sts=4 sw=4 noet:
2469 */