This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
pv_uni_display () omitted backslash in output string
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
fdf8c088 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
a0c21aa1
JH
28#ifndef EBCDIC
29/* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
33#endif
34
27da23d5
JH
35static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 37
ccfc67b7
JH
38/*
39=head1 Unicode Support
a0ed51b3 40
166f8a29
DM
41This file contains various utility functions for manipulating UTF8-encoded
42strings. For the uninitiated, this is a method of representing arbitrary
61296642 43Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
166f8a29 46
b851fbc1 47=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 48
1e54db1a 49Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 50of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 51bytes available. The return value is the pointer to the byte after the
9041c2e3 52end of the new character. In other words,
eebe1485 53
b851fbc1
JH
54 d = uvuni_to_utf8_flags(d, uv, flags);
55
56or, in most cases,
57
9041c2e3 58 d = uvuni_to_utf8(d, uv);
eebe1485 59
b851fbc1
JH
60(which is equivalent to)
61
62 d = uvuni_to_utf8_flags(d, uv, 0);
63
eebe1485
SC
64is the recommended Unicode-aware way of saying
65
66 *(d++) = uv;
67
68=cut
69*/
70
dfe13c55 71U8 *
b851fbc1 72Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 73{
7918f24d
NC
74 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
75
62961d2e 76 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
77 if (UNICODE_IS_SURROGATE(uv) &&
78 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 79 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
80 else if (
81 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
82 !(flags & UNICODE_ALLOW_FDD0))
83 ||
c867b360 84 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
85 !(flags & UNICODE_ALLOW_FFFF))) &&
86 /* UNICODE_ALLOW_SUPER includes
2a20b9da 87 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
88 ((uv <= PERL_UNICODE_MAX) ||
89 !(flags & UNICODE_ALLOW_SUPER))
90 )
9014280d 91 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
92 "Unicode character 0x%04"UVxf" is illegal", uv);
93 }
c4d5f83a 94 if (UNI_IS_INVARIANT(uv)) {
eb160463 95 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
96 return d;
97 }
2d331972 98#if defined(EBCDIC)
1d72bdf6
NIS
99 else {
100 STRLEN len = UNISKIP(uv);
101 U8 *p = d+len-1;
102 while (p > d) {
eb160463 103 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
104 uv >>= UTF_ACCUMULATION_SHIFT;
105 }
eb160463 106 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
107 return d+len;
108 }
109#else /* Non loop style */
a0ed51b3 110 if (uv < 0x800) {
eb160463
GS
111 *d++ = (U8)(( uv >> 6) | 0xc0);
112 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
113 return d;
114 }
115 if (uv < 0x10000) {
eb160463
GS
116 *d++ = (U8)(( uv >> 12) | 0xe0);
117 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
118 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
119 return d;
120 }
121 if (uv < 0x200000) {
eb160463
GS
122 *d++ = (U8)(( uv >> 18) | 0xf0);
123 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
124 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
125 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
126 return d;
127 }
128 if (uv < 0x4000000) {
eb160463
GS
129 *d++ = (U8)(( uv >> 24) | 0xf8);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
136 if (uv < 0x80000000) {
eb160463
GS
137 *d++ = (U8)(( uv >> 30) | 0xfc);
138 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
140 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
141 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
142 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
143 return d;
144 }
6b8eaf93 145#ifdef HAS_QUAD
d7578b48 146 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
147#endif
148 {
eb160463
GS
149 *d++ = 0xfe; /* Can't match U+FEFF! */
150 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
153 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
154 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
155 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
156 return d;
157 }
6b8eaf93 158#ifdef HAS_QUAD
a0ed51b3 159 {
eb160463
GS
160 *d++ = 0xff; /* Can't match U+FFFE! */
161 *d++ = 0x80; /* 6 Reserved bits */
162 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
163 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
170 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
172 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
173 return d;
174 }
175#endif
1d72bdf6 176#endif /* Loop style */
a0ed51b3 177}
9041c2e3 178
646ca15d
JH
179/*
180
181Tests if some arbitrary number of bytes begins in a valid UTF-8
182character. Note that an INVARIANT (i.e. ASCII) character is a valid
183UTF-8 character. The actual number of bytes in the UTF-8 character
184will be returned if it is valid, otherwise 0.
185
186This is the "slow" version as opposed to the "fast" version which is
187the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
188difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
189or less you should use the IS_UTF8_CHAR(), for lengths of five or more
190you should use the _slow(). In practice this means that the _slow()
191will be used very rarely, since the maximum Unicode code point (as of
192Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
193the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
194five bytes or more.
195
196=cut */
c053b435 197STATIC STRLEN
5f66b61c 198S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
199{
200 U8 u = *s;
201 STRLEN slen;
202 UV uv, ouv;
203
7918f24d
NC
204 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
205
646ca15d
JH
206 if (UTF8_IS_INVARIANT(u))
207 return 1;
208
209 if (!UTF8_IS_START(u))
210 return 0;
211
212 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
213 return 0;
214
215 slen = len - 1;
216 s++;
77263263
ST
217#ifdef EBCDIC
218 u = NATIVE_TO_UTF(u);
219#endif
646ca15d
JH
220 u &= UTF_START_MASK(len);
221 uv = u;
222 ouv = uv;
223 while (slen--) {
224 if (!UTF8_IS_CONTINUATION(*s))
225 return 0;
226 uv = UTF8_ACCUMULATE(uv, *s);
227 if (uv < ouv)
228 return 0;
229 ouv = uv;
230 s++;
231 }
232
233 if ((STRLEN)UNISKIP(uv) < len)
234 return 0;
235
236 return len;
237}
9041c2e3
NIS
238
239/*
7fc63493 240=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 241
5da9da9e 242Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
243character. Note that an INVARIANT (i.e. ASCII) character is a valid
244UTF-8 character. The actual number of bytes in the UTF-8 character
245will be returned if it is valid, otherwise 0.
9041c2e3 246
82686b01 247=cut */
067a85ef 248STRLEN
7fc63493 249Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 250{
44f8325f 251 const STRLEN len = UTF8SKIP(s);
7918f24d
NC
252
253 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
96a5add6 254 PERL_UNUSED_CONTEXT;
3b0fc154 255#ifdef IS_UTF8_CHAR
768c67ee 256 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
257 return IS_UTF8_CHAR(s, len) ? len : 0;
258#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 259 return is_utf8_char_slow(s, len);
386d01d6
GS
260}
261
6662521e 262/*
7fc63493 263=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 264
c9ada85f 265Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
266UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
267not mean 'a string that contains code points above 0x7F encoded in UTF-8'
268because a valid ASCII string is a valid UTF-8 string.
6662521e 269
768c67ee
JH
270See also is_utf8_string_loclen() and is_utf8_string_loc().
271
6662521e
GS
272=cut
273*/
274
8e84507e 275bool
7fc63493 276Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 277{
35da51f7 278 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 279 const U8* x = s;
067a85ef 280
7918f24d 281 PERL_ARGS_ASSERT_IS_UTF8_STRING;
96a5add6 282 PERL_UNUSED_CONTEXT;
1aa99e6b 283
6662521e 284 while (x < send) {
a3b680e6 285 STRLEN c;
1acdb0da
JH
286 /* Inline the easy bits of is_utf8_char() here for speed... */
287 if (UTF8_IS_INVARIANT(*x))
288 c = 1;
289 else if (!UTF8_IS_START(*x))
768c67ee 290 goto out;
1acdb0da
JH
291 else {
292 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
293#ifdef IS_UTF8_CHAR
294 c = UTF8SKIP(x);
768c67ee
JH
295 if (IS_UTF8_CHAR_FAST(c)) {
296 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
297 c = 0;
298 }
299 else
300 c = is_utf8_char_slow(x, c);
646ca15d
JH
301#else
302 c = is_utf8_char(x);
303#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 304 if (!c)
768c67ee 305 goto out;
1acdb0da 306 }
6662521e 307 x += c;
6662521e 308 }
768c67ee
JH
309
310 out:
60006e79
JH
311 if (x != send)
312 return FALSE;
067a85ef
A
313
314 return TRUE;
6662521e
GS
315}
316
67e989fb 317/*
814fafa7
NC
318Implemented as a macro in utf8.h
319
320=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
321
322Like is_utf8_string() but stores the location of the failure (in the
323case of "utf8ness failure") or the location s+len (in the case of
324"utf8ness success") in the C<ep>.
325
326See also is_utf8_string_loclen() and is_utf8_string().
327
768c67ee 328=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 329
e3e4599f 330Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
331case of "utf8ness failure") or the location s+len (in the case of
332"utf8ness success") in the C<ep>, and the number of UTF-8
333encoded characters in the C<el>.
334
335See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
336
337=cut
338*/
339
340bool
768c67ee 341Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 342{
35da51f7 343 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 344 const U8* x = s;
81cd54e3 345 STRLEN c;
3ebfea28 346 STRLEN outlen = 0;
7918f24d
NC
347
348 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
96a5add6 349 PERL_UNUSED_CONTEXT;
81cd54e3 350
81cd54e3
JH
351 while (x < send) {
352 /* Inline the easy bits of is_utf8_char() here for speed... */
353 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
354 c = 1;
355 else if (!UTF8_IS_START(*x))
356 goto out;
81cd54e3 357 else {
768c67ee
JH
358 /* ... and call is_utf8_char() only if really needed. */
359#ifdef IS_UTF8_CHAR
360 c = UTF8SKIP(x);
361 if (IS_UTF8_CHAR_FAST(c)) {
362 if (!IS_UTF8_CHAR(x, c))
363 c = 0;
364 } else
365 c = is_utf8_char_slow(x, c);
366#else
367 c = is_utf8_char(x);
368#endif /* #ifdef IS_UTF8_CHAR */
369 if (!c)
370 goto out;
81cd54e3 371 }
768c67ee 372 x += c;
3ebfea28 373 outlen++;
81cd54e3 374 }
768c67ee
JH
375
376 out:
3ebfea28
AL
377 if (el)
378 *el = outlen;
379
768c67ee
JH
380 if (ep)
381 *ep = x;
3ebfea28 382 return (x == send);
81cd54e3
JH
383}
384
385/*
768c67ee 386
7fc63493 387=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 388
9041c2e3 389Bottom level UTF-8 decode routine.
38a44b82 390Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 391which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 392C<retlen> will be set to the length, in bytes, of that character.
67e989fb 393
1e54db1a 394If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
395is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
396it is assumed that the caller will raise a warning, and this function
28d3d195
JH
397will silently just set C<retlen> to C<-1> and return zero. If the
398C<flags> does not contain UTF8_CHECK_ONLY, warnings about
399malformations will be given, C<retlen> will be set to the expected
400length of the UTF-8 character in bytes, and zero will be returned.
401
402The C<flags> can also contain various flags to allow deviations from
403the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 404
9041c2e3
NIS
405Most code should use utf8_to_uvchr() rather than call this directly.
406
37607a96
PK
407=cut
408*/
67e989fb 409
a0ed51b3 410UV
7fc63493 411Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 412{
97aff369 413 dVAR;
d4c19fe8 414 const U8 * const s0 = s;
9c5ffd7c 415 UV uv = *s, ouv = 0;
ba210ebe 416 STRLEN len = 1;
7fc63493
AL
417 const bool dowarn = ckWARN_d(WARN_UTF8);
418 const UV startbyte = *s;
ba210ebe 419 STRLEN expectlen = 0;
a0dbb045
JH
420 U32 warning = 0;
421
7918f24d
NC
422 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
423
a0dbb045
JH
424/* This list is a superset of the UTF8_ALLOW_XXX. */
425
426#define UTF8_WARN_EMPTY 1
427#define UTF8_WARN_CONTINUATION 2
428#define UTF8_WARN_NON_CONTINUATION 3
429#define UTF8_WARN_FE_FF 4
430#define UTF8_WARN_SHORT 5
431#define UTF8_WARN_OVERFLOW 6
432#define UTF8_WARN_SURROGATE 7
c867b360
JH
433#define UTF8_WARN_LONG 8
434#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
435
436 if (curlen == 0 &&
437 !(flags & UTF8_ALLOW_EMPTY)) {
438 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
439 goto malformed;
440 }
441
1d72bdf6 442 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
443 if (retlen)
444 *retlen = 1;
c4d5f83a 445 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 446 }
67e989fb 447
421a8bf2 448 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 449 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 450 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
451 goto malformed;
452 }
453
421a8bf2 454 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 455 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 456 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
457 goto malformed;
458 }
9041c2e3 459
1d72bdf6 460#ifdef EBCDIC
75383841 461 uv = NATIVE_TO_UTF(uv);
1d72bdf6 462#else
fcc8fcf6
JH
463 if ((uv == 0xfe || uv == 0xff) &&
464 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 465 warning = UTF8_WARN_FE_FF;
ba210ebe 466 goto malformed;
a0ed51b3 467 }
1d72bdf6
NIS
468#endif
469
ba210ebe
JH
470 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
471 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
472 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
473 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
474#ifdef EBCDIC
475 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
476 else { len = 7; uv &= 0x01; }
477#else
ba210ebe
JH
478 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
479 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
480 else { len = 13; uv = 0; } /* whoa! */
481#endif
482
a0ed51b3
LW
483 if (retlen)
484 *retlen = len;
9041c2e3 485
ba210ebe
JH
486 expectlen = len;
487
fcc8fcf6
JH
488 if ((curlen < expectlen) &&
489 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 490 warning = UTF8_WARN_SHORT;
ba210ebe
JH
491 goto malformed;
492 }
493
494 len--;
a0ed51b3 495 s++;
ba210ebe
JH
496 ouv = uv;
497
a0ed51b3 498 while (len--) {
421a8bf2
JH
499 if (!UTF8_IS_CONTINUATION(*s) &&
500 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
501 s--;
502 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 503 goto malformed;
a0ed51b3
LW
504 }
505 else
8850bf83 506 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
507 if (!(uv > ouv)) {
508 /* These cannot be allowed. */
509 if (uv == ouv) {
75dbc644 510 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
511 warning = UTF8_WARN_LONG;
512 goto malformed;
513 }
514 }
515 else { /* uv < ouv */
516 /* This cannot be allowed. */
517 warning = UTF8_WARN_OVERFLOW;
518 goto malformed;
519 }
ba210ebe
JH
520 }
521 s++;
522 ouv = uv;
523 }
524
421a8bf2 525 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 526 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 527 warning = UTF8_WARN_SURROGATE;
ba210ebe 528 goto malformed;
eb160463 529 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 530 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 531 warning = UTF8_WARN_LONG;
ba210ebe 532 goto malformed;
421a8bf2 533 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 534 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 535 warning = UTF8_WARN_FFFF;
a9917092 536 goto malformed;
a0ed51b3 537 }
ba210ebe 538
a0ed51b3 539 return uv;
ba210ebe
JH
540
541malformed:
542
fcc8fcf6 543 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 544 if (retlen)
10edeb5d 545 *retlen = ((STRLEN) -1);
ba210ebe
JH
546 return 0;
547 }
548
a0dbb045 549 if (dowarn) {
84bafc02 550 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045
JH
551
552 switch (warning) {
553 case 0: /* Intentionally empty. */ break;
554 case UTF8_WARN_EMPTY:
396482e1 555 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
556 break;
557 case UTF8_WARN_CONTINUATION:
097fb8e2 558 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
559 break;
560 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
561 if (s == s0)
562 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
563 (UV)s[1], startbyte);
551405c4
AL
564 else {
565 const int len = (int)(s-s0);
097fb8e2 566 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
567 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
568 }
569
a0dbb045
JH
570 break;
571 case UTF8_WARN_FE_FF:
572 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
573 break;
574 case UTF8_WARN_SHORT:
097fb8e2 575 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 576 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 577 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
578 break;
579 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
580 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
581 ouv, *s, startbyte);
a0dbb045
JH
582 break;
583 case UTF8_WARN_SURROGATE:
584 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
585 break;
a0dbb045 586 case UTF8_WARN_LONG:
097fb8e2 587 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 588 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
589 break;
590 case UTF8_WARN_FFFF:
591 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
592 break;
593 default:
396482e1 594 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
595 break;
596 }
597
598 if (warning) {
44f8325f 599 const char * const s = SvPVX_const(sv);
a0dbb045
JH
600
601 if (PL_op)
9014280d 602 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 603 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 604 else
9014280d 605 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
606 }
607 }
608
ba210ebe 609 if (retlen)
28d3d195 610 *retlen = expectlen ? expectlen : len;
ba210ebe 611
28d3d195 612 return 0;
a0ed51b3
LW
613}
614
8e84507e 615/*
7fc63493 616=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
617
618Returns the native character value of the first character in the string C<s>
1e54db1a 619which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
620length, in bytes, of that character.
621
1e54db1a 622If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
623returned and retlen is set, if possible, to -1.
624
625=cut
626*/
627
628UV
7fc63493 629Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 630{
7918f24d
NC
631 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
632
1754c1a1
NC
633 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
634 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
635}
636
637/*
7fc63493 638=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
639
640Returns the Unicode code point of the first character in the string C<s>
1e54db1a 641which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
642length, in bytes, of that character.
643
644This function should only be used when returned UV is considered
645an index into the Unicode semantic tables (e.g. swashes).
646
1e54db1a 647If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 648returned and retlen is set, if possible, to -1.
8e84507e
NIS
649
650=cut
651*/
652
653UV
7fc63493 654Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 655{
7918f24d
NC
656 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
657
9041c2e3 658 /* Call the low level routine asking for checks */
89ebb4a3 659 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 660 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
661}
662
b76347f2 663/*
35a4481c 664=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
665
666Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
667Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
668up past C<e>, croaks.
b76347f2
JH
669
670=cut
671*/
672
673STRLEN
35a4481c 674Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 675{
97aff369 676 dVAR;
b76347f2 677 STRLEN len = 0;
10edeb5d 678 U8 t = 0;
b76347f2 679
7918f24d
NC
680 PERL_ARGS_ASSERT_UTF8_LENGTH;
681
8850bf83
JH
682 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
683 * the bitops (especially ~) can create illegal UTF-8.
684 * In other words: in Perl UTF-8 is not just for Unicode. */
685
a3b680e6
AL
686 if (e < s)
687 goto warn_and_return;
b76347f2 688 while (s < e) {
10edeb5d 689 t = UTF8SKIP(s);
901b21bf 690 if (e - s < t) {
a3b680e6 691 warn_and_return:
901b21bf
JH
692 if (ckWARN_d(WARN_UTF8)) {
693 if (PL_op)
694 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 695 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
696 else
697 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
698 }
699 return len;
700 }
b76347f2
JH
701 s += t;
702 len++;
703 }
704
705 return len;
706}
707
b06226ff 708/*
35a4481c 709=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 710
1e54db1a 711Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
712and C<b>.
713
714WARNING: use only if you *know* that the pointers point inside the
715same UTF-8 buffer.
716
37607a96
PK
717=cut
718*/
a0ed51b3 719
02eb7b47 720IV
35a4481c 721Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 722{
7918f24d
NC
723 PERL_ARGS_ASSERT_UTF8_DISTANCE;
724
bf1665bc 725 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
726}
727
b06226ff 728/*
37607a96 729=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 730
8850bf83
JH
731Return the UTF-8 pointer C<s> displaced by C<off> characters, either
732forward or backward.
b06226ff
JH
733
734WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
735the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
736on the first byte of character or just after the last byte of a character.
b06226ff 737
37607a96
PK
738=cut
739*/
a0ed51b3
LW
740
741U8 *
4373e329 742Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 743{
7918f24d
NC
744 PERL_ARGS_ASSERT_UTF8_HOP;
745
96a5add6 746 PERL_UNUSED_CONTEXT;
8850bf83
JH
747 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
748 * the bitops (especially ~) can create illegal UTF-8.
749 * In other words: in Perl UTF-8 is not just for Unicode. */
750
a0ed51b3
LW
751 if (off >= 0) {
752 while (off--)
753 s += UTF8SKIP(s);
754 }
755 else {
756 while (off++) {
757 s--;
8850bf83
JH
758 while (UTF8_IS_CONTINUATION(*s))
759 s--;
a0ed51b3
LW
760 }
761 }
4373e329 762 return (U8 *)s;
a0ed51b3
LW
763}
764
6940069f 765/*
eebe1485 766=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 767
1e54db1a 768Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
769Unlike C<bytes_to_utf8>, this over-writes the original string, and
770updates len to contain the new length.
67e989fb 771Returns zero on failure, setting C<len> to -1.
6940069f 772
95be277c
NC
773If you need a copy of the string, see C<bytes_from_utf8>.
774
6940069f
GS
775=cut
776*/
777
778U8 *
37607a96 779Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 780{
d4c19fe8
AL
781 U8 * const save = s;
782 U8 * const send = s + *len;
6940069f 783 U8 *d;
246fae53 784
7918f24d
NC
785 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
786
1e54db1a 787 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 788 while (s < send) {
dcad2880
JH
789 U8 c = *s++;
790
1d72bdf6
NIS
791 if (!UTF8_IS_INVARIANT(c) &&
792 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
793 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 794 *len = ((STRLEN) -1);
dcad2880
JH
795 return 0;
796 }
246fae53 797 }
dcad2880
JH
798
799 d = s = save;
6940069f 800 while (s < send) {
ed646e6e 801 STRLEN ulen;
9041c2e3 802 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 803 s += ulen;
6940069f
GS
804 }
805 *d = '\0';
246fae53 806 *len = d - save;
6940069f
GS
807 return save;
808}
809
810/*
e1ec3a88 811=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 812
1e54db1a 813Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 814Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
815the newly-created string, and updates C<len> to contain the new
816length. Returns the original string if no conversion occurs, C<len>
817is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8180 if C<s> is converted or contains all 7bit characters.
f9a63242 819
37607a96
PK
820=cut
821*/
f9a63242
JH
822
823U8 *
e1ec3a88 824Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 825{
f9a63242 826 U8 *d;
e1ec3a88
AL
827 const U8 *start = s;
828 const U8 *send;
f9a63242
JH
829 I32 count = 0;
830
7918f24d
NC
831 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
832
96a5add6 833 PERL_UNUSED_CONTEXT;
f9a63242 834 if (!*is_utf8)
73d840c0 835 return (U8 *)start;
f9a63242 836
1e54db1a 837 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 838 for (send = s + *len; s < send;) {
e1ec3a88 839 U8 c = *s++;
1d72bdf6 840 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
841 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
842 (c = *s++) && UTF8_IS_CONTINUATION(c))
843 count++;
844 else
73d840c0 845 return (U8 *)start;
db42d148 846 }
f9a63242
JH
847 }
848
35da51f7 849 *is_utf8 = FALSE;
f9a63242 850
212542aa 851 Newx(d, (*len) - count + 1, U8);
ef9edfd0 852 s = start; start = d;
f9a63242
JH
853 while (s < send) {
854 U8 c = *s++;
c4d5f83a
NIS
855 if (!UTF8_IS_INVARIANT(c)) {
856 /* Then it is two-byte encoded */
857 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
858 c = ASCII_TO_NATIVE(c);
859 }
860 *d++ = c;
f9a63242
JH
861 }
862 *d = '\0';
863 *len = d - start;
73d840c0 864 return (U8 *)start;
f9a63242
JH
865}
866
867/*
35a4481c 868=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 869
1e54db1a 870Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
871Returns a pointer to the newly-created string, and sets C<len> to
872reflect the new length.
6940069f 873
1e54db1a 874If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
875see sv_recode_to_utf8().
876
497711e7 877=cut
6940069f
GS
878*/
879
880U8*
35a4481c 881Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 882{
35a4481c 883 const U8 * const send = s + (*len);
6940069f
GS
884 U8 *d;
885 U8 *dst;
7918f24d
NC
886
887 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 888 PERL_UNUSED_CONTEXT;
6940069f 889
212542aa 890 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
891 dst = d;
892
893 while (s < send) {
35a4481c 894 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 895 if (UNI_IS_INVARIANT(uv))
eb160463 896 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 897 else {
eb160463
GS
898 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
899 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
900 }
901 }
902 *d = '\0';
6662521e 903 *len = d-dst;
6940069f
GS
904 return dst;
905}
906
a0ed51b3 907/*
dea0fc0b 908 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
909 *
910 * Destination must be pre-extended to 3/2 source. Do not use in-place.
911 * We optimize for native, for obvious reasons. */
912
913U8*
dea0fc0b 914Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 915{
dea0fc0b
JH
916 U8* pend;
917 U8* dstart = d;
918
7918f24d
NC
919 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
920
1de9afcd
RGS
921 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
922 d[0] = 0;
923 *newlen = 1;
924 return d;
925 }
926
dea0fc0b 927 if (bytelen & 1)
f5992bc4 928 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
929
930 pend = p + bytelen;
931
a0ed51b3 932 while (p < pend) {
dea0fc0b
JH
933 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
934 p += 2;
a0ed51b3 935 if (uv < 0x80) {
e294cc5d
JH
936#ifdef EBCDIC
937 *d++ = UNI_TO_NATIVE(uv);
938#else
eb160463 939 *d++ = (U8)uv;
e294cc5d 940#endif
a0ed51b3
LW
941 continue;
942 }
943 if (uv < 0x800) {
eb160463
GS
944 *d++ = (U8)(( uv >> 6) | 0xc0);
945 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
946 continue;
947 }
948 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
949 UV low = (p[0] << 8) + p[1];
950 p += 2;
dea0fc0b
JH
951 if (low < 0xdc00 || low >= 0xdfff)
952 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
953 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
954 }
955 if (uv < 0x10000) {
eb160463
GS
956 *d++ = (U8)(( uv >> 12) | 0xe0);
957 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
958 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
959 continue;
960 }
961 else {
eb160463
GS
962 *d++ = (U8)(( uv >> 18) | 0xf0);
963 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
964 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
965 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
966 continue;
967 }
968 }
dea0fc0b 969 *newlen = d - dstart;
a0ed51b3
LW
970 return d;
971}
972
973/* Note: this one is slightly destructive of the source. */
974
975U8*
dea0fc0b 976Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
977{
978 U8* s = (U8*)p;
d4c19fe8 979 U8* const send = s + bytelen;
7918f24d
NC
980
981 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
982
a0ed51b3 983 while (s < send) {
d4c19fe8 984 const U8 tmp = s[0];
a0ed51b3
LW
985 s[0] = s[1];
986 s[1] = tmp;
987 s += 2;
988 }
dea0fc0b 989 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
990}
991
992/* for now these are all defined (inefficiently) in terms of the utf8 versions */
993
994bool
84afefe6 995Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
999 return is_utf8_alnum(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1007 return is_utf8_alnumc(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1015 return is_utf8_idfirst(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1023 return is_utf8_alpha(tmpbuf);
1024}
1025
1026bool
84afefe6 1027Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1031 return is_utf8_ascii(tmpbuf);
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1039 return is_utf8_space(tmpbuf);
1040}
1041
1042bool
84afefe6 1043Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1044{
89ebb4a3 1045 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1046 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1047 return is_utf8_digit(tmpbuf);
1048}
1049
1050bool
84afefe6 1051Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1052{
89ebb4a3 1053 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1054 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1055 return is_utf8_upper(tmpbuf);
1056}
1057
1058bool
84afefe6 1059Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1060{
89ebb4a3 1061 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1062 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1063 return is_utf8_lower(tmpbuf);
1064}
1065
1066bool
84afefe6 1067Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1068{
89ebb4a3 1069 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1070 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1071 return is_utf8_cntrl(tmpbuf);
1072}
1073
1074bool
84afefe6 1075Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1076{
89ebb4a3 1077 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1078 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1079 return is_utf8_graph(tmpbuf);
1080}
1081
1082bool
84afefe6 1083Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1084{
89ebb4a3 1085 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1086 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1087 return is_utf8_print(tmpbuf);
1088}
1089
b8c5462f 1090bool
84afefe6 1091Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1092{
89ebb4a3 1093 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1094 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1095 return is_utf8_punct(tmpbuf);
1096}
1097
4d61ec05 1098bool
84afefe6 1099Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1100{
89ebb4a3 1101 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1102 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1103 return is_utf8_xdigit(tmpbuf);
1104}
1105
84afefe6
JH
1106UV
1107Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1108{
7918f24d
NC
1109 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1110
0ebc6274
JH
1111 uvchr_to_utf8(p, c);
1112 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1113}
1114
84afefe6
JH
1115UV
1116Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1117{
7918f24d
NC
1118 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1119
0ebc6274
JH
1120 uvchr_to_utf8(p, c);
1121 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1122}
1123
84afefe6
JH
1124UV
1125Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1126{
7918f24d
NC
1127 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1128
0ebc6274
JH
1129 uvchr_to_utf8(p, c);
1130 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1131}
1132
84afefe6
JH
1133UV
1134Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1135{
7918f24d
NC
1136 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1137
0ebc6274
JH
1138 uvchr_to_utf8(p, c);
1139 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1140}
1141
a0ed51b3
LW
1142/* for now these all assume no locale info available for Unicode > 255 */
1143
1144bool
84afefe6 1145Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1146{
1147 return is_uni_alnum(c); /* XXX no locale support yet */
1148}
1149
1150bool
84afefe6 1151Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1152{
1153 return is_uni_alnumc(c); /* XXX no locale support yet */
1154}
1155
1156bool
84afefe6 1157Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1158{
1159 return is_uni_idfirst(c); /* XXX no locale support yet */
1160}
1161
1162bool
84afefe6 1163Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1164{
1165 return is_uni_alpha(c); /* XXX no locale support yet */
1166}
1167
1168bool
84afefe6 1169Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1170{
1171 return is_uni_ascii(c); /* XXX no locale support yet */
1172}
1173
1174bool
84afefe6 1175Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1176{
1177 return is_uni_space(c); /* XXX no locale support yet */
1178}
1179
1180bool
84afefe6 1181Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1182{
1183 return is_uni_digit(c); /* XXX no locale support yet */
1184}
1185
1186bool
84afefe6 1187Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1188{
1189 return is_uni_upper(c); /* XXX no locale support yet */
1190}
1191
1192bool
84afefe6 1193Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1194{
1195 return is_uni_lower(c); /* XXX no locale support yet */
1196}
1197
1198bool
84afefe6 1199Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1200{
1201 return is_uni_cntrl(c); /* XXX no locale support yet */
1202}
1203
1204bool
84afefe6 1205Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1206{
1207 return is_uni_graph(c); /* XXX no locale support yet */
1208}
1209
1210bool
84afefe6 1211Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1212{
1213 return is_uni_print(c); /* XXX no locale support yet */
1214}
1215
b8c5462f 1216bool
84afefe6 1217Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1218{
1219 return is_uni_punct(c); /* XXX no locale support yet */
1220}
1221
4d61ec05 1222bool
84afefe6 1223Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1224{
1225 return is_uni_xdigit(c); /* XXX no locale support yet */
1226}
1227
b7ac61fa
JH
1228U32
1229Perl_to_uni_upper_lc(pTHX_ U32 c)
1230{
ee099d14
JH
1231 /* XXX returns only the first character -- do not use XXX */
1232 /* XXX no locale support yet */
1233 STRLEN len;
89ebb4a3 1234 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1235 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1236}
1237
1238U32
1239Perl_to_uni_title_lc(pTHX_ U32 c)
1240{
ee099d14
JH
1241 /* XXX returns only the first character XXX -- do not use XXX */
1242 /* XXX no locale support yet */
1243 STRLEN len;
89ebb4a3 1244 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1245 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1246}
1247
1248U32
1249Perl_to_uni_lower_lc(pTHX_ U32 c)
1250{
ee099d14
JH
1251 /* XXX returns only the first character -- do not use XXX */
1252 /* XXX no locale support yet */
1253 STRLEN len;
89ebb4a3 1254 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1255 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1256}
1257
7452cf6a 1258static bool
5141f98e 1259S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1260 const char *const swashname)
1261{
97aff369 1262 dVAR;
7918f24d
NC
1263
1264 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1265
bde6a22d
NC
1266 if (!is_utf8_char(p))
1267 return FALSE;
1268 if (!*swash)
711a919c 1269 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1270 return swash_fetch(*swash, p, TRUE) != 0;
1271}
1272
1273bool
7fc63493 1274Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1275{
97aff369 1276 dVAR;
7918f24d
NC
1277
1278 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1279
671c33bf
NC
1280 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1281 * descendant of isalnum(3), in other words, it doesn't
1282 * contain the '_'. --jhi */
d4c19fe8 1283 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1284}
1285
1286bool
7fc63493 1287Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1288{
97aff369 1289 dVAR;
7918f24d
NC
1290
1291 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
1292
d4c19fe8 1293 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1298{
97aff369 1299 dVAR;
7918f24d
NC
1300
1301 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1302
82686b01
JH
1303 if (*p == '_')
1304 return TRUE;
bde6a22d 1305 /* is_utf8_idstart would be more logical. */
d4c19fe8 1306 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1307}
1308
1309bool
7fc63493 1310Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1311{
97aff369 1312 dVAR;
7918f24d
NC
1313
1314 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1315
82686b01
JH
1316 if (*p == '_')
1317 return TRUE;
d4c19fe8 1318 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1319}
1320
1321bool
7fc63493 1322Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1323{
97aff369 1324 dVAR;
7918f24d
NC
1325
1326 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1327
d4c19fe8 1328 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1329}
1330
1331bool
7fc63493 1332Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1333{
97aff369 1334 dVAR;
7918f24d
NC
1335
1336 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1337
d4c19fe8 1338 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1339}
1340
1341bool
7fc63493 1342Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1343{
97aff369 1344 dVAR;
7918f24d
NC
1345
1346 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1347
d4c19fe8 1348 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1349}
1350
1351bool
7fc63493 1352Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1353{
97aff369 1354 dVAR;
7918f24d
NC
1355
1356 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1357
d4c19fe8 1358 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1359}
1360
1361bool
7fc63493 1362Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1363{
97aff369 1364 dVAR;
7918f24d
NC
1365
1366 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1367
d4c19fe8 1368 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1369}
1370
1371bool
7fc63493 1372Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1373{
97aff369 1374 dVAR;
7918f24d
NC
1375
1376 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1377
d4c19fe8 1378 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1379}
1380
1381bool
7fc63493 1382Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1383{
97aff369 1384 dVAR;
7918f24d
NC
1385
1386 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1387
d4c19fe8 1388 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1389}
1390
1391bool
7fc63493 1392Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1393{
97aff369 1394 dVAR;
7918f24d
NC
1395
1396 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1397
d4c19fe8 1398 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1399}
1400
1401bool
7fc63493 1402Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1403{
97aff369 1404 dVAR;
7918f24d
NC
1405
1406 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1407
d4c19fe8 1408 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1409}
1410
1411bool
7fc63493 1412Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1413{
97aff369 1414 dVAR;
7918f24d
NC
1415
1416 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1417
d4c19fe8 1418 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1419}
1420
1421bool
7fc63493 1422Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1423{
97aff369 1424 dVAR;
7918f24d
NC
1425
1426 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1427
d4c19fe8 1428 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1429}
1430
1431bool
7fc63493 1432Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1433{
97aff369 1434 dVAR;
7918f24d
NC
1435
1436 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1437
d4c19fe8 1438 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1439}
1440
6b5c0936
JH
1441/*
1442=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1443
1444The "p" contains the pointer to the UTF-8 string encoding
1445the character that is being converted.
1446
1447The "ustrp" is a pointer to the character buffer to put the
1448conversion result to. The "lenp" is a pointer to the length
1449of the result.
1450
0134edef 1451The "swashp" is a pointer to the swash to use.
6b5c0936 1452
0134edef 1453Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1454and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1455but not always, a multicharacter mapping), is tried first.
6b5c0936 1456
0134edef
JH
1457The "special" is a string like "utf8::ToSpecLower", which means the
1458hash %utf8::ToSpecLower. The access to the hash is through
1459Perl_to_utf8_case().
6b5c0936 1460
0134edef
JH
1461The "normal" is a string like "ToLower" which means the swash
1462%utf8::ToLower.
1463
1464=cut */
6b5c0936 1465
2104c8d9 1466UV
9a957fbc
AL
1467Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1468 SV **swashp, const char *normal, const char *special)
a0ed51b3 1469{
97aff369 1470 dVAR;
89ebb4a3 1471 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1472 STRLEN len = 0;
aec46f14 1473 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1474 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1475 * are necessary in EBCDIC, they are redundant no-ops
1476 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1477 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
1478
1479 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1480
1feea2c7 1481 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1482
1483 if (!*swashp) /* load on-demand */
1484 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1485
b08cf34e
JH
1486 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1487 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1488 /* It might be "special" (sometimes, but not always,
2a37f04d 1489 * a multicharacter mapping) */
35da51f7 1490 HV * const hv = get_hv(special, FALSE);
b08cf34e
JH
1491 SV **svp;
1492
35da51f7 1493 if (hv &&
b08cf34e
JH
1494 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1495 (*svp)) {
cfd0369c 1496 const char *s;
47654450 1497
cfd0369c 1498 s = SvPV_const(*svp, len);
47654450
JH
1499 if (len == 1)
1500 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1501 else {
2f9475ad
JH
1502#ifdef EBCDIC
1503 /* If we have EBCDIC we need to remap the characters
1504 * since any characters in the low 256 are Unicode
1505 * code points, not EBCDIC. */
7cda7a3d 1506 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1507
1508 d = tmpbuf;
b08cf34e 1509 if (SvUTF8(*svp)) {
2f9475ad
JH
1510 STRLEN tlen = 0;
1511
1512 while (t < tend) {
d4c19fe8 1513 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1514 if (tlen > 0) {
1515 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1516 t += tlen;
1517 }
1518 else
1519 break;
1520 }
1521 }
1522 else {
36fec512
JH
1523 while (t < tend) {
1524 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1525 t++;
1526 }
2f9475ad
JH
1527 }
1528 len = d - tmpbuf;
1529 Copy(tmpbuf, ustrp, len, U8);
1530#else
d2dcd0fb 1531 Copy(s, ustrp, len, U8);
2f9475ad 1532#endif
29e98929 1533 }
983ffd37 1534 }
0134edef
JH
1535 }
1536
1537 if (!len && *swashp) {
d4c19fe8
AL
1538 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1539
0134edef
JH
1540 if (uv2) {
1541 /* It was "normal" (a single character mapping). */
d4c19fe8 1542 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1543 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1544 }
1545 }
1feea2c7 1546
0134edef
JH
1547 if (!len) /* Neither: just copy. */
1548 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1549
2a37f04d
JH
1550 if (lenp)
1551 *lenp = len;
1552
0134edef 1553 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1554}
1555
d3e79532 1556/*
7fc63493 1557=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1558
1559Convert the UTF-8 encoded character at p to its uppercase version and
1560store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1561that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1562the uppercase version may be longer than the original character.
d3e79532
JH
1563
1564The first character of the uppercased version is returned
1565(but note, as explained above, that there may be more.)
1566
1567=cut */
1568
2104c8d9 1569UV
7fc63493 1570Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1571{
97aff369 1572 dVAR;
7918f24d
NC
1573
1574 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1575
983ffd37 1576 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1577 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1578}
a0ed51b3 1579
d3e79532 1580/*
7fc63493 1581=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1582
1583Convert the UTF-8 encoded character at p to its titlecase version and
1584store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1585that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1586titlecase version may be longer than the original character.
d3e79532
JH
1587
1588The first character of the titlecased version is returned
1589(but note, as explained above, that there may be more.)
1590
1591=cut */
1592
983ffd37 1593UV
7fc63493 1594Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1595{
97aff369 1596 dVAR;
7918f24d
NC
1597
1598 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1599
983ffd37 1600 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1601 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1602}
1603
d3e79532 1604/*
7fc63493 1605=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1606
1607Convert the UTF-8 encoded character at p to its lowercase version and
1608store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1609that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1610lowercase version may be longer than the original character.
d3e79532
JH
1611
1612The first character of the lowercased version is returned
1613(but note, as explained above, that there may be more.)
1614
1615=cut */
1616
2104c8d9 1617UV
7fc63493 1618Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1619{
97aff369 1620 dVAR;
7918f24d
NC
1621
1622 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1623
983ffd37 1624 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1625 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1626}
1627
d3e79532 1628/*
7fc63493 1629=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1630
1631Convert the UTF-8 encoded character at p to its foldcase version and
1632store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1633that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1634foldcase version may be longer than the original character (up to
1635three characters).
1636
1637The first character of the foldcased version is returned
1638(but note, as explained above, that there may be more.)
1639
1640=cut */
1641
b4e400f9 1642UV
7fc63493 1643Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1644{
97aff369 1645 dVAR;
7918f24d
NC
1646
1647 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1648
b4e400f9
JH
1649 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1650 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1651}
1652
711a919c
ST
1653/* Note:
1654 * A "swash" is a swatch hash.
1655 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1656 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1657 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1658 */
a0ed51b3 1659SV*
7fc63493 1660Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1661{
27da23d5 1662 dVAR;
a0ed51b3 1663 SV* retval;
8e84507e 1664 dSP;
7fc63493
AL
1665 const size_t pkg_len = strlen(pkg);
1666 const size_t name_len = strlen(name);
da51bb9b 1667 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1668 SV* errsv_save;
ce3b816e 1669
7918f24d
NC
1670 PERL_ARGS_ASSERT_SWASH_INIT;
1671
96ca9f55
DM
1672 PUSHSTACKi(PERLSI_MAGIC);
1673 ENTER;
1674 SAVEI32(PL_hints);
1675 PL_hints = 0;
1676 save_re_context();
1b026014 1677 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1678 ENTER;
f8be5cf0 1679 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1680 /* It is assumed that callers of this routine are not passing in any
1681 user derived data. */
1682 /* Need to do this after save_re_context() as it will set PL_tainted to
1683 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1684 Even line to create errsv_save can turn on PL_tainted. */
1685 SAVEBOOL(PL_tainted);
1686 PL_tainted = 0;
71bed85a 1687 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1688 NULL);
f8be5cf0
JH
1689 if (!SvTRUE(ERRSV))
1690 sv_setsv(ERRSV, errsv_save);
1691 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1692 LEAVE;
1693 }
1694 SPAGAIN;
a0ed51b3
LW
1695 PUSHMARK(SP);
1696 EXTEND(SP,5);
6e449a3a
MHM
1697 mPUSHp(pkg, pkg_len);
1698 mPUSHp(name, name_len);
a0ed51b3 1699 PUSHs(listsv);
6e449a3a
MHM
1700 mPUSHi(minbits);
1701 mPUSHi(none);
a0ed51b3 1702 PUTBACK;
f8be5cf0 1703 errsv_save = newSVsv(ERRSV);
864dbfa3 1704 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1705 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1706 else
e24b16f9 1707 retval = &PL_sv_undef;
f8be5cf0
JH
1708 if (!SvTRUE(ERRSV))
1709 sv_setsv(ERRSV, errsv_save);
1710 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1711 LEAVE;
1712 POPSTACK;
923e4eb5 1713 if (IN_PERL_COMPILETIME) {
623e6609 1714 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1715 }
bc45ce41
JH
1716 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1717 if (SvPOK(retval))
35c1215d 1718 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1719 SVfARG(retval));
cea2e8a9 1720 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1721 }
a0ed51b3
LW
1722 return retval;
1723}
1724
035d37be
JH
1725
1726/* This API is wrong for special case conversions since we may need to
1727 * return several Unicode characters for a single Unicode character
1728 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1729 * the lower-level routine, and it is similarly broken for returning
1730 * multiple values. --jhi */
979f2922 1731/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
ST
1732
1733/* Note:
1734 * Returns the value of property/mapping C<swash> for the first character
1735 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1736 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1737 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1738 */
a0ed51b3 1739UV
680c470c 1740Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1741{
27da23d5 1742 dVAR;
680c470c 1743 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1744 U32 klen;
1745 U32 off;
a0ed51b3 1746 STRLEN slen;
7d85a32c 1747 STRLEN needents;
cfd0369c 1748 const U8 *tmps = NULL;
a0ed51b3 1749 U32 bit;
979f2922 1750 SV *swatch;
3568d838 1751 U8 tmputf8[2];
35da51f7 1752 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1753
7918f24d
NC
1754 PERL_ARGS_ASSERT_SWASH_FETCH;
1755
3568d838 1756 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
ST
1757 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1758 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1759 ptr = tmputf8;
3568d838
JH
1760 }
1761 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1762 * then the "swatch" is a vec() for al the chars which start
1763 * with 0xAA..0xYY
1764 * So the key in the hash (klen) is length of encoded char -1
1765 */
1766 klen = UTF8SKIP(ptr) - 1;
1767 off = ptr[klen];
a0ed51b3 1768
979f2922 1769 if (klen == 0) {
7d85a32c 1770 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1771 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1772 */
979f2922
ST
1773 needents = UTF_CONTINUATION_MARK;
1774 off = NATIVE_TO_UTF(ptr[klen]);
1775 }
1776 else {
7d85a32c 1777 /* If char is encoded then swatch is for the prefix */
979f2922
ST
1778 needents = (1 << UTF_ACCUMULATION_SHIFT);
1779 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1780 }
7d85a32c 1781
a0ed51b3
LW
1782 /*
1783 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1784 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1785 * it's nothing to sniff at.) Pity we usually come through at least
1786 * two function calls to get here...
1787 *
1788 * NB: this code assumes that swatches are never modified, once generated!
1789 */
1790
3568d838 1791 if (hv == PL_last_swash_hv &&
a0ed51b3 1792 klen == PL_last_swash_klen &&
27da23d5 1793 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1794 {
1795 tmps = PL_last_swash_tmps;
1796 slen = PL_last_swash_slen;
1797 }
1798 else {
1799 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1800 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1801
979f2922
ST
1802 /* If not cached, generate it via swash_get */
1803 if (!svp || !SvPOK(*svp)
1804 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1805 /* We use utf8n_to_uvuni() as we want an index into
1806 Unicode tables, not a native character number.
1807 */
aec46f14 1808 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1809 ckWARN(WARN_UTF8) ?
1810 0 : UTF8_ALLOW_ANY);
680c470c 1811 swatch = swash_get(swash,
979f2922
ST
1812 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1813 (klen) ? (code_point & ~(needents - 1)) : 0,
1814 needents);
1815
923e4eb5 1816 if (IN_PERL_COMPILETIME)
623e6609 1817 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1818
979f2922 1819 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1820
979f2922
ST
1821 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1822 || (slen << 3) < needents)
660a4616 1823 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1824 }
1825
1826 PL_last_swash_hv = hv;
16d8f38a 1827 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1828 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
1829 /* FIXME change interpvar.h? */
1830 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1831 PL_last_swash_slen = slen;
1832 if (klen)
1833 Copy(ptr, PL_last_swash_key, klen, U8);
1834 }
1835
9faf8d75 1836 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1837 case 1:
1838 bit = 1 << (off & 7);
1839 off >>= 3;
1840 return (tmps[off] & bit) != 0;
1841 case 8:
1842 return tmps[off];
1843 case 16:
1844 off <<= 1;
1845 return (tmps[off] << 8) + tmps[off + 1] ;
1846 case 32:
1847 off <<= 2;
1848 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1849 }
660a4616 1850 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1851 NORETURN_FUNCTION_END;
a0ed51b3 1852}
2b9d42f0 1853
979f2922
ST
1854/* Note:
1855 * Returns a swatch (a bit vector string) for a code point sequence
1856 * that starts from the value C<start> and comprises the number C<span>.
1857 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1858 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1859 */
1860STATIC SV*
1861S_swash_get(pTHX_ SV* swash, UV start, UV span)
1862{
1863 SV *swatch;
711a919c 1864 U8 *l, *lend, *x, *xend, *s;
979f2922 1865 STRLEN lcur, xcur, scur;
979f2922 1866 HV* const hv = (HV*)SvRV(swash);
017a3ce5
GA
1867 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1868 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1869 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1870 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1871 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1872 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1873 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1874 const STRLEN bits = SvUV(*bitssvp);
1875 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1876 const UV none = SvUV(*nonesvp);
1877 const UV end = start + span;
979f2922 1878
7918f24d
NC
1879 PERL_ARGS_ASSERT_SWASH_GET;
1880
979f2922 1881 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
ST
1882 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1883 (UV)bits);
979f2922
ST
1884 }
1885
1886 /* create and initialize $swatch */
979f2922 1887 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
1888 swatch = newSV(scur);
1889 SvPOK_on(swatch);
979f2922
ST
1890 s = (U8*)SvPVX(swatch);
1891 if (octets && none) {
0bd48802 1892 const U8* const e = s + scur;
979f2922
ST
1893 while (s < e) {
1894 if (bits == 8)
1895 *s++ = (U8)(none & 0xff);
1896 else if (bits == 16) {
1897 *s++ = (U8)((none >> 8) & 0xff);
1898 *s++ = (U8)( none & 0xff);
1899 }
1900 else if (bits == 32) {
1901 *s++ = (U8)((none >> 24) & 0xff);
1902 *s++ = (U8)((none >> 16) & 0xff);
1903 *s++ = (U8)((none >> 8) & 0xff);
1904 *s++ = (U8)( none & 0xff);
1905 }
1906 }
1907 *s = '\0';
1908 }
1909 else {
1910 (void)memzero((U8*)s, scur + 1);
1911 }
1912 SvCUR_set(swatch, scur);
1913 s = (U8*)SvPVX(swatch);
1914
1915 /* read $swash->{LIST} */
1916 l = (U8*)SvPV(*listsvp, lcur);
1917 lend = l + lcur;
1918 while (l < lend) {
35da51f7 1919 UV min, max, val;
979f2922
ST
1920 STRLEN numlen;
1921 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1922
0bd48802 1923 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
ST
1924
1925 numlen = lend - l;
1926 min = grok_hex((char *)l, &numlen, &flags, NULL);
1927 if (numlen)
1928 l += numlen;
1929 else if (nl) {
1930 l = nl + 1; /* 1 is length of "\n" */
1931 continue;
1932 }
1933 else {
1934 l = lend; /* to LIST's end at which \n is not found */
1935 break;
1936 }
1937
1938 if (isBLANK(*l)) {
1939 ++l;
1940 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1941 numlen = lend - l;
1942 max = grok_hex((char *)l, &numlen, &flags, NULL);
1943 if (numlen)
1944 l += numlen;
1945 else
1946 max = min;
1947
1948 if (octets) {
1949 if (isBLANK(*l)) {
1950 ++l;
1951 flags = PERL_SCAN_SILENT_ILLDIGIT |
1952 PERL_SCAN_DISALLOW_PREFIX;
1953 numlen = lend - l;
1954 val = grok_hex((char *)l, &numlen, &flags, NULL);
1955 if (numlen)
1956 l += numlen;
1957 else
1958 val = 0;
1959 }
1960 else {
1961 val = 0;
1962 if (typeto) {
1963 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1964 typestr, l);
1965 }
1966 }
1967 }
711a919c
ST
1968 else
1969 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1970 }
1971 else {
1972 max = min;
1973 if (octets) {
1974 val = 0;
1975 if (typeto) {
1976 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1977 }
1978 }
711a919c
ST
1979 else
1980 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1981 }
1982
1983 if (nl)
1984 l = nl + 1;
1985 else
1986 l = lend;
1987
1988 if (max < start)
1989 continue;
1990
1991 if (octets) {
35da51f7 1992 UV key;
979f2922
ST
1993 if (min < start) {
1994 if (!none || val < none) {
1995 val += start - min;
1996 }
1997 min = start;
1998 }
1999 for (key = min; key <= max; key++) {
2000 STRLEN offset;
2001 if (key >= end)
2002 goto go_out_list;
2003 /* offset must be non-negative (start <= min <= key < end) */
2004 offset = octets * (key - start);
2005 if (bits == 8)
2006 s[offset] = (U8)(val & 0xff);
2007 else if (bits == 16) {
2008 s[offset ] = (U8)((val >> 8) & 0xff);
2009 s[offset + 1] = (U8)( val & 0xff);
2010 }
2011 else if (bits == 32) {
2012 s[offset ] = (U8)((val >> 24) & 0xff);
2013 s[offset + 1] = (U8)((val >> 16) & 0xff);
2014 s[offset + 2] = (U8)((val >> 8) & 0xff);
2015 s[offset + 3] = (U8)( val & 0xff);
2016 }
2017
2018 if (!none || val < none)
2019 ++val;
2020 }
2021 }
711a919c 2022 else { /* bits == 1, then val should be ignored */
35da51f7 2023 UV key;
979f2922
ST
2024 if (min < start)
2025 min = start;
2026 for (key = min; key <= max; key++) {
0bd48802 2027 const STRLEN offset = (STRLEN)(key - start);
979f2922
ST
2028 if (key >= end)
2029 goto go_out_list;
2030 s[offset >> 3] |= 1 << (offset & 7);
2031 }
2032 }
2033 } /* while */
2034 go_out_list:
2035
2036 /* read $swash->{EXTRAS} */
2037 x = (U8*)SvPV(*extssvp, xcur);
2038 xend = x + xcur;
2039 while (x < xend) {
2040 STRLEN namelen;
2041 U8 *namestr;
2042 SV** othersvp;
2043 HV* otherhv;
2044 STRLEN otherbits;
2045 SV **otherbitssvp, *other;
711a919c 2046 U8 *s, *o, *nl;
979f2922
ST
2047 STRLEN slen, olen;
2048
35da51f7 2049 const U8 opc = *x++;
979f2922
ST
2050 if (opc == '\n')
2051 continue;
2052
2053 nl = (U8*)memchr(x, '\n', xend - x);
2054
2055 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2056 if (nl) {
2057 x = nl + 1; /* 1 is length of "\n" */
2058 continue;
2059 }
2060 else {
2061 x = xend; /* to EXTRAS' end at which \n is not found */
2062 break;
2063 }
2064 }
2065
2066 namestr = x;
2067 if (nl) {
2068 namelen = nl - namestr;
2069 x = nl + 1;
2070 }
2071 else {
2072 namelen = xend - namestr;
2073 x = xend;
2074 }
2075
2076 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 2077 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 2078 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
ST
2079 otherbits = (STRLEN)SvUV(*otherbitssvp);
2080 if (bits < otherbits)
660a4616 2081 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
ST
2082
2083 /* The "other" swatch must be destroyed after. */
2084 other = swash_get(*othersvp, start, span);
2085 o = (U8*)SvPV(other, olen);
2086
2087 if (!olen)
660a4616 2088 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
ST
2089
2090 s = (U8*)SvPV(swatch, slen);
2091 if (bits == 1 && otherbits == 1) {
2092 if (slen != olen)
660a4616 2093 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
ST
2094
2095 switch (opc) {
2096 case '+':
2097 while (slen--)
2098 *s++ |= *o++;
2099 break;
2100 case '!':
2101 while (slen--)
2102 *s++ |= ~*o++;
2103 break;
2104 case '-':
2105 while (slen--)
2106 *s++ &= ~*o++;
2107 break;
2108 case '&':
2109 while (slen--)
2110 *s++ &= *o++;
2111 break;
2112 default:
2113 break;
2114 }
2115 }
711a919c 2116 else {
979f2922
ST
2117 STRLEN otheroctets = otherbits >> 3;
2118 STRLEN offset = 0;
35da51f7 2119 U8* const send = s + slen;
979f2922
ST
2120
2121 while (s < send) {
2122 UV otherval = 0;
2123
2124 if (otherbits == 1) {
2125 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2126 ++offset;
2127 }
2128 else {
2129 STRLEN vlen = otheroctets;
2130 otherval = *o++;
2131 while (--vlen) {
2132 otherval <<= 8;
2133 otherval |= *o++;
2134 }
2135 }
2136
711a919c 2137 if (opc == '+' && otherval)
6f207bd3 2138 NOOP; /* replace with otherval */
979f2922
ST
2139 else if (opc == '!' && !otherval)
2140 otherval = 1;
2141 else if (opc == '-' && otherval)
2142 otherval = 0;
2143 else if (opc == '&' && !otherval)
2144 otherval = 0;
2145 else {
711a919c 2146 s += octets; /* no replacement */
979f2922
ST
2147 continue;
2148 }
2149
2150 if (bits == 8)
2151 *s++ = (U8)( otherval & 0xff);
2152 else if (bits == 16) {
2153 *s++ = (U8)((otherval >> 8) & 0xff);
2154 *s++ = (U8)( otherval & 0xff);
2155 }
2156 else if (bits == 32) {
2157 *s++ = (U8)((otherval >> 24) & 0xff);
2158 *s++ = (U8)((otherval >> 16) & 0xff);
2159 *s++ = (U8)((otherval >> 8) & 0xff);
2160 *s++ = (U8)( otherval & 0xff);
2161 }
2162 }
2163 }
2164 sv_free(other); /* through with it! */
2165 } /* while */
2166 return swatch;
2167}
2168
0f830e0b
NC
2169/*
2170=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2171
2172Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2173of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2174bytes available. The return value is the pointer to the byte after the
2175end of the new character. In other words,
2176
2177 d = uvchr_to_utf8(d, uv);
2178
2179is the recommended wide native character-aware way of saying
2180
2181 *(d++) = uv;
2182
2183=cut
2184*/
2185
2186/* On ASCII machines this is normally a macro but we want a
2187 real function in case XS code wants it
2188*/
0f830e0b
NC
2189U8 *
2190Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2191{
7918f24d
NC
2192 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2193
0f830e0b
NC
2194 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2195}
2196
b851fbc1
JH
2197U8 *
2198Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2199{
7918f24d
NC
2200 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2201
b851fbc1
JH
2202 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2203}
2b9d42f0
NIS
2204
2205/*
0f830e0b
NC
2206=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2207flags
2208
2209Returns the native character value of the first character in the string
2210C<s>
2211which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2212length, in bytes, of that character.
2213
2214Allows length and flags to be passed to low level routine.
2215
2216=cut
2217*/
2218/* On ASCII machines this is normally a macro but we want
2219 a real function in case XS code wants it
2220*/
0f830e0b
NC
2221UV
2222Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2223U32 flags)
2224{
2225 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
2226
2227 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2228
0f830e0b
NC
2229 return UNI_TO_NATIVE(uv);
2230}
2231
2232/*
d2cc3551
JH
2233=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2234
2235Build to the scalar dsv a displayable version of the string spv,
2236length len, the displayable version being at most pvlim bytes long
2237(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2238
9e55ce06 2239The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2240isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2241to display the \\[nrfta\\] as the backslashed versions (like '\n')
2242(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2243UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2244UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2245
d2cc3551
JH
2246The pointer to the PV of the dsv is returned.
2247
2248=cut */
e6b2e755 2249char *
e1ec3a88 2250Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2251{
2252 int truncated = 0;
e1ec3a88 2253 const char *s, *e;
e6b2e755 2254
7918f24d
NC
2255 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2256
e6b2e755 2257 sv_setpvn(dsv, "", 0);
7fddd944 2258 SvUTF8_off(dsv);
e1ec3a88 2259 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2260 UV u;
a49f32c6
NC
2261 /* This serves double duty as a flag and a character to print after
2262 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2263 */
2264 char ok = 0;
c728cb41 2265
e6b2e755
JH
2266 if (pvlim && SvCUR(dsv) >= pvlim) {
2267 truncated++;
2268 break;
2269 }
2270 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2271 if (u < 256) {
a3b680e6 2272 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2273 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2274 switch (c) {
c728cb41 2275 case '\n':
a49f32c6 2276 ok = 'n'; break;
c728cb41 2277 case '\r':
a49f32c6 2278 ok = 'r'; break;
c728cb41 2279 case '\t':
a49f32c6 2280 ok = 't'; break;
c728cb41 2281 case '\f':
a49f32c6 2282 ok = 'f'; break;
c728cb41 2283 case '\a':
a49f32c6 2284 ok = 'a'; break;
c728cb41 2285 case '\\':
a49f32c6 2286 ok = '\\'; break;
c728cb41
JH
2287 default: break;
2288 }
a49f32c6 2289 if (ok) {
88c9ea1e 2290 const char string = ok;
ce0834da 2291 sv_catpvn(dsv, "\\", 1);
5e7aa789 2292 sv_catpvn(dsv, &string, 1);
a49f32c6 2293 }
c728cb41 2294 }
00e86452 2295 /* isPRINT() is the locale-blind version. */
a49f32c6 2296 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2297 const char string = c;
5e7aa789 2298 sv_catpvn(dsv, &string, 1);
a49f32c6 2299 ok = 1;
0a2ef054 2300 }
c728cb41
JH
2301 }
2302 if (!ok)
9e55ce06 2303 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2304 }
2305 if (truncated)
396482e1 2306 sv_catpvs(dsv, "...");
e6b2e755
JH
2307
2308 return SvPVX(dsv);
2309}
2b9d42f0 2310
d2cc3551
JH
2311/*
2312=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2313
2314Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2315the displayable version being at most pvlim bytes long
d2cc3551 2316(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2317
2318The flags argument is as in pv_uni_display().
2319
d2cc3551
JH
2320The pointer to the PV of the dsv is returned.
2321
d4c19fe8
AL
2322=cut
2323*/
e6b2e755
JH
2324char *
2325Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2326{
7918f24d
NC
2327 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2328
cfd0369c
NC
2329 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2330 SvCUR(ssv), pvlim, flags);
701a277b
JH
2331}
2332
d2cc3551 2333/*
d07ddd77 2334=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2335
2336Return true if the strings s1 and s2 differ case-insensitively, false
2337if not (if they are equal case-insensitively). If u1 is true, the
2338string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2339the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2340are false, the respective string is assumed to be in native 8-bit
2341encoding.
2342
2343If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2344in there (they will point at the beginning of the I<next> character).
2345If the pointers behind pe1 or pe2 are non-NULL, they are the end
2346pointers beyond which scanning will not continue under any
4cdaeff7 2347circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2348s2+l2 will be used as goal end pointers that will also stop the scan,
2349and which qualify towards defining a successful match: all the scans
2350that define an explicit length must reach their goal pointers for
2351a match to succeed).
d2cc3551
JH
2352
2353For case-insensitiveness, the "casefolding" of Unicode is used
2354instead of upper/lowercasing both the characters, see
2355http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2356
2357=cut */
701a277b 2358I32
d07ddd77 2359Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2360{
97aff369 2361 dVAR;
e1ec3a88
AL
2362 register const U8 *p1 = (const U8*)s1;
2363 register const U8 *p2 = (const U8*)s2;
cbbf8932 2364 register const U8 *f1 = NULL;
2f73348c 2365 register const U8 *f2 = NULL;
cbbf8932
AL
2366 register U8 *e1 = NULL;
2367 register U8 *q1 = NULL;
2368 register U8 *e2 = NULL;
2369 register U8 *q2 = NULL;
d07ddd77 2370 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2371 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2372 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2373 U8 natbuf[1+1];
2374 STRLEN foldlen1, foldlen2;
d07ddd77 2375 bool match;
7918f24d
NC
2376
2377 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2378
d07ddd77
JH
2379 if (pe1)
2380 e1 = *(U8**)pe1;
a0a388a1 2381 /* assert(e1 || l1); */
e1ec3a88
AL
2382 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2383 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2384 if (pe2)
2385 e2 = *(U8**)pe2;
a0a388a1 2386 /* assert(e2 || l2); */
e1ec3a88
AL
2387 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2388 f2 = (const U8*)s2 + l2;
d07ddd77 2389
a0a388a1
YO
2390 /* This shouldn't happen. However, putting an assert() there makes some
2391 * tests fail. */
2392 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77
JH
2393 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2394 return 1; /* mismatch; possible infinite loop or false positive */
2395
a6872d42
JH
2396 if (!u1 || !u2)
2397 natbuf[1] = 0; /* Need to terminate the buffer. */
2398
d07ddd77
JH
2399 while ((e1 == 0 || p1 < e1) &&
2400 (f1 == 0 || p1 < f1) &&
2401 (e2 == 0 || p2 < e2) &&
2402 (f2 == 0 || p2 < f2)) {
2403 if (n1 == 0) {
d7f013c8
JH
2404 if (u1)
2405 to_utf8_fold(p1, foldbuf1, &foldlen1);
2406 else {
809e8e66 2407 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2408 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2409 }
2410 q1 = foldbuf1;
d07ddd77 2411 n1 = foldlen1;
332ddc25 2412 }
d07ddd77 2413 if (n2 == 0) {
d7f013c8
JH
2414 if (u2)
2415 to_utf8_fold(p2, foldbuf2, &foldlen2);
2416 else {
809e8e66 2417 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2418 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2419 }
2420 q2 = foldbuf2;
d07ddd77 2421 n2 = foldlen2;
332ddc25 2422 }
d07ddd77
JH
2423 while (n1 && n2) {
2424 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2425 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2426 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2427 return 1; /* mismatch */
d07ddd77 2428 n1 -= UTF8SKIP(q1);
d7f013c8 2429 q1 += UTF8SKIP(q1);
d07ddd77 2430 n2 -= UTF8SKIP(q2);
d7f013c8 2431 q2 += UTF8SKIP(q2);
701a277b 2432 }
d07ddd77 2433 if (n1 == 0)
d7f013c8 2434 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2435 if (n2 == 0)
d7f013c8
JH
2436 p2 += u2 ? UTF8SKIP(p2) : 1;
2437
d2cc3551 2438 }
5469e704 2439
d07ddd77
JH
2440 /* A match is defined by all the scans that specified
2441 * an explicit length reaching their final goals. */
2442 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2443
2444 if (match) {
d07ddd77
JH
2445 if (pe1)
2446 *pe1 = (char*)p1;
2447 if (pe2)
2448 *pe2 = (char*)p2;
5469e704
JH
2449 }
2450
2451 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2452}
701a277b 2453
a49f32c6
NC
2454/*
2455 * Local variables:
2456 * c-indentation-style: bsd
2457 * c-basic-offset: 4
2458 * indent-tabs-mode: t
2459 * End:
2460 *
37442d52
RGS
2461 * ex: set ts=8 sts=4 sw=4 noet:
2462 */