This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl builds fine on Windows 7
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550
TC
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3
LW
33#include "perl.h"
34
a0c21aa1
JH
35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5
JH
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
ccfc67b7
JH
45/*
46=head1 Unicode Support
a0ed51b3 47
166f8a29
DM
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
eaf7a4d2
CS
54=cut
55*/
56
57/*
58=for apidoc is_ascii_string
59
60Returns true if first C<len> bytes of the given string are ASCII (i.e. none
61of them even raise the question of UTF-8-ness).
62
63See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
64
65=cut
66*/
67
68bool
668b6d8d 69Perl_is_ascii_string(const U8 *s, STRLEN len)
eaf7a4d2
CS
70{
71 const U8* const send = s + (len ? len : strlen((const char *)s));
72 const U8* x = s;
73
74 PERL_ARGS_ASSERT_IS_ASCII_STRING;
eaf7a4d2
CS
75
76 for (; x < send; ++x) {
77 if (!UTF8_IS_INVARIANT(*x))
78 break;
79 }
80
81 return x == send;
82}
83
84/*
87cea99e 85=for apidoc uvuni_to_utf8_flags
eebe1485 86
1e54db1a 87Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 88of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 89bytes available. The return value is the pointer to the byte after the
9041c2e3 90end of the new character. In other words,
eebe1485 91
b851fbc1
JH
92 d = uvuni_to_utf8_flags(d, uv, flags);
93
94or, in most cases,
95
9041c2e3 96 d = uvuni_to_utf8(d, uv);
eebe1485 97
b851fbc1
JH
98(which is equivalent to)
99
100 d = uvuni_to_utf8_flags(d, uv, 0);
101
eebe1485
SC
102is the recommended Unicode-aware way of saying
103
104 *(d++) = uv;
105
106=cut
107*/
108
dfe13c55 109U8 *
b851fbc1 110Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 111{
7918f24d
NC
112 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
113
62961d2e 114 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
115 if (UNICODE_IS_SURROGATE(uv) &&
116 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 117 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
118 else if (
119 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
120 !(flags & UNICODE_ALLOW_FDD0))
121 ||
c867b360 122 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
123 !(flags & UNICODE_ALLOW_FFFF))) &&
124 /* UNICODE_ALLOW_SUPER includes
2a20b9da 125 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
126 ((uv <= PERL_UNICODE_MAX) ||
127 !(flags & UNICODE_ALLOW_SUPER))
128 )
9014280d 129 Perl_warner(aTHX_ packWARN(WARN_UTF8),
6f6ac1de 130 "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
507b9800 131 }
c4d5f83a 132 if (UNI_IS_INVARIANT(uv)) {
eb160463 133 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
134 return d;
135 }
2d331972 136#if defined(EBCDIC)
1d72bdf6
NIS
137 else {
138 STRLEN len = UNISKIP(uv);
139 U8 *p = d+len-1;
140 while (p > d) {
eb160463 141 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
142 uv >>= UTF_ACCUMULATION_SHIFT;
143 }
eb160463 144 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
145 return d+len;
146 }
147#else /* Non loop style */
a0ed51b3 148 if (uv < 0x800) {
eb160463
GS
149 *d++ = (U8)(( uv >> 6) | 0xc0);
150 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
151 return d;
152 }
153 if (uv < 0x10000) {
eb160463
GS
154 *d++ = (U8)(( uv >> 12) | 0xe0);
155 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
157 return d;
158 }
159 if (uv < 0x200000) {
eb160463
GS
160 *d++ = (U8)(( uv >> 18) | 0xf0);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166 if (uv < 0x4000000) {
eb160463
GS
167 *d++ = (U8)(( uv >> 24) | 0xf8);
168 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
170 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
171 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
172 return d;
173 }
174 if (uv < 0x80000000) {
eb160463
GS
175 *d++ = (U8)(( uv >> 30) | 0xfc);
176 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
177 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
178 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
180 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
181 return d;
182 }
6b8eaf93 183#ifdef HAS_QUAD
d7578b48 184 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
185#endif
186 {
eb160463
GS
187 *d++ = 0xfe; /* Can't match U+FEFF! */
188 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
189 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
190 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
191 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
192 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
193 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
194 return d;
195 }
6b8eaf93 196#ifdef HAS_QUAD
a0ed51b3 197 {
eb160463
GS
198 *d++ = 0xff; /* Can't match U+FFFE! */
199 *d++ = 0x80; /* 6 Reserved bits */
200 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
201 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
202 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
203 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
204 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
205 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
206 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
207 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
208 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
209 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
210 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
211 return d;
212 }
213#endif
1d72bdf6 214#endif /* Loop style */
a0ed51b3 215}
9041c2e3 216
646ca15d
JH
217/*
218
219Tests if some arbitrary number of bytes begins in a valid UTF-8
220character. Note that an INVARIANT (i.e. ASCII) character is a valid
221UTF-8 character. The actual number of bytes in the UTF-8 character
222will be returned if it is valid, otherwise 0.
223
224This is the "slow" version as opposed to the "fast" version which is
225the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
226difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
227or less you should use the IS_UTF8_CHAR(), for lengths of five or more
228you should use the _slow(). In practice this means that the _slow()
229will be used very rarely, since the maximum Unicode code point (as of
230Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
231the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
232five bytes or more.
233
234=cut */
c053b435 235STATIC STRLEN
5f66b61c 236S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
237{
238 U8 u = *s;
239 STRLEN slen;
240 UV uv, ouv;
241
7918f24d
NC
242 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
243
646ca15d
JH
244 if (UTF8_IS_INVARIANT(u))
245 return 1;
246
247 if (!UTF8_IS_START(u))
248 return 0;
249
250 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
251 return 0;
252
253 slen = len - 1;
254 s++;
77263263
TS
255#ifdef EBCDIC
256 u = NATIVE_TO_UTF(u);
257#endif
646ca15d
JH
258 u &= UTF_START_MASK(len);
259 uv = u;
260 ouv = uv;
261 while (slen--) {
262 if (!UTF8_IS_CONTINUATION(*s))
263 return 0;
264 uv = UTF8_ACCUMULATE(uv, *s);
265 if (uv < ouv)
266 return 0;
267 ouv = uv;
268 s++;
269 }
270
271 if ((STRLEN)UNISKIP(uv) < len)
272 return 0;
273
274 return len;
275}
9041c2e3
NIS
276
277/*
87cea99e 278=for apidoc is_utf8_char
eebe1485 279
5da9da9e 280Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
281character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
282character is a valid UTF-8 character. The actual number of bytes in the UTF-8
283character will be returned if it is valid, otherwise 0.
9041c2e3 284
82686b01 285=cut */
067a85ef 286STRLEN
668b6d8d 287Perl_is_utf8_char(const U8 *s)
386d01d6 288{
44f8325f 289 const STRLEN len = UTF8SKIP(s);
7918f24d
NC
290
291 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
3b0fc154 292#ifdef IS_UTF8_CHAR
768c67ee 293 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
294 return IS_UTF8_CHAR(s, len) ? len : 0;
295#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 296 return is_utf8_char_slow(s, len);
386d01d6
GS
297}
298
eaf7a4d2 299
6662521e 300/*
87cea99e 301=for apidoc is_utf8_string
6662521e 302
c9ada85f 303Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
304UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
305not mean 'a string that contains code points above 0x7F encoded in UTF-8'
306because a valid ASCII string is a valid UTF-8 string.
6662521e 307
eaf7a4d2 308See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
768c67ee 309
6662521e
GS
310=cut
311*/
312
8e84507e 313bool
668b6d8d 314Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 315{
35da51f7 316 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 317 const U8* x = s;
067a85ef 318
7918f24d 319 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 320
6662521e 321 while (x < send) {
a3b680e6 322 STRLEN c;
1acdb0da
JH
323 /* Inline the easy bits of is_utf8_char() here for speed... */
324 if (UTF8_IS_INVARIANT(*x))
325 c = 1;
326 else if (!UTF8_IS_START(*x))
768c67ee 327 goto out;
1acdb0da
JH
328 else {
329 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
330#ifdef IS_UTF8_CHAR
331 c = UTF8SKIP(x);
768c67ee
JH
332 if (IS_UTF8_CHAR_FAST(c)) {
333 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
334 c = 0;
335 }
336 else
337 c = is_utf8_char_slow(x, c);
646ca15d
JH
338#else
339 c = is_utf8_char(x);
340#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 341 if (!c)
768c67ee 342 goto out;
1acdb0da 343 }
6662521e 344 x += c;
6662521e 345 }
768c67ee
JH
346
347 out:
60006e79
JH
348 if (x != send)
349 return FALSE;
067a85ef
A
350
351 return TRUE;
6662521e
GS
352}
353
67e989fb 354/*
814fafa7
NC
355Implemented as a macro in utf8.h
356
87cea99e 357=for apidoc is_utf8_string_loc
814fafa7
NC
358
359Like is_utf8_string() but stores the location of the failure (in the
360case of "utf8ness failure") or the location s+len (in the case of
361"utf8ness success") in the C<ep>.
362
363See also is_utf8_string_loclen() and is_utf8_string().
364
87cea99e 365=for apidoc is_utf8_string_loclen
81cd54e3 366
e3e4599f 367Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
368case of "utf8ness failure") or the location s+len (in the case of
369"utf8ness success") in the C<ep>, and the number of UTF-8
370encoded characters in the C<el>.
371
372See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
373
374=cut
375*/
376
377bool
668b6d8d 378Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 379{
35da51f7 380 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 381 const U8* x = s;
81cd54e3 382 STRLEN c;
3ebfea28 383 STRLEN outlen = 0;
7918f24d
NC
384
385 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 386
81cd54e3
JH
387 while (x < send) {
388 /* Inline the easy bits of is_utf8_char() here for speed... */
389 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
390 c = 1;
391 else if (!UTF8_IS_START(*x))
392 goto out;
81cd54e3 393 else {
768c67ee
JH
394 /* ... and call is_utf8_char() only if really needed. */
395#ifdef IS_UTF8_CHAR
396 c = UTF8SKIP(x);
397 if (IS_UTF8_CHAR_FAST(c)) {
398 if (!IS_UTF8_CHAR(x, c))
399 c = 0;
400 } else
401 c = is_utf8_char_slow(x, c);
402#else
403 c = is_utf8_char(x);
404#endif /* #ifdef IS_UTF8_CHAR */
405 if (!c)
406 goto out;
81cd54e3 407 }
768c67ee 408 x += c;
3ebfea28 409 outlen++;
81cd54e3 410 }
768c67ee
JH
411
412 out:
3ebfea28
AL
413 if (el)
414 *el = outlen;
415
768c67ee
JH
416 if (ep)
417 *ep = x;
3ebfea28 418 return (x == send);
81cd54e3
JH
419}
420
421/*
768c67ee 422
87cea99e 423=for apidoc utf8n_to_uvuni
67e989fb 424
9041c2e3 425Bottom level UTF-8 decode routine.
38a44b82 426Returns the Unicode code point value of the first character in the string C<s>
1e54db1a 427which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 428C<retlen> will be set to the length, in bytes, of that character.
67e989fb 429
1e54db1a 430If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
431is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
432it is assumed that the caller will raise a warning, and this function
28d3d195
JH
433will silently just set C<retlen> to C<-1> and return zero. If the
434C<flags> does not contain UTF8_CHECK_ONLY, warnings about
435malformations will be given, C<retlen> will be set to the expected
436length of the UTF-8 character in bytes, and zero will be returned.
437
438The C<flags> can also contain various flags to allow deviations from
439the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 440
9041c2e3
NIS
441Most code should use utf8_to_uvchr() rather than call this directly.
442
37607a96
PK
443=cut
444*/
67e989fb 445
a0ed51b3 446UV
7fc63493 447Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 448{
97aff369 449 dVAR;
d4c19fe8 450 const U8 * const s0 = s;
9c5ffd7c 451 UV uv = *s, ouv = 0;
ba210ebe 452 STRLEN len = 1;
7fc63493
AL
453 const bool dowarn = ckWARN_d(WARN_UTF8);
454 const UV startbyte = *s;
ba210ebe 455 STRLEN expectlen = 0;
a0dbb045
JH
456 U32 warning = 0;
457
7918f24d
NC
458 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
459
a0dbb045
JH
460/* This list is a superset of the UTF8_ALLOW_XXX. */
461
462#define UTF8_WARN_EMPTY 1
463#define UTF8_WARN_CONTINUATION 2
464#define UTF8_WARN_NON_CONTINUATION 3
465#define UTF8_WARN_FE_FF 4
466#define UTF8_WARN_SHORT 5
467#define UTF8_WARN_OVERFLOW 6
468#define UTF8_WARN_SURROGATE 7
c867b360
JH
469#define UTF8_WARN_LONG 8
470#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
471
472 if (curlen == 0 &&
473 !(flags & UTF8_ALLOW_EMPTY)) {
474 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
475 goto malformed;
476 }
477
1d72bdf6 478 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
479 if (retlen)
480 *retlen = 1;
c4d5f83a 481 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 482 }
67e989fb 483
421a8bf2 484 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 485 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 486 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
487 goto malformed;
488 }
489
421a8bf2 490 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 491 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 492 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
493 goto malformed;
494 }
9041c2e3 495
1d72bdf6 496#ifdef EBCDIC
75383841 497 uv = NATIVE_TO_UTF(uv);
1d72bdf6 498#else
fcc8fcf6
JH
499 if ((uv == 0xfe || uv == 0xff) &&
500 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 501 warning = UTF8_WARN_FE_FF;
ba210ebe 502 goto malformed;
a0ed51b3 503 }
1d72bdf6
NIS
504#endif
505
ba210ebe
JH
506 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
507 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
508 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
509 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
510#ifdef EBCDIC
511 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
512 else { len = 7; uv &= 0x01; }
513#else
ba210ebe
JH
514 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
515 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
516 else { len = 13; uv = 0; } /* whoa! */
517#endif
518
a0ed51b3
LW
519 if (retlen)
520 *retlen = len;
9041c2e3 521
ba210ebe
JH
522 expectlen = len;
523
fcc8fcf6
JH
524 if ((curlen < expectlen) &&
525 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 526 warning = UTF8_WARN_SHORT;
ba210ebe
JH
527 goto malformed;
528 }
529
530 len--;
a0ed51b3 531 s++;
ba210ebe
JH
532 ouv = uv;
533
a0ed51b3 534 while (len--) {
421a8bf2
JH
535 if (!UTF8_IS_CONTINUATION(*s) &&
536 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
537 s--;
538 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 539 goto malformed;
a0ed51b3
LW
540 }
541 else
8850bf83 542 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
543 if (!(uv > ouv)) {
544 /* These cannot be allowed. */
545 if (uv == ouv) {
75dbc644 546 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
547 warning = UTF8_WARN_LONG;
548 goto malformed;
549 }
550 }
551 else { /* uv < ouv */
552 /* This cannot be allowed. */
553 warning = UTF8_WARN_OVERFLOW;
554 goto malformed;
555 }
ba210ebe
JH
556 }
557 s++;
558 ouv = uv;
559 }
560
421a8bf2 561 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 562 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 563 warning = UTF8_WARN_SURROGATE;
ba210ebe 564 goto malformed;
eb160463 565 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 566 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 567 warning = UTF8_WARN_LONG;
ba210ebe 568 goto malformed;
421a8bf2 569 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 570 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 571 warning = UTF8_WARN_FFFF;
a9917092 572 goto malformed;
a0ed51b3 573 }
ba210ebe 574
a0ed51b3 575 return uv;
ba210ebe
JH
576
577malformed:
578
fcc8fcf6 579 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 580 if (retlen)
10edeb5d 581 *retlen = ((STRLEN) -1);
ba210ebe
JH
582 return 0;
583 }
584
a0dbb045 585 if (dowarn) {
84bafc02 586 SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
a0dbb045
JH
587
588 switch (warning) {
589 case 0: /* Intentionally empty. */ break;
590 case UTF8_WARN_EMPTY:
396482e1 591 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
592 break;
593 case UTF8_WARN_CONTINUATION:
097fb8e2 594 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
595 break;
596 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
597 if (s == s0)
598 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
599 (UV)s[1], startbyte);
551405c4
AL
600 else {
601 const int len = (int)(s-s0);
097fb8e2 602 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
603 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
604 }
605
a0dbb045
JH
606 break;
607 case UTF8_WARN_FE_FF:
608 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
609 break;
610 case UTF8_WARN_SHORT:
097fb8e2 611 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 612 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 613 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
614 break;
615 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
616 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
617 ouv, *s, startbyte);
a0dbb045
JH
618 break;
619 case UTF8_WARN_SURROGATE:
620 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
621 break;
a0dbb045 622 case UTF8_WARN_LONG:
097fb8e2 623 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 624 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
625 break;
626 case UTF8_WARN_FFFF:
627 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
628 break;
629 default:
396482e1 630 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
631 break;
632 }
633
634 if (warning) {
44f8325f 635 const char * const s = SvPVX_const(sv);
a0dbb045
JH
636
637 if (PL_op)
9014280d 638 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 639 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 640 else
9014280d 641 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
642 }
643 }
644
ba210ebe 645 if (retlen)
28d3d195 646 *retlen = expectlen ? expectlen : len;
ba210ebe 647
28d3d195 648 return 0;
a0ed51b3
LW
649}
650
8e84507e 651/*
87cea99e 652=for apidoc utf8_to_uvchr
9041c2e3
NIS
653
654Returns the native character value of the first character in the string C<s>
1e54db1a 655which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
656length, in bytes, of that character.
657
1e54db1a 658If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
659returned and retlen is set, if possible, to -1.
660
661=cut
662*/
663
664UV
7fc63493 665Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 666{
7918f24d
NC
667 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
668
1754c1a1
NC
669 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
670 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
671}
672
673/*
87cea99e 674=for apidoc utf8_to_uvuni
9041c2e3
NIS
675
676Returns the Unicode code point of the first character in the string C<s>
1e54db1a 677which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
678length, in bytes, of that character.
679
2bbc8d55 680This function should only be used when the returned UV is considered
9041c2e3
NIS
681an index into the Unicode semantic tables (e.g. swashes).
682
1e54db1a 683If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 684returned and retlen is set, if possible, to -1.
8e84507e
NIS
685
686=cut
687*/
688
689UV
7fc63493 690Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 691{
7918f24d
NC
692 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
693
9041c2e3 694 /* Call the low level routine asking for checks */
89ebb4a3 695 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 696 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
697}
698
b76347f2 699/*
87cea99e 700=for apidoc utf8_length
b76347f2
JH
701
702Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
703Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
704up past C<e>, croaks.
b76347f2
JH
705
706=cut
707*/
708
709STRLEN
35a4481c 710Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 711{
97aff369 712 dVAR;
b76347f2
JH
713 STRLEN len = 0;
714
7918f24d
NC
715 PERL_ARGS_ASSERT_UTF8_LENGTH;
716
8850bf83
JH
717 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
718 * the bitops (especially ~) can create illegal UTF-8.
719 * In other words: in Perl UTF-8 is not just for Unicode. */
720
a3b680e6
AL
721 if (e < s)
722 goto warn_and_return;
b76347f2 723 while (s < e) {
8e91ec7f
AV
724 if (!UTF8_IS_INVARIANT(*s))
725 s += UTF8SKIP(s);
726 else
727 s++;
728 len++;
729 }
730
731 if (e != s) {
732 len--;
733 warn_and_return:
9b387841
NC
734 if (PL_op)
735 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
736 "%s in %s", unees, OP_DESC(PL_op));
737 else
738 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
b76347f2
JH
739 }
740
741 return len;
742}
743
b06226ff 744/*
87cea99e 745=for apidoc utf8_distance
b06226ff 746
1e54db1a 747Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
748and C<b>.
749
750WARNING: use only if you *know* that the pointers point inside the
751same UTF-8 buffer.
752
37607a96
PK
753=cut
754*/
a0ed51b3 755
02eb7b47 756IV
35a4481c 757Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 758{
7918f24d
NC
759 PERL_ARGS_ASSERT_UTF8_DISTANCE;
760
bf1665bc 761 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
762}
763
b06226ff 764/*
87cea99e 765=for apidoc utf8_hop
b06226ff 766
8850bf83
JH
767Return the UTF-8 pointer C<s> displaced by C<off> characters, either
768forward or backward.
b06226ff
JH
769
770WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
771the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
772on the first byte of character or just after the last byte of a character.
b06226ff 773
37607a96
PK
774=cut
775*/
a0ed51b3
LW
776
777U8 *
4373e329 778Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 779{
7918f24d
NC
780 PERL_ARGS_ASSERT_UTF8_HOP;
781
96a5add6 782 PERL_UNUSED_CONTEXT;
8850bf83
JH
783 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
784 * the bitops (especially ~) can create illegal UTF-8.
785 * In other words: in Perl UTF-8 is not just for Unicode. */
786
a0ed51b3
LW
787 if (off >= 0) {
788 while (off--)
789 s += UTF8SKIP(s);
790 }
791 else {
792 while (off++) {
793 s--;
8850bf83
JH
794 while (UTF8_IS_CONTINUATION(*s))
795 s--;
a0ed51b3
LW
796 }
797 }
4373e329 798 return (U8 *)s;
a0ed51b3
LW
799}
800
6940069f 801/*
87cea99e 802=for apidoc utf8_to_bytes
6940069f 803
2bbc8d55 804Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
246fae53
MG
805Unlike C<bytes_to_utf8>, this over-writes the original string, and
806updates len to contain the new length.
67e989fb 807Returns zero on failure, setting C<len> to -1.
6940069f 808
95be277c
NC
809If you need a copy of the string, see C<bytes_from_utf8>.
810
6940069f
GS
811=cut
812*/
813
814U8 *
37607a96 815Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 816{
d4c19fe8
AL
817 U8 * const save = s;
818 U8 * const send = s + *len;
6940069f 819 U8 *d;
246fae53 820
7918f24d
NC
821 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
822
1e54db1a 823 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 824 while (s < send) {
dcad2880
JH
825 U8 c = *s++;
826
1d72bdf6
NIS
827 if (!UTF8_IS_INVARIANT(c) &&
828 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
829 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 830 *len = ((STRLEN) -1);
dcad2880
JH
831 return 0;
832 }
246fae53 833 }
dcad2880
JH
834
835 d = s = save;
6940069f 836 while (s < send) {
ed646e6e 837 STRLEN ulen;
9041c2e3 838 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 839 s += ulen;
6940069f
GS
840 }
841 *d = '\0';
246fae53 842 *len = d - save;
6940069f
GS
843 return save;
844}
845
846/*
87cea99e 847=for apidoc bytes_from_utf8
f9a63242 848
2bbc8d55 849Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
35a4481c 850Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
851the newly-created string, and updates C<len> to contain the new
852length. Returns the original string if no conversion occurs, C<len>
853is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
8540 if C<s> is converted or consisted entirely of characters that are invariant
855in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 856
37607a96
PK
857=cut
858*/
f9a63242
JH
859
860U8 *
e1ec3a88 861Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 862{
f9a63242 863 U8 *d;
e1ec3a88
AL
864 const U8 *start = s;
865 const U8 *send;
f9a63242
JH
866 I32 count = 0;
867
7918f24d
NC
868 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
869
96a5add6 870 PERL_UNUSED_CONTEXT;
f9a63242 871 if (!*is_utf8)
73d840c0 872 return (U8 *)start;
f9a63242 873
1e54db1a 874 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 875 for (send = s + *len; s < send;) {
e1ec3a88 876 U8 c = *s++;
1d72bdf6 877 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
878 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
879 (c = *s++) && UTF8_IS_CONTINUATION(c))
880 count++;
881 else
73d840c0 882 return (U8 *)start;
db42d148 883 }
f9a63242
JH
884 }
885
35da51f7 886 *is_utf8 = FALSE;
f9a63242 887
212542aa 888 Newx(d, (*len) - count + 1, U8);
ef9edfd0 889 s = start; start = d;
f9a63242
JH
890 while (s < send) {
891 U8 c = *s++;
c4d5f83a
NIS
892 if (!UTF8_IS_INVARIANT(c)) {
893 /* Then it is two-byte encoded */
894 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
895 c = ASCII_TO_NATIVE(c);
896 }
897 *d++ = c;
f9a63242
JH
898 }
899 *d = '\0';
900 *len = d - start;
73d840c0 901 return (U8 *)start;
f9a63242
JH
902}
903
904/*
87cea99e 905=for apidoc bytes_to_utf8
6940069f 906
2bbc8d55 907Converts a string C<s> of length C<len> from the native encoding into UTF-8.
6662521e
GS
908Returns a pointer to the newly-created string, and sets C<len> to
909reflect the new length.
6940069f 910
2bbc8d55
SP
911A NUL character will be written after the end of the string.
912
913If you want to convert to UTF-8 from encodings other than
914the native (Latin1 or EBCDIC),
c9ada85f
JH
915see sv_recode_to_utf8().
916
497711e7 917=cut
6940069f
GS
918*/
919
920U8*
35a4481c 921Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 922{
35a4481c 923 const U8 * const send = s + (*len);
6940069f
GS
924 U8 *d;
925 U8 *dst;
7918f24d
NC
926
927 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 928 PERL_UNUSED_CONTEXT;
6940069f 929
212542aa 930 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
931 dst = d;
932
933 while (s < send) {
35a4481c 934 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 935 if (UNI_IS_INVARIANT(uv))
eb160463 936 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 937 else {
eb160463
GS
938 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
939 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
940 }
941 }
942 *d = '\0';
6662521e 943 *len = d-dst;
6940069f
GS
944 return dst;
945}
946
a0ed51b3 947/*
dea0fc0b 948 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
949 *
950 * Destination must be pre-extended to 3/2 source. Do not use in-place.
951 * We optimize for native, for obvious reasons. */
952
953U8*
dea0fc0b 954Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 955{
dea0fc0b
JH
956 U8* pend;
957 U8* dstart = d;
958
7918f24d
NC
959 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
960
1de9afcd
RGS
961 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
962 d[0] = 0;
963 *newlen = 1;
964 return d;
965 }
966
dea0fc0b 967 if (bytelen & 1)
f5992bc4 968 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
969
970 pend = p + bytelen;
971
a0ed51b3 972 while (p < pend) {
dea0fc0b
JH
973 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
974 p += 2;
a0ed51b3 975 if (uv < 0x80) {
e294cc5d
JH
976#ifdef EBCDIC
977 *d++ = UNI_TO_NATIVE(uv);
978#else
eb160463 979 *d++ = (U8)uv;
e294cc5d 980#endif
a0ed51b3
LW
981 continue;
982 }
983 if (uv < 0x800) {
eb160463
GS
984 *d++ = (U8)(( uv >> 6) | 0xc0);
985 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
986 continue;
987 }
52b9aa85 988 if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
01ea242b 989 if (p >= pend) {
dea0fc0b 990 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
01ea242b
NC
991 } else {
992 UV low = (p[0] << 8) + p[1];
993 p += 2;
52b9aa85 994 if (low < 0xdc00 || low > 0xdfff)
01ea242b
NC
995 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
996 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
997 }
dbde1951
NC
998 } else if (uv >= 0xdc00 && uv <= 0xdfff) {
999 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
1000 }
1001 if (uv < 0x10000) {
eb160463
GS
1002 *d++ = (U8)(( uv >> 12) | 0xe0);
1003 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1004 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1005 continue;
1006 }
1007 else {
eb160463
GS
1008 *d++ = (U8)(( uv >> 18) | 0xf0);
1009 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1010 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1011 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1012 continue;
1013 }
1014 }
dea0fc0b 1015 *newlen = d - dstart;
a0ed51b3
LW
1016 return d;
1017}
1018
1019/* Note: this one is slightly destructive of the source. */
1020
1021U8*
dea0fc0b 1022Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1023{
1024 U8* s = (U8*)p;
d4c19fe8 1025 U8* const send = s + bytelen;
7918f24d
NC
1026
1027 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1028
e0ea5e2d
NC
1029 if (bytelen & 1)
1030 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1031 (UV)bytelen);
1032
a0ed51b3 1033 while (s < send) {
d4c19fe8 1034 const U8 tmp = s[0];
a0ed51b3
LW
1035 s[0] = s[1];
1036 s[1] = tmp;
1037 s += 2;
1038 }
dea0fc0b 1039 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1040}
1041
1042/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1043
1044bool
84afefe6 1045Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1046{
89ebb4a3 1047 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1048 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1049 return is_utf8_alnum(tmpbuf);
1050}
1051
1052bool
84afefe6 1053Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1054{
89ebb4a3 1055 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1056 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1057 return is_utf8_idfirst(tmpbuf);
1058}
1059
1060bool
84afefe6 1061Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1062{
89ebb4a3 1063 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1064 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1065 return is_utf8_alpha(tmpbuf);
1066}
1067
1068bool
84afefe6 1069Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1070{
89ebb4a3 1071 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1072 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1073 return is_utf8_ascii(tmpbuf);
1074}
1075
1076bool
84afefe6 1077Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1078{
89ebb4a3 1079 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1080 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1081 return is_utf8_space(tmpbuf);
1082}
1083
1084bool
84afefe6 1085Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1086{
89ebb4a3 1087 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1088 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1089 return is_utf8_digit(tmpbuf);
1090}
1091
1092bool
84afefe6 1093Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1094{
89ebb4a3 1095 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1096 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1097 return is_utf8_upper(tmpbuf);
1098}
1099
1100bool
84afefe6 1101Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1102{
89ebb4a3 1103 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1104 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1105 return is_utf8_lower(tmpbuf);
1106}
1107
1108bool
84afefe6 1109Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1110{
89ebb4a3 1111 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1112 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1113 return is_utf8_cntrl(tmpbuf);
1114}
1115
1116bool
84afefe6 1117Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1118{
89ebb4a3 1119 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1120 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1121 return is_utf8_graph(tmpbuf);
1122}
1123
1124bool
84afefe6 1125Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1126{
89ebb4a3 1127 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1128 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1129 return is_utf8_print(tmpbuf);
1130}
1131
b8c5462f 1132bool
84afefe6 1133Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1134{
89ebb4a3 1135 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1136 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1137 return is_utf8_punct(tmpbuf);
1138}
1139
4d61ec05 1140bool
84afefe6 1141Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1142{
89ebb4a3 1143 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1144 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1145 return is_utf8_xdigit(tmpbuf);
1146}
1147
84afefe6
JH
1148UV
1149Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1150{
7918f24d
NC
1151 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1152
0ebc6274
JH
1153 uvchr_to_utf8(p, c);
1154 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1155}
1156
84afefe6
JH
1157UV
1158Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1159{
7918f24d
NC
1160 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1161
0ebc6274
JH
1162 uvchr_to_utf8(p, c);
1163 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1164}
1165
84afefe6
JH
1166UV
1167Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1168{
7918f24d
NC
1169 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1170
0ebc6274
JH
1171 uvchr_to_utf8(p, c);
1172 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1173}
1174
84afefe6
JH
1175UV
1176Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1177{
7918f24d
NC
1178 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1179
0ebc6274
JH
1180 uvchr_to_utf8(p, c);
1181 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1182}
1183
a0ed51b3
LW
1184/* for now these all assume no locale info available for Unicode > 255 */
1185
1186bool
84afefe6 1187Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1188{
1189 return is_uni_alnum(c); /* XXX no locale support yet */
1190}
1191
1192bool
84afefe6 1193Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1194{
1195 return is_uni_idfirst(c); /* XXX no locale support yet */
1196}
1197
1198bool
84afefe6 1199Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1200{
1201 return is_uni_alpha(c); /* XXX no locale support yet */
1202}
1203
1204bool
84afefe6 1205Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1206{
1207 return is_uni_ascii(c); /* XXX no locale support yet */
1208}
1209
1210bool
84afefe6 1211Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1212{
1213 return is_uni_space(c); /* XXX no locale support yet */
1214}
1215
1216bool
84afefe6 1217Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1218{
1219 return is_uni_digit(c); /* XXX no locale support yet */
1220}
1221
1222bool
84afefe6 1223Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1224{
1225 return is_uni_upper(c); /* XXX no locale support yet */
1226}
1227
1228bool
84afefe6 1229Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1230{
1231 return is_uni_lower(c); /* XXX no locale support yet */
1232}
1233
1234bool
84afefe6 1235Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1236{
1237 return is_uni_cntrl(c); /* XXX no locale support yet */
1238}
1239
1240bool
84afefe6 1241Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1242{
1243 return is_uni_graph(c); /* XXX no locale support yet */
1244}
1245
1246bool
84afefe6 1247Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1248{
1249 return is_uni_print(c); /* XXX no locale support yet */
1250}
1251
b8c5462f 1252bool
84afefe6 1253Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1254{
1255 return is_uni_punct(c); /* XXX no locale support yet */
1256}
1257
4d61ec05 1258bool
84afefe6 1259Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1260{
1261 return is_uni_xdigit(c); /* XXX no locale support yet */
1262}
1263
b7ac61fa
JH
1264U32
1265Perl_to_uni_upper_lc(pTHX_ U32 c)
1266{
ee099d14
JH
1267 /* XXX returns only the first character -- do not use XXX */
1268 /* XXX no locale support yet */
1269 STRLEN len;
89ebb4a3 1270 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1271 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1272}
1273
1274U32
1275Perl_to_uni_title_lc(pTHX_ U32 c)
1276{
ee099d14
JH
1277 /* XXX returns only the first character XXX -- do not use XXX */
1278 /* XXX no locale support yet */
1279 STRLEN len;
89ebb4a3 1280 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1281 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1282}
1283
1284U32
1285Perl_to_uni_lower_lc(pTHX_ U32 c)
1286{
ee099d14
JH
1287 /* XXX returns only the first character -- do not use XXX */
1288 /* XXX no locale support yet */
1289 STRLEN len;
89ebb4a3 1290 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1291 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1292}
1293
7452cf6a 1294static bool
5141f98e 1295S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1296 const char *const swashname)
1297{
97aff369 1298 dVAR;
7918f24d
NC
1299
1300 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1301
bde6a22d
NC
1302 if (!is_utf8_char(p))
1303 return FALSE;
1304 if (!*swash)
711a919c 1305 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1306 return swash_fetch(*swash, p, TRUE) != 0;
1307}
1308
1309bool
7fc63493 1310Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1311{
97aff369 1312 dVAR;
7918f24d
NC
1313
1314 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1315
671c33bf
NC
1316 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1317 * descendant of isalnum(3), in other words, it doesn't
1318 * contain the '_'. --jhi */
d4c19fe8 1319 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1320}
1321
1322bool
7fc63493 1323Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1324{
97aff369 1325 dVAR;
7918f24d
NC
1326
1327 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1328
82686b01
JH
1329 if (*p == '_')
1330 return TRUE;
bde6a22d 1331 /* is_utf8_idstart would be more logical. */
d4c19fe8 1332 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1333}
1334
1335bool
7fc63493 1336Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1337{
97aff369 1338 dVAR;
7918f24d
NC
1339
1340 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1341
82686b01
JH
1342 if (*p == '_')
1343 return TRUE;
d4c19fe8 1344 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1345}
1346
1347bool
7fc63493 1348Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1349{
97aff369 1350 dVAR;
7918f24d
NC
1351
1352 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1353
d4c19fe8 1354 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1355}
1356
1357bool
7fc63493 1358Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1359{
97aff369 1360 dVAR;
7918f24d
NC
1361
1362 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1363
d4c19fe8 1364 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1365}
1366
1367bool
7fc63493 1368Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1369{
97aff369 1370 dVAR;
7918f24d
NC
1371
1372 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1373
d4c19fe8 1374 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1375}
1376
1377bool
7fc63493 1378Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1379{
97aff369 1380 dVAR;
7918f24d
NC
1381
1382 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1383
d4c19fe8 1384 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1385}
1386
1387bool
7fc63493 1388Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1389{
97aff369 1390 dVAR;
7918f24d
NC
1391
1392 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1393
d4c19fe8 1394 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1395}
1396
1397bool
7fc63493 1398Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1399{
97aff369 1400 dVAR;
7918f24d
NC
1401
1402 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1403
d4c19fe8 1404 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1405}
1406
1407bool
7fc63493 1408Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1409{
97aff369 1410 dVAR;
7918f24d
NC
1411
1412 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1413
d4c19fe8 1414 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1415}
1416
1417bool
7fc63493 1418Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1419{
97aff369 1420 dVAR;
7918f24d
NC
1421
1422 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1423
d4c19fe8 1424 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1425}
1426
1427bool
7fc63493 1428Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1429{
97aff369 1430 dVAR;
7918f24d
NC
1431
1432 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1433
d4c19fe8 1434 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1435}
1436
1437bool
7fc63493 1438Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1439{
97aff369 1440 dVAR;
7918f24d
NC
1441
1442 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1443
d4c19fe8 1444 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1445}
1446
1447bool
7fc63493 1448Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1449{
97aff369 1450 dVAR;
7918f24d
NC
1451
1452 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1453
d4c19fe8 1454 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1455}
1456
1457bool
7fc63493 1458Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1459{
97aff369 1460 dVAR;
7918f24d
NC
1461
1462 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1463
d4c19fe8 1464 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1465}
1466
6b5c0936 1467/*
87cea99e 1468=for apidoc to_utf8_case
6b5c0936
JH
1469
1470The "p" contains the pointer to the UTF-8 string encoding
1471the character that is being converted.
1472
1473The "ustrp" is a pointer to the character buffer to put the
1474conversion result to. The "lenp" is a pointer to the length
1475of the result.
1476
0134edef 1477The "swashp" is a pointer to the swash to use.
6b5c0936 1478
0134edef 1479Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1480and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1481but not always, a multicharacter mapping), is tried first.
6b5c0936 1482
0134edef
JH
1483The "special" is a string like "utf8::ToSpecLower", which means the
1484hash %utf8::ToSpecLower. The access to the hash is through
1485Perl_to_utf8_case().
6b5c0936 1486
0134edef
JH
1487The "normal" is a string like "ToLower" which means the swash
1488%utf8::ToLower.
1489
1490=cut */
6b5c0936 1491
2104c8d9 1492UV
9a957fbc
AL
1493Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1494 SV **swashp, const char *normal, const char *special)
a0ed51b3 1495{
97aff369 1496 dVAR;
89ebb4a3 1497 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1498 STRLEN len = 0;
aec46f14 1499 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1500 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1501 * are necessary in EBCDIC, they are redundant no-ops
1502 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1503 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
1504
1505 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1506
1feea2c7 1507 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1508
1509 if (!*swashp) /* load on-demand */
1510 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1511
b08cf34e
JH
1512 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1513 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1514 /* It might be "special" (sometimes, but not always,
2a37f04d 1515 * a multicharacter mapping) */
6673a63c 1516 HV * const hv = get_hv(special, 0);
b08cf34e
JH
1517 SV **svp;
1518
35da51f7 1519 if (hv &&
b08cf34e
JH
1520 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1521 (*svp)) {
cfd0369c 1522 const char *s;
47654450 1523
cfd0369c 1524 s = SvPV_const(*svp, len);
47654450
JH
1525 if (len == 1)
1526 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1527 else {
2f9475ad
JH
1528#ifdef EBCDIC
1529 /* If we have EBCDIC we need to remap the characters
1530 * since any characters in the low 256 are Unicode
1531 * code points, not EBCDIC. */
7cda7a3d 1532 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1533
1534 d = tmpbuf;
b08cf34e 1535 if (SvUTF8(*svp)) {
2f9475ad
JH
1536 STRLEN tlen = 0;
1537
1538 while (t < tend) {
d4c19fe8 1539 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1540 if (tlen > 0) {
1541 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1542 t += tlen;
1543 }
1544 else
1545 break;
1546 }
1547 }
1548 else {
36fec512
JH
1549 while (t < tend) {
1550 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1551 t++;
1552 }
2f9475ad
JH
1553 }
1554 len = d - tmpbuf;
1555 Copy(tmpbuf, ustrp, len, U8);
1556#else
d2dcd0fb 1557 Copy(s, ustrp, len, U8);
2f9475ad 1558#endif
29e98929 1559 }
983ffd37 1560 }
0134edef
JH
1561 }
1562
1563 if (!len && *swashp) {
d4c19fe8
AL
1564 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1565
0134edef
JH
1566 if (uv2) {
1567 /* It was "normal" (a single character mapping). */
d4c19fe8 1568 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1569 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1570 }
1571 }
1feea2c7 1572
0134edef
JH
1573 if (!len) /* Neither: just copy. */
1574 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1575
2a37f04d
JH
1576 if (lenp)
1577 *lenp = len;
1578
0134edef 1579 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1580}
1581
d3e79532 1582/*
87cea99e 1583=for apidoc to_utf8_upper
d3e79532
JH
1584
1585Convert the UTF-8 encoded character at p to its uppercase version and
1586store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1587that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1588the uppercase version may be longer than the original character.
d3e79532
JH
1589
1590The first character of the uppercased version is returned
1591(but note, as explained above, that there may be more.)
1592
1593=cut */
1594
2104c8d9 1595UV
7fc63493 1596Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1597{
97aff369 1598 dVAR;
7918f24d
NC
1599
1600 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1601
983ffd37 1602 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1603 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1604}
a0ed51b3 1605
d3e79532 1606/*
87cea99e 1607=for apidoc to_utf8_title
d3e79532
JH
1608
1609Convert the UTF-8 encoded character at p to its titlecase version and
1610store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1611that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612titlecase version may be longer than the original character.
d3e79532
JH
1613
1614The first character of the titlecased version is returned
1615(but note, as explained above, that there may be more.)
1616
1617=cut */
1618
983ffd37 1619UV
7fc63493 1620Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1621{
97aff369 1622 dVAR;
7918f24d
NC
1623
1624 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1625
983ffd37 1626 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1627 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1628}
1629
d3e79532 1630/*
87cea99e 1631=for apidoc to_utf8_lower
d3e79532
JH
1632
1633Convert the UTF-8 encoded character at p to its lowercase version and
1634store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1635that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1636lowercase version may be longer than the original character.
d3e79532
JH
1637
1638The first character of the lowercased version is returned
1639(but note, as explained above, that there may be more.)
1640
1641=cut */
1642
2104c8d9 1643UV
7fc63493 1644Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1645{
97aff369 1646 dVAR;
7918f24d
NC
1647
1648 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1649
983ffd37 1650 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1651 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1652}
1653
d3e79532 1654/*
87cea99e 1655=for apidoc to_utf8_fold
d3e79532
JH
1656
1657Convert the UTF-8 encoded character at p to its foldcase version and
1658store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1659that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1660foldcase version may be longer than the original character (up to
1661three characters).
1662
1663The first character of the foldcased version is returned
1664(but note, as explained above, that there may be more.)
1665
1666=cut */
1667
b4e400f9 1668UV
7fc63493 1669Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1670{
97aff369 1671 dVAR;
7918f24d
NC
1672
1673 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1674
b4e400f9
JH
1675 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1676 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1677}
1678
711a919c
TS
1679/* Note:
1680 * A "swash" is a swatch hash.
1681 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1682 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1683 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1684 */
a0ed51b3 1685SV*
7fc63493 1686Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1687{
27da23d5 1688 dVAR;
a0ed51b3 1689 SV* retval;
8e84507e 1690 dSP;
7fc63493
AL
1691 const size_t pkg_len = strlen(pkg);
1692 const size_t name_len = strlen(name);
da51bb9b 1693 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1694 SV* errsv_save;
ce3b816e 1695
7918f24d
NC
1696 PERL_ARGS_ASSERT_SWASH_INIT;
1697
96ca9f55
DM
1698 PUSHSTACKi(PERLSI_MAGIC);
1699 ENTER;
1700 SAVEI32(PL_hints);
1701 PL_hints = 0;
1702 save_re_context();
1b026014 1703 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1704 ENTER;
f8be5cf0 1705 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1706 /* It is assumed that callers of this routine are not passing in any
1707 user derived data. */
1708 /* Need to do this after save_re_context() as it will set PL_tainted to
1709 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1710 Even line to create errsv_save can turn on PL_tainted. */
1711 SAVEBOOL(PL_tainted);
1712 PL_tainted = 0;
71bed85a 1713 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1714 NULL);
f8be5cf0
JH
1715 if (!SvTRUE(ERRSV))
1716 sv_setsv(ERRSV, errsv_save);
1717 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1718 LEAVE;
1719 }
1720 SPAGAIN;
a0ed51b3
LW
1721 PUSHMARK(SP);
1722 EXTEND(SP,5);
6e449a3a
MHM
1723 mPUSHp(pkg, pkg_len);
1724 mPUSHp(name, name_len);
a0ed51b3 1725 PUSHs(listsv);
6e449a3a
MHM
1726 mPUSHi(minbits);
1727 mPUSHi(none);
a0ed51b3 1728 PUTBACK;
f8be5cf0 1729 errsv_save = newSVsv(ERRSV);
864dbfa3 1730 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1731 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1732 else
e24b16f9 1733 retval = &PL_sv_undef;
f8be5cf0
JH
1734 if (!SvTRUE(ERRSV))
1735 sv_setsv(ERRSV, errsv_save);
1736 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1737 LEAVE;
1738 POPSTACK;
923e4eb5 1739 if (IN_PERL_COMPILETIME) {
623e6609 1740 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1741 }
bc45ce41
JH
1742 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1743 if (SvPOK(retval))
35c1215d 1744 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1745 SVfARG(retval));
cea2e8a9 1746 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1747 }
a0ed51b3
LW
1748 return retval;
1749}
1750
035d37be
JH
1751
1752/* This API is wrong for special case conversions since we may need to
1753 * return several Unicode characters for a single Unicode character
1754 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1755 * the lower-level routine, and it is similarly broken for returning
1756 * multiple values. --jhi */
979f2922 1757/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1758
1759/* Note:
1760 * Returns the value of property/mapping C<swash> for the first character
1761 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1762 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1763 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1764 */
a0ed51b3 1765UV
680c470c 1766Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1767{
27da23d5 1768 dVAR;
ef8f7699 1769 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
1770 U32 klen;
1771 U32 off;
a0ed51b3 1772 STRLEN slen;
7d85a32c 1773 STRLEN needents;
cfd0369c 1774 const U8 *tmps = NULL;
a0ed51b3 1775 U32 bit;
979f2922 1776 SV *swatch;
3568d838 1777 U8 tmputf8[2];
35da51f7 1778 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 1779
7918f24d
NC
1780 PERL_ARGS_ASSERT_SWASH_FETCH;
1781
3568d838 1782 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1783 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1784 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1785 ptr = tmputf8;
3568d838
JH
1786 }
1787 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1788 * then the "swatch" is a vec() for al the chars which start
1789 * with 0xAA..0xYY
1790 * So the key in the hash (klen) is length of encoded char -1
1791 */
1792 klen = UTF8SKIP(ptr) - 1;
1793 off = ptr[klen];
a0ed51b3 1794
979f2922 1795 if (klen == 0) {
7d85a32c 1796 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1797 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1798 */
979f2922
TS
1799 needents = UTF_CONTINUATION_MARK;
1800 off = NATIVE_TO_UTF(ptr[klen]);
1801 }
1802 else {
7d85a32c 1803 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1804 needents = (1 << UTF_ACCUMULATION_SHIFT);
1805 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1806 }
7d85a32c 1807
a0ed51b3
LW
1808 /*
1809 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1810 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1811 * it's nothing to sniff at.) Pity we usually come through at least
1812 * two function calls to get here...
1813 *
1814 * NB: this code assumes that swatches are never modified, once generated!
1815 */
1816
3568d838 1817 if (hv == PL_last_swash_hv &&
a0ed51b3 1818 klen == PL_last_swash_klen &&
27da23d5 1819 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1820 {
1821 tmps = PL_last_swash_tmps;
1822 slen = PL_last_swash_slen;
1823 }
1824 else {
1825 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1826 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1827
979f2922
TS
1828 /* If not cached, generate it via swash_get */
1829 if (!svp || !SvPOK(*svp)
1830 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1831 /* We use utf8n_to_uvuni() as we want an index into
1832 Unicode tables, not a native character number.
1833 */
aec46f14 1834 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1835 ckWARN(WARN_UTF8) ?
1836 0 : UTF8_ALLOW_ANY);
680c470c 1837 swatch = swash_get(swash,
979f2922
TS
1838 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1839 (klen) ? (code_point & ~(needents - 1)) : 0,
1840 needents);
1841
923e4eb5 1842 if (IN_PERL_COMPILETIME)
623e6609 1843 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1844
979f2922 1845 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1846
979f2922
TS
1847 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1848 || (slen << 3) < needents)
660a4616 1849 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1850 }
1851
1852 PL_last_swash_hv = hv;
16d8f38a 1853 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 1854 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
1855 /* FIXME change interpvar.h? */
1856 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1857 PL_last_swash_slen = slen;
1858 if (klen)
1859 Copy(ptr, PL_last_swash_key, klen, U8);
1860 }
1861
9faf8d75 1862 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1863 case 1:
1864 bit = 1 << (off & 7);
1865 off >>= 3;
1866 return (tmps[off] & bit) != 0;
1867 case 8:
1868 return tmps[off];
1869 case 16:
1870 off <<= 1;
1871 return (tmps[off] << 8) + tmps[off + 1] ;
1872 case 32:
1873 off <<= 2;
1874 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1875 }
660a4616 1876 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1877 NORETURN_FUNCTION_END;
a0ed51b3 1878}
2b9d42f0 1879
979f2922
TS
1880/* Note:
1881 * Returns a swatch (a bit vector string) for a code point sequence
1882 * that starts from the value C<start> and comprises the number C<span>.
1883 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1884 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1885 */
1886STATIC SV*
1887S_swash_get(pTHX_ SV* swash, UV start, UV span)
1888{
1889 SV *swatch;
711a919c 1890 U8 *l, *lend, *x, *xend, *s;
979f2922 1891 STRLEN lcur, xcur, scur;
ef8f7699 1892 HV *const hv = MUTABLE_HV(SvRV(swash));
017a3ce5
GA
1893 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1894 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1895 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1896 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1897 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1898 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1899 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1900 const STRLEN bits = SvUV(*bitssvp);
1901 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1902 const UV none = SvUV(*nonesvp);
1903 const UV end = start + span;
979f2922 1904
7918f24d
NC
1905 PERL_ARGS_ASSERT_SWASH_GET;
1906
979f2922 1907 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1908 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1909 (UV)bits);
979f2922
TS
1910 }
1911
1912 /* create and initialize $swatch */
979f2922 1913 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
1914 swatch = newSV(scur);
1915 SvPOK_on(swatch);
979f2922
TS
1916 s = (U8*)SvPVX(swatch);
1917 if (octets && none) {
0bd48802 1918 const U8* const e = s + scur;
979f2922
TS
1919 while (s < e) {
1920 if (bits == 8)
1921 *s++ = (U8)(none & 0xff);
1922 else if (bits == 16) {
1923 *s++ = (U8)((none >> 8) & 0xff);
1924 *s++ = (U8)( none & 0xff);
1925 }
1926 else if (bits == 32) {
1927 *s++ = (U8)((none >> 24) & 0xff);
1928 *s++ = (U8)((none >> 16) & 0xff);
1929 *s++ = (U8)((none >> 8) & 0xff);
1930 *s++ = (U8)( none & 0xff);
1931 }
1932 }
1933 *s = '\0';
1934 }
1935 else {
1936 (void)memzero((U8*)s, scur + 1);
1937 }
1938 SvCUR_set(swatch, scur);
1939 s = (U8*)SvPVX(swatch);
1940
1941 /* read $swash->{LIST} */
1942 l = (U8*)SvPV(*listsvp, lcur);
1943 lend = l + lcur;
1944 while (l < lend) {
35da51f7 1945 UV min, max, val;
979f2922
TS
1946 STRLEN numlen;
1947 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1948
0bd48802 1949 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1950
1951 numlen = lend - l;
1952 min = grok_hex((char *)l, &numlen, &flags, NULL);
1953 if (numlen)
1954 l += numlen;
1955 else if (nl) {
1956 l = nl + 1; /* 1 is length of "\n" */
1957 continue;
1958 }
1959 else {
1960 l = lend; /* to LIST's end at which \n is not found */
1961 break;
1962 }
1963
1964 if (isBLANK(*l)) {
1965 ++l;
1966 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1967 numlen = lend - l;
1968 max = grok_hex((char *)l, &numlen, &flags, NULL);
1969 if (numlen)
1970 l += numlen;
1971 else
1972 max = min;
1973
1974 if (octets) {
1975 if (isBLANK(*l)) {
1976 ++l;
1977 flags = PERL_SCAN_SILENT_ILLDIGIT |
1978 PERL_SCAN_DISALLOW_PREFIX;
1979 numlen = lend - l;
1980 val = grok_hex((char *)l, &numlen, &flags, NULL);
1981 if (numlen)
1982 l += numlen;
1983 else
1984 val = 0;
1985 }
1986 else {
1987 val = 0;
1988 if (typeto) {
1989 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1990 typestr, l);
1991 }
1992 }
1993 }
711a919c
TS
1994 else
1995 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1996 }
1997 else {
1998 max = min;
1999 if (octets) {
2000 val = 0;
2001 if (typeto) {
2002 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2003 }
2004 }
711a919c
TS
2005 else
2006 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
2007 }
2008
2009 if (nl)
2010 l = nl + 1;
2011 else
2012 l = lend;
2013
2014 if (max < start)
2015 continue;
2016
2017 if (octets) {
35da51f7 2018 UV key;
979f2922
TS
2019 if (min < start) {
2020 if (!none || val < none) {
2021 val += start - min;
2022 }
2023 min = start;
2024 }
2025 for (key = min; key <= max; key++) {
2026 STRLEN offset;
2027 if (key >= end)
2028 goto go_out_list;
2029 /* offset must be non-negative (start <= min <= key < end) */
2030 offset = octets * (key - start);
2031 if (bits == 8)
2032 s[offset] = (U8)(val & 0xff);
2033 else if (bits == 16) {
2034 s[offset ] = (U8)((val >> 8) & 0xff);
2035 s[offset + 1] = (U8)( val & 0xff);
2036 }
2037 else if (bits == 32) {
2038 s[offset ] = (U8)((val >> 24) & 0xff);
2039 s[offset + 1] = (U8)((val >> 16) & 0xff);
2040 s[offset + 2] = (U8)((val >> 8) & 0xff);
2041 s[offset + 3] = (U8)( val & 0xff);
2042 }
2043
2044 if (!none || val < none)
2045 ++val;
2046 }
2047 }
711a919c 2048 else { /* bits == 1, then val should be ignored */
35da51f7 2049 UV key;
979f2922
TS
2050 if (min < start)
2051 min = start;
2052 for (key = min; key <= max; key++) {
0bd48802 2053 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
2054 if (key >= end)
2055 goto go_out_list;
2056 s[offset >> 3] |= 1 << (offset & 7);
2057 }
2058 }
2059 } /* while */
2060 go_out_list:
2061
2062 /* read $swash->{EXTRAS} */
2063 x = (U8*)SvPV(*extssvp, xcur);
2064 xend = x + xcur;
2065 while (x < xend) {
2066 STRLEN namelen;
2067 U8 *namestr;
2068 SV** othersvp;
2069 HV* otherhv;
2070 STRLEN otherbits;
2071 SV **otherbitssvp, *other;
711a919c 2072 U8 *s, *o, *nl;
979f2922
TS
2073 STRLEN slen, olen;
2074
35da51f7 2075 const U8 opc = *x++;
979f2922
TS
2076 if (opc == '\n')
2077 continue;
2078
2079 nl = (U8*)memchr(x, '\n', xend - x);
2080
2081 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2082 if (nl) {
2083 x = nl + 1; /* 1 is length of "\n" */
2084 continue;
2085 }
2086 else {
2087 x = xend; /* to EXTRAS' end at which \n is not found */
2088 break;
2089 }
2090 }
2091
2092 namestr = x;
2093 if (nl) {
2094 namelen = nl - namestr;
2095 x = nl + 1;
2096 }
2097 else {
2098 namelen = xend - namestr;
2099 x = xend;
2100 }
2101
2102 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 2103 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 2104 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
2105 otherbits = (STRLEN)SvUV(*otherbitssvp);
2106 if (bits < otherbits)
660a4616 2107 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
2108
2109 /* The "other" swatch must be destroyed after. */
2110 other = swash_get(*othersvp, start, span);
2111 o = (U8*)SvPV(other, olen);
2112
2113 if (!olen)
660a4616 2114 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
2115
2116 s = (U8*)SvPV(swatch, slen);
2117 if (bits == 1 && otherbits == 1) {
2118 if (slen != olen)
660a4616 2119 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
2120
2121 switch (opc) {
2122 case '+':
2123 while (slen--)
2124 *s++ |= *o++;
2125 break;
2126 case '!':
2127 while (slen--)
2128 *s++ |= ~*o++;
2129 break;
2130 case '-':
2131 while (slen--)
2132 *s++ &= ~*o++;
2133 break;
2134 case '&':
2135 while (slen--)
2136 *s++ &= *o++;
2137 break;
2138 default:
2139 break;
2140 }
2141 }
711a919c 2142 else {
979f2922
TS
2143 STRLEN otheroctets = otherbits >> 3;
2144 STRLEN offset = 0;
35da51f7 2145 U8* const send = s + slen;
979f2922
TS
2146
2147 while (s < send) {
2148 UV otherval = 0;
2149
2150 if (otherbits == 1) {
2151 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2152 ++offset;
2153 }
2154 else {
2155 STRLEN vlen = otheroctets;
2156 otherval = *o++;
2157 while (--vlen) {
2158 otherval <<= 8;
2159 otherval |= *o++;
2160 }
2161 }
2162
711a919c 2163 if (opc == '+' && otherval)
6f207bd3 2164 NOOP; /* replace with otherval */
979f2922
TS
2165 else if (opc == '!' && !otherval)
2166 otherval = 1;
2167 else if (opc == '-' && otherval)
2168 otherval = 0;
2169 else if (opc == '&' && !otherval)
2170 otherval = 0;
2171 else {
711a919c 2172 s += octets; /* no replacement */
979f2922
TS
2173 continue;
2174 }
2175
2176 if (bits == 8)
2177 *s++ = (U8)( otherval & 0xff);
2178 else if (bits == 16) {
2179 *s++ = (U8)((otherval >> 8) & 0xff);
2180 *s++ = (U8)( otherval & 0xff);
2181 }
2182 else if (bits == 32) {
2183 *s++ = (U8)((otherval >> 24) & 0xff);
2184 *s++ = (U8)((otherval >> 16) & 0xff);
2185 *s++ = (U8)((otherval >> 8) & 0xff);
2186 *s++ = (U8)( otherval & 0xff);
2187 }
2188 }
2189 }
2190 sv_free(other); /* through with it! */
2191 } /* while */
2192 return swatch;
2193}
2194
0f830e0b 2195/*
87cea99e 2196=for apidoc uvchr_to_utf8
0f830e0b
NC
2197
2198Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2199of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2200bytes available. The return value is the pointer to the byte after the
2201end of the new character. In other words,
2202
2203 d = uvchr_to_utf8(d, uv);
2204
2205is the recommended wide native character-aware way of saying
2206
2207 *(d++) = uv;
2208
2209=cut
2210*/
2211
2212/* On ASCII machines this is normally a macro but we want a
2213 real function in case XS code wants it
2214*/
0f830e0b
NC
2215U8 *
2216Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2217{
7918f24d
NC
2218 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2219
0f830e0b
NC
2220 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2221}
2222
b851fbc1
JH
2223U8 *
2224Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2225{
7918f24d
NC
2226 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2227
b851fbc1
JH
2228 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2229}
2b9d42f0
NIS
2230
2231/*
87cea99e 2232=for apidoc utf8n_to_uvchr
0f830e0b
NC
2233flags
2234
2235Returns the native character value of the first character in the string
2236C<s>
2237which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2238length, in bytes, of that character.
2239
2240Allows length and flags to be passed to low level routine.
2241
2242=cut
2243*/
2244/* On ASCII machines this is normally a macro but we want
2245 a real function in case XS code wants it
2246*/
0f830e0b
NC
2247UV
2248Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2249U32 flags)
2250{
2251 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
2252
2253 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2254
0f830e0b
NC
2255 return UNI_TO_NATIVE(uv);
2256}
2257
2258/*
87cea99e 2259=for apidoc pv_uni_display
d2cc3551
JH
2260
2261Build to the scalar dsv a displayable version of the string spv,
2262length len, the displayable version being at most pvlim bytes long
2263(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2264
9e55ce06 2265The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2266isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2267to display the \\[nrfta\\] as the backslashed versions (like '\n')
2268(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2269UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2270UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2271
d2cc3551
JH
2272The pointer to the PV of the dsv is returned.
2273
2274=cut */
e6b2e755 2275char *
e1ec3a88 2276Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2277{
2278 int truncated = 0;
e1ec3a88 2279 const char *s, *e;
e6b2e755 2280
7918f24d
NC
2281 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2282
76f68e9b 2283 sv_setpvs(dsv, "");
7fddd944 2284 SvUTF8_off(dsv);
e1ec3a88 2285 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2286 UV u;
a49f32c6
NC
2287 /* This serves double duty as a flag and a character to print after
2288 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2289 */
2290 char ok = 0;
c728cb41 2291
e6b2e755
JH
2292 if (pvlim && SvCUR(dsv) >= pvlim) {
2293 truncated++;
2294 break;
2295 }
2296 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2297 if (u < 256) {
a3b680e6 2298 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2299 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2300 switch (c) {
c728cb41 2301 case '\n':
a49f32c6 2302 ok = 'n'; break;
c728cb41 2303 case '\r':
a49f32c6 2304 ok = 'r'; break;
c728cb41 2305 case '\t':
a49f32c6 2306 ok = 't'; break;
c728cb41 2307 case '\f':
a49f32c6 2308 ok = 'f'; break;
c728cb41 2309 case '\a':
a49f32c6 2310 ok = 'a'; break;
c728cb41 2311 case '\\':
a49f32c6 2312 ok = '\\'; break;
c728cb41
JH
2313 default: break;
2314 }
a49f32c6 2315 if (ok) {
88c9ea1e 2316 const char string = ok;
76f68e9b 2317 sv_catpvs(dsv, "\\");
5e7aa789 2318 sv_catpvn(dsv, &string, 1);
a49f32c6 2319 }
c728cb41 2320 }
00e86452 2321 /* isPRINT() is the locale-blind version. */
a49f32c6 2322 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 2323 const char string = c;
5e7aa789 2324 sv_catpvn(dsv, &string, 1);
a49f32c6 2325 ok = 1;
0a2ef054 2326 }
c728cb41
JH
2327 }
2328 if (!ok)
9e55ce06 2329 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2330 }
2331 if (truncated)
396482e1 2332 sv_catpvs(dsv, "...");
e6b2e755
JH
2333
2334 return SvPVX(dsv);
2335}
2b9d42f0 2336
d2cc3551 2337/*
87cea99e 2338=for apidoc sv_uni_display
d2cc3551
JH
2339
2340Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2341the displayable version being at most pvlim bytes long
d2cc3551 2342(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2343
2344The flags argument is as in pv_uni_display().
2345
d2cc3551
JH
2346The pointer to the PV of the dsv is returned.
2347
d4c19fe8
AL
2348=cut
2349*/
e6b2e755
JH
2350char *
2351Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2352{
7918f24d
NC
2353 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2354
cfd0369c
NC
2355 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2356 SvCUR(ssv), pvlim, flags);
701a277b
JH
2357}
2358
d2cc3551 2359/*
87cea99e 2360=for apidoc ibcmp_utf8
d2cc3551
JH
2361
2362Return true if the strings s1 and s2 differ case-insensitively, false
2363if not (if they are equal case-insensitively). If u1 is true, the
2364string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2365the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2366are false, the respective string is assumed to be in native 8-bit
2367encoding.
2368
2369If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2370in there (they will point at the beginning of the I<next> character).
2371If the pointers behind pe1 or pe2 are non-NULL, they are the end
2372pointers beyond which scanning will not continue under any
4cdaeff7 2373circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2374s2+l2 will be used as goal end pointers that will also stop the scan,
2375and which qualify towards defining a successful match: all the scans
2376that define an explicit length must reach their goal pointers for
2377a match to succeed).
d2cc3551
JH
2378
2379For case-insensitiveness, the "casefolding" of Unicode is used
2380instead of upper/lowercasing both the characters, see
2381http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2382
2383=cut */
701a277b 2384I32
d07ddd77 2385Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2386{
97aff369 2387 dVAR;
e1ec3a88
AL
2388 register const U8 *p1 = (const U8*)s1;
2389 register const U8 *p2 = (const U8*)s2;
cbbf8932 2390 register const U8 *f1 = NULL;
2f73348c 2391 register const U8 *f2 = NULL;
cbbf8932
AL
2392 register U8 *e1 = NULL;
2393 register U8 *q1 = NULL;
2394 register U8 *e2 = NULL;
2395 register U8 *q2 = NULL;
d07ddd77 2396 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2397 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2398 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2399 U8 natbuf[1+1];
2400 STRLEN foldlen1, foldlen2;
d07ddd77 2401 bool match;
7918f24d
NC
2402
2403 PERL_ARGS_ASSERT_IBCMP_UTF8;
332ddc25 2404
d07ddd77
JH
2405 if (pe1)
2406 e1 = *(U8**)pe1;
a0a388a1 2407 /* assert(e1 || l1); */
e1ec3a88
AL
2408 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2409 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2410 if (pe2)
2411 e2 = *(U8**)pe2;
a0a388a1 2412 /* assert(e2 || l2); */
e1ec3a88
AL
2413 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2414 f2 = (const U8*)s2 + l2;
d07ddd77 2415
a0a388a1
YO
2416 /* This shouldn't happen. However, putting an assert() there makes some
2417 * tests fail. */
2418 /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
d07ddd77
JH
2419 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2420 return 1; /* mismatch; possible infinite loop or false positive */
2421
a6872d42
JH
2422 if (!u1 || !u2)
2423 natbuf[1] = 0; /* Need to terminate the buffer. */
2424
d07ddd77
JH
2425 while ((e1 == 0 || p1 < e1) &&
2426 (f1 == 0 || p1 < f1) &&
2427 (e2 == 0 || p2 < e2) &&
2428 (f2 == 0 || p2 < f2)) {
2429 if (n1 == 0) {
d7f013c8
JH
2430 if (u1)
2431 to_utf8_fold(p1, foldbuf1, &foldlen1);
2432 else {
809e8e66 2433 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2434 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2435 }
2436 q1 = foldbuf1;
d07ddd77 2437 n1 = foldlen1;
332ddc25 2438 }
d07ddd77 2439 if (n2 == 0) {
d7f013c8
JH
2440 if (u2)
2441 to_utf8_fold(p2, foldbuf2, &foldlen2);
2442 else {
809e8e66 2443 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2444 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2445 }
2446 q2 = foldbuf2;
d07ddd77 2447 n2 = foldlen2;
332ddc25 2448 }
d07ddd77
JH
2449 while (n1 && n2) {
2450 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2451 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2452 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2453 return 1; /* mismatch */
d07ddd77 2454 n1 -= UTF8SKIP(q1);
d7f013c8 2455 q1 += UTF8SKIP(q1);
d07ddd77 2456 n2 -= UTF8SKIP(q2);
d7f013c8 2457 q2 += UTF8SKIP(q2);
701a277b 2458 }
d07ddd77 2459 if (n1 == 0)
d7f013c8 2460 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2461 if (n2 == 0)
d7f013c8
JH
2462 p2 += u2 ? UTF8SKIP(p2) : 1;
2463
d2cc3551 2464 }
5469e704 2465
d07ddd77
JH
2466 /* A match is defined by all the scans that specified
2467 * an explicit length reaching their final goals. */
2468 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2469
2470 if (match) {
d07ddd77
JH
2471 if (pe1)
2472 *pe1 = (char*)p1;
2473 if (pe2)
2474 *pe2 = (char*)p2;
5469e704
JH
2475 }
2476
2477 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2478}
701a277b 2479
a49f32c6
NC
2480/*
2481 * Local variables:
2482 * c-indentation-style: bsd
2483 * c-basic-offset: 4
2484 * indent-tabs-mode: t
2485 * End:
2486 *
37442d52
RGS
2487 * ex: set ts=8 sts=4 sw=4 noet:
2488 */