This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
do not have perlbug talk about perlthanks
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
81e983c1 34#include "inline_invlist.c"
a0ed51b3 35
27da23d5
JH
36static const char unees[] =
37 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 38
48ef279e 39/*
ccfc67b7 40=head1 Unicode Support
a0ed51b3 41
166f8a29 42This file contains various utility functions for manipulating UTF8-encoded
72d33970 43strings. For the uninitiated, this is a method of representing arbitrary
61296642 44Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
45characters in the ASCII range are unmodified, and a zero byte never appears
46within non-zero characters.
166f8a29 47
eaf7a4d2
CS
48=cut
49*/
50
51/*
52=for apidoc is_ascii_string
53
a1433954 54Returns true if the first C<len> bytes of the string C<s> are the same whether
970ea3cb
KW
55or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
56is, if they are invariant. On ASCII-ish machines, only ASCII characters
57fit this definition, hence the function's name.
eaf7a4d2 58
9f7e3d64
MH
59If C<len> is 0, it will be calculated using C<strlen(s)>.
60
a1433954 61See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
eaf7a4d2
CS
62
63=cut
64*/
65
66bool
668b6d8d 67Perl_is_ascii_string(const U8 *s, STRLEN len)
eaf7a4d2
CS
68{
69 const U8* const send = s + (len ? len : strlen((const char *)s));
70 const U8* x = s;
71
72 PERL_ARGS_ASSERT_IS_ASCII_STRING;
eaf7a4d2
CS
73
74 for (; x < send; ++x) {
75 if (!UTF8_IS_INVARIANT(*x))
76 break;
77 }
78
79 return x == send;
80}
81
82/*
378516de 83=for apidoc uvoffuni_to_utf8_flags
eebe1485 84
a27992cc 85THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
86Instead, B<Almost all code should use L</uvchr_to_utf8> or
87L</uvchr_to_utf8_flags>>.
a27992cc 88
de69f3af
KW
89This function is like them, but the input is a strict Unicode
90(as opposed to native) code point. Only in very rare circumstances should code
91not be using the native code point.
949cf498 92
de69f3af 93For details, see the description for L</uvchr_to_utf8_flags>>.
949cf498 94
eebe1485
SC
95=cut
96*/
97
dfe13c55 98U8 *
378516de 99Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 100{
378516de 101 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 102
d9432125
KW
103 if (UNI_IS_INVARIANT(uv)) {
104 *d++ = (U8) LATIN1_TO_NATIVE(uv);
105 return d;
106 }
107
979f77b6
KW
108 /* The first problematic code point is the first surrogate */
109 if (uv >= UNICODE_SURROGATE_FIRST
0cfa64bf 110 && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
979f77b6 111 {
949cf498
KW
112 if (UNICODE_IS_SURROGATE(uv)) {
113 if (flags & UNICODE_WARN_SURROGATE) {
8457b38f 114 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
949cf498
KW
115 "UTF-16 surrogate U+%04"UVXf, uv);
116 }
117 if (flags & UNICODE_DISALLOW_SURROGATE) {
118 return NULL;
119 }
120 }
121 else if (UNICODE_IS_SUPER(uv)) {
122 if (flags & UNICODE_WARN_SUPER
123 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
124 {
8457b38f 125 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
949cf498
KW
126 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
127 }
128 if (flags & UNICODE_DISALLOW_SUPER
129 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
130 {
131 return NULL;
132 }
133 }
134 else if (UNICODE_IS_NONCHAR(uv)) {
135 if (flags & UNICODE_WARN_NONCHAR) {
8457b38f 136 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
949cf498
KW
137 "Unicode non-character U+%04"UVXf" is illegal for open interchange",
138 uv);
139 }
140 if (flags & UNICODE_DISALLOW_NONCHAR) {
141 return NULL;
142 }
143 }
507b9800 144 }
d9432125 145
2d331972 146#if defined(EBCDIC)
d9432125 147 {
5aaebcb3 148 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
149 U8 *p = d+len-1;
150 while (p > d) {
bc3632a8 151 *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
152 uv >>= UTF_ACCUMULATION_SHIFT;
153 }
bc3632a8 154 *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
155 return d+len;
156 }
157#else /* Non loop style */
a0ed51b3 158 if (uv < 0x800) {
eb160463
GS
159 *d++ = (U8)(( uv >> 6) | 0xc0);
160 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
161 return d;
162 }
163 if (uv < 0x10000) {
eb160463
GS
164 *d++ = (U8)(( uv >> 12) | 0xe0);
165 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
166 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
167 return d;
168 }
169 if (uv < 0x200000) {
eb160463
GS
170 *d++ = (U8)(( uv >> 18) | 0xf0);
171 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
173 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
174 return d;
175 }
176 if (uv < 0x4000000) {
eb160463
GS
177 *d++ = (U8)(( uv >> 24) | 0xf8);
178 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
180 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
181 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
182 return d;
183 }
184 if (uv < 0x80000000) {
eb160463
GS
185 *d++ = (U8)(( uv >> 30) | 0xfc);
186 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
187 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
188 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
189 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
190 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
191 return d;
192 }
6588300d 193#ifdef UTF8_QUAD_MAX
d7578b48 194 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
195#endif
196 {
eb160463
GS
197 *d++ = 0xfe; /* Can't match U+FEFF! */
198 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
199 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
200 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
201 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
202 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
203 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
204 return d;
205 }
6588300d 206#ifdef UTF8_QUAD_MAX
a0ed51b3 207 {
eb160463
GS
208 *d++ = 0xff; /* Can't match U+FFFE! */
209 *d++ = 0x80; /* 6 Reserved bits */
210 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
211 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
212 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
213 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
214 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
215 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
216 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
217 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
218 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
219 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
220 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
221 return d;
222 }
223#endif
537124e4 224#endif /* Non loop style */
a0ed51b3 225}
646ca15d 226/*
07693fe6
KW
227=for apidoc uvchr_to_utf8
228
bcb1a2d4 229Adds the UTF-8 representation of the native code point C<uv> to the end
07693fe6 230of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
72d33970
FC
231bytes available. The return value is the pointer to the byte after the
232end of the new character. In other words,
07693fe6
KW
233
234 d = uvchr_to_utf8(d, uv);
235
236is the recommended wide native character-aware way of saying
237
238 *(d++) = uv;
239
de69f3af
KW
240This function accepts any UV as input. To forbid or warn on non-Unicode code
241points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
242
07693fe6
KW
243=cut
244*/
245
de69f3af
KW
246/* This is also a macro */
247PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
248
07693fe6
KW
249U8 *
250Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
251{
de69f3af 252 return uvchr_to_utf8(d, uv);
07693fe6
KW
253}
254
de69f3af
KW
255/*
256=for apidoc uvchr_to_utf8_flags
257
258Adds the UTF-8 representation of the native code point C<uv> to the end
259of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
72d33970
FC
260bytes available. The return value is the pointer to the byte after the
261end of the new character. In other words,
de69f3af
KW
262
263 d = uvchr_to_utf8_flags(d, uv, flags);
264
265or, in most cases,
266
267 d = uvchr_to_utf8_flags(d, uv, 0);
268
269This is the Unicode-aware way of saying
270
271 *(d++) = uv;
272
273This function will convert to UTF-8 (and not warn) even code points that aren't
274legal Unicode or are problematic, unless C<flags> contains one or more of the
275following flags:
276
277If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
278the function will raise a warning, provided UTF8 warnings are enabled. If instead
279UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
280If both flags are set, the function will both warn and return NULL.
281
4c3cfd5d 282The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
de69f3af 283affect how the function handles a Unicode non-character. And likewise, the
4c3cfd5d 284UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
de69f3af
KW
285code points that are
286above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
287even less portable) can be warned and/or disallowed even if other above-Unicode
288code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
289flags.
290
291And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
292above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
293DISALLOW flags.
294
295=cut
296*/
297
298/* This is also a macro */
299PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
300
07693fe6
KW
301U8 *
302Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
303{
de69f3af 304 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
305}
306
307/*
646ca15d 308
f7d739d1 309Tests if the first C<len> bytes of string C<s> form a valid UTF-8
bcb1a2d4
KW
310character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
311valid UTF-8 character. The number of bytes in the UTF-8 character
646ca15d
JH
312will be returned if it is valid, otherwise 0.
313
314This is the "slow" version as opposed to the "fast" version which is
315the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
316difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
317or less you should use the IS_UTF8_CHAR(), for lengths of five or more
318you should use the _slow(). In practice this means that the _slow()
319will be used very rarely, since the maximum Unicode code point (as of
320Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
537124e4 321the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
646ca15d
JH
322five bytes or more.
323
324=cut */
7af276bc 325PERL_STATIC_INLINE STRLEN
5f66b61c 326S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d 327{
cd7e6c88 328 dTHX; /* The function called below requires thread context */
646ca15d 329
cd7e6c88 330 STRLEN actual_len;
646ca15d 331
cd7e6c88 332 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
646ca15d 333
18712bce 334 utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
646ca15d 335
cd7e6c88 336 return (actual_len == (STRLEN) -1) ? 0 : actual_len;
646ca15d 337}
9041c2e3
NIS
338
339/*
492a624f
KW
340=for apidoc is_utf8_char_buf
341
342Returns the number of bytes that comprise the first UTF-8 encoded character in
343buffer C<buf>. C<buf_end> should point to one position beyond the end of the
344buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
345encoded character.
346
347Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
348machines) is a valid UTF-8 character.
349
350=cut */
351
352STRLEN
353Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
354{
355
356 STRLEN len;
357
358 PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
359
360 if (buf_end <= buf) {
361 return 0;
362 }
363
364 len = buf_end - buf;
365 if (len > UTF8SKIP(buf)) {
366 len = UTF8SKIP(buf);
367 }
368
492a624f
KW
369 if (IS_UTF8_CHAR_FAST(len))
370 return IS_UTF8_CHAR(buf, len) ? len : 0;
492a624f
KW
371 return is_utf8_char_slow(buf, len);
372}
373
374/*
87cea99e 375=for apidoc is_utf8_char
eebe1485 376
5da9da9e 377Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
378character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
379character is a valid UTF-8 character. The actual number of bytes in the UTF-8
380character will be returned if it is valid, otherwise 0.
9041c2e3 381
76848387 382This function is deprecated due to the possibility that malformed input could
a1433954 383cause reading beyond the end of the input buffer. Use L</is_utf8_char_buf>
76848387 384instead.
e0328548 385
82686b01 386=cut */
76848387 387
067a85ef 388STRLEN
668b6d8d 389Perl_is_utf8_char(const U8 *s)
386d01d6 390{
7918f24d 391 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
492a624f 392
76848387 393 /* Assumes we have enough space, which is why this is deprecated */
492a624f 394 return is_utf8_char_buf(s, s + UTF8SKIP(s));
386d01d6
GS
395}
396
eaf7a4d2 397
6662521e 398/*
87cea99e 399=for apidoc is_utf8_string
6662521e 400
a1433954 401Returns true if the first C<len> bytes of string C<s> form a valid
9f7e3d64 402UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
e0328548
KW
403using C<strlen(s)> (which means if you use this option, that C<s> has to have a
404terminating NUL byte). Note that all characters being ASCII constitute 'a
405valid UTF-8 string'.
6662521e 406
a1433954 407See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
768c67ee 408
6662521e
GS
409=cut
410*/
411
8e84507e 412bool
668b6d8d 413Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 414{
35da51f7 415 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 416 const U8* x = s;
067a85ef 417
7918f24d 418 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 419
6662521e 420 while (x < send) {
1acdb0da 421 /* Inline the easy bits of is_utf8_char() here for speed... */
e0328548
KW
422 if (UTF8_IS_INVARIANT(*x)) {
423 x++;
424 }
1acdb0da
JH
425 else {
426 /* ... and call is_utf8_char() only if really needed. */
e0328548
KW
427 const STRLEN c = UTF8SKIP(x);
428 const U8* const next_char_ptr = x + c;
429
430 if (next_char_ptr > send) {
431 return FALSE;
432 }
433
768c67ee
JH
434 if (IS_UTF8_CHAR_FAST(c)) {
435 if (!IS_UTF8_CHAR(x, c))
e0328548 436 return FALSE;
3c614e38 437 }
e0328548
KW
438 else if (! is_utf8_char_slow(x, c)) {
439 return FALSE;
440 }
441 x = next_char_ptr;
1acdb0da 442 }
6662521e 443 }
768c67ee 444
067a85ef 445 return TRUE;
6662521e
GS
446}
447
67e989fb 448/*
814fafa7
NC
449Implemented as a macro in utf8.h
450
87cea99e 451=for apidoc is_utf8_string_loc
814fafa7 452
a1433954
KW
453Like L</is_utf8_string> but stores the location of the failure (in the
454case of "utf8ness failure") or the location C<s>+C<len> (in the case of
814fafa7
NC
455"utf8ness success") in the C<ep>.
456
a1433954 457See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
814fafa7 458
87cea99e 459=for apidoc is_utf8_string_loclen
81cd54e3 460
a1433954
KW
461Like L</is_utf8_string>() but stores the location of the failure (in the
462case of "utf8ness failure") or the location C<s>+C<len> (in the case of
768c67ee
JH
463"utf8ness success") in the C<ep>, and the number of UTF-8
464encoded characters in the C<el>.
465
a1433954 466See also L</is_utf8_string_loc>() and L</is_utf8_string>().
81cd54e3
JH
467
468=cut
469*/
470
471bool
668b6d8d 472Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 473{
35da51f7 474 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 475 const U8* x = s;
81cd54e3 476 STRLEN c;
3ebfea28 477 STRLEN outlen = 0;
7918f24d
NC
478
479 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 480
81cd54e3 481 while (x < send) {
e0328548
KW
482 const U8* next_char_ptr;
483
81cd54e3
JH
484 /* Inline the easy bits of is_utf8_char() here for speed... */
485 if (UTF8_IS_INVARIANT(*x))
e0328548 486 next_char_ptr = x + 1;
81cd54e3 487 else {
768c67ee 488 /* ... and call is_utf8_char() only if really needed. */
768c67ee 489 c = UTF8SKIP(x);
e0328548
KW
490 next_char_ptr = c + x;
491 if (next_char_ptr > send) {
492 goto out;
493 }
768c67ee
JH
494 if (IS_UTF8_CHAR_FAST(c)) {
495 if (!IS_UTF8_CHAR(x, c))
496 c = 0;
497 } else
498 c = is_utf8_char_slow(x, c);
768c67ee
JH
499 if (!c)
500 goto out;
81cd54e3 501 }
e0328548 502 x = next_char_ptr;
3ebfea28 503 outlen++;
81cd54e3 504 }
768c67ee
JH
505
506 out:
3ebfea28
AL
507 if (el)
508 *el = outlen;
509
768c67ee
JH
510 if (ep)
511 *ep = x;
3ebfea28 512 return (x == send);
81cd54e3
JH
513}
514
515/*
768c67ee 516
de69f3af 517=for apidoc utf8n_to_uvchr
378516de
KW
518
519THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 520Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 521
9041c2e3 522Bottom level UTF-8 decode routine.
de69f3af 523Returns the native code point value of the first character in the string C<s>,
746afd53
KW
524which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
525C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
526the length, in bytes, of that character.
949cf498
KW
527
528The value of C<flags> determines the behavior when C<s> does not point to a
529well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
524080c4
KW
530zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
531next possible position in C<s> that could begin a non-malformed character.
532Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
949cf498
KW
533
534Various ALLOW flags can be set in C<flags> to allow (and not warn on)
535individual types of malformations, such as the sequence being overlong (that
536is, when there is a shorter sequence that can express the same code point;
537overlong sequences are expressly forbidden in the UTF-8 standard due to
538potential security issues). Another malformation example is the first byte of
539a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
540flags. For allowed 0 length strings, this function returns 0; for allowed
541overlong sequences, the computed code point is returned; for all other allowed
542malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
543determinable reasonable value.
949cf498
KW
544
545The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
546flags) malformation is found. If this flag is set, the routine assumes that
547the caller will raise a warning, and this function will silently just set
d088425d
KW
548C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
549
550Note that this API requires disambiguation between successful decoding a NUL
551character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
552in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
553first byte of C<s> is 0 as well. If so, the input was a NUL; if not, the input
554had an error.
949cf498
KW
555
556Certain code points are considered problematic. These are Unicode surrogates,
746afd53 557Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 558By default these are considered regular code points, but certain situations
5eafe189 559warrant special handling for them. If C<flags> contains
949cf498
KW
560UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
561malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
562UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
563maximum) can be set to disallow these categories individually.
564
565The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
566UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
567for their respective categories, but otherwise the code points are considered
568valid (not malformations). To get a category to both be treated as a
569malformation and raise a warning, specify both the WARN and DISALLOW flags.
570(But note that warnings are not raised if lexically disabled nor if
571UTF8_CHECK_ONLY is also specified.)
572
573Very large code points (above 0x7FFF_FFFF) are considered more problematic than
574the others that are above the Unicode legal maximum. There are several
eb83ed87
KW
575reasons: they requre at least 32 bits to represent them on ASCII platforms, are
576not representable at all on EBCDIC platforms, and the original UTF-8
577specification never went above this number (the current 0x10FFFF limit was
578imposed later). (The smaller ones, those that fit into 32 bits, are
579representable by a UV on ASCII platforms, but not by an IV, which means that
580the number of operations that can be performed on them is quite restricted.)
581The UTF-8 encoding on ASCII platforms for these large code points begins with a
582byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
583be treated as malformations, while allowing smaller above-Unicode code points.
584(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
72d33970
FC
585including these, as malformations.)
586Similarly, UTF8_WARN_FE_FF acts just like
eb83ed87 587the other WARN flags, but applies just to these code points.
949cf498
KW
588
589All other code points corresponding to Unicode characters, including private
590use and those yet to be assigned, are never considered malformed and never
591warn.
67e989fb 592
37607a96
PK
593=cut
594*/
67e989fb 595
a0ed51b3 596UV
de69f3af 597Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 598{
97aff369 599 dVAR;
d4c19fe8 600 const U8 * const s0 = s;
eb83ed87 601 U8 overflow_byte = '\0'; /* Save byte in case of overflow */
0b8d30e8 602 U8 * send;
eb83ed87
KW
603 UV uv = *s;
604 STRLEN expectlen;
949cf498 605 SV* sv = NULL;
eb83ed87
KW
606 UV outlier_ret = 0; /* return value when input is in error or problematic
607 */
608 UV pack_warn = 0; /* Save result of packWARN() for later */
609 bool unexpected_non_continuation = FALSE;
610 bool overflowed = FALSE;
2f8f112e 611 bool do_overlong_test = TRUE; /* May have to skip this test */
a0dbb045 612
eb83ed87 613 const char* const malformed_text = "Malformed UTF-8 character";
7918f24d 614
de69f3af 615 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
a0dbb045 616
eb83ed87
KW
617 /* The order of malformation tests here is important. We should consume as
618 * few bytes as possible in order to not skip any valid character. This is
619 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
620 * http://unicode.org/reports/tr36 for more discussion as to why. For
621 * example, once we've done a UTF8SKIP, we can tell the expected number of
622 * bytes, and could fail right off the bat if the input parameters indicate
623 * that there are too few available. But it could be that just that first
624 * byte is garbled, and the intended character occupies fewer bytes. If we
625 * blindly assumed that the first byte is correct, and skipped based on
626 * that number, we could skip over a valid input character. So instead, we
627 * always examine the sequence byte-by-byte.
628 *
629 * We also should not consume too few bytes, otherwise someone could inject
630 * things. For example, an input could be deliberately designed to
631 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 632 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
633 * which is actually part of of the overflowing sequence), that could look
634 * legitimate to the caller, which could discard the initial partial
635 * sequence and process the rest, inappropriately */
636
637 /* Zero length strings, if allowed, of necessity are zero */
b5b9af04 638 if (UNLIKELY(curlen == 0)) {
eb83ed87
KW
639 if (retlen) {
640 *retlen = 0;
641 }
a0dbb045 642
eb83ed87
KW
643 if (flags & UTF8_ALLOW_EMPTY) {
644 return 0;
645 }
646 if (! (flags & UTF8_CHECK_ONLY)) {
647 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
648 }
0c443dc2
JH
649 goto malformed;
650 }
651
eb83ed87
KW
652 expectlen = UTF8SKIP(s);
653
654 /* A well-formed UTF-8 character, as the vast majority of calls to this
655 * function will be for, has this expected length. For efficiency, set
656 * things up here to return it. It will be overriden only in those rare
657 * cases where a malformation is found */
658 if (retlen) {
659 *retlen = expectlen;
660 }
661
662 /* An invariant is trivially well-formed */
1d72bdf6 663 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 664 return uv;
a0ed51b3 665 }
67e989fb 666
eb83ed87 667 /* A continuation character can't start a valid sequence */
b5b9af04 668 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
eb83ed87
KW
669 if (flags & UTF8_ALLOW_CONTINUATION) {
670 if (retlen) {
671 *retlen = 1;
672 }
673 return UNICODE_REPLACEMENT;
674 }
ba210ebe 675
eb83ed87
KW
676 if (! (flags & UTF8_CHECK_ONLY)) {
677 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
678 }
679 curlen = 1;
ba210ebe
JH
680 goto malformed;
681 }
9041c2e3 682
dcd27b3c
KW
683 /* Here is not a continuation byte, nor an invariant. The only thing left
684 * is a start byte (possibly for an overlong) */
685
1d72bdf6 686#ifdef EBCDIC
bc3632a8 687 uv = NATIVE_UTF8_TO_I8(uv);
1d72bdf6
NIS
688#endif
689
eb83ed87
KW
690 /* Remove the leading bits that indicate the number of bytes in the
691 * character's whole UTF-8 sequence, leaving just the bits that are part of
692 * the value */
693 uv &= UTF_START_MASK(expectlen);
ba210ebe 694
eb83ed87
KW
695 /* Now, loop through the remaining bytes in the character's sequence,
696 * accumulating each into the working value as we go. Be sure to not look
697 * past the end of the input string */
0b8d30e8
KW
698 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
699
eb83ed87 700 for (s = s0 + 1; s < send; s++) {
b5b9af04 701 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
eb83ed87
KW
702#ifndef EBCDIC /* Can't overflow in EBCDIC */
703 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
704
705 /* The original implementors viewed this malformation as more
706 * serious than the others (though I, khw, don't understand
707 * why, since other malformations also give very very wrong
708 * results), so there is no way to turn off checking for it.
709 * Set a flag, but keep going in the loop, so that we absorb
710 * the rest of the bytes that comprise the character. */
711 overflowed = TRUE;
712 overflow_byte = *s; /* Save for warning message's use */
713 }
714#endif
8850bf83 715 uv = UTF8_ACCUMULATE(uv, *s);
eb83ed87
KW
716 }
717 else {
718 /* Here, found a non-continuation before processing all expected
719 * bytes. This byte begins a new character, so quit, even if
720 * allowing this malformation. */
721 unexpected_non_continuation = TRUE;
722 break;
723 }
724 } /* End of loop through the character's bytes */
725
726 /* Save how many bytes were actually in the character */
727 curlen = s - s0;
728
729 /* The loop above finds two types of malformations: non-continuation and/or
730 * overflow. The non-continuation malformation is really a too-short
731 * malformation, as it means that the current character ended before it was
732 * expected to (being terminated prematurely by the beginning of the next
733 * character, whereas in the too-short malformation there just are too few
734 * bytes available to hold the character. In both cases, the check below
735 * that we have found the expected number of bytes would fail if executed.)
736 * Thus the non-continuation malformation is really unnecessary, being a
737 * subset of the too-short malformation. But there may be existing
738 * applications that are expecting the non-continuation type, so we retain
739 * it, and return it in preference to the too-short malformation. (If this
740 * code were being written from scratch, the two types might be collapsed
741 * into one.) I, khw, am also giving priority to returning the
742 * non-continuation and too-short malformations over overflow when multiple
743 * ones are present. I don't know of any real reason to prefer one over
744 * the other, except that it seems to me that multiple-byte errors trumps
745 * errors from a single byte */
b5b9af04 746 if (UNLIKELY(unexpected_non_continuation)) {
eb83ed87
KW
747 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
748 if (! (flags & UTF8_CHECK_ONLY)) {
749 if (curlen == 1) {
750 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
751 }
752 else {
753 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
a0dbb045
JH
754 }
755 }
eb83ed87
KW
756 goto malformed;
757 }
758 uv = UNICODE_REPLACEMENT;
2f8f112e
KW
759
760 /* Skip testing for overlongs, as the REPLACEMENT may not be the same
761 * as what the original expectations were. */
762 do_overlong_test = FALSE;
eb83ed87
KW
763 if (retlen) {
764 *retlen = curlen;
765 }
766 }
b5b9af04 767 else if (UNLIKELY(curlen < expectlen)) {
eb83ed87
KW
768 if (! (flags & UTF8_ALLOW_SHORT)) {
769 if (! (flags & UTF8_CHECK_ONLY)) {
770 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
a0dbb045 771 }
eb83ed87
KW
772 goto malformed;
773 }
774 uv = UNICODE_REPLACEMENT;
2f8f112e 775 do_overlong_test = FALSE;
eb83ed87
KW
776 if (retlen) {
777 *retlen = curlen;
778 }
779 }
780
781#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
2f8f112e 782 if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
eb83ed87
KW
783 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
784 {
785 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
786 * generation of the sv, since no warnings are raised under CHECK */
787 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
788 && ckWARN_d(WARN_UTF8))
789 {
42303544
KW
790 /* This message is deliberately not of the same syntax as the other
791 * messages for malformations, for backwards compatibility in the
792 * unlikely event that code is relying on its precise earlier text
793 */
eb83ed87
KW
794 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
795 pack_warn = packWARN(WARN_UTF8);
796 }
797 if (flags & UTF8_DISALLOW_FE_FF) {
798 goto malformed;
ba210ebe 799 }
ba210ebe 800 }
b5b9af04 801 if (UNLIKELY(overflowed)) {
ba210ebe 802
eb83ed87
KW
803 /* If the first byte is FF, it will overflow a 32-bit word. If the
804 * first byte is FE, it will overflow a signed 32-bit word. The
805 * above preserves backward compatibility, since its message was used
806 * in earlier versions of this code in preference to overflow */
807 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
ba210ebe 808 goto malformed;
eb83ed87
KW
809 }
810#endif
811
2f8f112e 812 if (do_overlong_test
5aaebcb3 813 && expectlen > (STRLEN) OFFUNISKIP(uv)
2f8f112e
KW
814 && ! (flags & UTF8_ALLOW_LONG))
815 {
eb83ed87
KW
816 /* The overlong malformation has lower precedence than the others.
817 * Note that if this malformation is allowed, we return the actual
818 * value, instead of the replacement character. This is because this
819 * value is actually well-defined. */
820 if (! (flags & UTF8_CHECK_ONLY)) {
5aaebcb3 821 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
eb83ed87
KW
822 }
823 goto malformed;
824 }
825
1a89bb6c 826 /* Here, the input is considered to be well-formed, but it still could be a
eb83ed87
KW
827 * problematic code point that is not allowed by the input parameters. */
828 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
829 && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
830 |UTF8_WARN_ILLEGAL_INTERCHANGE)))
831 {
949cf498 832 if (UNICODE_IS_SURROGATE(uv)) {
eb83ed87 833 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
54f4afef 834 && ckWARN_d(WARN_SURROGATE))
eb83ed87 835 {
111d382d 836 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
54f4afef 837 pack_warn = packWARN(WARN_SURROGATE);
949cf498
KW
838 }
839 if (flags & UTF8_DISALLOW_SURROGATE) {
840 goto disallowed;
841 }
842 }
949cf498 843 else if ((uv > PERL_UNICODE_MAX)) {
eb83ed87 844 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
54f4afef 845 && ckWARN_d(WARN_NON_UNICODE))
eb83ed87 846 {
111d382d 847 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
54f4afef 848 pack_warn = packWARN(WARN_NON_UNICODE);
949cf498
KW
849 }
850 if (flags & UTF8_DISALLOW_SUPER) {
851 goto disallowed;
852 }
853 }
4190d317
KW
854 else if (UNICODE_IS_NONCHAR(uv)) {
855 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
54f4afef 856 && ckWARN_d(WARN_NONCHAR))
4190d317
KW
857 {
858 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
54f4afef 859 pack_warn = packWARN(WARN_NONCHAR);
4190d317
KW
860 }
861 if (flags & UTF8_DISALLOW_NONCHAR) {
862 goto disallowed;
863 }
864 }
949cf498 865
eb83ed87 866 if (sv) {
de69f3af
KW
867 outlier_ret = uv; /* Note we don't bother to convert to native,
868 as all the outlier code points are the same
869 in both ASCII and EBCDIC */
eb83ed87
KW
870 goto do_warn;
871 }
872
949cf498
KW
873 /* Here, this is not considered a malformed character, so drop through
874 * to return it */
a0ed51b3 875 }
ba210ebe 876
de69f3af 877 return UNI_TO_NATIVE(uv);
ba210ebe 878
eb83ed87
KW
879 /* There are three cases which get to beyond this point. In all 3 cases:
880 * <sv> if not null points to a string to print as a warning.
881 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
882 * set.
883 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
884 * This is done by initializing it to 0, and changing it only
885 * for case 1).
886 * The 3 cases are:
887 * 1) The input is valid but problematic, and to be warned about. The
888 * return value is the resultant code point; <*retlen> is set to
889 * <curlen>, the number of bytes that comprise the code point.
890 * <pack_warn> contains the result of packWARN() for the warning
891 * types. The entry point for this case is the label <do_warn>;
892 * 2) The input is a valid code point but disallowed by the parameters to
893 * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
894 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
895 * comprise the code point. <pack_warn> contains the result of
896 * packWARN() for the warning types. The entry point for this case is
897 * the label <disallowed>.
898 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
899 * is set, <*relen> is -1; otherwise it is <curlen>, the number of
900 * bytes that comprise the malformation. All such malformations are
901 * assumed to be warning type <utf8>. The entry point for this case
902 * is the label <malformed>.
903 */
949cf498 904
ba210ebe
JH
905malformed:
906
eb83ed87
KW
907 if (sv && ckWARN_d(WARN_UTF8)) {
908 pack_warn = packWARN(WARN_UTF8);
909 }
910
911disallowed:
912
fcc8fcf6 913 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 914 if (retlen)
10edeb5d 915 *retlen = ((STRLEN) -1);
ba210ebe
JH
916 return 0;
917 }
918
eb83ed87 919do_warn:
5b311467 920
eb83ed87
KW
921 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
922 if warnings are to be raised. */
f555bc63 923 const char * const string = SvPVX_const(sv);
a0dbb045 924
f555bc63
KW
925 if (PL_op)
926 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
927 else
928 Perl_warner(aTHX_ pack_warn, "%s", string);
a0dbb045
JH
929 }
930
eb83ed87
KW
931 if (retlen) {
932 *retlen = curlen;
933 }
ba210ebe 934
eb83ed87 935 return outlier_ret;
a0ed51b3
LW
936}
937
8e84507e 938/*
ec5f19d0
KW
939=for apidoc utf8_to_uvchr_buf
940
941Returns the native code point of the first character in the string C<s> which
942is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 943C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 944
524080c4
KW
945If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
946enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
173db420
KW
947NULL) to -1. If those warnings are off, the computed value, if well-defined
948(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
949C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
950the next possible position in C<s> that could begin a non-malformed character.
de69f3af 951See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 952returned.
ec5f19d0
KW
953
954=cut
955*/
956
957
958UV
959Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
960{
ec5f19d0
KW
961 assert(s < send);
962
963 return utf8n_to_uvchr(s, send - s, retlen,
964 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
965}
966
27d6c58a 967/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
3986bb7c 968 * there are no malformations in the input UTF-8 string C<s>. surrogates,
57b0056d 969 * non-character code points, and non-Unicode code points are allowed. */
27d6c58a
KW
970
971UV
972Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
973{
010ab96b
KW
974 UV expectlen = UTF8SKIP(s);
975 const U8* send = s + expectlen;
9ff2f0f7 976 UV uv = *s;
3986bb7c 977
27d6c58a
KW
978 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
979
010ab96b
KW
980 if (retlen) {
981 *retlen = expectlen;
982 }
983
984 /* An invariant is trivially returned */
985 if (expectlen == 1) {
9ff2f0f7 986 return uv;
010ab96b
KW
987 }
988
9ff2f0f7
KW
989#ifdef EBCDIC
990 uv = NATIVE_UTF8_TO_I8(uv);
991#endif
992
010ab96b
KW
993 /* Remove the leading bits that indicate the number of bytes, leaving just
994 * the bits that are part of the value */
995 uv &= UTF_START_MASK(expectlen);
996
997 /* Now, loop through the remaining bytes, accumulating each into the
998 * working total as we go. (I khw tried unrolling the loop for up to 4
999 * bytes, but there was no performance improvement) */
1000 for (++s; s < send; s++) {
1001 uv = UTF8_ACCUMULATE(uv, *s);
1002 }
1003
3986bb7c 1004 return UNI_TO_NATIVE(uv);
010ab96b 1005
27d6c58a
KW
1006}
1007
ec5f19d0 1008/*
87cea99e 1009=for apidoc utf8_to_uvchr
9041c2e3 1010
6ee84de2 1011Returns the native code point of the first character in the string C<s>
1e54db1a 1012which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
1013length, in bytes, of that character.
1014
4b88fb76 1015Some, but not all, UTF-8 malformations are detected, and in fact, some
977c1d31
KW
1016malformed input could cause reading beyond the end of the input buffer, which
1017is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
4b88fb76 1018
524080c4
KW
1019If C<s> points to one of the detected malformations, and UTF8 warnings are
1020enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1021NULL) to -1. If those warnings are off, the computed value if well-defined (or
1022the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1023is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1024next possible position in C<s> that could begin a non-malformed character.
de69f3af 1025See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
9041c2e3
NIS
1026
1027=cut
1028*/
1029
1030UV
7fc63493 1031Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 1032{
7918f24d
NC
1033 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
1034
2ff6c191 1035 return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
9041c2e3
NIS
1036}
1037
1038/*
ec5f19d0
KW
1039=for apidoc utf8_to_uvuni_buf
1040
de69f3af
KW
1041Only in very rare circumstances should code need to be dealing in Unicode
1042(as opposed to native) code points. In those few cases, use
1043C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1044
1045Returns the Unicode (not-native) code point of the first character in the
1046string C<s> which
ec5f19d0
KW
1047is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1048C<retlen> will be set to the length, in bytes, of that character.
1049
524080c4
KW
1050If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1051enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1052NULL) to -1. If those warnings are off, the computed value if well-defined (or
1053the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1054is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1055next possible position in C<s> that could begin a non-malformed character.
de69f3af 1056See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0
KW
1057
1058=cut
1059*/
1060
1061UV
1062Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1063{
1064 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1065
1066 assert(send > s);
1067
1068 /* Call the low level routine asking for checks */
de69f3af
KW
1069 return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
1070 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
ec5f19d0
KW
1071}
1072
5495102a
KW
1073/* DEPRECATED!
1074 * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
2114036c 1075 * there are no malformations in the input UTF-8 string C<s>. Surrogates,
3986bb7c 1076 * non-character code points, and non-Unicode code points are allowed */
27d6c58a
KW
1077
1078UV
1079Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1080{
1081 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1082
010ab96b 1083 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
27d6c58a
KW
1084}
1085
ec5f19d0 1086/*
87cea99e 1087=for apidoc utf8_to_uvuni
9041c2e3
NIS
1088
1089Returns the Unicode code point of the first character in the string C<s>
1e54db1a 1090which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
1091length, in bytes, of that character.
1092
4b88fb76 1093Some, but not all, UTF-8 malformations are detected, and in fact, some
977c1d31 1094malformed input could cause reading beyond the end of the input buffer, which
4f83cdcd
KW
1095is one reason why this function is deprecated. The other is that only in
1096extremely limited circumstances should the Unicode versus native code point be
de69f3af 1097of any interest to you. See L</utf8_to_uvuni_buf> for alternatives.
9041c2e3 1098
524080c4
KW
1099If C<s> points to one of the detected malformations, and UTF8 warnings are
1100enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
1101NULL) to -1. If those warnings are off, the computed value if well-defined (or
1102the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1103is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1104next possible position in C<s> that could begin a non-malformed character.
de69f3af 1105See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
8e84507e
NIS
1106
1107=cut
1108*/
1109
1110UV
7fc63493 1111Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 1112{
7918f24d
NC
1113 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1114
5495102a 1115 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
8e84507e
NIS
1116}
1117
b76347f2 1118/*
87cea99e 1119=for apidoc utf8_length
b76347f2
JH
1120
1121Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1122Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1123up past C<e>, croaks.
b76347f2
JH
1124
1125=cut
1126*/
1127
1128STRLEN
35a4481c 1129Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 1130{
97aff369 1131 dVAR;
b76347f2
JH
1132 STRLEN len = 0;
1133
7918f24d
NC
1134 PERL_ARGS_ASSERT_UTF8_LENGTH;
1135
8850bf83
JH
1136 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1137 * the bitops (especially ~) can create illegal UTF-8.
1138 * In other words: in Perl UTF-8 is not just for Unicode. */
1139
a3b680e6
AL
1140 if (e < s)
1141 goto warn_and_return;
b76347f2 1142 while (s < e) {
4cbf4130 1143 s += UTF8SKIP(s);
8e91ec7f
AV
1144 len++;
1145 }
1146
1147 if (e != s) {
1148 len--;
1149 warn_and_return:
9b387841
NC
1150 if (PL_op)
1151 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1152 "%s in %s", unees, OP_DESC(PL_op));
1153 else
61a12c31 1154 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1155 }
1156
1157 return len;
1158}
1159
b06226ff 1160/*
87cea99e 1161=for apidoc utf8_distance
b06226ff 1162
1e54db1a 1163Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
1164and C<b>.
1165
1166WARNING: use only if you *know* that the pointers point inside the
1167same UTF-8 buffer.
1168
37607a96
PK
1169=cut
1170*/
a0ed51b3 1171
02eb7b47 1172IV
35a4481c 1173Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 1174{
7918f24d
NC
1175 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1176
bf1665bc 1177 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
1178}
1179
b06226ff 1180/*
87cea99e 1181=for apidoc utf8_hop
b06226ff 1182
8850bf83
JH
1183Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1184forward or backward.
b06226ff
JH
1185
1186WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
1187the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1188on the first byte of character or just after the last byte of a character.
b06226ff 1189
37607a96
PK
1190=cut
1191*/
a0ed51b3
LW
1192
1193U8 *
4373e329 1194Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 1195{
7918f24d
NC
1196 PERL_ARGS_ASSERT_UTF8_HOP;
1197
96a5add6 1198 PERL_UNUSED_CONTEXT;
8850bf83
JH
1199 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1200 * the bitops (especially ~) can create illegal UTF-8.
1201 * In other words: in Perl UTF-8 is not just for Unicode. */
1202
a0ed51b3
LW
1203 if (off >= 0) {
1204 while (off--)
1205 s += UTF8SKIP(s);
1206 }
1207 else {
1208 while (off++) {
1209 s--;
8850bf83
JH
1210 while (UTF8_IS_CONTINUATION(*s))
1211 s--;
a0ed51b3
LW
1212 }
1213 }
4373e329 1214 return (U8 *)s;
a0ed51b3
LW
1215}
1216
6940069f 1217/*
fed3ba5d
NC
1218=for apidoc bytes_cmp_utf8
1219
a1433954 1220Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1221sequence of characters (stored as UTF-8)
1222in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1223equal, -1 or -2 if the first string is less than the second string, +1 or +2
1224if the first string is greater than the second string.
1225
1226-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1227longer string. -2 or +2 is returned if
1228there was a difference between characters
fed3ba5d
NC
1229within the strings.
1230
1231=cut
1232*/
1233
1234int
1235Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1236{
1237 const U8 *const bend = b + blen;
1238 const U8 *const uend = u + ulen;
1239
1240 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1241
1242 PERL_UNUSED_CONTEXT;
1243
1244 while (b < bend && u < uend) {
1245 U8 c = *u++;
1246 if (!UTF8_IS_INVARIANT(c)) {
1247 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1248 if (u < uend) {
1249 U8 c1 = *u++;
1250 if (UTF8_IS_CONTINUATION(c1)) {
94bb8c36 1251 c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
fed3ba5d
NC
1252 } else {
1253 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1254 "Malformed UTF-8 character "
1255 "(unexpected non-continuation byte 0x%02x"
1256 ", immediately after start byte 0x%02x)"
1257 /* Dear diag.t, it's in the pod. */
1258 "%s%s", c1, c,
1259 PL_op ? " in " : "",
1260 PL_op ? OP_DESC(PL_op) : "");
1261 return -2;
1262 }
1263 } else {
1264 if (PL_op)
1265 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1266 "%s in %s", unees, OP_DESC(PL_op));
1267 else
61a12c31 1268 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1269 return -2; /* Really want to return undef :-) */
1270 }
1271 } else {
1272 return -2;
1273 }
1274 }
1275 if (*b != c) {
1276 return *b < c ? -2 : +2;
1277 }
1278 ++b;
1279 }
1280
1281 if (b == bend && u == uend)
1282 return 0;
1283
1284 return b < bend ? +1 : -1;
1285}
1286
1287/*
87cea99e 1288=for apidoc utf8_to_bytes
6940069f 1289
2bbc8d55 1290Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1291Unlike L</bytes_to_utf8>, this over-writes the original string, and
1292updates C<len> to contain the new length.
67e989fb 1293Returns zero on failure, setting C<len> to -1.
6940069f 1294
a1433954 1295If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1296
6940069f
GS
1297=cut
1298*/
1299
1300U8 *
37607a96 1301Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1302{
d4c19fe8
AL
1303 U8 * const save = s;
1304 U8 * const send = s + *len;
6940069f 1305 U8 *d;
246fae53 1306
7918f24d
NC
1307 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1308
1e54db1a 1309 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1310 while (s < send) {
d59937ca
KW
1311 if (! UTF8_IS_INVARIANT(*s)) {
1312 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1313 *len = ((STRLEN) -1);
1314 return 0;
1315 }
1316 s++;
dcad2880 1317 }
d59937ca 1318 s++;
246fae53 1319 }
dcad2880
JH
1320
1321 d = s = save;
6940069f 1322 while (s < send) {
80e0b38f
KW
1323 U8 c = *s++;
1324 if (! UTF8_IS_INVARIANT(c)) {
1325 /* Then it is two-byte encoded */
1326 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1327 s++;
1328 }
1329 *d++ = c;
6940069f
GS
1330 }
1331 *d = '\0';
246fae53 1332 *len = d - save;
6940069f
GS
1333 return save;
1334}
1335
1336/*
87cea99e 1337=for apidoc bytes_from_utf8
f9a63242 1338
2bbc8d55 1339Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1340Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1341the newly-created string, and updates C<len> to contain the new
1342length. Returns the original string if no conversion occurs, C<len>
72d33970 1343is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
13440 if C<s> is converted or consisted entirely of characters that are invariant
1345in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1346
37607a96
PK
1347=cut
1348*/
f9a63242
JH
1349
1350U8 *
e1ec3a88 1351Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1352{
f9a63242 1353 U8 *d;
e1ec3a88
AL
1354 const U8 *start = s;
1355 const U8 *send;
f9a63242
JH
1356 I32 count = 0;
1357
7918f24d
NC
1358 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1359
96a5add6 1360 PERL_UNUSED_CONTEXT;
f9a63242 1361 if (!*is_utf8)
73d840c0 1362 return (U8 *)start;
f9a63242 1363
1e54db1a 1364 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1365 for (send = s + *len; s < send;) {
d59937ca
KW
1366 if (! UTF8_IS_INVARIANT(*s)) {
1367 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1368 return (U8 *)start;
d59937ca
KW
1369 }
1370 count++;
1371 s++;
db42d148 1372 }
d59937ca 1373 s++;
f9a63242
JH
1374 }
1375
35da51f7 1376 *is_utf8 = FALSE;
f9a63242 1377
212542aa 1378 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1379 s = start; start = d;
f9a63242
JH
1380 while (s < send) {
1381 U8 c = *s++;
1a91c45d 1382 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1383 /* Then it is two-byte encoded */
1a91c45d
KW
1384 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1385 s++;
c4d5f83a
NIS
1386 }
1387 *d++ = c;
f9a63242
JH
1388 }
1389 *d = '\0';
1390 *len = d - start;
73d840c0 1391 return (U8 *)start;
f9a63242
JH
1392}
1393
1394/*
87cea99e 1395=for apidoc bytes_to_utf8
6940069f 1396
ff97e5cf
KW
1397Converts a string C<s> of length C<len> bytes from the native encoding into
1398UTF-8.
6662521e 1399Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1400reflect the new length in bytes.
6940069f 1401
2bbc8d55
SP
1402A NUL character will be written after the end of the string.
1403
1404If you want to convert to UTF-8 from encodings other than
1405the native (Latin1 or EBCDIC),
a1433954 1406see L</sv_recode_to_utf8>().
c9ada85f 1407
497711e7 1408=cut
6940069f
GS
1409*/
1410
c682ebef
FC
1411/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1412 likewise need duplication. */
1413
6940069f 1414U8*
35a4481c 1415Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1416{
35a4481c 1417 const U8 * const send = s + (*len);
6940069f
GS
1418 U8 *d;
1419 U8 *dst;
7918f24d
NC
1420
1421 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1422 PERL_UNUSED_CONTEXT;
6940069f 1423
212542aa 1424 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1425 dst = d;
1426
1427 while (s < send) {
55d09dc8
KW
1428 append_utf8_from_native_byte(*s, &d);
1429 s++;
6940069f
GS
1430 }
1431 *d = '\0';
6662521e 1432 *len = d-dst;
6940069f
GS
1433 return dst;
1434}
1435
a0ed51b3 1436/*
dea0fc0b 1437 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1438 *
1439 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1440 * We optimize for native, for obvious reasons. */
1441
1442U8*
dea0fc0b 1443Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1444{
dea0fc0b
JH
1445 U8* pend;
1446 U8* dstart = d;
1447
7918f24d
NC
1448 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1449
dea0fc0b 1450 if (bytelen & 1)
f5992bc4 1451 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1452
1453 pend = p + bytelen;
1454
a0ed51b3 1455 while (p < pend) {
dea0fc0b
JH
1456 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1457 p += 2;
56d37426
KW
1458 if (UNI_IS_INVARIANT(uv)) {
1459 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1460 continue;
1461 }
56d37426
KW
1462 if (uv <= MAX_UTF8_TWO_BYTE) {
1463 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1464 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1465 continue;
1466 }
46956fad
KW
1467#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1468#define LAST_HIGH_SURROGATE 0xDBFF
1469#define FIRST_LOW_SURROGATE 0xDC00
1470#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
1471 if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
01ea242b 1472 if (p >= pend) {
dea0fc0b 1473 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
01ea242b
NC
1474 } else {
1475 UV low = (p[0] << 8) + p[1];
1476 p += 2;
46956fad 1477 if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
01ea242b 1478 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
46956fad
KW
1479 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1480 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1481 }
46956fad 1482 } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
dbde1951 1483 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3 1484 }
56d37426
KW
1485#ifdef EBCDIC
1486 d = uvoffuni_to_utf8_flags(d, uv, 0);
1487#else
a0ed51b3 1488 if (uv < 0x10000) {
eb160463
GS
1489 *d++ = (U8)(( uv >> 12) | 0xe0);
1490 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1491 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1492 continue;
1493 }
1494 else {
eb160463
GS
1495 *d++ = (U8)(( uv >> 18) | 0xf0);
1496 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1497 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1498 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1499 continue;
1500 }
56d37426 1501#endif
a0ed51b3 1502 }
dea0fc0b 1503 *newlen = d - dstart;
a0ed51b3
LW
1504 return d;
1505}
1506
1507/* Note: this one is slightly destructive of the source. */
1508
1509U8*
dea0fc0b 1510Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1511{
1512 U8* s = (U8*)p;
d4c19fe8 1513 U8* const send = s + bytelen;
7918f24d
NC
1514
1515 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1516
e0ea5e2d
NC
1517 if (bytelen & 1)
1518 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1519 (UV)bytelen);
1520
a0ed51b3 1521 while (s < send) {
d4c19fe8 1522 const U8 tmp = s[0];
a0ed51b3
LW
1523 s[0] = s[1];
1524 s[1] = tmp;
1525 s += 2;
1526 }
dea0fc0b 1527 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1528}
1529
922e8cb4
KW
1530bool
1531Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1532{
1533 U8 tmpbuf[UTF8_MAXBYTES+1];
1534 uvchr_to_utf8(tmpbuf, c);
1535 return _is_utf8_FOO(classnum, tmpbuf);
1536}
1537
c3fd2246
KW
1538/* for now these are all defined (inefficiently) in terms of the utf8 versions.
1539 * Note that the macros in handy.h that call these short-circuit calling them
1540 * for Latin-1 range inputs */
a0ed51b3
LW
1541
1542bool
84afefe6 1543Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1544{
89ebb4a3 1545 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1546 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1547 return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
a0ed51b3
LW
1548}
1549
1550bool
5092f92a
KW
1551Perl_is_uni_alnumc(pTHX_ UV c)
1552{
1553 U8 tmpbuf[UTF8_MAXBYTES+1];
1554 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1555 return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
5092f92a
KW
1556}
1557
f9ae8fb6
JD
1558/* Internal function so we can deprecate the external one, and call
1559 this one from other deprecated functions in this file */
1560
1561PERL_STATIC_INLINE bool
61b19385
KW
1562S_is_utf8_idfirst(pTHX_ const U8 *p)
1563{
1564 dVAR;
1565
1566 if (*p == '_')
1567 return TRUE;
1568 /* is_utf8_idstart would be more logical. */
1569 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1570}
1571
5092f92a 1572bool
84afefe6 1573Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1574{
89ebb4a3 1575 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1576 uvchr_to_utf8(tmpbuf, c);
61b19385 1577 return S_is_utf8_idfirst(aTHX_ tmpbuf);
a0ed51b3
LW
1578}
1579
1580bool
eba68aa0
KW
1581Perl__is_uni_perl_idcont(pTHX_ UV c)
1582{
1583 U8 tmpbuf[UTF8_MAXBYTES+1];
1584 uvchr_to_utf8(tmpbuf, c);
1585 return _is_utf8_perl_idcont(tmpbuf);
1586}
1587
1588bool
f91dcd13
KW
1589Perl__is_uni_perl_idstart(pTHX_ UV c)
1590{
1591 U8 tmpbuf[UTF8_MAXBYTES+1];
1592 uvchr_to_utf8(tmpbuf, c);
1593 return _is_utf8_perl_idstart(tmpbuf);
1594}
1595
1596bool
84afefe6 1597Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1598{
89ebb4a3 1599 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1600 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1601 return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
a0ed51b3
LW
1602}
1603
1604bool
84afefe6 1605Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1606{
bc39fe24 1607 return isASCII(c);
4d61ec05
GS
1608}
1609
1610bool
bdd8600f
KW
1611Perl_is_uni_blank(pTHX_ UV c)
1612{
2cafb56b 1613 return isBLANK_uni(c);
bdd8600f
KW
1614}
1615
1616bool
84afefe6 1617Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1618{
add4123a 1619 return isSPACE_uni(c);
a0ed51b3
LW
1620}
1621
1622bool
84afefe6 1623Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1624{
89ebb4a3 1625 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1626 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1627 return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
a0ed51b3
LW
1628}
1629
1630bool
84afefe6 1631Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1632{
89ebb4a3 1633 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1634 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1635 return _is_utf8_FOO(_CC_UPPER, tmpbuf);
a0ed51b3
LW
1636}
1637
1638bool
84afefe6 1639Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1640{
89ebb4a3 1641 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1642 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1643 return _is_utf8_FOO(_CC_LOWER, tmpbuf);
a0ed51b3
LW
1644}
1645
1646bool
84afefe6 1647Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1648{
7b952154 1649 return isCNTRL_L1(c);
b8c5462f
JH
1650}
1651
1652bool
84afefe6 1653Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1654{
89ebb4a3 1655 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1656 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1657 return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
b8c5462f
JH
1658}
1659
1660bool
84afefe6 1661Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1662{
89ebb4a3 1663 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1664 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1665 return _is_utf8_FOO(_CC_PRINT, tmpbuf);
a0ed51b3
LW
1666}
1667
b8c5462f 1668bool
84afefe6 1669Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1670{
89ebb4a3 1671 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1672 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1673 return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
b8c5462f
JH
1674}
1675
4d61ec05 1676bool
84afefe6 1677Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1678{
4ac6419d 1679 return isXDIGIT_uni(c);
4d61ec05
GS
1680}
1681
3a4c58c9
KW
1682UV
1683Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1684{
1685 /* We have the latin1-range values compiled into the core, so just use
1686 * those, converting the result to utf8. The only difference between upper
1687 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1688 * either "SS" or "Ss". Which one to use is passed into the routine in
1689 * 'S_or_s' to avoid a test */
1690
1691 UV converted = toUPPER_LATIN1_MOD(c);
1692
1693 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1694
1695 assert(S_or_s == 'S' || S_or_s == 's');
1696
6f2d5cbc 1697 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 1698 characters in this range */
3a4c58c9
KW
1699 *p = (U8) converted;
1700 *lenp = 1;
1701 return converted;
1702 }
1703
1704 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1705 * which it maps to one of them, so as to only have to have one check for
1706 * it in the main case */
1707 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1708 switch (c) {
1709 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1710 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1711 break;
1712 case MICRO_SIGN:
1713 converted = GREEK_CAPITAL_LETTER_MU;
1714 break;
1715 case LATIN_SMALL_LETTER_SHARP_S:
1716 *(p)++ = 'S';
1717 *p = S_or_s;
1718 *lenp = 2;
1719 return 'S';
1720 default:
1721 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
118e2215 1722 assert(0); /* NOTREACHED */
3a4c58c9
KW
1723 }
1724 }
1725
1726 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1727 *p = UTF8_TWO_BYTE_LO(converted);
1728 *lenp = 2;
1729
1730 return converted;
1731}
1732
50bda2c3
KW
1733/* Call the function to convert a UTF-8 encoded character to the specified case.
1734 * Note that there may be more than one character in the result.
1735 * INP is a pointer to the first byte of the input character
1736 * OUTP will be set to the first byte of the string of changed characters. It
1737 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1738 * LENP will be set to the length in bytes of the string of changed characters
1739 *
1740 * The functions return the ordinal of the first character in the string of OUTP */
4a8240a3
KW
1741#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
1742#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
1743#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
50bda2c3
KW
1744
1745/* This additionally has the input parameter SPECIALS, which if non-zero will
1746 * cause this to use the SPECIALS hash for folding (meaning get full case
1747 * folding); otherwise, when zero, this implies a simple case fold */
4a8240a3 1748#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
c3fd2246 1749
84afefe6
JH
1750UV
1751Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1752{
3a4c58c9
KW
1753 dVAR;
1754
a1433954
KW
1755 /* Convert the Unicode character whose ordinal is <c> to its uppercase
1756 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1757 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
1758 * the changed version may be longer than the original character.
1759 *
1760 * The ordinal of the first character of the changed version is returned
1761 * (but note, as explained above, that there may be more.) */
1762
7918f24d
NC
1763 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1764
3a4c58c9
KW
1765 if (c < 256) {
1766 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1767 }
1768
0ebc6274 1769 uvchr_to_utf8(p, c);
3a4c58c9 1770 return CALL_UPPER_CASE(p, p, lenp);
a0ed51b3
LW
1771}
1772
84afefe6
JH
1773UV
1774Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1775{
3a4c58c9
KW
1776 dVAR;
1777
7918f24d
NC
1778 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1779
3a4c58c9
KW
1780 if (c < 256) {
1781 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1782 }
1783
0ebc6274 1784 uvchr_to_utf8(p, c);
3a4c58c9 1785 return CALL_TITLE_CASE(p, p, lenp);
a0ed51b3
LW
1786}
1787
afc16117
KW
1788STATIC U8
1789S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1790{
1791 /* We have the latin1-range values compiled into the core, so just use
1792 * those, converting the result to utf8. Since the result is always just
a1433954 1793 * one character, we allow <p> to be NULL */
afc16117
KW
1794
1795 U8 converted = toLOWER_LATIN1(c);
1796
1797 if (p != NULL) {
6f2d5cbc 1798 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
1799 *p = converted;
1800 *lenp = 1;
1801 }
1802 else {
1803 *p = UTF8_TWO_BYTE_HI(converted);
1804 *(p+1) = UTF8_TWO_BYTE_LO(converted);
1805 *lenp = 2;
1806 }
1807 }
1808 return converted;
1809}
1810
84afefe6
JH
1811UV
1812Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1813{
968c5e6a
KW
1814 dVAR;
1815
7918f24d
NC
1816 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1817
afc16117
KW
1818 if (c < 256) {
1819 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
1820 }
1821
afc16117 1822 uvchr_to_utf8(p, c);
968c5e6a 1823 return CALL_LOWER_CASE(p, p, lenp);
a0ed51b3
LW
1824}
1825
84afefe6 1826UV
51910141 1827Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 1828{
51910141 1829 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 1830 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 1831 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
1832 *
1833 * Not to be used for locale folds
51910141 1834 */
f673fad4 1835
a1dde8de
KW
1836 UV converted;
1837
1838 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1839
1ca267a5
KW
1840 assert (! (flags & FOLD_FLAGS_LOCALE));
1841
a1dde8de
KW
1842 if (c == MICRO_SIGN) {
1843 converted = GREEK_SMALL_LETTER_MU;
1844 }
51910141 1845 else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1ca267a5
KW
1846
1847 /* If can't cross 127/128 boundary, can't return "ss"; instead return
1848 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1849 * under those circumstances. */
1850 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1851 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1852 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1853 p, *lenp, U8);
1854 return LATIN_SMALL_LETTER_LONG_S;
1855 }
1856 else {
4f489194
KW
1857 *(p)++ = 's';
1858 *p = 's';
1859 *lenp = 2;
1860 return 's';
1ca267a5 1861 }
a1dde8de
KW
1862 }
1863 else { /* In this range the fold of all other characters is their lower
1864 case */
1865 converted = toLOWER_LATIN1(c);
1866 }
1867
6f2d5cbc 1868 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
1869 *p = (U8) converted;
1870 *lenp = 1;
1871 }
1872 else {
1873 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1874 *p = UTF8_TWO_BYTE_LO(converted);
1875 *lenp = 2;
1876 }
1877
1878 return converted;
1879}
1880
1881UV
a0270393 1882Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
84afefe6 1883{
4b593389 1884
a0270393
KW
1885 /* Not currently externally documented, and subject to change
1886 * <flags> bits meanings:
1887 * FOLD_FLAGS_FULL iff full folding is to be used;
1888 * FOLD_FLAGS_LOCALE iff in locale
1889 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1890 */
4b593389 1891
36bb2ab6 1892 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 1893
a1dde8de 1894 if (c < 256) {
a0270393 1895 UV result = _to_fold_latin1((U8) c, p, lenp,
1ca267a5 1896 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a0270393
KW
1897 /* It is illegal for the fold to cross the 255/256 boundary under
1898 * locale; in this case return the original */
1899 return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1900 ? c
1901 : result;
a1dde8de
KW
1902 }
1903
a0270393
KW
1904 /* If no special needs, just use the macro */
1905 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1906 uvchr_to_utf8(p, c);
1907 return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1908 }
1909 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1910 the special flags. */
1911 U8 utf8_c[UTF8_MAXBYTES + 1];
1912 uvchr_to_utf8(utf8_c, c);
1913 return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
1914 }
84afefe6
JH
1915}
1916
a0ed51b3 1917bool
84afefe6 1918Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1919{
edfb3318 1920 if (c < 256) {
f4cd282c 1921 return isALNUM_LC(c);
edfb3318 1922 }
922e8cb4 1923 return _is_uni_FOO(_CC_WORDCHAR, c);
a0ed51b3
LW
1924}
1925
1926bool
5092f92a
KW
1927Perl_is_uni_alnumc_lc(pTHX_ UV c)
1928{
1929 if (c < 256) {
f4cd282c 1930 return isALPHANUMERIC_LC(c);
5092f92a 1931 }
922e8cb4 1932 return _is_uni_FOO(_CC_ALPHANUMERIC, c);
5092f92a
KW
1933}
1934
1935bool
84afefe6 1936Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1937{
edfb3318 1938 if (c < 256) {
f4cd282c 1939 return isIDFIRST_LC(c);
edfb3318
KW
1940 }
1941 return _is_uni_perl_idstart(c);
a0ed51b3
LW
1942}
1943
1944bool
84afefe6 1945Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1946{
edfb3318 1947 if (c < 256) {
f4cd282c 1948 return isALPHA_LC(c);
edfb3318 1949 }
922e8cb4 1950 return _is_uni_FOO(_CC_ALPHA, c);
a0ed51b3
LW
1951}
1952
1953bool
84afefe6 1954Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1955{
edfb3318 1956 if (c < 256) {
f4cd282c 1957 return isASCII_LC(c);
edfb3318
KW
1958 }
1959 return 0;
4d61ec05
GS
1960}
1961
1962bool
bdd8600f
KW
1963Perl_is_uni_blank_lc(pTHX_ UV c)
1964{
edfb3318 1965 if (c < 256) {
f4cd282c 1966 return isBLANK_LC(c);
edfb3318 1967 }
61b19385 1968 return isBLANK_uni(c);
bdd8600f
KW
1969}
1970
1971bool
84afefe6 1972Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1973{
edfb3318 1974 if (c < 256) {
f4cd282c 1975 return isSPACE_LC(c);
edfb3318 1976 }
61b19385 1977 return isSPACE_uni(c);
a0ed51b3
LW
1978}
1979
1980bool
84afefe6 1981Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1982{
edfb3318 1983 if (c < 256) {
f4cd282c 1984 return isDIGIT_LC(c);
edfb3318 1985 }
922e8cb4 1986 return _is_uni_FOO(_CC_DIGIT, c);
a0ed51b3
LW
1987}
1988
1989bool
84afefe6 1990Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1991{
edfb3318 1992 if (c < 256) {
f4cd282c 1993 return isUPPER_LC(c);
edfb3318 1994 }
922e8cb4 1995 return _is_uni_FOO(_CC_UPPER, c);
a0ed51b3
LW
1996}
1997
1998bool
84afefe6 1999Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 2000{
edfb3318 2001 if (c < 256) {
f4cd282c 2002 return isLOWER_LC(c);
edfb3318 2003 }
922e8cb4 2004 return _is_uni_FOO(_CC_LOWER, c);
a0ed51b3
LW
2005}
2006
2007bool
84afefe6 2008Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 2009{
edfb3318 2010 if (c < 256) {
f4cd282c 2011 return isCNTRL_LC(c);
edfb3318 2012 }
61b19385 2013 return 0;
b8c5462f
JH
2014}
2015
2016bool
84afefe6 2017Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 2018{
edfb3318 2019 if (c < 256) {
f4cd282c 2020 return isGRAPH_LC(c);
edfb3318 2021 }
922e8cb4 2022 return _is_uni_FOO(_CC_GRAPH, c);
b8c5462f
JH
2023}
2024
2025bool
84afefe6 2026Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 2027{
edfb3318 2028 if (c < 256) {
f4cd282c 2029 return isPRINT_LC(c);
edfb3318 2030 }
922e8cb4 2031 return _is_uni_FOO(_CC_PRINT, c);
a0ed51b3
LW
2032}
2033
b8c5462f 2034bool
84afefe6 2035Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 2036{
edfb3318 2037 if (c < 256) {
f4cd282c 2038 return isPUNCT_LC(c);
edfb3318 2039 }
922e8cb4 2040 return _is_uni_FOO(_CC_PUNCT, c);
b8c5462f
JH
2041}
2042
4d61ec05 2043bool
84afefe6 2044Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 2045{
edfb3318 2046 if (c < 256) {
f4cd282c 2047 return isXDIGIT_LC(c);
edfb3318 2048 }
61b19385 2049 return isXDIGIT_uni(c);
4d61ec05
GS
2050}
2051
b7ac61fa
JH
2052U32
2053Perl_to_uni_upper_lc(pTHX_ U32 c)
2054{
ee099d14
JH
2055 /* XXX returns only the first character -- do not use XXX */
2056 /* XXX no locale support yet */
2057 STRLEN len;
89ebb4a3 2058 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2059 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
2060}
2061
2062U32
2063Perl_to_uni_title_lc(pTHX_ U32 c)
2064{
ee099d14
JH
2065 /* XXX returns only the first character XXX -- do not use XXX */
2066 /* XXX no locale support yet */
2067 STRLEN len;
89ebb4a3 2068 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2069 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
2070}
2071
2072U32
2073Perl_to_uni_lower_lc(pTHX_ U32 c)
2074{
ee099d14
JH
2075 /* XXX returns only the first character -- do not use XXX */
2076 /* XXX no locale support yet */
2077 STRLEN len;
89ebb4a3 2078 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2079 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
2080}
2081
26483009 2082PERL_STATIC_INLINE bool
5141f98e 2083S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
2084 const char *const swashname)
2085{
ea317ccb
KW
2086 /* returns a boolean giving whether or not the UTF8-encoded character that
2087 * starts at <p> is in the swash indicated by <swashname>. <swash>
2088 * contains a pointer to where the swash indicated by <swashname>
2089 * is to be stored; which this routine will do, so that future calls will
2090 * look at <*swash> and only generate a swash if it is not null
2091 *
2092 * Note that it is assumed that the buffer length of <p> is enough to
2093 * contain all the bytes that comprise the character. Thus, <*p> should
2094 * have been checked before this call for mal-formedness enough to assure
2095 * that. */
2096
97aff369 2097 dVAR;
7918f24d
NC
2098
2099 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2100
492a624f 2101 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2102 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2103 * as far as there being enough bytes available in it to accommodate the
2104 * character without reading beyond the end, and pass that number on to the
2105 * validating routine */
28123549
KW
2106 if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
2107 if (ckWARN_d(WARN_UTF8)) {
2108 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2109 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2110 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2111 what the malformation is */
2112 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2113 }
2114 }
2115 return FALSE;
2116 }
87367d5f
KW
2117 if (!*swash) {
2118 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2119 *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
2120 }
28123549 2121
bde6a22d
NC
2122 return swash_fetch(*swash, p, TRUE) != 0;
2123}
2124
2125bool
922e8cb4
KW
2126Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2127{
2128 dVAR;
2129
2130 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2131
2132 assert(classnum < _FIRST_NON_SWASH_CC);
2133
2134 return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
2135}
2136
2137bool
7fc63493 2138Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 2139{
97aff369 2140 dVAR;
7918f24d
NC
2141
2142 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
2143
671c33bf
NC
2144 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
2145 * descendant of isalnum(3), in other words, it doesn't
2146 * contain the '_'. --jhi */
03940dc2 2147 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
a0ed51b3
LW
2148}
2149
2150bool
5092f92a
KW
2151Perl_is_utf8_alnumc(pTHX_ const U8 *p)
2152{
2153 dVAR;
2154
2155 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
2156
03940dc2 2157 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
5092f92a
KW
2158}
2159
2160bool
7fc63493 2161Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 2162{
97aff369 2163 dVAR;
7918f24d
NC
2164
2165 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
2166
61b19385 2167 return S_is_utf8_idfirst(aTHX_ p);
82686b01
JH
2168}
2169
2170bool
c11ff943
KW
2171Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
2172{
2173 dVAR;
2174
2175 PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
2176
2177 if (*p == '_')
2178 return TRUE;
2179 /* is_utf8_idstart would be more logical. */
2180 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
2181}
2182
2183bool
d65654cb 2184Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
b6912c02
KW
2185{
2186 dVAR;
2187
d65654cb 2188 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
b6912c02
KW
2189
2190 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
2191}
2192
2193bool
eba68aa0
KW
2194Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2195{
2196 dVAR;
2197
2198 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2199
2200 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
2201}
2202
2203
2204bool
7fc63493 2205Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2206{
97aff369 2207 dVAR;
7918f24d
NC
2208
2209 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
2210
d4c19fe8 2211 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
2212}
2213
2214bool
c11ff943
KW
2215Perl_is_utf8_xidcont(pTHX_ const U8 *p)
2216{
2217 dVAR;
2218
2219 PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
2220
c11ff943
KW
2221 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
2222}
2223
2224bool
7fc63493 2225Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 2226{
97aff369 2227 dVAR;
7918f24d
NC
2228
2229 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
2230
03940dc2 2231 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
a0ed51b3
LW
2232}
2233
2234bool
7fc63493 2235Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 2236{
97aff369 2237 dVAR;
7918f24d
NC
2238
2239 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
2240
bc39fe24
KW
2241 /* ASCII characters are the same whether in utf8 or not. So the macro
2242 * works on both utf8 and non-utf8 representations. */
2243 return isASCII(*p);
b8c5462f
JH
2244}
2245
2246bool
bdd8600f
KW
2247Perl_is_utf8_blank(pTHX_ const U8 *p)
2248{
2249 dVAR;
2250
2251 PERL_ARGS_ASSERT_IS_UTF8_BLANK;
2252
2cafb56b 2253 return isBLANK_utf8(p);
bdd8600f
KW
2254}
2255
2256bool
7fc63493 2257Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 2258{
97aff369 2259 dVAR;
7918f24d
NC
2260
2261 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
2262
add4123a 2263 return isSPACE_utf8(p);
a0ed51b3
LW
2264}
2265
2266bool
d1eb3177
YO
2267Perl_is_utf8_perl_space(pTHX_ const U8 *p)
2268{
2269 dVAR;
2270
2271 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2272
c4428693
KW
2273 /* Only true if is an ASCII space-like character, and ASCII is invariant
2274 * under utf8, so can just use the macro */
2275 return isSPACE_A(*p);
d1eb3177
YO
2276}
2277
2278bool
2279Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2280{
2281 dVAR;
2282
2283 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2284
c4428693
KW
2285 /* Only true if is an ASCII word character, and ASCII is invariant
2286 * under utf8, so can just use the macro */
2287 return isWORDCHAR_A(*p);
d1eb3177
YO
2288}
2289
2290bool
7fc63493 2291Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 2292{
97aff369 2293 dVAR;
7918f24d
NC
2294
2295 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2296
03940dc2 2297 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
a0ed51b3
LW
2298}
2299
2300bool
d1eb3177
YO
2301Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2302{
2303 dVAR;
2304
2305 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2306
c4428693
KW
2307 /* Only true if is an ASCII digit character, and ASCII is invariant
2308 * under utf8, so can just use the macro */
2309 return isDIGIT_A(*p);
d1eb3177
YO
2310}
2311
2312bool
7fc63493 2313Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 2314{
97aff369 2315 dVAR;
7918f24d
NC
2316
2317 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2318
03940dc2 2319 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
a0ed51b3
LW
2320}
2321
2322bool
7fc63493 2323Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 2324{
97aff369 2325 dVAR;
7918f24d
NC
2326
2327 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2328
03940dc2 2329 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
a0ed51b3
LW
2330}
2331
2332bool
7fc63493 2333Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 2334{
97aff369 2335 dVAR;
7918f24d
NC
2336
2337 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2338
a35d759a 2339 return isCNTRL_utf8(p);
b8c5462f
JH
2340}
2341
2342bool
7fc63493 2343Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 2344{
97aff369 2345 dVAR;
7918f24d
NC
2346
2347 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2348
03940dc2 2349 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
b8c5462f
JH
2350}
2351
2352bool
7fc63493 2353Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 2354{
97aff369 2355 dVAR;
7918f24d
NC
2356
2357 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2358
03940dc2 2359 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
a0ed51b3
LW
2360}
2361
2362bool
7fc63493 2363Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 2364{
97aff369 2365 dVAR;
7918f24d
NC
2366
2367 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2368
03940dc2 2369 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
b8c5462f
JH
2370}
2371
2372bool
7fc63493 2373Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 2374{
97aff369 2375 dVAR;
7918f24d
NC
2376
2377 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2378
4ac6419d 2379 return is_XDIGIT_utf8(p);
b8c5462f
JH
2380}
2381
2382bool
7dbf68d2
KW
2383Perl__is_utf8_mark(pTHX_ const U8 *p)
2384{
2385 dVAR;
2386
2387 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2388
2389 return is_utf8_common(p, &PL_utf8_mark, "IsM");
2390}
2391
2392
2393bool
7fc63493 2394Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 2395{
97aff369 2396 dVAR;
7918f24d
NC
2397
2398 PERL_ARGS_ASSERT_IS_UTF8_MARK;
2399
7dbf68d2 2400 return _is_utf8_mark(p);
a0ed51b3
LW
2401}
2402
6b5c0936 2403/*
87cea99e 2404=for apidoc to_utf8_case
6b5c0936 2405
6fae5207 2406C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2407the character that is being converted. This routine assumes that the character
2408at C<p> is well-formed.
6b5c0936 2409
6fae5207
KW
2410C<ustrp> is a pointer to the character buffer to put the
2411conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2412of the result.
2413
6fae5207 2414C<swashp> is a pointer to the swash to use.
6b5c0936 2415
a1433954 2416Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
6fae5207 2417and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2418but not always, a multicharacter mapping), is tried first.
6b5c0936 2419
4a8240a3
KW
2420C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2421any special mappings; C<""> means to use the special mappings. Values other
2422than these two are treated as the name of the hash containing the special
2423mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2424
6fae5207 2425C<normal> is a string like "ToLower" which means the swash
0134edef
JH
2426%utf8::ToLower.
2427
2428=cut */
6b5c0936 2429
2104c8d9 2430UV
9a957fbc
AL
2431Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2432 SV **swashp, const char *normal, const char *special)
a0ed51b3 2433{
97aff369 2434 dVAR;
0134edef 2435 STRLEN len = 0;
f4cd282c 2436 const UV uv1 = valid_utf8_to_uvchr(p, NULL);
7918f24d
NC
2437
2438 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2439
9ae3ac1a
KW
2440 /* Note that swash_fetch() doesn't output warnings for these because it
2441 * assumes we will */
8457b38f 2442 if (uv1 >= UNICODE_SURROGATE_FIRST) {
9ae3ac1a 2443 if (uv1 <= UNICODE_SURROGATE_LAST) {
8457b38f
KW
2444 if (ckWARN_d(WARN_SURROGATE)) {
2445 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2446 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2447 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2448 }
9ae3ac1a
KW
2449 }
2450 else if (UNICODE_IS_SUPER(uv1)) {
8457b38f
KW
2451 if (ckWARN_d(WARN_NON_UNICODE)) {
2452 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2453 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2454 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2455 }
9ae3ac1a
KW
2456 }
2457
2458 /* Note that non-characters are perfectly legal, so no warning should
2459 * be given */
2460 }
2461
0134edef 2462 if (!*swashp) /* load on-demand */
5ab9d2ef 2463 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2464
a6f87d8c 2465 if (special) {
0134edef 2466 /* It might be "special" (sometimes, but not always,
2a37f04d 2467 * a multicharacter mapping) */
4a8240a3 2468 HV *hv = NULL;
b08cf34e
JH
2469 SV **svp;
2470
4a8240a3
KW
2471 /* If passed in the specials name, use that; otherwise use any
2472 * given in the swash */
2473 if (*special != '\0') {
2474 hv = get_hv(special, 0);
2475 }
2476 else {
2477 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2478 if (svp) {
2479 hv = MUTABLE_HV(SvRV(*svp));
2480 }
2481 }
2482
176fe009
KW
2483 if (hv
2484 && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
2485 && (*svp))
2486 {
cfd0369c 2487 const char *s;
47654450 2488
cfd0369c 2489 s = SvPV_const(*svp, len);
47654450 2490 if (len == 1)
f4cd282c 2491 /* EIGHTBIT */
c80e42f3 2492 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2493 else {
d2dcd0fb 2494 Copy(s, ustrp, len, U8);
29e98929 2495 }
983ffd37 2496 }
0134edef
JH
2497 }
2498
2499 if (!len && *swashp) {
f4cd282c 2500 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
d4c19fe8 2501
0134edef
JH
2502 if (uv2) {
2503 /* It was "normal" (a single character mapping). */
f4cd282c 2504 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2505 }
2506 }
1feea2c7 2507
cbe07460
KW
2508 if (len) {
2509 if (lenp) {
2510 *lenp = len;
2511 }
2512 return valid_utf8_to_uvchr(ustrp, 0);
2513 }
2514
2515 /* Here, there was no mapping defined, which means that the code point maps
2516 * to itself. Return the inputs */
bfdf22ec 2517 len = UTF8SKIP(p);
ca9fab46
KW
2518 if (p != ustrp) { /* Don't copy onto itself */
2519 Copy(p, ustrp, len, U8);
2520 }
0134edef 2521
2a37f04d
JH
2522 if (lenp)
2523 *lenp = len;
2524
f4cd282c 2525 return uv1;
cbe07460 2526
a0ed51b3
LW
2527}
2528
051a06d4
KW
2529STATIC UV
2530S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2531{
2532 /* This is called when changing the case of a utf8-encoded character above
2533 * the Latin1 range, and the operation is in locale. If the result
2534 * contains a character that crosses the 255/256 boundary, disallow the
2535 * change, and return the original code point. See L<perlfunc/lc> for why;
2536 *
a1433954
KW
2537 * p points to the original string whose case was changed; assumed
2538 * by this routine to be well-formed
051a06d4
KW
2539 * result the code point of the first character in the changed-case string
2540 * ustrp points to the changed-case string (<result> represents its first char)
2541 * lenp points to the length of <ustrp> */
2542
2543 UV original; /* To store the first code point of <p> */
2544
2545 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2546
a4f12ed7 2547 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2548
2549 /* We know immediately if the first character in the string crosses the
2550 * boundary, so can skip */
2551 if (result > 255) {
2552
2553 /* Look at every character in the result; if any cross the
2554 * boundary, the whole thing is disallowed */
2555 U8* s = ustrp + UTF8SKIP(ustrp);
2556 U8* e = ustrp + *lenp;
2557 while (s < e) {
a4f12ed7 2558 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2559 goto bad_crossing;
2560 }
2561 s += UTF8SKIP(s);
2562 }
2563
2564 /* Here, no characters crossed, result is ok as-is */
2565 return result;
2566 }
2567
2568bad_crossing:
2569
2570 /* Failed, have to return the original */
4b88fb76 2571 original = valid_utf8_to_uvchr(p, lenp);
051a06d4
KW
2572 Copy(p, ustrp, *lenp, char);
2573 return original;
2574}
2575
d3e79532 2576/*
87cea99e 2577=for apidoc to_utf8_upper
d3e79532 2578
1f607577 2579Instead use L</toUPPER_utf8>.
a1433954 2580
d3e79532
JH
2581=cut */
2582
051a06d4
KW
2583/* Not currently externally documented, and subject to change:
2584 * <flags> is set iff locale semantics are to be used for code points < 256
2585 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2586 * were used in the calculation; otherwise unchanged. */
2587
2104c8d9 2588UV
051a06d4 2589Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2590{
97aff369 2591 dVAR;
7918f24d 2592
051a06d4
KW
2593 UV result;
2594
2595 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2596
3a4c58c9 2597 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2598 if (flags) {
2599 result = toUPPER_LC(*p);
2600 }
2601 else {
81c6c7ce 2602 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2603 }
3a4c58c9
KW
2604 }
2605 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2606 if (flags) {
a6d8b88b 2607 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2608 result = toUPPER_LC(c);
051a06d4
KW
2609 }
2610 else {
94bb8c36 2611 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2612 ustrp, lenp, 'S');
051a06d4
KW
2613 }
2614 }
2615 else { /* utf8, ord above 255 */
2616 result = CALL_UPPER_CASE(p, ustrp, lenp);
2617
2618 if (flags) {
2619 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2620 }
2621 return result;
2622 }
2623
2624 /* Here, used locale rules. Convert back to utf8 */
2625 if (UTF8_IS_INVARIANT(result)) {
2626 *ustrp = (U8) result;
2627 *lenp = 1;
2628 }
2629 else {
62cb07ea
KW
2630 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2631 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2632 *lenp = 2;
3a4c58c9
KW
2633 }
2634
051a06d4
KW
2635 if (tainted_ptr) {
2636 *tainted_ptr = TRUE;
2637 }
2638 return result;
983ffd37 2639}
a0ed51b3 2640
d3e79532 2641/*
87cea99e 2642=for apidoc to_utf8_title
d3e79532 2643
1f607577 2644Instead use L</toTITLE_utf8>.
a1433954 2645
d3e79532
JH
2646=cut */
2647
051a06d4
KW
2648/* Not currently externally documented, and subject to change:
2649 * <flags> is set iff locale semantics are to be used for code points < 256
2650 * Since titlecase is not defined in POSIX, uppercase is used instead
2651 * for these/
2652 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2653 * were used in the calculation; otherwise unchanged. */
2654
983ffd37 2655UV
051a06d4 2656Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
983ffd37 2657{
97aff369 2658 dVAR;
7918f24d 2659
051a06d4
KW
2660 UV result;
2661
2662 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2663
3a4c58c9 2664 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2665 if (flags) {
2666 result = toUPPER_LC(*p);
2667 }
2668 else {
81c6c7ce 2669 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2670 }
3a4c58c9
KW
2671 }
2672 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2673 if (flags) {
a6d8b88b 2674 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2675 result = toUPPER_LC(c);
051a06d4
KW
2676 }
2677 else {
94bb8c36 2678 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2679 ustrp, lenp, 's');
051a06d4
KW
2680 }
2681 }
2682 else { /* utf8, ord above 255 */
2683 result = CALL_TITLE_CASE(p, ustrp, lenp);
2684
2685 if (flags) {
2686 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2687 }
2688 return result;
2689 }
2690
2691 /* Here, used locale rules. Convert back to utf8 */
2692 if (UTF8_IS_INVARIANT(result)) {
2693 *ustrp = (U8) result;
2694 *lenp = 1;
2695 }
2696 else {
62cb07ea
KW
2697 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2698 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2699 *lenp = 2;
3a4c58c9
KW
2700 }
2701
051a06d4
KW
2702 if (tainted_ptr) {
2703 *tainted_ptr = TRUE;
2704 }
2705 return result;
a0ed51b3
LW
2706}
2707
d3e79532 2708/*
87cea99e 2709=for apidoc to_utf8_lower
d3e79532 2710
1f607577 2711Instead use L</toLOWER_utf8>.
a1433954 2712
d3e79532
JH
2713=cut */
2714
051a06d4
KW
2715/* Not currently externally documented, and subject to change:
2716 * <flags> is set iff locale semantics are to be used for code points < 256
2717 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2718 * were used in the calculation; otherwise unchanged. */
2719
2104c8d9 2720UV
051a06d4 2721Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2722{
051a06d4
KW
2723 UV result;
2724
97aff369 2725 dVAR;
7918f24d 2726
051a06d4 2727 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2728
968c5e6a 2729 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2730 if (flags) {
2731 result = toLOWER_LC(*p);
2732 }
2733 else {
81c6c7ce 2734 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2735 }
968c5e6a
KW
2736 }
2737 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2738 if (flags) {
a6d8b88b 2739 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2740 result = toLOWER_LC(c);
051a06d4
KW
2741 }
2742 else {
94bb8c36 2743 return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2744 ustrp, lenp);
051a06d4 2745 }
968c5e6a 2746 }
051a06d4
KW
2747 else { /* utf8, ord above 255 */
2748 result = CALL_LOWER_CASE(p, ustrp, lenp);
2749
2750 if (flags) {
2751 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2752 }
968c5e6a 2753
051a06d4
KW
2754 return result;
2755 }
2756
2757 /* Here, used locale rules. Convert back to utf8 */
2758 if (UTF8_IS_INVARIANT(result)) {
2759 *ustrp = (U8) result;
2760 *lenp = 1;
2761 }
2762 else {
62cb07ea
KW
2763 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2764 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2765 *lenp = 2;
2766 }
2767
2768 if (tainted_ptr) {
2769 *tainted_ptr = TRUE;
2770 }
2771 return result;
b4e400f9
JH
2772}
2773
d3e79532 2774/*
87cea99e 2775=for apidoc to_utf8_fold
d3e79532 2776
1f607577 2777Instead use L</toFOLD_utf8>.
a1433954 2778
d3e79532
JH
2779=cut */
2780
051a06d4
KW
2781/* Not currently externally documented, and subject to change,
2782 * in <flags>
2783 * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2784 * points < 256. Since foldcase is not defined in
2785 * POSIX, lowercase is used instead
2786 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2787 * otherwise simple folds
a0270393
KW
2788 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2789 * prohibited
051a06d4
KW
2790 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2791 * were used in the calculation; otherwise unchanged. */
36bb2ab6 2792
b4e400f9 2793UV
051a06d4 2794Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
b4e400f9 2795{
97aff369 2796 dVAR;
7918f24d 2797
051a06d4
KW
2798 UV result;
2799
36bb2ab6 2800 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2801
a0270393
KW
2802 /* These are mutually exclusive */
2803 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2804
50ba90ff
KW
2805 assert(p != ustrp); /* Otherwise overwrites */
2806
a1dde8de 2807 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2808 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2809 result = toFOLD_LC(*p);
051a06d4
KW
2810 }
2811 else {
81c6c7ce 2812 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2813 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2814 }
a1dde8de
KW
2815 }
2816 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2817 if (flags & FOLD_FLAGS_LOCALE) {
a6d8b88b 2818 U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2819 result = toFOLD_LC(c);
051a06d4
KW
2820 }
2821 else {
94bb8c36 2822 return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2823 ustrp, lenp,
1ca267a5 2824 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2825 }
a1dde8de 2826 }
051a06d4 2827 else { /* utf8, ord above 255 */
a0270393 2828 result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2829
1ca267a5
KW
2830 if (flags & FOLD_FLAGS_LOCALE) {
2831
9fc2026f 2832 /* Special case these characters, as what normally gets returned
1ca267a5
KW
2833 * under locale doesn't work */
2834 if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2835 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2836 sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2837 {
2838 goto return_long_s;
2839 }
9fc2026f
KW
2840 else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2841 && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2842 sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2843 {
2844 goto return_ligature_st;
2845 }
a0270393 2846 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2847 }
a0270393
KW
2848 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2849 return result;
2850 }
2851 else {
2852 /* This is called when changing the case of a utf8-encoded
9fc2026f
KW
2853 * character above the ASCII range, and the result should not
2854 * contain an ASCII character. */
a0270393
KW
2855
2856 UV original; /* To store the first code point of <p> */
2857
2858 /* Look at every character in the result; if any cross the
2859 * boundary, the whole thing is disallowed */
2860 U8* s = ustrp;
2861 U8* e = ustrp + *lenp;
2862 while (s < e) {
2863 if (isASCII(*s)) {
2864 /* Crossed, have to return the original */
2865 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 2866
9fc2026f 2867 /* But in these instances, there is an alternative we can
1ca267a5 2868 * return that is valid */
9fc2026f
KW
2869 if (original == LATIN_CAPITAL_LETTER_SHARP_S
2870 || original == LATIN_SMALL_LETTER_SHARP_S)
2871 {
1ca267a5
KW
2872 goto return_long_s;
2873 }
9fc2026f
KW
2874 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2875 goto return_ligature_st;
2876 }
a0270393
KW
2877 Copy(p, ustrp, *lenp, char);
2878 return original;
2879 }
2880 s += UTF8SKIP(s);
2881 }
051a06d4 2882
a0270393
KW
2883 /* Here, no characters crossed, result is ok as-is */
2884 return result;
2885 }
051a06d4
KW
2886 }
2887
2888 /* Here, used locale rules. Convert back to utf8 */
2889 if (UTF8_IS_INVARIANT(result)) {
2890 *ustrp = (U8) result;
2891 *lenp = 1;
2892 }
2893 else {
62cb07ea
KW
2894 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2895 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2896 *lenp = 2;
2897 }
2898
2899 if (tainted_ptr) {
2900 *tainted_ptr = TRUE;
2901 }
2902 return result;
1ca267a5
KW
2903
2904 return_long_s:
2905 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2906 * folds to a string of two of these characters. By returning this
2907 * instead, then, e.g.,
2908 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2909 * works. */
2910
2911 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2912 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2913 ustrp, *lenp, U8);
2914 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
2915
2916 return_ligature_st:
2917 /* Two folds to 'st' are prohibited by the options; instead we pick one and
2918 * have the other one fold to it */
2919
2920 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2921 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2922 return LATIN_SMALL_LIGATURE_ST;
a0ed51b3
LW
2923}
2924
711a919c 2925/* Note:
f90a9a02 2926 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
2927 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2928 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2929 */
c4a5db0c 2930
a0ed51b3 2931SV*
7fc63493 2932Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 2933{
c4a5db0c
KW
2934 PERL_ARGS_ASSERT_SWASH_INIT;
2935
2936 /* Returns a copy of a swash initiated by the called function. This is the
2937 * public interface, and returning a copy prevents others from doing
2938 * mischief on the original */
2939
5d3d13d1 2940 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
2941}
2942
2943SV*
5d3d13d1 2944Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c
KW
2945{
2946 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
2947 * by calling utf8_heavy.pl in the general case. The returned value may be
2948 * the swash's inversion list instead if the input parameters allow it.
2949 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
2950 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2951 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
2952 *
2953 * This interface should only be used by functions that won't destroy or
2954 * adversely change the swash, as doing so affects all other uses of the
2955 * swash in the program; the general public should use 'Perl_swash_init'
2956 * instead.
2957 *
2958 * pkg is the name of the package that <name> should be in.
2959 * name is the name of the swash to find. Typically it is a Unicode
2960 * property name, including user-defined ones
2961 * listsv is a string to initialize the swash with. It must be of the form
2962 * documented as the subroutine return value in
2963 * L<perlunicode/User-Defined Character Properties>
2964 * minbits is the number of bits required to represent each data element.
2965 * It is '1' for binary properties.
2966 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 2967 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
2968 * flags_p if non-NULL is the address of various input and output flag bits
2969 * to the routine, as follows: ('I' means is input to the routine;
2970 * 'O' means output from the routine. Only flags marked O are
2971 * meaningful on return.)
2972 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2973 * came from a user-defined property. (I O)
5d3d13d1
KW
2974 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2975 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
2976 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2977 * return of an inversion list instead of a swash hash if this routine
2978 * thinks that would result in faster execution of swash_fetch() later
2979 * on. (I)
9a53f6cf
KW
2980 *
2981 * Thus there are three possible inputs to find the swash: <name>,
2982 * <listsv>, and <invlist>. At least one must be specified. The result
2983 * will be the union of the specified ones, although <listsv>'s various
2984 * actions can intersect, etc. what <name> gives.
2985 *
2986 * <invlist> is only valid for binary properties */
c4a5db0c 2987
27da23d5 2988 dVAR;
c4a5db0c 2989 SV* retval = &PL_sv_undef;
83199d38 2990 HV* swash_hv = NULL;
87367d5f
KW
2991 const int invlist_swash_boundary =
2992 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2993 ? 512 /* Based on some benchmarking, but not extensive, see commit
2994 message */
2995 : -1; /* Never return just an inversion list */
9a53f6cf
KW
2996
2997 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2998 assert(! invlist || minbits == 1);
2999
3000 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3001 * so */
3002 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3003 dSP;
3004 const size_t pkg_len = strlen(pkg);
3005 const size_t name_len = strlen(name);
3006 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3007 SV* errsv_save;
3008 GV *method;
3009
3010 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3011
3012 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3013 ENTER;
69794297
KW
3014 SAVEHINTS();
3015 save_re_context();
650f067c
JL
3016 /* We might get here via a subroutine signature which uses a utf8
3017 * parameter name, at which point PL_subname will have been set
3018 * but not yet used. */
3019 save_item(PL_subname);
69794297
KW
3020 if (PL_parser && PL_parser->error_count)
3021 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3022 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
3023 if (!method) { /* demand load utf8 */
3024 ENTER;
db2c6cb3
FC
3025 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3026 GvSV(PL_errgv) = NULL;
69794297
KW
3027 /* It is assumed that callers of this routine are not passing in
3028 * any user derived data. */
3029 /* Need to do this after save_re_context() as it will set
3030 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3031 * in Perl_magic_get). Even line to create errsv_save can turn on
3032 * PL_tainted. */
284167a5
SM
3033#ifndef NO_TAINT_SUPPORT
3034 SAVEBOOL(TAINT_get);
3035 TAINT_NOT;
3036#endif
69794297
KW
3037 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3038 NULL);
eed484f9 3039 {
db2c6cb3
FC
3040 /* Not ERRSV, as there is no need to vivify a scalar we are
3041 about to discard. */
3042 SV * const errsv = GvSV(PL_errgv);
3043 if (!SvTRUE(errsv)) {
3044 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3045 SvREFCNT_dec(errsv);
3046 }
eed484f9 3047 }
69794297
KW
3048 LEAVE;
3049 }
3050 SPAGAIN;
3051 PUSHMARK(SP);
3052 EXTEND(SP,5);
3053 mPUSHp(pkg, pkg_len);
3054 mPUSHp(name, name_len);
3055 PUSHs(listsv);
3056 mPUSHi(minbits);
3057 mPUSHi(none);
3058 PUTBACK;
db2c6cb3
FC
3059 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3060 GvSV(PL_errgv) = NULL;
69794297
KW
3061 /* If we already have a pointer to the method, no need to use
3062 * call_method() to repeat the lookup. */
c41800a8
KW
3063 if (method
3064 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3065 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3066 {
3067 retval = *PL_stack_sp--;
3068 SvREFCNT_inc(retval);
3069 }
eed484f9 3070 {
db2c6cb3
FC
3071 /* Not ERRSV. See above. */
3072 SV * const errsv = GvSV(PL_errgv);
3073 if (!SvTRUE(errsv)) {
3074 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3075 SvREFCNT_dec(errsv);
3076 }
eed484f9 3077 }
ce3b816e 3078 LEAVE;
69794297
KW
3079 POPSTACK;
3080 if (IN_PERL_COMPILETIME) {
3081 CopHINTS_set(PL_curcop, PL_hints);
3082 }
3083 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
3084 if (SvPOK(retval))
3085
3086 /* If caller wants to handle missing properties, let them */
5d3d13d1 3087 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
69794297
KW
3088 return NULL;
3089 }
3090 Perl_croak(aTHX_
3091 "Can't find Unicode property definition \"%"SVf"\"",
3092 SVfARG(retval));
3093 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
3094 }
9a53f6cf 3095 } /* End of calling the module to find the swash */
36eb48b4 3096
83199d38
KW
3097 /* If this operation fetched a swash, and we will need it later, get it */
3098 if (retval != &PL_sv_undef
3099 && (minbits == 1 || (flags_p
3100 && ! (*flags_p
3101 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3102 {
3103 swash_hv = MUTABLE_HV(SvRV(retval));
3104
3105 /* If we don't already know that there is a user-defined component to
3106 * this swash, and the user has indicated they wish to know if there is
3107 * one (by passing <flags_p>), find out */
3108 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3109 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3110 if (user_defined && SvUV(*user_defined)) {
3111 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3112 }
3113 }
3114 }
3115
36eb48b4
KW
3116 /* Make sure there is an inversion list for binary properties */
3117 if (minbits == 1) {
3118 SV** swash_invlistsvp = NULL;
3119 SV* swash_invlist = NULL;
9a53f6cf 3120 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3121 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3122 an unclaimed reference count */
36eb48b4 3123
9a53f6cf 3124 /* If this operation fetched a swash, get its already existing
83199d38 3125 * inversion list, or create one for it */
36eb48b4 3126
83199d38 3127 if (swash_hv) {
5c9f4bd2 3128 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3129 if (swash_invlistsvp) {
3130 swash_invlist = *swash_invlistsvp;
3131 invlist_in_swash_is_valid = TRUE;
3132 }
3133 else {
36eb48b4 3134 swash_invlist = _swash_to_invlist(retval);
02c85471 3135 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3136 }
3137 }
3138
3139 /* If an inversion list was passed in, have to include it */
3140 if (invlist) {
3141
3142 /* Any fetched swash will by now have an inversion list in it;
3143 * otherwise <swash_invlist> will be NULL, indicating that we
3144 * didn't fetch a swash */
3145 if (swash_invlist) {
3146
3147 /* Add the passed-in inversion list, which invalidates the one
3148 * already stored in the swash */
3149 invlist_in_swash_is_valid = FALSE;
3150 _invlist_union(invlist, swash_invlist, &swash_invlist);
3151 }
3152 else {
3153
87367d5f
KW
3154 /* Here, there is no swash already. Set up a minimal one, if
3155 * we are going to return a swash */
3156 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3157 swash_hv = newHV();
4aca0fe6 3158 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3159 }
9a53f6cf
KW
3160 swash_invlist = invlist;
3161 }
9a53f6cf
KW
3162 }
3163
3164 /* Here, we have computed the union of all the passed-in data. It may
3165 * be that there was an inversion list in the swash which didn't get
3166 * touched; otherwise save the one computed one */
87367d5f
KW
3167 if (! invlist_in_swash_is_valid
3168 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3169 {
5c9f4bd2 3170 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3171 {
3172 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3173 }
cc34d8c5
FC
3174 /* We just stole a reference count. */
3175 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3176 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3177 }
87367d5f 3178
c41800a8 3179 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3180 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3181 SvREFCNT_dec(retval);
02c85471
FC
3182 if (!swash_invlist_unclaimed)
3183 SvREFCNT_inc_simple_void_NN(swash_invlist);
3184 retval = newRV_noinc(swash_invlist);
87367d5f 3185 }
36eb48b4
KW
3186 }
3187
a0ed51b3
LW
3188 return retval;
3189}
3190
035d37be
JH
3191
3192/* This API is wrong for special case conversions since we may need to
3193 * return several Unicode characters for a single Unicode character
3194 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3195 * the lower-level routine, and it is similarly broken for returning
38684baa
KW
3196 * multiple values. --jhi
3197 * For those, you should use to_utf8_case() instead */
b0e3252e 3198/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
3199
3200/* Note:
3201 * Returns the value of property/mapping C<swash> for the first character
3202 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
3d0f8846
KW
3203 * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
3204 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3205 *
3206 * A "swash" is a hash which contains initially the keys/values set up by
3207 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3208 * property for all possible code points. Things are stored in a compact form
3209 * (see utf8_heavy.pl) so that calculation is required to find the actual
3210 * property value for a given code point. As code points are looked up, new
3211 * key/value pairs are added to the hash, so that the calculation doesn't have
3212 * to ever be re-done. Further, each calculation is done, not just for the
3213 * desired one, but for a whole block of code points adjacent to that one.
3214 * For binary properties on ASCII machines, the block is usually for 64 code
3215 * points, starting with a code point evenly divisible by 64. Thus if the
3216 * property value for code point 257 is requested, the code goes out and
3217 * calculates the property values for all 64 code points between 256 and 319,
3218 * and stores these as a single 64-bit long bit vector, called a "swatch",
3219 * under the key for code point 256. The key is the UTF-8 encoding for code
3220 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3221 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3222 * for code point 258 is then requested, this code realizes that it would be
3223 * stored under the key for 256, and would find that value and extract the
3224 * relevant bit, offset from 256.
3225 *
3226 * Non-binary properties are stored in as many bits as necessary to represent
3227 * their values (32 currently, though the code is more general than that), not
3228 * as single bits, but the principal is the same: the value for each key is a
3229 * vector that encompasses the property values for all code points whose UTF-8
3230 * representations are represented by the key. That is, for all code points
3231 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3232 * bytes of that.
680c470c 3233 */
a0ed51b3 3234UV
680c470c 3235Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3236{
27da23d5 3237 dVAR;
ef8f7699 3238 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3239 U32 klen;
3240 U32 off;
a0ed51b3 3241 STRLEN slen;
7d85a32c 3242 STRLEN needents;
cfd0369c 3243 const U8 *tmps = NULL;
a0ed51b3 3244 U32 bit;
979f2922 3245 SV *swatch;
08fb1ac5 3246 const U8 c = *ptr;
3568d838 3247
7918f24d
NC
3248 PERL_ARGS_ASSERT_SWASH_FETCH;
3249
87367d5f
KW
3250 /* If it really isn't a hash, it isn't really swash; must be an inversion
3251 * list */
3252 if (SvTYPE(hv) != SVt_PVHV) {
3253 return _invlist_contains_cp((SV*)hv,
3254 (do_utf8)
3255 ? valid_utf8_to_uvchr(ptr, NULL)
3256 : c);
3257 }
3258
08fb1ac5
KW
3259 /* We store the values in a "swatch" which is a vec() value in a swash
3260 * hash. Code points 0-255 are a single vec() stored with key length
3261 * (klen) 0. All other code points have a UTF-8 representation
3262 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3263 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3264 * length for them is the length of the encoded char - 1. ptr[klen] is the
3265 * final byte in the sequence representing the character */
3266 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3267 klen = 0;
3268 needents = 256;
3269 off = c;
3568d838 3270 }
08fb1ac5
KW
3271 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3272 klen = 0;
3273 needents = 256;
3274 off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
ST
3275 }
3276 else {
08fb1ac5
KW
3277 klen = UTF8SKIP(ptr) - 1;
3278
3279 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3280 * the vec is the final byte in the sequence. (In EBCDIC this is
3281 * converted to I8 to get consecutive values.) To help you visualize
3282 * all this:
3283 * Straight 1047 After final byte
3284 * UTF-8 UTF-EBCDIC I8 transform
3285 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3286 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3287 * ...
3288 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3289 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3290 * ...
3291 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3292 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3293 * ...
3294 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3295 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3296 * ...
3297 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3298 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3299 *
3300 * (There are no discontinuities in the elided (...) entries.)
3301 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3302 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3303 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3304 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3305 * index into the vec() swatch (after subtracting 0x80, which we
3306 * actually do with an '&').
3307 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3308 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3309 * dicontinuities which go away by transforming it into I8, and we
3310 * effectively subtract 0xA0 to get the index. */
979f2922 3311 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3312 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3313 }
7d85a32c 3314
a0ed51b3
LW
3315 /*
3316 * This single-entry cache saves about 1/3 of the utf8 overhead in test
3317 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3318 * it's nothing to sniff at.) Pity we usually come through at least
3319 * two function calls to get here...
3320 *
3321 * NB: this code assumes that swatches are never modified, once generated!
3322 */
3323
3568d838 3324 if (hv == PL_last_swash_hv &&
a0ed51b3 3325 klen == PL_last_swash_klen &&
27da23d5 3326 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3327 {
3328 tmps = PL_last_swash_tmps;
3329 slen = PL_last_swash_slen;
3330 }
3331 else {
3332 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3333 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3334
b0e3252e 3335 /* If not cached, generate it via swatch_get */
979f2922 3336 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3337 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3338 {
3339 if (klen) {
3340 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3341 swatch = swatch_get(swash,
3342 code_point & ~((UV)needents - 1),
3343 needents);
3344 }
3345 else { /* For the first 256 code points, the swatch has a key of
3346 length 0 */
3347 swatch = swatch_get(swash, 0, needents);
3348 }
979f2922 3349
923e4eb5 3350 if (IN_PERL_COMPILETIME)
623e6609 3351 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3352
979f2922 3353 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3354
979f2922
ST
3355 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3356 || (slen << 3) < needents)
5637ef5b
NC
3357 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3358 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3359 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3360 }
3361
3362 PL_last_swash_hv = hv;
16d8f38a 3363 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3364 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3365 /* FIXME change interpvar.h? */
3366 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3367 PL_last_swash_slen = slen;
3368 if (klen)
3369 Copy(ptr, PL_last_swash_key, klen, U8);
3370 }
3371
9faf8d75 3372 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
3373 case 1:
3374 bit = 1 << (off & 7);
3375 off >>= 3;
3376 return (tmps[off] & bit) != 0;
3377 case 8:
3378 return tmps[off];
3379 case 16:
3380 off <<= 1;
3381 return (tmps[off] << 8) + tmps[off + 1] ;
3382 case 32:
3383 off <<= 2;
3384 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3385 }
5637ef5b
NC
3386 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3387 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3388 NORETURN_FUNCTION_END;
a0ed51b3 3389}
2b9d42f0 3390
319009ee
KW
3391/* Read a single line of the main body of the swash input text. These are of
3392 * the form:
3393 * 0053 0056 0073
3394 * where each number is hex. The first two numbers form the minimum and
3395 * maximum of a range, and the third is the value associated with the range.
3396 * Not all swashes should have a third number
3397 *
3398 * On input: l points to the beginning of the line to be examined; it points
3399 * to somewhere in the string of the whole input text, and is
3400 * terminated by a \n or the null string terminator.
3401 * lend points to the null terminator of that string
3402 * wants_value is non-zero if the swash expects a third number
3403 * typestr is the name of the swash's mapping, like 'ToLower'
3404 * On output: *min, *max, and *val are set to the values read from the line.
3405 * returns a pointer just beyond the line examined. If there was no
3406 * valid min number on the line, returns lend+1
3407 */
3408
3409STATIC U8*
3410S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3411 const bool wants_value, const U8* const typestr)
3412{
3413 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3414 STRLEN numlen; /* Length of the number */
02470786
KW
3415 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3416 | PERL_SCAN_DISALLOW_PREFIX
3417 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3418
3419 /* nl points to the next \n in the scan */
3420 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3421
3422 /* Get the first number on the line: the range minimum */
3423 numlen = lend - l;
3424 *min = grok_hex((char *)l, &numlen, &flags, NULL);
3425 if (numlen) /* If found a hex number, position past it */
3426 l += numlen;
3427 else if (nl) { /* Else, go handle next line, if any */
3428 return nl + 1; /* 1 is length of "\n" */
3429 }
3430 else { /* Else, no next line */
3431 return lend + 1; /* to LIST's end at which \n is not found */
3432 }
3433
3434 /* The max range value follows, separated by a BLANK */
3435 if (isBLANK(*l)) {
3436 ++l;
02470786
KW
3437 flags = PERL_SCAN_SILENT_ILLDIGIT
3438 | PERL_SCAN_DISALLOW_PREFIX
3439 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3440 numlen = lend - l;
3441 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3442 if (numlen)
3443 l += numlen;
3444 else /* If no value here, it is a single element range */
3445 *max = *min;
3446
3447 /* Non-binary tables have a third entry: what the first element of the
24303724 3448 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3449 if (wants_value) {
3450 if (isBLANK(*l)) {
3451 ++l;
f2a7d0fc
KW
3452 flags = PERL_SCAN_SILENT_ILLDIGIT
3453 | PERL_SCAN_DISALLOW_PREFIX
3454 | PERL_SCAN_SILENT_NON_PORTABLE;
3455 numlen = lend - l;
3456 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3457 if (numlen)
3458 l += numlen;
3459 else
3460 *val = 0;
319009ee
KW
3461 }
3462 else {
3463 *val = 0;
3464 if (typeto) {
dcbac5bb 3465 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3466 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3467 typestr, l);
3468 }
3469 }
3470 }
3471 else
3472 *val = 0; /* bits == 1, then any val should be ignored */
3473 }
3474 else { /* Nothing following range min, should be single element with no
3475 mapping expected */
3476 *max = *min;
3477 if (wants_value) {
3478 *val = 0;
3479 if (typeto) {
dcbac5bb 3480 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3481 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3482 }
3483 }
3484 else
3485 *val = 0; /* bits == 1, then val should be ignored */
3486 }
3487
3488 /* Position to next line if any, or EOF */
3489 if (nl)
3490 l = nl + 1;
3491 else
3492 l = lend;
3493
3494 return l;
3495}
3496
979f2922
ST
3497/* Note:
3498 * Returns a swatch (a bit vector string) for a code point sequence
3499 * that starts from the value C<start> and comprises the number C<span>.
3500 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3501 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3502 */
3503STATIC SV*
b0e3252e 3504S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
ST
3505{
3506 SV *swatch;
77f9f126 3507 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3508 STRLEN lcur, xcur, scur;
ef8f7699 3509 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3510 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3511
88d45d28
KW
3512 SV** listsvp = NULL; /* The string containing the main body of the table */
3513 SV** extssvp = NULL;
3514 SV** invert_it_svp = NULL;
3515 U8* typestr = NULL;
786861f5
KW
3516 STRLEN bits;
3517 STRLEN octets; /* if bits == 1, then octets == 0 */
3518 UV none;
3519 UV end = start + span;
972dd592 3520
36eb48b4 3521 if (invlistsvp == NULL) {
786861f5
KW
3522 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3523 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3524 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3525 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3526 listsvp = hv_fetchs(hv, "LIST", FALSE);
3527 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3528
3529 bits = SvUV(*bitssvp);
3530 none = SvUV(*nonesvp);
3531 typestr = (U8*)SvPV_nolen(*typesvp);
3532 }
36eb48b4
KW
3533 else {
3534 bits = 1;
3535 none = 0;
3536 }
786861f5 3537 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3538
b0e3252e 3539 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3540
979f2922 3541 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3542 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3543 (UV)bits);
979f2922
ST
3544 }
3545
84ea5ef6
KW
3546 /* If overflowed, use the max possible */
3547 if (end < start) {
3548 end = UV_MAX;
3549 span = end - start;
3550 }
3551
979f2922 3552 /* create and initialize $swatch */
979f2922 3553 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3554 swatch = newSV(scur);
3555 SvPOK_on(swatch);
979f2922
ST
3556 s = (U8*)SvPVX(swatch);
3557 if (octets && none) {
0bd48802 3558 const U8* const e = s + scur;
979f2922
ST
3559 while (s < e) {
3560 if (bits == 8)
3561 *s++ = (U8)(none & 0xff);
3562 else if (bits == 16) {
3563 *s++ = (U8)((none >> 8) & 0xff);
3564 *s++ = (U8)( none & 0xff);
3565 }
3566 else if (bits == 32) {
3567 *s++ = (U8)((none >> 24) & 0xff);
3568 *s++ = (U8)((none >> 16) & 0xff);
3569 *s++ = (U8)((none >> 8) & 0xff);
3570 *s++ = (U8)( none & 0xff);
3571 }
3572 }
3573 *s = '\0';
3574 }
3575 else {
3576 (void)memzero((U8*)s, scur + 1);
3577 }
3578 SvCUR_set(swatch, scur);
3579 s = (U8*)SvPVX(swatch);
3580
36eb48b4
KW
3581 if (invlistsvp) { /* If has an inversion list set up use that */
3582 _invlist_populate_swatch(*invlistsvp, start, end, s);
3583 return swatch;
3584 }
3585
3586 /* read $swash->{LIST} */
979f2922
ST
3587 l = (U8*)SvPV(*listsvp, lcur);
3588 lend = l + lcur;
3589 while (l < lend) {
8ed25d53 3590 UV min, max, val, upper;
319009ee
KW
3591 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3592 cBOOL(octets), typestr);
3593 if (l > lend) {
979f2922
ST
3594 break;
3595 }
3596
972dd592 3597 /* If looking for something beyond this range, go try the next one */
979f2922
ST
3598 if (max < start)
3599 continue;
3600
8ed25d53
KW
3601 /* <end> is generally 1 beyond where we want to set things, but at the
3602 * platform's infinity, where we can't go any higher, we want to
3603 * include the code point at <end> */
3604 upper = (max < end)
3605 ? max
3606 : (max != UV_MAX || end != UV_MAX)
3607 ? end - 1
3608 : end;
3609
979f2922 3610 if (octets) {
35da51f7 3611 UV key;
979f2922
ST
3612 if (min < start) {
3613 if (!none || val < none) {
3614 val += start - min;
3615 }
3616 min = start;
3617 }
8ed25d53 3618 for (key = min; key <= upper; key++) {
979f2922 3619 STRLEN offset;
979f2922
ST
3620 /* offset must be non-negative (start <= min <= key < end) */
3621 offset = octets * (key - start);
3622 if (bits == 8)
3623 s[offset] = (U8)(val & 0xff);
3624 else if (bits == 16) {
3625 s[offset ] = (U8)((val >> 8) & 0xff);
3626 s[offset + 1] = (U8)( val & 0xff);
3627 }