This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Deprecate utf8_to_uni_buf()
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
81e983c1 34#include "inline_invlist.c"
a0ed51b3 35
a0c21aa1 36#ifndef EBCDIC
970ea3cb 37/* Separate prototypes needed because in ASCII systems these are
a0c21aa1
JH
38 * usually macros but they still are compiled as code, too. */
39PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
40PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
41#endif
42
27da23d5
JH
43static const char unees[] =
44 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 45
48ef279e 46/*
ccfc67b7 47=head1 Unicode Support
a0ed51b3 48
166f8a29
DM
49This file contains various utility functions for manipulating UTF8-encoded
50strings. For the uninitiated, this is a method of representing arbitrary
61296642 51Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
52characters in the ASCII range are unmodified, and a zero byte never appears
53within non-zero characters.
166f8a29 54
eaf7a4d2
CS
55=cut
56*/
57
58/*
59=for apidoc is_ascii_string
60
a1433954 61Returns true if the first C<len> bytes of the string C<s> are the same whether
970ea3cb
KW
62or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
63is, if they are invariant. On ASCII-ish machines, only ASCII characters
64fit this definition, hence the function's name.
eaf7a4d2 65
9f7e3d64
MH
66If C<len> is 0, it will be calculated using C<strlen(s)>.
67
a1433954 68See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
eaf7a4d2
CS
69
70=cut
71*/
72
73bool
668b6d8d 74Perl_is_ascii_string(const U8 *s, STRLEN len)
eaf7a4d2
CS
75{
76 const U8* const send = s + (len ? len : strlen((const char *)s));
77 const U8* x = s;
78
79 PERL_ARGS_ASSERT_IS_ASCII_STRING;
eaf7a4d2
CS
80
81 for (; x < send; ++x) {
82 if (!UTF8_IS_INVARIANT(*x))
83 break;
84 }
85
86 return x == send;
87}
88
89/*
87cea99e 90=for apidoc uvuni_to_utf8_flags
eebe1485 91
537124e4 92Adds the UTF-8 representation of the Unicode code point C<uv> to the end
6ee84de2 93of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
eebe1485 94bytes available. The return value is the pointer to the byte after the
9041c2e3 95end of the new character. In other words,
eebe1485 96
b851fbc1
JH
97 d = uvuni_to_utf8_flags(d, uv, flags);
98
99or, in most cases,
100
9041c2e3 101 d = uvuni_to_utf8(d, uv);
eebe1485 102
b851fbc1
JH
103(which is equivalent to)
104
105 d = uvuni_to_utf8_flags(d, uv, 0);
106
949cf498 107This is the recommended Unicode-aware way of saying
eebe1485
SC
108
109 *(d++) = uv;
110
537124e4
KW
111where uv is a code point expressed in Latin-1 or above, not the platform's
112native character set. B<Almost all code should instead use L</uvchr_to_utf8>
113or L</uvchr_to_utf8_flags>>.
114
949cf498
KW
115This function will convert to UTF-8 (and not warn) even code points that aren't
116legal Unicode or are problematic, unless C<flags> contains one or more of the
a1433954
KW
117following flags:
118
949cf498
KW
119If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
120the function will raise a warning, provided UTF8 warnings are enabled. If instead
121UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
122If both flags are set, the function will both warn and return NULL.
123
124The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
537124e4
KW
125affect how the function handles a Unicode non-character. And likewise, the
126UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
127code points that are
949cf498
KW
128above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
129even less portable) can be warned and/or disallowed even if other above-Unicode
130code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
131flags.
132
133And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
134above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
135DISALLOW flags.
136
137
eebe1485
SC
138=cut
139*/
140
dfe13c55 141U8 *
b851fbc1 142Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 143{
7918f24d
NC
144 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
145
979f77b6
KW
146 /* The first problematic code point is the first surrogate */
147 if (uv >= UNICODE_SURROGATE_FIRST
148 && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
149 {
949cf498
KW
150 if (UNICODE_IS_SURROGATE(uv)) {
151 if (flags & UNICODE_WARN_SURROGATE) {
8457b38f 152 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
949cf498
KW
153 "UTF-16 surrogate U+%04"UVXf, uv);
154 }
155 if (flags & UNICODE_DISALLOW_SURROGATE) {
156 return NULL;
157 }
158 }
159 else if (UNICODE_IS_SUPER(uv)) {
160 if (flags & UNICODE_WARN_SUPER
161 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
162 {
8457b38f 163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
949cf498
KW
164 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
165 }
166 if (flags & UNICODE_DISALLOW_SUPER
167 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
168 {
169 return NULL;
170 }
171 }
172 else if (UNICODE_IS_NONCHAR(uv)) {
173 if (flags & UNICODE_WARN_NONCHAR) {
8457b38f 174 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
949cf498
KW
175 "Unicode non-character U+%04"UVXf" is illegal for open interchange",
176 uv);
177 }
178 if (flags & UNICODE_DISALLOW_NONCHAR) {
179 return NULL;
180 }
181 }
507b9800 182 }
c4d5f83a 183 if (UNI_IS_INVARIANT(uv)) {
bc3632a8 184 *d++ = (U8) I8_TO_NATIVE_UTF8(uv);
a0ed51b3
LW
185 return d;
186 }
2d331972 187#if defined(EBCDIC)
1d72bdf6
NIS
188 else {
189 STRLEN len = UNISKIP(uv);
190 U8 *p = d+len-1;
191 while (p > d) {
bc3632a8 192 *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
193 uv >>= UTF_ACCUMULATION_SHIFT;
194 }
bc3632a8 195 *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
196 return d+len;
197 }
198#else /* Non loop style */
a0ed51b3 199 if (uv < 0x800) {
eb160463
GS
200 *d++ = (U8)(( uv >> 6) | 0xc0);
201 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
202 return d;
203 }
204 if (uv < 0x10000) {
eb160463
GS
205 *d++ = (U8)(( uv >> 12) | 0xe0);
206 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
207 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
208 return d;
209 }
210 if (uv < 0x200000) {
eb160463
GS
211 *d++ = (U8)(( uv >> 18) | 0xf0);
212 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
213 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
214 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
215 return d;
216 }
217 if (uv < 0x4000000) {
eb160463
GS
218 *d++ = (U8)(( uv >> 24) | 0xf8);
219 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
220 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
221 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
222 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
223 return d;
224 }
225 if (uv < 0x80000000) {
eb160463
GS
226 *d++ = (U8)(( uv >> 30) | 0xfc);
227 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
228 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
229 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
230 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
231 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
232 return d;
233 }
6b8eaf93 234#ifdef HAS_QUAD
d7578b48 235 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
236#endif
237 {
eb160463
GS
238 *d++ = 0xfe; /* Can't match U+FEFF! */
239 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
240 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
241 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
242 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
243 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
244 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
245 return d;
246 }
6b8eaf93 247#ifdef HAS_QUAD
a0ed51b3 248 {
eb160463
GS
249 *d++ = 0xff; /* Can't match U+FFFE! */
250 *d++ = 0x80; /* 6 Reserved bits */
251 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
252 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
253 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
254 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
255 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
256 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
257 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
258 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
259 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
260 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
261 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
262 return d;
263 }
264#endif
537124e4 265#endif /* Non loop style */
a0ed51b3 266}
9041c2e3 267
646ca15d
JH
268/*
269
f7d739d1 270Tests if the first C<len> bytes of string C<s> form a valid UTF-8
646ca15d 271character. Note that an INVARIANT (i.e. ASCII) character is a valid
f7d739d1 272UTF-8 character. The number of bytes in the UTF-8 character
646ca15d
JH
273will be returned if it is valid, otherwise 0.
274
275This is the "slow" version as opposed to the "fast" version which is
276the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
277difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
278or less you should use the IS_UTF8_CHAR(), for lengths of five or more
279you should use the _slow(). In practice this means that the _slow()
280will be used very rarely, since the maximum Unicode code point (as of
281Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
537124e4 282the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
646ca15d
JH
283five bytes or more.
284
285=cut */
7af276bc 286PERL_STATIC_INLINE STRLEN
5f66b61c 287S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d 288{
cd7e6c88 289 dTHX; /* The function called below requires thread context */
646ca15d 290
cd7e6c88 291 STRLEN actual_len;
646ca15d 292
cd7e6c88 293 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
646ca15d 294
18712bce 295 utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
646ca15d 296
cd7e6c88 297 return (actual_len == (STRLEN) -1) ? 0 : actual_len;
646ca15d 298}
9041c2e3
NIS
299
300/*
492a624f
KW
301=for apidoc is_utf8_char_buf
302
303Returns the number of bytes that comprise the first UTF-8 encoded character in
304buffer C<buf>. C<buf_end> should point to one position beyond the end of the
305buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
306encoded character.
307
308Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
309machines) is a valid UTF-8 character.
310
311=cut */
312
313STRLEN
314Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
315{
316
317 STRLEN len;
318
319 PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
320
321 if (buf_end <= buf) {
322 return 0;
323 }
324
325 len = buf_end - buf;
326 if (len > UTF8SKIP(buf)) {
327 len = UTF8SKIP(buf);
328 }
329
330#ifdef IS_UTF8_CHAR
331 if (IS_UTF8_CHAR_FAST(len))
332 return IS_UTF8_CHAR(buf, len) ? len : 0;
333#endif /* #ifdef IS_UTF8_CHAR */
334 return is_utf8_char_slow(buf, len);
335}
336
337/*
87cea99e 338=for apidoc is_utf8_char
eebe1485 339
5da9da9e 340Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
341character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
342character is a valid UTF-8 character. The actual number of bytes in the UTF-8
343character will be returned if it is valid, otherwise 0.
9041c2e3 344
76848387 345This function is deprecated due to the possibility that malformed input could
a1433954 346cause reading beyond the end of the input buffer. Use L</is_utf8_char_buf>
76848387 347instead.
e0328548 348
82686b01 349=cut */
76848387 350
067a85ef 351STRLEN
668b6d8d 352Perl_is_utf8_char(const U8 *s)
386d01d6 353{
7918f24d 354 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
492a624f 355
76848387 356 /* Assumes we have enough space, which is why this is deprecated */
492a624f 357 return is_utf8_char_buf(s, s + UTF8SKIP(s));
386d01d6
GS
358}
359
eaf7a4d2 360
6662521e 361/*
87cea99e 362=for apidoc is_utf8_string
6662521e 363
a1433954 364Returns true if the first C<len> bytes of string C<s> form a valid
9f7e3d64 365UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
e0328548
KW
366using C<strlen(s)> (which means if you use this option, that C<s> has to have a
367terminating NUL byte). Note that all characters being ASCII constitute 'a
368valid UTF-8 string'.
6662521e 369
a1433954 370See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
768c67ee 371
6662521e
GS
372=cut
373*/
374
8e84507e 375bool
668b6d8d 376Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 377{
35da51f7 378 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 379 const U8* x = s;
067a85ef 380
7918f24d 381 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 382
6662521e 383 while (x < send) {
1acdb0da 384 /* Inline the easy bits of is_utf8_char() here for speed... */
e0328548
KW
385 if (UTF8_IS_INVARIANT(*x)) {
386 x++;
387 }
1acdb0da
JH
388 else {
389 /* ... and call is_utf8_char() only if really needed. */
e0328548
KW
390 const STRLEN c = UTF8SKIP(x);
391 const U8* const next_char_ptr = x + c;
392
393 if (next_char_ptr > send) {
394 return FALSE;
395 }
396
768c67ee
JH
397 if (IS_UTF8_CHAR_FAST(c)) {
398 if (!IS_UTF8_CHAR(x, c))
e0328548 399 return FALSE;
3c614e38 400 }
e0328548
KW
401 else if (! is_utf8_char_slow(x, c)) {
402 return FALSE;
403 }
404 x = next_char_ptr;
1acdb0da 405 }
6662521e 406 }
768c67ee 407
067a85ef 408 return TRUE;
6662521e
GS
409}
410
67e989fb 411/*
814fafa7
NC
412Implemented as a macro in utf8.h
413
87cea99e 414=for apidoc is_utf8_string_loc
814fafa7 415
a1433954
KW
416Like L</is_utf8_string> but stores the location of the failure (in the
417case of "utf8ness failure") or the location C<s>+C<len> (in the case of
814fafa7
NC
418"utf8ness success") in the C<ep>.
419
a1433954 420See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
814fafa7 421
87cea99e 422=for apidoc is_utf8_string_loclen
81cd54e3 423
a1433954
KW
424Like L</is_utf8_string>() but stores the location of the failure (in the
425case of "utf8ness failure") or the location C<s>+C<len> (in the case of
768c67ee
JH
426"utf8ness success") in the C<ep>, and the number of UTF-8
427encoded characters in the C<el>.
428
a1433954 429See also L</is_utf8_string_loc>() and L</is_utf8_string>().
81cd54e3
JH
430
431=cut
432*/
433
434bool
668b6d8d 435Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 436{
35da51f7 437 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 438 const U8* x = s;
81cd54e3 439 STRLEN c;
3ebfea28 440 STRLEN outlen = 0;
7918f24d
NC
441
442 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 443
81cd54e3 444 while (x < send) {
e0328548
KW
445 const U8* next_char_ptr;
446
81cd54e3
JH
447 /* Inline the easy bits of is_utf8_char() here for speed... */
448 if (UTF8_IS_INVARIANT(*x))
e0328548 449 next_char_ptr = x + 1;
81cd54e3 450 else {
768c67ee 451 /* ... and call is_utf8_char() only if really needed. */
768c67ee 452 c = UTF8SKIP(x);
e0328548
KW
453 next_char_ptr = c + x;
454 if (next_char_ptr > send) {
455 goto out;
456 }
768c67ee
JH
457 if (IS_UTF8_CHAR_FAST(c)) {
458 if (!IS_UTF8_CHAR(x, c))
459 c = 0;
460 } else
461 c = is_utf8_char_slow(x, c);
768c67ee
JH
462 if (!c)
463 goto out;
81cd54e3 464 }
e0328548 465 x = next_char_ptr;
3ebfea28 466 outlen++;
81cd54e3 467 }
768c67ee
JH
468
469 out:
3ebfea28
AL
470 if (el)
471 *el = outlen;
472
768c67ee
JH
473 if (ep)
474 *ep = x;
3ebfea28 475 return (x == send);
81cd54e3
JH
476}
477
478/*
768c67ee 479
87cea99e 480=for apidoc utf8n_to_uvuni
67e989fb 481
9041c2e3 482Bottom level UTF-8 decode routine.
746afd53
KW
483Returns the code point value of the first character in the string C<s>,
484which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
485C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
486the length, in bytes, of that character.
949cf498
KW
487
488The value of C<flags> determines the behavior when C<s> does not point to a
489well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
524080c4
KW
490zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
491next possible position in C<s> that could begin a non-malformed character.
492Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
949cf498
KW
493
494Various ALLOW flags can be set in C<flags> to allow (and not warn on)
495individual types of malformations, such as the sequence being overlong (that
496is, when there is a shorter sequence that can express the same code point;
497overlong sequences are expressly forbidden in the UTF-8 standard due to
498potential security issues). Another malformation example is the first byte of
499a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
500flags. For allowed 0 length strings, this function returns 0; for allowed
501overlong sequences, the computed code point is returned; for all other allowed
502malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
503determinable reasonable value.
949cf498
KW
504
505The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
506flags) malformation is found. If this flag is set, the routine assumes that
507the caller will raise a warning, and this function will silently just set
d088425d
KW
508C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
509
510Note that this API requires disambiguation between successful decoding a NUL
511character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
512in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
513first byte of C<s> is 0 as well. If so, the input was a NUL; if not, the input
514had an error.
949cf498
KW
515
516Certain code points are considered problematic. These are Unicode surrogates,
746afd53 517Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 518By default these are considered regular code points, but certain situations
5eafe189 519warrant special handling for them. If C<flags> contains
949cf498
KW
520UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
521malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
522UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
523maximum) can be set to disallow these categories individually.
524
525The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
526UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
527for their respective categories, but otherwise the code points are considered
528valid (not malformations). To get a category to both be treated as a
529malformation and raise a warning, specify both the WARN and DISALLOW flags.
530(But note that warnings are not raised if lexically disabled nor if
531UTF8_CHECK_ONLY is also specified.)
532
533Very large code points (above 0x7FFF_FFFF) are considered more problematic than
534the others that are above the Unicode legal maximum. There are several
eb83ed87
KW
535reasons: they requre at least 32 bits to represent them on ASCII platforms, are
536not representable at all on EBCDIC platforms, and the original UTF-8
537specification never went above this number (the current 0x10FFFF limit was
538imposed later). (The smaller ones, those that fit into 32 bits, are
539representable by a UV on ASCII platforms, but not by an IV, which means that
540the number of operations that can be performed on them is quite restricted.)
541The UTF-8 encoding on ASCII platforms for these large code points begins with a
542byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
543be treated as malformations, while allowing smaller above-Unicode code points.
544(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
545including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
546the other WARN flags, but applies just to these code points.
949cf498
KW
547
548All other code points corresponding to Unicode characters, including private
549use and those yet to be assigned, are never considered malformed and never
550warn.
67e989fb 551
ec5f19d0 552Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
9041c2e3 553
37607a96
PK
554=cut
555*/
67e989fb 556
a0ed51b3 557UV
7fc63493 558Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 559{
97aff369 560 dVAR;
d4c19fe8 561 const U8 * const s0 = s;
eb83ed87 562 U8 overflow_byte = '\0'; /* Save byte in case of overflow */
0b8d30e8 563 U8 * send;
eb83ed87
KW
564 UV uv = *s;
565 STRLEN expectlen;
949cf498 566 SV* sv = NULL;
eb83ed87
KW
567 UV outlier_ret = 0; /* return value when input is in error or problematic
568 */
569 UV pack_warn = 0; /* Save result of packWARN() for later */
570 bool unexpected_non_continuation = FALSE;
571 bool overflowed = FALSE;
2f8f112e 572 bool do_overlong_test = TRUE; /* May have to skip this test */
a0dbb045 573
eb83ed87 574 const char* const malformed_text = "Malformed UTF-8 character";
7918f24d 575
eb83ed87 576 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
a0dbb045 577
eb83ed87
KW
578 /* The order of malformation tests here is important. We should consume as
579 * few bytes as possible in order to not skip any valid character. This is
580 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
581 * http://unicode.org/reports/tr36 for more discussion as to why. For
582 * example, once we've done a UTF8SKIP, we can tell the expected number of
583 * bytes, and could fail right off the bat if the input parameters indicate
584 * that there are too few available. But it could be that just that first
585 * byte is garbled, and the intended character occupies fewer bytes. If we
586 * blindly assumed that the first byte is correct, and skipped based on
587 * that number, we could skip over a valid input character. So instead, we
588 * always examine the sequence byte-by-byte.
589 *
590 * We also should not consume too few bytes, otherwise someone could inject
591 * things. For example, an input could be deliberately designed to
592 * overflow, and if this code bailed out immediately upon discovering that,
593 * returning to the caller *retlen pointing to the very next byte (one
594 * which is actually part of of the overflowing sequence), that could look
595 * legitimate to the caller, which could discard the initial partial
596 * sequence and process the rest, inappropriately */
597
598 /* Zero length strings, if allowed, of necessity are zero */
b5b9af04 599 if (UNLIKELY(curlen == 0)) {
eb83ed87
KW
600 if (retlen) {
601 *retlen = 0;
602 }
a0dbb045 603
eb83ed87
KW
604 if (flags & UTF8_ALLOW_EMPTY) {
605 return 0;
606 }
607 if (! (flags & UTF8_CHECK_ONLY)) {
608 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
609 }
0c443dc2
JH
610 goto malformed;
611 }
612
eb83ed87
KW
613 expectlen = UTF8SKIP(s);
614
615 /* A well-formed UTF-8 character, as the vast majority of calls to this
616 * function will be for, has this expected length. For efficiency, set
617 * things up here to return it. It will be overriden only in those rare
618 * cases where a malformation is found */
619 if (retlen) {
620 *retlen = expectlen;
621 }
622
623 /* An invariant is trivially well-formed */
1d72bdf6 624 if (UTF8_IS_INVARIANT(uv)) {
38bce45e 625 return uv;
a0ed51b3 626 }
67e989fb 627
eb83ed87 628 /* A continuation character can't start a valid sequence */
b5b9af04 629 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
eb83ed87
KW
630 if (flags & UTF8_ALLOW_CONTINUATION) {
631 if (retlen) {
632 *retlen = 1;
633 }
634 return UNICODE_REPLACEMENT;
635 }
ba210ebe 636
eb83ed87
KW
637 if (! (flags & UTF8_CHECK_ONLY)) {
638 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
639 }
640 curlen = 1;
ba210ebe
JH
641 goto malformed;
642 }
9041c2e3 643
dcd27b3c
KW
644 /* Here is not a continuation byte, nor an invariant. The only thing left
645 * is a start byte (possibly for an overlong) */
646
1d72bdf6 647#ifdef EBCDIC
bc3632a8 648 uv = NATIVE_UTF8_TO_I8(uv);
1d72bdf6
NIS
649#endif
650
eb83ed87
KW
651 /* Remove the leading bits that indicate the number of bytes in the
652 * character's whole UTF-8 sequence, leaving just the bits that are part of
653 * the value */
654 uv &= UTF_START_MASK(expectlen);
ba210ebe 655
eb83ed87
KW
656 /* Now, loop through the remaining bytes in the character's sequence,
657 * accumulating each into the working value as we go. Be sure to not look
658 * past the end of the input string */
0b8d30e8
KW
659 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
660
eb83ed87 661 for (s = s0 + 1; s < send; s++) {
b5b9af04 662 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
eb83ed87
KW
663#ifndef EBCDIC /* Can't overflow in EBCDIC */
664 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
665
666 /* The original implementors viewed this malformation as more
667 * serious than the others (though I, khw, don't understand
668 * why, since other malformations also give very very wrong
669 * results), so there is no way to turn off checking for it.
670 * Set a flag, but keep going in the loop, so that we absorb
671 * the rest of the bytes that comprise the character. */
672 overflowed = TRUE;
673 overflow_byte = *s; /* Save for warning message's use */
674 }
675#endif
8850bf83 676 uv = UTF8_ACCUMULATE(uv, *s);
eb83ed87
KW
677 }
678 else {
679 /* Here, found a non-continuation before processing all expected
680 * bytes. This byte begins a new character, so quit, even if
681 * allowing this malformation. */
682 unexpected_non_continuation = TRUE;
683 break;
684 }
685 } /* End of loop through the character's bytes */
686
687 /* Save how many bytes were actually in the character */
688 curlen = s - s0;
689
690 /* The loop above finds two types of malformations: non-continuation and/or
691 * overflow. The non-continuation malformation is really a too-short
692 * malformation, as it means that the current character ended before it was
693 * expected to (being terminated prematurely by the beginning of the next
694 * character, whereas in the too-short malformation there just are too few
695 * bytes available to hold the character. In both cases, the check below
696 * that we have found the expected number of bytes would fail if executed.)
697 * Thus the non-continuation malformation is really unnecessary, being a
698 * subset of the too-short malformation. But there may be existing
699 * applications that are expecting the non-continuation type, so we retain
700 * it, and return it in preference to the too-short malformation. (If this
701 * code were being written from scratch, the two types might be collapsed
702 * into one.) I, khw, am also giving priority to returning the
703 * non-continuation and too-short malformations over overflow when multiple
704 * ones are present. I don't know of any real reason to prefer one over
705 * the other, except that it seems to me that multiple-byte errors trumps
706 * errors from a single byte */
b5b9af04 707 if (UNLIKELY(unexpected_non_continuation)) {
eb83ed87
KW
708 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
709 if (! (flags & UTF8_CHECK_ONLY)) {
710 if (curlen == 1) {
711 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
712 }
713 else {
714 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
a0dbb045
JH
715 }
716 }
eb83ed87
KW
717 goto malformed;
718 }
719 uv = UNICODE_REPLACEMENT;
2f8f112e
KW
720
721 /* Skip testing for overlongs, as the REPLACEMENT may not be the same
722 * as what the original expectations were. */
723 do_overlong_test = FALSE;
eb83ed87
KW
724 if (retlen) {
725 *retlen = curlen;
726 }
727 }
b5b9af04 728 else if (UNLIKELY(curlen < expectlen)) {
eb83ed87
KW
729 if (! (flags & UTF8_ALLOW_SHORT)) {
730 if (! (flags & UTF8_CHECK_ONLY)) {
731 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
a0dbb045 732 }
eb83ed87
KW
733 goto malformed;
734 }
735 uv = UNICODE_REPLACEMENT;
2f8f112e 736 do_overlong_test = FALSE;
eb83ed87
KW
737 if (retlen) {
738 *retlen = curlen;
739 }
740 }
741
742#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
2f8f112e 743 if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
eb83ed87
KW
744 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
745 {
746 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
747 * generation of the sv, since no warnings are raised under CHECK */
748 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
749 && ckWARN_d(WARN_UTF8))
750 {
42303544
KW
751 /* This message is deliberately not of the same syntax as the other
752 * messages for malformations, for backwards compatibility in the
753 * unlikely event that code is relying on its precise earlier text
754 */
eb83ed87
KW
755 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
756 pack_warn = packWARN(WARN_UTF8);
757 }
758 if (flags & UTF8_DISALLOW_FE_FF) {
759 goto malformed;
ba210ebe 760 }
ba210ebe 761 }
b5b9af04 762 if (UNLIKELY(overflowed)) {
ba210ebe 763
eb83ed87
KW
764 /* If the first byte is FF, it will overflow a 32-bit word. If the
765 * first byte is FE, it will overflow a signed 32-bit word. The
766 * above preserves backward compatibility, since its message was used
767 * in earlier versions of this code in preference to overflow */
768 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
ba210ebe 769 goto malformed;
eb83ed87
KW
770 }
771#endif
772
2f8f112e
KW
773 if (do_overlong_test
774 && expectlen > (STRLEN)UNISKIP(uv)
775 && ! (flags & UTF8_ALLOW_LONG))
776 {
eb83ed87
KW
777 /* The overlong malformation has lower precedence than the others.
778 * Note that if this malformation is allowed, we return the actual
779 * value, instead of the replacement character. This is because this
780 * value is actually well-defined. */
781 if (! (flags & UTF8_CHECK_ONLY)) {
782 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0));
783 }
784 goto malformed;
785 }
786
787 /* Here, the input is considered to be well-formed , but could be a
788 * problematic code point that is not allowed by the input parameters. */
789 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
790 && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
791 |UTF8_WARN_ILLEGAL_INTERCHANGE)))
792 {
949cf498 793 if (UNICODE_IS_SURROGATE(uv)) {
eb83ed87
KW
794 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
795 && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
796 {
111d382d 797 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
eb83ed87 798 pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
949cf498
KW
799 }
800 if (flags & UTF8_DISALLOW_SURROGATE) {
801 goto disallowed;
802 }
803 }
949cf498 804 else if ((uv > PERL_UNICODE_MAX)) {
eb83ed87
KW
805 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
806 && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
807 {
111d382d 808 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
eb83ed87 809 pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
949cf498
KW
810 }
811 if (flags & UTF8_DISALLOW_SUPER) {
812 goto disallowed;
813 }
814 }
4190d317
KW
815 else if (UNICODE_IS_NONCHAR(uv)) {
816 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
817 && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
818 {
819 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
820 pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
821 }
822 if (flags & UTF8_DISALLOW_NONCHAR) {
823 goto disallowed;
824 }
825 }
949cf498 826
eb83ed87
KW
827 if (sv) {
828 outlier_ret = uv;
829 goto do_warn;
830 }
831
949cf498
KW
832 /* Here, this is not considered a malformed character, so drop through
833 * to return it */
a0ed51b3 834 }
ba210ebe 835
a0ed51b3 836 return uv;
ba210ebe 837
eb83ed87
KW
838 /* There are three cases which get to beyond this point. In all 3 cases:
839 * <sv> if not null points to a string to print as a warning.
840 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
841 * set.
842 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
843 * This is done by initializing it to 0, and changing it only
844 * for case 1).
845 * The 3 cases are:
846 * 1) The input is valid but problematic, and to be warned about. The
847 * return value is the resultant code point; <*retlen> is set to
848 * <curlen>, the number of bytes that comprise the code point.
849 * <pack_warn> contains the result of packWARN() for the warning
850 * types. The entry point for this case is the label <do_warn>;
851 * 2) The input is a valid code point but disallowed by the parameters to
852 * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
853 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
854 * comprise the code point. <pack_warn> contains the result of
855 * packWARN() for the warning types. The entry point for this case is
856 * the label <disallowed>.
857 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
858 * is set, <*relen> is -1; otherwise it is <curlen>, the number of
859 * bytes that comprise the malformation. All such malformations are
860 * assumed to be warning type <utf8>. The entry point for this case
861 * is the label <malformed>.
862 */
949cf498 863
ba210ebe
JH
864malformed:
865
eb83ed87
KW
866 if (sv && ckWARN_d(WARN_UTF8)) {
867 pack_warn = packWARN(WARN_UTF8);
868 }
869
870disallowed:
871
fcc8fcf6 872 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 873 if (retlen)
10edeb5d 874 *retlen = ((STRLEN) -1);
ba210ebe
JH
875 return 0;
876 }
877
eb83ed87 878do_warn:
5b311467 879
eb83ed87
KW
880 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
881 if warnings are to be raised. */
f555bc63 882 const char * const string = SvPVX_const(sv);
a0dbb045 883
f555bc63
KW
884 if (PL_op)
885 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
886 else
887 Perl_warner(aTHX_ pack_warn, "%s", string);
a0dbb045
JH
888 }
889
eb83ed87
KW
890 if (retlen) {
891 *retlen = curlen;
892 }
ba210ebe 893
eb83ed87 894 return outlier_ret;
a0ed51b3
LW
895}
896
8e84507e 897/*
ec5f19d0
KW
898=for apidoc utf8_to_uvchr_buf
899
900Returns the native code point of the first character in the string C<s> which
901is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 902C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 903
524080c4
KW
904If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
905enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
173db420
KW
906NULL) to -1. If those warnings are off, the computed value, if well-defined
907(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
908C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
909the next possible position in C<s> that could begin a non-malformed character.
910See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is
911returned.
ec5f19d0
KW
912
913=cut
914*/
915
916
917UV
918Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
919{
920 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
921
922 assert(s < send);
923
924 return utf8n_to_uvchr(s, send - s, retlen,
925 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
926}
927
27d6c58a 928/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
3986bb7c 929 * there are no malformations in the input UTF-8 string C<s>. surrogates,
6dd9dce9
KW
930 * non-character code points, and non-Unicode code points are allowed. A macro
931 * in utf8.h is used to normally avoid this function wrapper */
27d6c58a
KW
932
933UV
934Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
935{
010ab96b
KW
936 UV expectlen = UTF8SKIP(s);
937 const U8* send = s + expectlen;
938 UV uv = NATIVE_UTF8_TO_I8(*s);
3986bb7c 939
27d6c58a
KW
940 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
941
010ab96b
KW
942 if (retlen) {
943 *retlen = expectlen;
944 }
945
946 /* An invariant is trivially returned */
947 if (expectlen == 1) {
948 return LATIN1_TO_NATIVE(uv);
949 }
950
951 /* Remove the leading bits that indicate the number of bytes, leaving just
952 * the bits that are part of the value */
953 uv &= UTF_START_MASK(expectlen);
954
955 /* Now, loop through the remaining bytes, accumulating each into the
956 * working total as we go. (I khw tried unrolling the loop for up to 4
957 * bytes, but there was no performance improvement) */
958 for (++s; s < send; s++) {
959 uv = UTF8_ACCUMULATE(uv, *s);
960 }
961
3986bb7c 962 return UNI_TO_NATIVE(uv);
010ab96b 963
27d6c58a
KW
964}
965
ec5f19d0 966/*
87cea99e 967=for apidoc utf8_to_uvchr
9041c2e3 968
6ee84de2 969Returns the native code point of the first character in the string C<s>
1e54db1a 970which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
971length, in bytes, of that character.
972
4b88fb76 973Some, but not all, UTF-8 malformations are detected, and in fact, some
977c1d31
KW
974malformed input could cause reading beyond the end of the input buffer, which
975is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
4b88fb76 976
524080c4
KW
977If C<s> points to one of the detected malformations, and UTF8 warnings are
978enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
979NULL) to -1. If those warnings are off, the computed value if well-defined (or
980the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
981is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
982next possible position in C<s> that could begin a non-malformed character.
983See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
9041c2e3
NIS
984
985=cut
986*/
987
988UV
7fc63493 989Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 990{
7918f24d
NC
991 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
992
2ff6c191 993 return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
9041c2e3
NIS
994}
995
996/*
ec5f19d0
KW
997=for apidoc utf8_to_uvuni_buf
998
4f83cdcd
KW
999Only in very rare circumstances should code need to be dealing in the Unicode
1000code point. Use L</utf8_to_uvchr_buf> instead.
1001
1002Returns the Unicode (not-native) code point of the first character in the
1003string C<s> which
ec5f19d0
KW
1004is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1005C<retlen> will be set to the length, in bytes, of that character.
1006
524080c4
KW
1007If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1008enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1009NULL) to -1. If those warnings are off, the computed value if well-defined (or
1010the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1011is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1012next possible position in C<s> that could begin a non-malformed character.
1013See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0
KW
1014
1015=cut
1016*/
1017
1018UV
1019Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1020{
1021 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1022
1023 assert(send > s);
1024
1025 /* Call the low level routine asking for checks */
1026 return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
1027 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1028}
1029
5495102a
KW
1030/* DEPRECATED!
1031 * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
2114036c 1032 * there are no malformations in the input UTF-8 string C<s>. Surrogates,
3986bb7c 1033 * non-character code points, and non-Unicode code points are allowed */
27d6c58a
KW
1034
1035UV
1036Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1037{
1038 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1039
010ab96b 1040 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
27d6c58a
KW
1041}
1042
ec5f19d0 1043/*
87cea99e 1044=for apidoc utf8_to_uvuni
9041c2e3
NIS
1045
1046Returns the Unicode code point of the first character in the string C<s>
1e54db1a 1047which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
1048length, in bytes, of that character.
1049
4b88fb76 1050Some, but not all, UTF-8 malformations are detected, and in fact, some
977c1d31 1051malformed input could cause reading beyond the end of the input buffer, which
4f83cdcd
KW
1052is one reason why this function is deprecated. The other is that only in
1053extremely limited circumstances should the Unicode versus native code point be
1054of any interest to you. Use L</utf8_to_uvchr_buf> instead.
9041c2e3 1055
524080c4
KW
1056If C<s> points to one of the detected malformations, and UTF8 warnings are
1057enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
1058NULL) to -1. If those warnings are off, the computed value if well-defined (or
1059the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1060is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1061next possible position in C<s> that could begin a non-malformed character.
1062See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
8e84507e
NIS
1063
1064=cut
1065*/
1066
1067UV
7fc63493 1068Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 1069{
7918f24d
NC
1070 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1071
5495102a 1072 return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
8e84507e
NIS
1073}
1074
b76347f2 1075/*
87cea99e 1076=for apidoc utf8_length
b76347f2
JH
1077
1078Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1079Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1080up past C<e>, croaks.
b76347f2
JH
1081
1082=cut
1083*/
1084
1085STRLEN
35a4481c 1086Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 1087{
97aff369 1088 dVAR;
b76347f2
JH
1089 STRLEN len = 0;
1090
7918f24d
NC
1091 PERL_ARGS_ASSERT_UTF8_LENGTH;
1092
8850bf83
JH
1093 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1094 * the bitops (especially ~) can create illegal UTF-8.
1095 * In other words: in Perl UTF-8 is not just for Unicode. */
1096
a3b680e6
AL
1097 if (e < s)
1098 goto warn_and_return;
b76347f2 1099 while (s < e) {
4cbf4130 1100 s += UTF8SKIP(s);
8e91ec7f
AV
1101 len++;
1102 }
1103
1104 if (e != s) {
1105 len--;
1106 warn_and_return:
9b387841
NC
1107 if (PL_op)
1108 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1109 "%s in %s", unees, OP_DESC(PL_op));
1110 else
61a12c31 1111 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1112 }
1113
1114 return len;
1115}
1116
b06226ff 1117/*
87cea99e 1118=for apidoc utf8_distance
b06226ff 1119
1e54db1a 1120Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
1121and C<b>.
1122
1123WARNING: use only if you *know* that the pointers point inside the
1124same UTF-8 buffer.
1125
37607a96
PK
1126=cut
1127*/
a0ed51b3 1128
02eb7b47 1129IV
35a4481c 1130Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 1131{
7918f24d
NC
1132 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1133
bf1665bc 1134 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
1135}
1136
b06226ff 1137/*
87cea99e 1138=for apidoc utf8_hop
b06226ff 1139
8850bf83
JH
1140Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1141forward or backward.
b06226ff
JH
1142
1143WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
1144the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1145on the first byte of character or just after the last byte of a character.
b06226ff 1146
37607a96
PK
1147=cut
1148*/
a0ed51b3
LW
1149
1150U8 *
4373e329 1151Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 1152{
7918f24d
NC
1153 PERL_ARGS_ASSERT_UTF8_HOP;
1154
96a5add6 1155 PERL_UNUSED_CONTEXT;
8850bf83
JH
1156 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1157 * the bitops (especially ~) can create illegal UTF-8.
1158 * In other words: in Perl UTF-8 is not just for Unicode. */
1159
a0ed51b3
LW
1160 if (off >= 0) {
1161 while (off--)
1162 s += UTF8SKIP(s);
1163 }
1164 else {
1165 while (off++) {
1166 s--;
8850bf83
JH
1167 while (UTF8_IS_CONTINUATION(*s))
1168 s--;
a0ed51b3
LW
1169 }
1170 }
4373e329 1171 return (U8 *)s;
a0ed51b3
LW
1172}
1173
6940069f 1174/*
fed3ba5d
NC
1175=for apidoc bytes_cmp_utf8
1176
a1433954
KW
1177Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1178sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1179equal, -1 or -2 if the first string is less than the second string, +1 or +2
1180if the first string is greater than the second string.
1181
1182-1 or +1 is returned if the shorter string was identical to the start of the
1183longer string. -2 or +2 is returned if the was a difference between characters
1184within the strings.
1185
1186=cut
1187*/
1188
1189int
1190Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1191{
1192 const U8 *const bend = b + blen;
1193 const U8 *const uend = u + ulen;
1194
1195 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1196
1197 PERL_UNUSED_CONTEXT;
1198
1199 while (b < bend && u < uend) {
1200 U8 c = *u++;
1201 if (!UTF8_IS_INVARIANT(c)) {
1202 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1203 if (u < uend) {
1204 U8 c1 = *u++;
1205 if (UTF8_IS_CONTINUATION(c1)) {
94bb8c36 1206 c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
fed3ba5d
NC
1207 } else {
1208 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1209 "Malformed UTF-8 character "
1210 "(unexpected non-continuation byte 0x%02x"
1211 ", immediately after start byte 0x%02x)"
1212 /* Dear diag.t, it's in the pod. */
1213 "%s%s", c1, c,
1214 PL_op ? " in " : "",
1215 PL_op ? OP_DESC(PL_op) : "");
1216 return -2;
1217 }
1218 } else {
1219 if (PL_op)
1220 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1221 "%s in %s", unees, OP_DESC(PL_op));
1222 else
61a12c31 1223 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1224 return -2; /* Really want to return undef :-) */
1225 }
1226 } else {
1227 return -2;
1228 }
1229 }
1230 if (*b != c) {
1231 return *b < c ? -2 : +2;
1232 }
1233 ++b;
1234 }
1235
1236 if (b == bend && u == uend)
1237 return 0;
1238
1239 return b < bend ? +1 : -1;
1240}
1241
1242/*
87cea99e 1243=for apidoc utf8_to_bytes
6940069f 1244
2bbc8d55 1245Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1246Unlike L</bytes_to_utf8>, this over-writes the original string, and
1247updates C<len> to contain the new length.
67e989fb 1248Returns zero on failure, setting C<len> to -1.
6940069f 1249
a1433954 1250If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1251
6940069f
GS
1252=cut
1253*/
1254
1255U8 *
37607a96 1256Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1257{
d4c19fe8
AL
1258 U8 * const save = s;
1259 U8 * const send = s + *len;
6940069f 1260 U8 *d;
246fae53 1261
7918f24d
NC
1262 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1263
1e54db1a 1264 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1265 while (s < send) {
dcad2880
JH
1266 U8 c = *s++;
1267
1d72bdf6
NIS
1268 if (!UTF8_IS_INVARIANT(c) &&
1269 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
1270 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 1271 *len = ((STRLEN) -1);
dcad2880
JH
1272 return 0;
1273 }
246fae53 1274 }
dcad2880
JH
1275
1276 d = s = save;
6940069f 1277 while (s < send) {
ed646e6e 1278 STRLEN ulen;
4b88fb76 1279 *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
ed646e6e 1280 s += ulen;
6940069f
GS
1281 }
1282 *d = '\0';
246fae53 1283 *len = d - save;
6940069f
GS
1284 return save;
1285}
1286
1287/*
87cea99e 1288=for apidoc bytes_from_utf8
f9a63242 1289
2bbc8d55 1290Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1291Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1292the newly-created string, and updates C<len> to contain the new
1293length. Returns the original string if no conversion occurs, C<len>
1294is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
12950 if C<s> is converted or consisted entirely of characters that are invariant
1296in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1297
37607a96
PK
1298=cut
1299*/
f9a63242
JH
1300
1301U8 *
e1ec3a88 1302Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1303{
f9a63242 1304 U8 *d;
e1ec3a88
AL
1305 const U8 *start = s;
1306 const U8 *send;
f9a63242
JH
1307 I32 count = 0;
1308
7918f24d
NC
1309 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1310
96a5add6 1311 PERL_UNUSED_CONTEXT;
f9a63242 1312 if (!*is_utf8)
73d840c0 1313 return (U8 *)start;
f9a63242 1314
1e54db1a 1315 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1316 for (send = s + *len; s < send;) {
e1ec3a88 1317 U8 c = *s++;
1d72bdf6 1318 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
1319 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
1320 (c = *s++) && UTF8_IS_CONTINUATION(c))
1321 count++;
1322 else
73d840c0 1323 return (U8 *)start;
db42d148 1324 }
f9a63242
JH
1325 }
1326
35da51f7 1327 *is_utf8 = FALSE;
f9a63242 1328
212542aa 1329 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1330 s = start; start = d;
f9a63242
JH
1331 while (s < send) {
1332 U8 c = *s++;
c4d5f83a
NIS
1333 if (!UTF8_IS_INVARIANT(c)) {
1334 /* Then it is two-byte encoded */
94bb8c36 1335 c = TWO_BYTE_UTF8_TO_NATIVE(c, *s++);
c4d5f83a
NIS
1336 }
1337 *d++ = c;
f9a63242
JH
1338 }
1339 *d = '\0';
1340 *len = d - start;
73d840c0 1341 return (U8 *)start;
f9a63242
JH
1342}
1343
1344/*
87cea99e 1345=for apidoc bytes_to_utf8
6940069f 1346
ff97e5cf
KW
1347Converts a string C<s> of length C<len> bytes from the native encoding into
1348UTF-8.
6662521e 1349Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1350reflect the new length in bytes.
6940069f 1351
2bbc8d55
SP
1352A NUL character will be written after the end of the string.
1353
1354If you want to convert to UTF-8 from encodings other than
1355the native (Latin1 or EBCDIC),
a1433954 1356see L</sv_recode_to_utf8>().
c9ada85f 1357
497711e7 1358=cut
6940069f
GS
1359*/
1360
c682ebef
FC
1361/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1362 likewise need duplication. */
1363
6940069f 1364U8*
35a4481c 1365Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1366{
35a4481c 1367 const U8 * const send = s + (*len);
6940069f
GS
1368 U8 *d;
1369 U8 *dst;
7918f24d
NC
1370
1371 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1372 PERL_UNUSED_CONTEXT;
6940069f 1373
212542aa 1374 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1375 dst = d;
1376
1377 while (s < send) {
55d09dc8
KW
1378 append_utf8_from_native_byte(*s, &d);
1379 s++;
6940069f
GS
1380 }
1381 *d = '\0';
6662521e 1382 *len = d-dst;
6940069f
GS
1383 return dst;
1384}
1385
a0ed51b3 1386/*
dea0fc0b 1387 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1388 *
1389 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1390 * We optimize for native, for obvious reasons. */
1391
1392U8*
dea0fc0b 1393Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1394{
dea0fc0b
JH
1395 U8* pend;
1396 U8* dstart = d;
1397
7918f24d
NC
1398 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1399
dea0fc0b 1400 if (bytelen & 1)
f5992bc4 1401 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1402
1403 pend = p + bytelen;
1404
a0ed51b3 1405 while (p < pend) {
dea0fc0b
JH
1406 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1407 p += 2;
a0ed51b3 1408 if (uv < 0x80) {
eb160463 1409 *d++ = (U8)uv;
a0ed51b3
LW
1410 continue;
1411 }
1412 if (uv < 0x800) {
eb160463
GS
1413 *d++ = (U8)(( uv >> 6) | 0xc0);
1414 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1415 continue;
1416 }
46956fad
KW
1417#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1418#define LAST_HIGH_SURROGATE 0xDBFF
1419#define FIRST_LOW_SURROGATE 0xDC00
1420#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
1421 if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
01ea242b 1422 if (p >= pend) {
dea0fc0b 1423 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
01ea242b
NC
1424 } else {
1425 UV low = (p[0] << 8) + p[1];
1426 p += 2;
46956fad 1427 if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
01ea242b 1428 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
46956fad
KW
1429 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1430 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1431 }
46956fad 1432 } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
dbde1951 1433 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
1434 }
1435 if (uv < 0x10000) {
eb160463
GS
1436 *d++ = (U8)(( uv >> 12) | 0xe0);
1437 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1438 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1439 continue;
1440 }
1441 else {
eb160463
GS
1442 *d++ = (U8)(( uv >> 18) | 0xf0);
1443 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1444 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1445 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1446 continue;
1447 }
1448 }
dea0fc0b 1449 *newlen = d - dstart;
a0ed51b3
LW
1450 return d;
1451}
1452
1453/* Note: this one is slightly destructive of the source. */
1454
1455U8*
dea0fc0b 1456Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1457{
1458 U8* s = (U8*)p;
d4c19fe8 1459 U8* const send = s + bytelen;
7918f24d
NC
1460
1461 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1462
e0ea5e2d
NC
1463 if (bytelen & 1)
1464 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1465 (UV)bytelen);
1466
a0ed51b3 1467 while (s < send) {
d4c19fe8 1468 const U8 tmp = s[0];
a0ed51b3
LW
1469 s[0] = s[1];
1470 s[1] = tmp;
1471 s += 2;
1472 }
dea0fc0b 1473 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1474}
1475
922e8cb4
KW
1476bool
1477Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1478{
1479 U8 tmpbuf[UTF8_MAXBYTES+1];
1480 uvchr_to_utf8(tmpbuf, c);
1481 return _is_utf8_FOO(classnum, tmpbuf);
1482}
1483
c3fd2246
KW
1484/* for now these are all defined (inefficiently) in terms of the utf8 versions.
1485 * Note that the macros in handy.h that call these short-circuit calling them
1486 * for Latin-1 range inputs */
a0ed51b3
LW
1487
1488bool
84afefe6 1489Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1490{
89ebb4a3 1491 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1492 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1493 return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
a0ed51b3
LW
1494}
1495
1496bool
5092f92a
KW
1497Perl_is_uni_alnumc(pTHX_ UV c)
1498{
1499 U8 tmpbuf[UTF8_MAXBYTES+1];
1500 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1501 return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
5092f92a
KW
1502}
1503
f9ae8fb6
JD
1504/* Internal function so we can deprecate the external one, and call
1505 this one from other deprecated functions in this file */
1506
1507PERL_STATIC_INLINE bool
61b19385
KW
1508S_is_utf8_idfirst(pTHX_ const U8 *p)
1509{
1510 dVAR;
1511
1512 if (*p == '_')
1513 return TRUE;
1514 /* is_utf8_idstart would be more logical. */
1515 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1516}
1517
5092f92a 1518bool
84afefe6 1519Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1520{
89ebb4a3 1521 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1522 uvchr_to_utf8(tmpbuf, c);
61b19385 1523 return S_is_utf8_idfirst(aTHX_ tmpbuf);
a0ed51b3
LW
1524}
1525
1526bool
eba68aa0
KW
1527Perl__is_uni_perl_idcont(pTHX_ UV c)
1528{
1529 U8 tmpbuf[UTF8_MAXBYTES+1];
1530 uvchr_to_utf8(tmpbuf, c);
1531 return _is_utf8_perl_idcont(tmpbuf);
1532}
1533
1534bool
f91dcd13
KW
1535Perl__is_uni_perl_idstart(pTHX_ UV c)
1536{
1537 U8 tmpbuf[UTF8_MAXBYTES+1];
1538 uvchr_to_utf8(tmpbuf, c);
1539 return _is_utf8_perl_idstart(tmpbuf);
1540}
1541
1542bool
84afefe6 1543Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1544{
89ebb4a3 1545 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1546 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1547 return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
a0ed51b3
LW
1548}
1549
1550bool
84afefe6 1551Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1552{
bc39fe24 1553 return isASCII(c);
4d61ec05
GS
1554}
1555
1556bool
bdd8600f
KW
1557Perl_is_uni_blank(pTHX_ UV c)
1558{
2cafb56b 1559 return isBLANK_uni(c);
bdd8600f
KW
1560}
1561
1562bool
84afefe6 1563Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1564{
add4123a 1565 return isSPACE_uni(c);
a0ed51b3
LW
1566}
1567
1568bool
84afefe6 1569Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1570{
89ebb4a3 1571 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1572 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1573 return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
a0ed51b3
LW
1574}
1575
1576bool
84afefe6 1577Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1578{
89ebb4a3 1579 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1580 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1581 return _is_utf8_FOO(_CC_UPPER, tmpbuf);
a0ed51b3
LW
1582}
1583
1584bool
84afefe6 1585Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1586{
89ebb4a3 1587 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1588 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1589 return _is_utf8_FOO(_CC_LOWER, tmpbuf);
a0ed51b3
LW
1590}
1591
1592bool
84afefe6 1593Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1594{
7b952154 1595 return isCNTRL_L1(c);
b8c5462f
JH
1596}
1597
1598bool
84afefe6 1599Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1600{
89ebb4a3 1601 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1602 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1603 return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
b8c5462f
JH
1604}
1605
1606bool
84afefe6 1607Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1608{
89ebb4a3 1609 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1610 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1611 return _is_utf8_FOO(_CC_PRINT, tmpbuf);
a0ed51b3
LW
1612}
1613
b8c5462f 1614bool
84afefe6 1615Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1616{
89ebb4a3 1617 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1618 uvchr_to_utf8(tmpbuf, c);
922e8cb4 1619 return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
b8c5462f
JH
1620}
1621
4d61ec05 1622bool
84afefe6 1623Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1624{
4ac6419d 1625 return isXDIGIT_uni(c);
4d61ec05
GS
1626}
1627
3a4c58c9
KW
1628UV
1629Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1630{
1631 /* We have the latin1-range values compiled into the core, so just use
1632 * those, converting the result to utf8. The only difference between upper
1633 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1634 * either "SS" or "Ss". Which one to use is passed into the routine in
1635 * 'S_or_s' to avoid a test */
1636
1637 UV converted = toUPPER_LATIN1_MOD(c);
1638
1639 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1640
1641 assert(S_or_s == 'S' || S_or_s == 's');
1642
f4cd282c
KW
1643 if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
1644 characters in this range */
3a4c58c9
KW
1645 *p = (U8) converted;
1646 *lenp = 1;
1647 return converted;
1648 }
1649
1650 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1651 * which it maps to one of them, so as to only have to have one check for
1652 * it in the main case */
1653 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1654 switch (c) {
1655 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1656 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1657 break;
1658 case MICRO_SIGN:
1659 converted = GREEK_CAPITAL_LETTER_MU;
1660 break;
1661 case LATIN_SMALL_LETTER_SHARP_S:
1662 *(p)++ = 'S';
1663 *p = S_or_s;
1664 *lenp = 2;
1665 return 'S';
1666 default:
1667 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
118e2215 1668 assert(0); /* NOTREACHED */
3a4c58c9
KW
1669 }
1670 }
1671
1672 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1673 *p = UTF8_TWO_BYTE_LO(converted);
1674 *lenp = 2;
1675
1676 return converted;
1677}
1678
50bda2c3
KW
1679/* Call the function to convert a UTF-8 encoded character to the specified case.
1680 * Note that there may be more than one character in the result.
1681 * INP is a pointer to the first byte of the input character
1682 * OUTP will be set to the first byte of the string of changed characters. It
1683 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1684 * LENP will be set to the length in bytes of the string of changed characters
1685 *
1686 * The functions return the ordinal of the first character in the string of OUTP */
f90a9a02
KW
1687#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1688#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1689#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
50bda2c3
KW
1690
1691/* This additionally has the input parameter SPECIALS, which if non-zero will
1692 * cause this to use the SPECIALS hash for folding (meaning get full case
1693 * folding); otherwise, when zero, this implies a simple case fold */
f90a9a02 1694#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
c3fd2246 1695
84afefe6
JH
1696UV
1697Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1698{
3a4c58c9
KW
1699 dVAR;
1700
a1433954
KW
1701 /* Convert the Unicode character whose ordinal is <c> to its uppercase
1702 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1703 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
1704 * the changed version may be longer than the original character.
1705 *
1706 * The ordinal of the first character of the changed version is returned
1707 * (but note, as explained above, that there may be more.) */
1708
7918f24d
NC
1709 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1710
3a4c58c9
KW
1711 if (c < 256) {
1712 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1713 }
1714
0ebc6274 1715 uvchr_to_utf8(p, c);
3a4c58c9 1716 return CALL_UPPER_CASE(p, p, lenp);
a0ed51b3
LW
1717}
1718
84afefe6
JH
1719UV
1720Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1721{
3a4c58c9
KW
1722 dVAR;
1723
7918f24d
NC
1724 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1725
3a4c58c9
KW
1726 if (c < 256) {
1727 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1728 }
1729
0ebc6274 1730 uvchr_to_utf8(p, c);
3a4c58c9 1731 return CALL_TITLE_CASE(p, p, lenp);
a0ed51b3
LW
1732}
1733
afc16117
KW
1734STATIC U8
1735S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1736{
1737 /* We have the latin1-range values compiled into the core, so just use
1738 * those, converting the result to utf8. Since the result is always just
a1433954 1739 * one character, we allow <p> to be NULL */
afc16117
KW
1740
1741 U8 converted = toLOWER_LATIN1(c);
1742
1743 if (p != NULL) {
f4cd282c 1744 if (NATIVE_IS_INVARIANT(converted)) {
afc16117
KW
1745 *p = converted;
1746 *lenp = 1;
1747 }
1748 else {
1749 *p = UTF8_TWO_BYTE_HI(converted);
1750 *(p+1) = UTF8_TWO_BYTE_LO(converted);
1751 *lenp = 2;
1752 }
1753 }
1754 return converted;
1755}
1756
84afefe6
JH
1757UV
1758Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1759{
968c5e6a
KW
1760 dVAR;
1761
7918f24d
NC
1762 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1763
afc16117
KW
1764 if (c < 256) {
1765 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
1766 }
1767
afc16117 1768 uvchr_to_utf8(p, c);
968c5e6a 1769 return CALL_LOWER_CASE(p, p, lenp);
a0ed51b3
LW
1770}
1771
84afefe6 1772UV
51910141 1773Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 1774{
51910141 1775 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 1776 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 1777 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
1778 *
1779 * Not to be used for locale folds
51910141 1780 */
f673fad4 1781
a1dde8de
KW
1782 UV converted;
1783
1784 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1785
1ca267a5
KW
1786 assert (! (flags & FOLD_FLAGS_LOCALE));
1787
a1dde8de
KW
1788 if (c == MICRO_SIGN) {
1789 converted = GREEK_SMALL_LETTER_MU;
1790 }
51910141 1791 else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1ca267a5
KW
1792
1793 /* If can't cross 127/128 boundary, can't return "ss"; instead return
1794 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1795 * under those circumstances. */
1796 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1797 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1798 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1799 p, *lenp, U8);
1800 return LATIN_SMALL_LETTER_LONG_S;
1801 }
1802 else {
4f489194
KW
1803 *(p)++ = 's';
1804 *p = 's';
1805 *lenp = 2;
1806 return 's';
1ca267a5 1807 }
a1dde8de
KW
1808 }
1809 else { /* In this range the fold of all other characters is their lower
1810 case */
1811 converted = toLOWER_LATIN1(c);
1812 }
1813
f4cd282c 1814 if (NATIVE_IS_INVARIANT(converted)) {
a1dde8de
KW
1815 *p = (U8) converted;
1816 *lenp = 1;
1817 }
1818 else {
1819 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1820 *p = UTF8_TWO_BYTE_LO(converted);
1821 *lenp = 2;
1822 }
1823
1824 return converted;
1825}
1826
1827UV
a0270393 1828Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
84afefe6 1829{
4b593389 1830
a0270393
KW
1831 /* Not currently externally documented, and subject to change
1832 * <flags> bits meanings:
1833 * FOLD_FLAGS_FULL iff full folding is to be used;
1834 * FOLD_FLAGS_LOCALE iff in locale
1835 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1836 */
4b593389 1837
36bb2ab6 1838 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 1839
a1dde8de 1840 if (c < 256) {
a0270393 1841 UV result = _to_fold_latin1((U8) c, p, lenp,
1ca267a5 1842 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a0270393
KW
1843 /* It is illegal for the fold to cross the 255/256 boundary under
1844 * locale; in this case return the original */
1845 return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1846 ? c
1847 : result;
a1dde8de
KW
1848 }
1849
a0270393
KW
1850 /* If no special needs, just use the macro */
1851 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1852 uvchr_to_utf8(p, c);
1853 return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1854 }
1855 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1856 the special flags. */
1857 U8 utf8_c[UTF8_MAXBYTES + 1];
1858 uvchr_to_utf8(utf8_c, c);
1859 return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
1860 }
84afefe6
JH
1861}
1862
a0ed51b3 1863bool
84afefe6 1864Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3 1865{
edfb3318 1866 if (c < 256) {
f4cd282c 1867 return isALNUM_LC(c);
edfb3318 1868 }
922e8cb4 1869 return _is_uni_FOO(_CC_WORDCHAR, c);
a0ed51b3
LW
1870}
1871
1872bool
5092f92a
KW
1873Perl_is_uni_alnumc_lc(pTHX_ UV c)
1874{
1875 if (c < 256) {
f4cd282c 1876 return isALPHANUMERIC_LC(c);
5092f92a 1877 }
922e8cb4 1878 return _is_uni_FOO(_CC_ALPHANUMERIC, c);
5092f92a
KW
1879}
1880
1881bool
84afefe6 1882Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3 1883{
edfb3318 1884 if (c < 256) {
f4cd282c 1885 return isIDFIRST_LC(c);
edfb3318
KW
1886 }
1887 return _is_uni_perl_idstart(c);
a0ed51b3
LW
1888}
1889
1890bool
84afefe6 1891Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3 1892{
edfb3318 1893 if (c < 256) {
f4cd282c 1894 return isALPHA_LC(c);
edfb3318 1895 }
922e8cb4 1896 return _is_uni_FOO(_CC_ALPHA, c);
a0ed51b3
LW
1897}
1898
1899bool
84afefe6 1900Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05 1901{
edfb3318 1902 if (c < 256) {
f4cd282c 1903 return isASCII_LC(c);
edfb3318
KW
1904 }
1905 return 0;
4d61ec05
GS
1906}
1907
1908bool
bdd8600f
KW
1909Perl_is_uni_blank_lc(pTHX_ UV c)
1910{
edfb3318 1911 if (c < 256) {
f4cd282c 1912 return isBLANK_LC(c);
edfb3318 1913 }
61b19385 1914 return isBLANK_uni(c);
bdd8600f
KW
1915}
1916
1917bool
84afefe6 1918Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3 1919{
edfb3318 1920 if (c < 256) {
f4cd282c 1921 return isSPACE_LC(c);
edfb3318 1922 }
61b19385 1923 return isSPACE_uni(c);
a0ed51b3
LW
1924}
1925
1926bool
84afefe6 1927Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3 1928{
edfb3318 1929 if (c < 256) {
f4cd282c 1930 return isDIGIT_LC(c);
edfb3318 1931 }
922e8cb4 1932 return _is_uni_FOO(_CC_DIGIT, c);
a0ed51b3
LW
1933}
1934
1935bool
84afefe6 1936Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3 1937{
edfb3318 1938 if (c < 256) {
f4cd282c 1939 return isUPPER_LC(c);
edfb3318 1940 }
922e8cb4 1941 return _is_uni_FOO(_CC_UPPER, c);
a0ed51b3
LW
1942}
1943
1944bool
84afefe6 1945Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3 1946{
edfb3318 1947 if (c < 256) {
f4cd282c 1948 return isLOWER_LC(c);
edfb3318 1949 }
922e8cb4 1950 return _is_uni_FOO(_CC_LOWER, c);
a0ed51b3
LW
1951}
1952
1953bool
84afefe6 1954Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f 1955{
edfb3318 1956 if (c < 256) {
f4cd282c 1957 return isCNTRL_LC(c);
edfb3318 1958 }
61b19385 1959 return 0;
b8c5462f
JH
1960}
1961
1962bool
84afefe6 1963Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f 1964{
edfb3318 1965 if (c < 256) {
f4cd282c 1966 return isGRAPH_LC(c);
edfb3318 1967 }
922e8cb4 1968 return _is_uni_FOO(_CC_GRAPH, c);
b8c5462f
JH
1969}
1970
1971bool
84afefe6 1972Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3 1973{
edfb3318 1974 if (c < 256) {
f4cd282c 1975 return isPRINT_LC(c);
edfb3318 1976 }
922e8cb4 1977 return _is_uni_FOO(_CC_PRINT, c);
a0ed51b3
LW
1978}
1979
b8c5462f 1980bool
84afefe6 1981Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f 1982{
edfb3318 1983 if (c < 256) {
f4cd282c 1984 return isPUNCT_LC(c);
edfb3318 1985 }
922e8cb4 1986 return _is_uni_FOO(_CC_PUNCT, c);
b8c5462f
JH
1987}
1988
4d61ec05 1989bool
84afefe6 1990Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05 1991{
edfb3318 1992 if (c < 256) {
f4cd282c 1993 return isXDIGIT_LC(c);
edfb3318 1994 }
61b19385 1995 return isXDIGIT_uni(c);
4d61ec05
GS
1996}
1997
b7ac61fa
JH
1998U32
1999Perl_to_uni_upper_lc(pTHX_ U32 c)
2000{
ee099d14
JH
2001 /* XXX returns only the first character -- do not use XXX */
2002 /* XXX no locale support yet */
2003 STRLEN len;
89ebb4a3 2004 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2005 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
2006}
2007
2008U32
2009Perl_to_uni_title_lc(pTHX_ U32 c)
2010{
ee099d14
JH
2011 /* XXX returns only the first character XXX -- do not use XXX */
2012 /* XXX no locale support yet */
2013 STRLEN len;
89ebb4a3 2014 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2015 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
2016}
2017
2018U32
2019Perl_to_uni_lower_lc(pTHX_ U32 c)
2020{
ee099d14
JH
2021 /* XXX returns only the first character -- do not use XXX */
2022 /* XXX no locale support yet */
2023 STRLEN len;
89ebb4a3 2024 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 2025 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
2026}
2027
26483009 2028PERL_STATIC_INLINE bool
5141f98e 2029S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
2030 const char *const swashname)
2031{
ea317ccb
KW
2032 /* returns a boolean giving whether or not the UTF8-encoded character that
2033 * starts at <p> is in the swash indicated by <swashname>. <swash>
2034 * contains a pointer to where the swash indicated by <swashname>
2035 * is to be stored; which this routine will do, so that future calls will
2036 * look at <*swash> and only generate a swash if it is not null
2037 *
2038 * Note that it is assumed that the buffer length of <p> is enough to
2039 * contain all the bytes that comprise the character. Thus, <*p> should
2040 * have been checked before this call for mal-formedness enough to assure
2041 * that. */
2042
97aff369 2043 dVAR;
7918f24d
NC
2044
2045 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2046
492a624f 2047 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2048 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2049 * as far as there being enough bytes available in it to accommodate the
2050 * character without reading beyond the end, and pass that number on to the
2051 * validating routine */
28123549
KW
2052 if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
2053 if (ckWARN_d(WARN_UTF8)) {
2054 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2055 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2056 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2057 what the malformation is */
2058 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2059 }
2060 }
2061 return FALSE;
2062 }
87367d5f
KW
2063 if (!*swash) {
2064 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2065 *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
2066 }
28123549 2067
bde6a22d
NC
2068 return swash_fetch(*swash, p, TRUE) != 0;
2069}
2070
2071bool
922e8cb4
KW
2072Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2073{
2074 dVAR;
2075
2076 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2077
2078 assert(classnum < _FIRST_NON_SWASH_CC);
2079
2080 return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
2081}
2082
2083bool
7fc63493 2084Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 2085{
97aff369 2086 dVAR;
7918f24d
NC
2087
2088 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
2089
671c33bf
NC
2090 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
2091 * descendant of isalnum(3), in other words, it doesn't
2092 * contain the '_'. --jhi */
03940dc2 2093 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
a0ed51b3
LW
2094}
2095
2096bool
5092f92a
KW
2097Perl_is_utf8_alnumc(pTHX_ const U8 *p)
2098{
2099 dVAR;
2100
2101 PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
2102
03940dc2 2103 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
5092f92a
KW
2104}
2105
2106bool
7fc63493 2107Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 2108{
97aff369 2109 dVAR;
7918f24d
NC
2110
2111 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
2112
61b19385 2113 return S_is_utf8_idfirst(aTHX_ p);
82686b01
JH
2114}
2115
2116bool
c11ff943
KW
2117Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
2118{
2119 dVAR;
2120
2121 PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
2122
2123 if (*p == '_')
2124 return TRUE;
2125 /* is_utf8_idstart would be more logical. */
2126 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
2127}
2128
2129bool
d65654cb 2130Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
b6912c02
KW
2131{
2132 dVAR;
2133
d65654cb 2134 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
b6912c02
KW
2135
2136 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
2137}
2138
2139bool
eba68aa0
KW
2140Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2141{
2142 dVAR;
2143
2144 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2145
2146 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
2147}
2148
2149
2150bool
7fc63493 2151Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2152{
97aff369 2153 dVAR;
7918f24d
NC
2154
2155 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
2156
d4c19fe8 2157 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
2158}
2159
2160bool
c11ff943
KW
2161Perl_is_utf8_xidcont(pTHX_ const U8 *p)
2162{
2163 dVAR;
2164
2165 PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
2166
c11ff943
KW
2167 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
2168}
2169
2170bool
7fc63493 2171Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 2172{
97aff369 2173 dVAR;
7918f24d
NC
2174
2175 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
2176
03940dc2 2177 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
a0ed51b3
LW
2178}
2179
2180bool
7fc63493 2181Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 2182{
97aff369 2183 dVAR;
7918f24d
NC
2184
2185 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
2186
bc39fe24
KW
2187 /* ASCII characters are the same whether in utf8 or not. So the macro
2188 * works on both utf8 and non-utf8 representations. */
2189 return isASCII(*p);
b8c5462f
JH
2190}
2191
2192bool
bdd8600f
KW
2193Perl_is_utf8_blank(pTHX_ const U8 *p)
2194{
2195 dVAR;
2196
2197 PERL_ARGS_ASSERT_IS_UTF8_BLANK;
2198
2cafb56b 2199 return isBLANK_utf8(p);
bdd8600f
KW
2200}
2201
2202bool
7fc63493 2203Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 2204{
97aff369 2205 dVAR;
7918f24d
NC
2206
2207 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
2208
add4123a 2209 return isSPACE_utf8(p);
a0ed51b3
LW
2210}
2211
2212bool
d1eb3177
YO
2213Perl_is_utf8_perl_space(pTHX_ const U8 *p)
2214{
2215 dVAR;
2216
2217 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2218
c4428693
KW
2219 /* Only true if is an ASCII space-like character, and ASCII is invariant
2220 * under utf8, so can just use the macro */
2221 return isSPACE_A(*p);
d1eb3177
YO
2222}
2223
2224bool
2225Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2226{
2227 dVAR;
2228
2229 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2230
c4428693
KW
2231 /* Only true if is an ASCII word character, and ASCII is invariant
2232 * under utf8, so can just use the macro */
2233 return isWORDCHAR_A(*p);
d1eb3177
YO
2234}
2235
2236bool
7fc63493 2237Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 2238{
97aff369 2239 dVAR;
7918f24d
NC
2240
2241 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2242
03940dc2 2243 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
a0ed51b3
LW
2244}
2245
2246bool
d1eb3177
YO
2247Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2248{
2249 dVAR;
2250
2251 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2252
c4428693
KW
2253 /* Only true if is an ASCII digit character, and ASCII is invariant
2254 * under utf8, so can just use the macro */
2255 return isDIGIT_A(*p);
d1eb3177
YO
2256}
2257
2258bool
7fc63493 2259Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 2260{
97aff369 2261 dVAR;
7918f24d
NC
2262
2263 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2264
03940dc2 2265 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
a0ed51b3
LW
2266}
2267
2268bool
7fc63493 2269Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 2270{
97aff369 2271 dVAR;
7918f24d
NC
2272
2273 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2274
03940dc2 2275 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
a0ed51b3
LW
2276}
2277
2278bool
7fc63493 2279Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 2280{
97aff369 2281 dVAR;
7918f24d
NC
2282
2283 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2284
a35d759a 2285 return isCNTRL_utf8(p);
b8c5462f
JH
2286}
2287
2288bool
7fc63493 2289Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 2290{
97aff369 2291 dVAR;
7918f24d
NC
2292
2293 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2294
03940dc2 2295 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
b8c5462f
JH
2296}
2297
2298bool
7fc63493 2299Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 2300{
97aff369 2301 dVAR;
7918f24d
NC
2302
2303 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2304
03940dc2 2305 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
a0ed51b3
LW
2306}
2307
2308bool
7fc63493 2309Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 2310{
97aff369 2311 dVAR;
7918f24d
NC
2312
2313 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2314
03940dc2 2315 return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
b8c5462f
JH
2316}
2317
2318bool
7fc63493 2319Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 2320{
97aff369 2321 dVAR;
7918f24d
NC
2322
2323 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2324
4ac6419d 2325 return is_XDIGIT_utf8(p);
b8c5462f
JH
2326}
2327
2328bool
7dbf68d2
KW
2329Perl__is_utf8_mark(pTHX_ const U8 *p)
2330{
2331 dVAR;
2332
2333 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2334
2335 return is_utf8_common(p, &PL_utf8_mark, "IsM");
2336}
2337
2338
2339bool
7fc63493 2340Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 2341{
97aff369 2342 dVAR;
7918f24d
NC
2343
2344 PERL_ARGS_ASSERT_IS_UTF8_MARK;
2345
7dbf68d2 2346 return _is_utf8_mark(p);
a0ed51b3
LW
2347}
2348
6b5c0936 2349/*
87cea99e 2350=for apidoc to_utf8_case
6b5c0936 2351
a1433954
KW
2352The C<p> contains the pointer to the UTF-8 string encoding
2353the character that is being converted. This routine assumes that the character
2354at C<p> is well-formed.
6b5c0936 2355
a1433954
KW
2356The C<ustrp> is a pointer to the character buffer to put the
2357conversion result to. The C<lenp> is a pointer to the length
6b5c0936
JH
2358of the result.
2359
a1433954 2360The C<swashp> is a pointer to the swash to use.
6b5c0936 2361
a1433954
KW
2362Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
2363and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. The C<special> (usually,
0134edef 2364but not always, a multicharacter mapping), is tried first.
6b5c0936 2365
a1433954 2366The C<special> is a string like "utf8::ToSpecLower", which means the
0134edef
JH
2367hash %utf8::ToSpecLower. The access to the hash is through
2368Perl_to_utf8_case().
6b5c0936 2369
a1433954 2370The C<normal> is a string like "ToLower" which means the swash
0134edef
JH
2371%utf8::ToLower.
2372
2373=cut */
6b5c0936 2374
2104c8d9 2375UV
9a957fbc
AL
2376Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2377 SV **swashp, const char *normal, const char *special)
a0ed51b3 2378{
97aff369 2379 dVAR;
0134edef 2380 STRLEN len = 0;
f4cd282c 2381 const UV uv1 = valid_utf8_to_uvchr(p, NULL);
7918f24d
NC
2382
2383 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2384
9ae3ac1a
KW
2385 /* Note that swash_fetch() doesn't output warnings for these because it
2386 * assumes we will */
8457b38f 2387 if (uv1 >= UNICODE_SURROGATE_FIRST) {
9ae3ac1a 2388 if (uv1 <= UNICODE_SURROGATE_LAST) {
8457b38f
KW
2389 if (ckWARN_d(WARN_SURROGATE)) {
2390 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2391 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2392 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2393 }
9ae3ac1a
KW
2394 }
2395 else if (UNICODE_IS_SUPER(uv1)) {
8457b38f
KW
2396 if (ckWARN_d(WARN_NON_UNICODE)) {
2397 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2398 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2399 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2400 }
9ae3ac1a
KW
2401 }
2402
2403 /* Note that non-characters are perfectly legal, so no warning should
2404 * be given */
2405 }
2406
0134edef 2407 if (!*swashp) /* load on-demand */
5ab9d2ef 2408 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2409
a6f87d8c 2410 if (special) {
0134edef 2411 /* It might be "special" (sometimes, but not always,
2a37f04d 2412 * a multicharacter mapping) */
6673a63c 2413 HV * const hv = get_hv(special, 0);
b08cf34e
JH
2414 SV **svp;
2415
35da51f7 2416 if (hv &&
f4cd282c 2417 (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
b08cf34e 2418 (*svp)) {
cfd0369c 2419 const char *s;
47654450 2420
cfd0369c 2421 s = SvPV_const(*svp, len);
47654450 2422 if (len == 1)
f4cd282c 2423 /* EIGHTBIT */
c80e42f3 2424 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2425 else {
d2dcd0fb 2426 Copy(s, ustrp, len, U8);
29e98929 2427 }
983ffd37 2428 }
0134edef
JH
2429 }
2430
2431 if (!len && *swashp) {
f4cd282c 2432 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
d4c19fe8 2433
0134edef
JH
2434 if (uv2) {
2435 /* It was "normal" (a single character mapping). */
f4cd282c 2436 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2437 }
2438 }
1feea2c7 2439
cbe07460
KW
2440 if (len) {
2441 if (lenp) {
2442 *lenp = len;
2443 }
2444 return valid_utf8_to_uvchr(ustrp, 0);
2445 }
2446
2447 /* Here, there was no mapping defined, which means that the code point maps
2448 * to itself. Return the inputs */
bfdf22ec 2449 len = UTF8SKIP(p);
ca9fab46
KW
2450 if (p != ustrp) { /* Don't copy onto itself */
2451 Copy(p, ustrp, len, U8);
2452 }
0134edef 2453
2a37f04d
JH
2454 if (lenp)
2455 *lenp = len;
2456
f4cd282c 2457 return uv1;
cbe07460 2458
a0ed51b3
LW
2459}
2460
051a06d4
KW
2461STATIC UV
2462S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2463{
2464 /* This is called when changing the case of a utf8-encoded character above
2465 * the Latin1 range, and the operation is in locale. If the result
2466 * contains a character that crosses the 255/256 boundary, disallow the
2467 * change, and return the original code point. See L<perlfunc/lc> for why;
2468 *
a1433954
KW
2469 * p points to the original string whose case was changed; assumed
2470 * by this routine to be well-formed
051a06d4
KW
2471 * result the code point of the first character in the changed-case string
2472 * ustrp points to the changed-case string (<result> represents its first char)
2473 * lenp points to the length of <ustrp> */
2474
2475 UV original; /* To store the first code point of <p> */
2476
2477 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2478
a4f12ed7 2479 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2480
2481 /* We know immediately if the first character in the string crosses the
2482 * boundary, so can skip */
2483 if (result > 255) {
2484
2485 /* Look at every character in the result; if any cross the
2486 * boundary, the whole thing is disallowed */
2487 U8* s = ustrp + UTF8SKIP(ustrp);
2488 U8* e = ustrp + *lenp;
2489 while (s < e) {
a4f12ed7 2490 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2491 goto bad_crossing;
2492 }
2493 s += UTF8SKIP(s);
2494 }
2495
2496 /* Here, no characters crossed, result is ok as-is */
2497 return result;
2498 }
2499
2500bad_crossing:
2501
2502 /* Failed, have to return the original */
4b88fb76 2503 original = valid_utf8_to_uvchr(p, lenp);
051a06d4
KW
2504 Copy(p, ustrp, *lenp, char);
2505 return original;
2506}
2507
d3e79532 2508/*
87cea99e 2509=for apidoc to_utf8_upper
d3e79532 2510
1f607577 2511Instead use L</toUPPER_utf8>.
a1433954 2512
d3e79532
JH
2513=cut */
2514
051a06d4
KW
2515/* Not currently externally documented, and subject to change:
2516 * <flags> is set iff locale semantics are to be used for code points < 256
2517 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2518 * were used in the calculation; otherwise unchanged. */
2519
2104c8d9 2520UV
051a06d4 2521Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2522{
97aff369 2523 dVAR;
7918f24d 2524
051a06d4
KW
2525 UV result;
2526
2527 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2528
3a4c58c9 2529 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2530 if (flags) {
2531 result = toUPPER_LC(*p);
2532 }
2533 else {
81c6c7ce 2534 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2535 }
3a4c58c9
KW
2536 }
2537 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2538 if (flags) {
94bb8c36 2539 result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
051a06d4
KW
2540 }
2541 else {
94bb8c36 2542 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2543 ustrp, lenp, 'S');
051a06d4
KW
2544 }
2545 }
2546 else { /* utf8, ord above 255 */
2547 result = CALL_UPPER_CASE(p, ustrp, lenp);
2548
2549 if (flags) {
2550 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2551 }
2552 return result;
2553 }
2554
2555 /* Here, used locale rules. Convert back to utf8 */
2556 if (UTF8_IS_INVARIANT(result)) {
2557 *ustrp = (U8) result;
2558 *lenp = 1;
2559 }
2560 else {
2561 *ustrp = UTF8_EIGHT_BIT_HI(result);
2562 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2563 *lenp = 2;
3a4c58c9
KW
2564 }
2565
051a06d4
KW
2566 if (tainted_ptr) {
2567 *tainted_ptr = TRUE;
2568 }
2569 return result;
983ffd37 2570}
a0ed51b3 2571
d3e79532 2572/*
87cea99e 2573=for apidoc to_utf8_title
d3e79532 2574
1f607577 2575Instead use L</toTITLE_utf8>.
a1433954 2576
d3e79532
JH
2577=cut */
2578
051a06d4
KW
2579/* Not currently externally documented, and subject to change:
2580 * <flags> is set iff locale semantics are to be used for code points < 256
2581 * Since titlecase is not defined in POSIX, uppercase is used instead
2582 * for these/
2583 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2584 * were used in the calculation; otherwise unchanged. */
2585
983ffd37 2586UV
051a06d4 2587Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
983ffd37 2588{
97aff369 2589 dVAR;
7918f24d 2590
051a06d4
KW
2591 UV result;
2592
2593 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2594
3a4c58c9 2595 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2596 if (flags) {
2597 result = toUPPER_LC(*p);
2598 }
2599 else {
81c6c7ce 2600 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2601 }
3a4c58c9
KW
2602 }
2603 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2604 if (flags) {
94bb8c36 2605 result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
051a06d4
KW
2606 }
2607 else {
94bb8c36 2608 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2609 ustrp, lenp, 's');
051a06d4
KW
2610 }
2611 }
2612 else { /* utf8, ord above 255 */
2613 result = CALL_TITLE_CASE(p, ustrp, lenp);
2614
2615 if (flags) {
2616 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2617 }
2618 return result;
2619 }
2620
2621 /* Here, used locale rules. Convert back to utf8 */
2622 if (UTF8_IS_INVARIANT(result)) {
2623 *ustrp = (U8) result;
2624 *lenp = 1;
2625 }
2626 else {
2627 *ustrp = UTF8_EIGHT_BIT_HI(result);
2628 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2629 *lenp = 2;
3a4c58c9
KW
2630 }
2631
051a06d4
KW
2632 if (tainted_ptr) {
2633 *tainted_ptr = TRUE;
2634 }
2635 return result;
a0ed51b3
LW
2636}
2637
d3e79532 2638/*
87cea99e 2639=for apidoc to_utf8_lower
d3e79532 2640
1f607577 2641Instead use L</toLOWER_utf8>.
a1433954 2642
d3e79532
JH
2643=cut */
2644
051a06d4
KW
2645/* Not currently externally documented, and subject to change:
2646 * <flags> is set iff locale semantics are to be used for code points < 256
2647 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2648 * were used in the calculation; otherwise unchanged. */
2649
2104c8d9 2650UV
051a06d4 2651Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2652{
051a06d4
KW
2653 UV result;
2654
97aff369 2655 dVAR;
7918f24d 2656
051a06d4 2657 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2658
968c5e6a 2659 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2660 if (flags) {
2661 result = toLOWER_LC(*p);
2662 }
2663 else {
81c6c7ce 2664 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2665 }
968c5e6a
KW
2666 }
2667 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2668 if (flags) {
94bb8c36 2669 result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
051a06d4
KW
2670 }
2671 else {
94bb8c36 2672 return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2673 ustrp, lenp);
051a06d4 2674 }
968c5e6a 2675 }
051a06d4
KW
2676 else { /* utf8, ord above 255 */
2677 result = CALL_LOWER_CASE(p, ustrp, lenp);
2678
2679 if (flags) {
2680 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2681 }
968c5e6a 2682
051a06d4
KW
2683 return result;
2684 }
2685
2686 /* Here, used locale rules. Convert back to utf8 */
2687 if (UTF8_IS_INVARIANT(result)) {
2688 *ustrp = (U8) result;
2689 *lenp = 1;
2690 }
2691 else {
2692 *ustrp = UTF8_EIGHT_BIT_HI(result);
2693 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2694 *lenp = 2;
2695 }
2696
2697 if (tainted_ptr) {
2698 *tainted_ptr = TRUE;
2699 }
2700 return result;
b4e400f9
JH
2701}
2702
d3e79532 2703/*
87cea99e 2704=for apidoc to_utf8_fold
d3e79532 2705
1f607577 2706Instead use L</toFOLD_utf8>.
a1433954 2707
d3e79532
JH
2708=cut */
2709
051a06d4
KW
2710/* Not currently externally documented, and subject to change,
2711 * in <flags>
2712 * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2713 * points < 256. Since foldcase is not defined in
2714 * POSIX, lowercase is used instead
2715 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2716 * otherwise simple folds
a0270393
KW
2717 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2718 * prohibited
051a06d4
KW
2719 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2720 * were used in the calculation; otherwise unchanged. */
36bb2ab6 2721
b4e400f9 2722UV
051a06d4 2723Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
b4e400f9 2724{
97aff369 2725 dVAR;
7918f24d 2726
051a06d4
KW
2727 UV result;
2728
36bb2ab6 2729 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2730
a0270393
KW
2731 /* These are mutually exclusive */
2732 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2733
50ba90ff
KW
2734 assert(p != ustrp); /* Otherwise overwrites */
2735
a1dde8de 2736 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2737 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2738 result = toFOLD_LC(*p);
051a06d4
KW
2739 }
2740 else {
81c6c7ce 2741 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2742 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2743 }
a1dde8de
KW
2744 }
2745 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2746 if (flags & FOLD_FLAGS_LOCALE) {
94bb8c36 2747 result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
051a06d4
KW
2748 }
2749 else {
94bb8c36 2750 return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2751 ustrp, lenp,
1ca267a5 2752 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2753 }
a1dde8de 2754 }
051a06d4 2755 else { /* utf8, ord above 255 */
a0270393 2756 result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2757
1ca267a5
KW
2758 if (flags & FOLD_FLAGS_LOCALE) {
2759
2760 /* Special case this character, as what normally gets returned
2761 * under locale doesn't work */
2762 if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2763 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2764 sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2765 {
2766 goto return_long_s;
2767 }
a0270393 2768 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2769 }
a0270393
KW
2770 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2771 return result;
2772 }
2773 else {
2774 /* This is called when changing the case of a utf8-encoded
2775 * character above the Latin1 range, and the result should not
2776 * contain an ASCII character. */
2777
2778 UV original; /* To store the first code point of <p> */
2779
2780 /* Look at every character in the result; if any cross the
2781 * boundary, the whole thing is disallowed */
2782 U8* s = ustrp;
2783 U8* e = ustrp + *lenp;
2784 while (s < e) {
2785 if (isASCII(*s)) {
2786 /* Crossed, have to return the original */
2787 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5
KW
2788
2789 /* But in this one instance, there is an alternative we can
2790 * return that is valid */
2791 if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
2792 goto return_long_s;
2793 }
a0270393
KW
2794 Copy(p, ustrp, *lenp, char);
2795 return original;
2796 }
2797 s += UTF8SKIP(s);
2798 }
051a06d4 2799
a0270393
KW
2800 /* Here, no characters crossed, result is ok as-is */
2801 return result;
2802 }
051a06d4
KW
2803 }
2804
2805 /* Here, used locale rules. Convert back to utf8 */
2806 if (UTF8_IS_INVARIANT(result)) {
2807 *ustrp = (U8) result;
2808 *lenp = 1;
2809 }
2810 else {
2811 *ustrp = UTF8_EIGHT_BIT_HI(result);
2812 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2813 *lenp = 2;
2814 }
2815
2816 if (tainted_ptr) {
2817 *tainted_ptr = TRUE;
2818 }
2819 return result;
1ca267a5
KW
2820
2821 return_long_s:
2822 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2823 * folds to a string of two of these characters. By returning this
2824 * instead, then, e.g.,
2825 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2826 * works. */
2827
2828 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2829 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2830 ustrp, *lenp, U8);
2831 return LATIN_SMALL_LETTER_LONG_S;
a0ed51b3
LW
2832}
2833
711a919c 2834/* Note:
f90a9a02 2835 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
2836 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2837 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2838 */
c4a5db0c 2839
a0ed51b3 2840SV*
7fc63493 2841Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 2842{
c4a5db0c
KW
2843 PERL_ARGS_ASSERT_SWASH_INIT;
2844
2845 /* Returns a copy of a swash initiated by the called function. This is the
2846 * public interface, and returning a copy prevents others from doing
2847 * mischief on the original */
2848
5d3d13d1 2849 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
2850}
2851
2852SV*
5d3d13d1 2853Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c
KW
2854{
2855 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
2856 * by calling utf8_heavy.pl in the general case. The returned value may be
2857 * the swash's inversion list instead if the input parameters allow it.
2858 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
2859 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2860 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
2861 *
2862 * This interface should only be used by functions that won't destroy or
2863 * adversely change the swash, as doing so affects all other uses of the
2864 * swash in the program; the general public should use 'Perl_swash_init'
2865 * instead.
2866 *
2867 * pkg is the name of the package that <name> should be in.
2868 * name is the name of the swash to find. Typically it is a Unicode
2869 * property name, including user-defined ones
2870 * listsv is a string to initialize the swash with. It must be of the form
2871 * documented as the subroutine return value in
2872 * L<perlunicode/User-Defined Character Properties>
2873 * minbits is the number of bits required to represent each data element.
2874 * It is '1' for binary properties.
2875 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 2876 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
2877 * flags_p if non-NULL is the address of various input and output flag bits
2878 * to the routine, as follows: ('I' means is input to the routine;
2879 * 'O' means output from the routine. Only flags marked O are
2880 * meaningful on return.)
2881 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2882 * came from a user-defined property. (I O)
5d3d13d1
KW
2883 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2884 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
2885 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2886 * return of an inversion list instead of a swash hash if this routine
2887 * thinks that would result in faster execution of swash_fetch() later
2888 * on. (I)
9a53f6cf
KW
2889 *
2890 * Thus there are three possible inputs to find the swash: <name>,
2891 * <listsv>, and <invlist>. At least one must be specified. The result
2892 * will be the union of the specified ones, although <listsv>'s various
2893 * actions can intersect, etc. what <name> gives.
2894 *
2895 * <invlist> is only valid for binary properties */
c4a5db0c 2896
27da23d5 2897 dVAR;
c4a5db0c 2898 SV* retval = &PL_sv_undef;
83199d38 2899 HV* swash_hv = NULL;
87367d5f
KW
2900 const int invlist_swash_boundary =
2901 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2902 ? 512 /* Based on some benchmarking, but not extensive, see commit
2903 message */
2904 : -1; /* Never return just an inversion list */
9a53f6cf
KW
2905
2906 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2907 assert(! invlist || minbits == 1);
2908
2909 /* If data was passed in to go out to utf8_heavy to find the swash of, do
2910 * so */
2911 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
2912 dSP;
2913 const size_t pkg_len = strlen(pkg);
2914 const size_t name_len = strlen(name);
2915 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2916 SV* errsv_save;
2917 GV *method;
2918
2919 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2920
2921 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 2922 ENTER;
69794297
KW
2923 SAVEHINTS();
2924 save_re_context();
650f067c
JL
2925 /* We might get here via a subroutine signature which uses a utf8
2926 * parameter name, at which point PL_subname will have been set
2927 * but not yet used. */
2928 save_item(PL_subname);
69794297
KW
2929 if (PL_parser && PL_parser->error_count)
2930 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2931 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2932 if (!method) { /* demand load utf8 */
2933 ENTER;
db2c6cb3
FC
2934 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2935 GvSV(PL_errgv) = NULL;
69794297
KW
2936 /* It is assumed that callers of this routine are not passing in
2937 * any user derived data. */
2938 /* Need to do this after save_re_context() as it will set
2939 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2940 * in Perl_magic_get). Even line to create errsv_save can turn on
2941 * PL_tainted. */
284167a5
SM
2942#ifndef NO_TAINT_SUPPORT
2943 SAVEBOOL(TAINT_get);
2944 TAINT_NOT;
2945#endif
69794297
KW
2946 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2947 NULL);
eed484f9 2948 {
db2c6cb3
FC
2949 /* Not ERRSV, as there is no need to vivify a scalar we are
2950 about to discard. */
2951 SV * const errsv = GvSV(PL_errgv);
2952 if (!SvTRUE(errsv)) {
2953 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2954 SvREFCNT_dec(errsv);
2955 }
eed484f9 2956 }
69794297
KW
2957 LEAVE;
2958 }
2959 SPAGAIN;
2960 PUSHMARK(SP);
2961 EXTEND(SP,5);
2962 mPUSHp(pkg, pkg_len);
2963 mPUSHp(name, name_len);
2964 PUSHs(listsv);
2965 mPUSHi(minbits);
2966 mPUSHi(none);
2967 PUTBACK;
db2c6cb3
FC
2968 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2969 GvSV(PL_errgv) = NULL;
69794297
KW
2970 /* If we already have a pointer to the method, no need to use
2971 * call_method() to repeat the lookup. */
c41800a8
KW
2972 if (method
2973 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
2974 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2975 {
2976 retval = *PL_stack_sp--;
2977 SvREFCNT_inc(retval);
2978 }
eed484f9 2979 {
db2c6cb3
FC
2980 /* Not ERRSV. See above. */
2981 SV * const errsv = GvSV(PL_errgv);
2982 if (!SvTRUE(errsv)) {
2983 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2984 SvREFCNT_dec(errsv);
2985 }
eed484f9 2986 }
ce3b816e 2987 LEAVE;
69794297
KW
2988 POPSTACK;
2989 if (IN_PERL_COMPILETIME) {
2990 CopHINTS_set(PL_curcop, PL_hints);
2991 }
2992 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2993 if (SvPOK(retval))
2994
2995 /* If caller wants to handle missing properties, let them */
5d3d13d1 2996 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
69794297
KW
2997 return NULL;
2998 }
2999 Perl_croak(aTHX_
3000 "Can't find Unicode property definition \"%"SVf"\"",
3001 SVfARG(retval));
3002 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
3003 }
9a53f6cf 3004 } /* End of calling the module to find the swash */
36eb48b4 3005
83199d38
KW
3006 /* If this operation fetched a swash, and we will need it later, get it */
3007 if (retval != &PL_sv_undef
3008 && (minbits == 1 || (flags_p
3009 && ! (*flags_p
3010 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3011 {
3012 swash_hv = MUTABLE_HV(SvRV(retval));
3013
3014 /* If we don't already know that there is a user-defined component to
3015 * this swash, and the user has indicated they wish to know if there is
3016 * one (by passing <flags_p>), find out */
3017 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3018 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3019 if (user_defined && SvUV(*user_defined)) {
3020 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3021 }
3022 }
3023 }
3024
36eb48b4
KW
3025 /* Make sure there is an inversion list for binary properties */
3026 if (minbits == 1) {
3027 SV** swash_invlistsvp = NULL;
3028 SV* swash_invlist = NULL;
9a53f6cf 3029 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3030 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3031 an unclaimed reference count */
36eb48b4 3032
9a53f6cf 3033 /* If this operation fetched a swash, get its already existing
83199d38 3034 * inversion list, or create one for it */
36eb48b4 3035
83199d38 3036 if (swash_hv) {
5c9f4bd2 3037 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3038 if (swash_invlistsvp) {
3039 swash_invlist = *swash_invlistsvp;
3040 invlist_in_swash_is_valid = TRUE;
3041 }
3042 else {
36eb48b4 3043 swash_invlist = _swash_to_invlist(retval);
02c85471 3044 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3045 }
3046 }
3047
3048 /* If an inversion list was passed in, have to include it */
3049 if (invlist) {
3050
3051 /* Any fetched swash will by now have an inversion list in it;
3052 * otherwise <swash_invlist> will be NULL, indicating that we
3053 * didn't fetch a swash */
3054 if (swash_invlist) {
3055
3056 /* Add the passed-in inversion list, which invalidates the one
3057 * already stored in the swash */
3058 invlist_in_swash_is_valid = FALSE;
3059 _invlist_union(invlist, swash_invlist, &swash_invlist);
3060 }
3061 else {
3062
87367d5f
KW
3063 /* Here, there is no swash already. Set up a minimal one, if
3064 * we are going to return a swash */
3065 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3066 swash_hv = newHV();
4aca0fe6 3067 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3068 }
9a53f6cf
KW
3069 swash_invlist = invlist;
3070 }
9a53f6cf
KW
3071 }
3072
3073 /* Here, we have computed the union of all the passed-in data. It may
3074 * be that there was an inversion list in the swash which didn't get
3075 * touched; otherwise save the one computed one */
87367d5f
KW
3076 if (! invlist_in_swash_is_valid
3077 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3078 {
5c9f4bd2 3079 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3080 {
3081 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3082 }
cc34d8c5
FC
3083 /* We just stole a reference count. */
3084 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3085 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3086 }
87367d5f 3087
c41800a8 3088 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3089 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3090 SvREFCNT_dec(retval);
02c85471
FC
3091 if (!swash_invlist_unclaimed)
3092 SvREFCNT_inc_simple_void_NN(swash_invlist);
3093 retval = newRV_noinc(swash_invlist);
87367d5f 3094 }
36eb48b4
KW
3095 }
3096
a0ed51b3
LW
3097 return retval;
3098}
3099
035d37be
JH
3100
3101/* This API is wrong for special case conversions since we may need to
3102 * return several Unicode characters for a single Unicode character
3103 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3104 * the lower-level routine, and it is similarly broken for returning
38684baa
KW
3105 * multiple values. --jhi
3106 * For those, you should use to_utf8_case() instead */
b0e3252e 3107/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
3108
3109/* Note:
3110 * Returns the value of property/mapping C<swash> for the first character
3111 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
3112 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
3113 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3114 *
3115 * A "swash" is a hash which contains initially the keys/values set up by
3116 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3117 * property for all possible code points. Things are stored in a compact form
3118 * (see utf8_heavy.pl) so that calculation is required to find the actual
3119 * property value for a given code point. As code points are looked up, new
3120 * key/value pairs are added to the hash, so that the calculation doesn't have
3121 * to ever be re-done. Further, each calculation is done, not just for the
3122 * desired one, but for a whole block of code points adjacent to that one.
3123 * For binary properties on ASCII machines, the block is usually for 64 code
3124 * points, starting with a code point evenly divisible by 64. Thus if the
3125 * property value for code point 257 is requested, the code goes out and
3126 * calculates the property values for all 64 code points between 256 and 319,
3127 * and stores these as a single 64-bit long bit vector, called a "swatch",
3128 * under the key for code point 256. The key is the UTF-8 encoding for code
3129 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3130 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3131 * for code point 258 is then requested, this code realizes that it would be
3132 * stored under the key for 256, and would find that value and extract the
3133 * relevant bit, offset from 256.
3134 *
3135 * Non-binary properties are stored in as many bits as necessary to represent
3136 * their values (32 currently, though the code is more general than that), not
3137 * as single bits, but the principal is the same: the value for each key is a
3138 * vector that encompasses the property values for all code points whose UTF-8
3139 * representations are represented by the key. That is, for all code points
3140 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3141 * bytes of that.
680c470c 3142 */
a0ed51b3 3143UV
680c470c 3144Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3145{
27da23d5 3146 dVAR;
ef8f7699 3147 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3148 U32 klen;
3149 U32 off;
a0ed51b3 3150 STRLEN slen;
7d85a32c 3151 STRLEN needents;
cfd0369c 3152 const U8 *tmps = NULL;
a0ed51b3 3153 U32 bit;
979f2922 3154 SV *swatch;
3568d838 3155 U8 tmputf8[2];
f4cd282c 3156 const UV c = *ptr;
3568d838 3157
7918f24d
NC
3158 PERL_ARGS_ASSERT_SWASH_FETCH;
3159
87367d5f
KW
3160 /* If it really isn't a hash, it isn't really swash; must be an inversion
3161 * list */
3162 if (SvTYPE(hv) != SVt_PVHV) {
3163 return _invlist_contains_cp((SV*)hv,
3164 (do_utf8)
3165 ? valid_utf8_to_uvchr(ptr, NULL)
3166 : c);
3167 }
3168
dbe7a391 3169 /* Convert to utf8 if not already */
f4cd282c 3170 if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) {
979f2922
ST
3171 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
3172 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3173 ptr = tmputf8;
3568d838
JH
3174 }
3175 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
37e2e78e 3176 * then the "swatch" is a vec() for all the chars which start
3568d838
JH
3177 * with 0xAA..0xYY
3178 * So the key in the hash (klen) is length of encoded char -1
3179 */
3180 klen = UTF8SKIP(ptr) - 1;
a0ed51b3 3181
979f2922 3182 if (klen == 0) {
37e2e78e 3183 /* If char is invariant then swatch is for all the invariant chars
1e54db1a 3184 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 3185 */
979f2922 3186 needents = UTF_CONTINUATION_MARK;
bc3632a8 3187 off = NATIVE_UTF8_TO_I8(ptr[klen]);
979f2922
ST
3188 }
3189 else {
7d85a32c 3190 /* If char is encoded then swatch is for the prefix */
979f2922 3191 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3192 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3193 }
7d85a32c 3194
a0ed51b3
LW
3195 /*
3196 * This single-entry cache saves about 1/3 of the utf8 overhead in test
3197 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3198 * it's nothing to sniff at.) Pity we usually come through at least
3199 * two function calls to get here...
3200 *
3201 * NB: this code assumes that swatches are never modified, once generated!
3202 */
3203
3568d838 3204 if (hv == PL_last_swash_hv &&
a0ed51b3 3205 klen == PL_last_swash_klen &&
27da23d5 3206 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3207 {
3208 tmps = PL_last_swash_tmps;
3209 slen = PL_last_swash_slen;
3210 }
3211 else {
3212 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3213 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3214
b0e3252e 3215 /* If not cached, generate it via swatch_get */
979f2922
ST
3216 if (!svp || !SvPOK(*svp)
3217 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
c80e42f3 3218 const UV code_point = utf8n_to_uvchr(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
3219 ckWARN(WARN_UTF8) ?
3220 0 : UTF8_ALLOW_ANY);
b0e3252e 3221 swatch = swatch_get(swash,
979f2922 3222 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
361ee0fe 3223 (klen) ? (code_point & ~((UV)needents - 1)) : 0,
979f2922
ST
3224 needents);
3225
923e4eb5 3226 if (IN_PERL_COMPILETIME)
623e6609 3227 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3228
979f2922 3229 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3230
979f2922
ST
3231 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3232 || (slen << 3) < needents)
5637ef5b
NC
3233 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3234 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3235 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3236 }
3237
3238 PL_last_swash_hv = hv;
16d8f38a 3239 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3240 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3241 /* FIXME change interpvar.h? */
3242 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3243 PL_last_swash_slen = slen;
3244 if (klen)
3245 Copy(ptr, PL_last_swash_key, klen, U8);
3246 }
3247
9faf8d75 3248 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
3249 case 1:
3250 bit = 1 << (off & 7);
3251 off >>= 3;
3252 return (tmps[off] & bit) != 0;
3253 case 8:
3254 return tmps[off];
3255 case 16:
3256 off <<= 1;
3257 return (tmps[off] << 8) + tmps[off + 1] ;
3258 case 32:
3259 off <<= 2;
3260 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3261 }
5637ef5b
NC
3262 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3263 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3264 NORETURN_FUNCTION_END;
a0ed51b3 3265}
2b9d42f0 3266
319009ee
KW
3267/* Read a single line of the main body of the swash input text. These are of
3268 * the form:
3269 * 0053 0056 0073
3270 * where each number is hex. The first two numbers form the minimum and
3271 * maximum of a range, and the third is the value associated with the range.
3272 * Not all swashes should have a third number
3273 *
3274 * On input: l points to the beginning of the line to be examined; it points
3275 * to somewhere in the string of the whole input text, and is
3276 * terminated by a \n or the null string terminator.
3277 * lend points to the null terminator of that string
3278 * wants_value is non-zero if the swash expects a third number
3279 * typestr is the name of the swash's mapping, like 'ToLower'
3280 * On output: *min, *max, and *val are set to the values read from the line.
3281 * returns a pointer just beyond the line examined. If there was no
3282 * valid min number on the line, returns lend+1
3283 */
3284
3285STATIC U8*
3286S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3287 const bool wants_value, const U8* const typestr)
3288{
3289 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3290 STRLEN numlen; /* Length of the number */
02470786
KW
3291 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3292 | PERL_SCAN_DISALLOW_PREFIX
3293 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3294
3295 /* nl points to the next \n in the scan */
3296 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3297
3298 /* Get the first number on the line: the range minimum */
3299 numlen = lend - l;
3300 *min = grok_hex((char *)l, &numlen, &flags, NULL);
3301 if (numlen) /* If found a hex number, position past it */
3302 l += numlen;
3303 else if (nl) { /* Else, go handle next line, if any */
3304 return nl + 1; /* 1 is length of "\n" */
3305 }
3306 else { /* Else, no next line */
3307 return lend + 1; /* to LIST's end at which \n is not found */
3308 }
3309
3310 /* The max range value follows, separated by a BLANK */
3311 if (isBLANK(*l)) {
3312 ++l;
02470786
KW
3313 flags = PERL_SCAN_SILENT_ILLDIGIT
3314 | PERL_SCAN_DISALLOW_PREFIX
3315 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3316 numlen = lend - l;
3317 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3318 if (numlen)
3319 l += numlen;
3320 else /* If no value here, it is a single element range */
3321 *max = *min;
3322
3323 /* Non-binary tables have a third entry: what the first element of the
3324 * range maps to */
3325 if (wants_value) {
3326 if (isBLANK(*l)) {
3327 ++l;
f90a9a02
KW
3328
3329 /* The ToLc, etc table mappings are not in hex, and must be
3330 * corrected by adding the code point to them */
3331 if (typeto) {
3332 char *after_strtol = (char *) lend;
3333 *val = Strtol((char *)l, &after_strtol, 10);
3334 l = (U8 *) after_strtol;
f90a9a02
KW
3335 }
3336 else { /* Other tables are in hex, and are the correct result
3337 without tweaking */
a9d188b3
KW
3338 flags = PERL_SCAN_SILENT_ILLDIGIT
3339 | PERL_SCAN_DISALLOW_PREFIX
3340 | PERL_SCAN_SILENT_NON_PORTABLE;
3341 numlen = lend - l;
3342 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3343 if (numlen)
3344 l += numlen;
3345 else
3346 *val = 0;
f90a9a02 3347 }
319009ee
KW
3348 }
3349 else {
3350 *val = 0;
3351 if (typeto) {
dcbac5bb 3352 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3353 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3354 typestr, l);
3355 }
3356 }
3357 }
3358 else
3359 *val = 0; /* bits == 1, then any val should be ignored */
3360 }
3361 else { /* Nothing following range min, should be single element with no
3362 mapping expected */
3363 *max = *min;
3364 if (wants_value) {
3365 *val = 0;
3366 if (typeto) {
dcbac5bb 3367 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3368 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3369 }
3370 }
3371 else
3372 *val = 0; /* bits == 1, then val should be ignored */
3373 }
3374
3375 /* Position to next line if any, or EOF */
3376 if (nl)
3377 l = nl + 1;
3378 else
3379 l = lend;
3380
3381 return l;
3382}
3383
979f2922
ST
3384/* Note:
3385 * Returns a swatch (a bit vector string) for a code point sequence
3386 * that starts from the value C<start> and comprises the number C<span>.
3387 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3388 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3389 */
3390STATIC SV*
b0e3252e 3391S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
ST
3392{
3393 SV *swatch;
77f9f126 3394 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3395 STRLEN lcur, xcur, scur;
ef8f7699 3396 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3397 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3398
88d45d28
KW
3399 SV** listsvp = NULL; /* The string containing the main body of the table */
3400 SV** extssvp = NULL;
3401 SV** invert_it_svp = NULL;
3402 U8* typestr = NULL;
786861f5
KW
3403 STRLEN bits;
3404 STRLEN octets; /* if bits == 1, then octets == 0 */
3405 UV none;
3406 UV end = start + span;
972dd592 3407
36eb48b4 3408 if (invlistsvp == NULL) {
786861f5
KW
3409 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3410 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3411 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3412 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3413 listsvp = hv_fetchs(hv, "LIST", FALSE);
3414 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3415
3416 bits = SvUV(*bitssvp);
3417 none = SvUV(*nonesvp);
3418 typestr = (U8*)SvPV_nolen(*typesvp);
3419 }
36eb48b4
KW
3420 else {
3421 bits = 1;
3422 none = 0;
3423 }
786861f5 3424 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3425
b0e3252e 3426 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3427
979f2922 3428 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3429 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3430 (UV)bits);
979f2922
ST
3431 }
3432
84ea5ef6
KW
3433 /* If overflowed, use the max possible */
3434 if (end < start) {
3435 end = UV_MAX;
3436 span = end - start;
3437 }
3438
979f2922 3439 /* create and initialize $swatch */
979f2922 3440 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3441 swatch = newSV(scur);
3442 SvPOK_on(swatch);
979f2922
ST
3443 s = (U8*)SvPVX(swatch);
3444 if (octets && none) {
0bd48802 3445 const U8* const e = s + scur;
979f2922
ST
3446 while (s < e) {
3447 if (bits == 8)
3448 *s++ = (U8)(none & 0xff);
3449 else if (bits == 16) {
3450 *s++ = (U8)((none >> 8) & 0xff);
3451 *s++ = (U8)( none & 0xff);
3452 }
3453 else if (bits == 32) {
3454 *s++ = (U8)((none >> 24) & 0xff);
3455 *s++ = (U8)((none >> 16) & 0xff);
3456 *s++ = (U8)((none >> 8) & 0xff);
3457 *s++ = (U8)( none & 0xff);
3458 }
3459 }
3460 *s = '\0';
3461 }
3462 else {
3463 (void)memzero((U8*)s, scur + 1);
3464 }
3465 SvCUR_set(swatch, scur);
3466 s = (U8*)SvPVX(swatch);
3467
36eb48b4
KW
3468 if (invlistsvp) { /* If has an inversion list set up use that */
3469 _invlist_populate_swatch(*invlistsvp, start, end, s);
3470 return swatch;
3471 }
3472
3473 /* read $swash->{LIST} */
979f2922
ST
3474 l = (U8*)SvPV(*listsvp, lcur);
3475 lend = l + lcur;
3476 while (l < lend) {
8ed25d53 3477 UV min, max, val, upper;
319009ee
KW
3478 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3479 cBOOL(octets), typestr);
3480 if (l > lend) {
979f2922
ST
3481 break;
3482 }
3483
972dd592 3484 /* If looking for something beyond this range, go try the next one */
979f2922
ST
3485 if (max < start)
3486 continue;
3487
8ed25d53
KW
3488 /* <end> is generally 1 beyond where we want to set things, but at the
3489 * platform's infinity, where we can't go any higher, we want to
3490 * include the code point at <end> */
3491 upper = (max < end)
3492 ? max
3493 : (max != UV_MAX || end != UV_MAX)
3494 ? end - 1
3495 : end;
3496
979f2922 3497 if (octets) {
35da51f7 3498 UV key;
979f2922
ST
3499 if (min < start) {
3500 if (!none || val < none) {
3501 val += start - min;
3502 }
3503 min = start;
3504 }
8ed25d53 3505 for (key = min; key <= upper; key++) {
979f2922 3506 STRLEN offset;
979f2922
ST
3507 /* offset must be non-negative (start <= min <= key < end) */
3508 offset = octets * (key - start);
3509 if (bits == 8)
3510 s[offset] = (U8)(val & 0xff);
3511 else if (bits == 16) {
3512 s[offset ] = (U8)((val >> 8) & 0xff);
3513 s[offset + 1] = (U8)( val & 0xff);
3514 }
3515 else if (bits == 32) {
3516 s[offset ] = (U8)((val >> 24) & 0xff);
3517 s[offset + 1] = (U8)((val >> 16) & 0xff);
3518 s[offset + 2] = (U8)((val >> 8) & 0xff);
3519 s[offset + 3] = (U8)( val & 0xff);
3520 }
3521
3522 if (!none || val < none)
3523 ++val;
3524 }
3525 }
711a919c 3526 else { /* bits == 1, then val should be ignored */
35da51f7 3527 UV key;
979f2922
ST
3528 if (min < start)
3529 min = start;
6cb05c12 3530
8ed25d53 3531 for (key = min; key <= upper; key++) {
0bd48802 3532 const STRLEN offset = (STRLEN)(key - start);
979f2922
ST
3533 s[offset >> 3] |= 1 << (offset & 7);
3534 }
3535 }
3536 } /* while */
979f2922 3537
9479a769 3538 /* Invert if the data says it should be. Assumes that bits == 1 */
77f9f126 3539 if (invert_it_svp && SvUV(*invert_it_svp)) {
0bda3001
KW
3540
3541 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3542 * be 0, and their inversion should also be 0, as we don't succeed any
3543 * Unicode property matches for non-Unicode code points */
3544 if (start <= PERL_UNICODE_MAX) {
3545
3546 /* The code below assumes that we never cross the
3547 * Unicode/above-Unicode boundary in a range, as otherwise we would
3548 * have to figure out where to stop flipping the bits. Since this
3549 * boundary is divisible by a large power of 2, and swatches comes
3550 * in small powers of 2, this should be a valid assumption */
3551 assert(start + span - 1 <= PERL_UNICODE_MAX);
3552
507a8485
KW
3553 send = s + scur;
3554 while (s < send) {
3555 *s = ~(*s);
3556 s++;
3557 }
0bda3001 3558 }
77f9f126
KW
3559 }
3560
d73c39c5
KW
3561 /* read $swash->{EXTRAS}
3562 * This code also copied to swash_to_invlist() below */
979f2922
ST
3563 x = (U8*)SvPV(*extssvp, xcur);
3564 xend = x + xcur;
3565 while (x < xend) {
3566 STRLEN namelen;
3567 U8 *namestr;
3568 SV** othersvp;
3569 HV* otherhv;
3570 STRLEN otherbits;
3571 SV **otherbitssvp, *other;
711a919c 3572 U8 *s, *o, *nl;
979f2922
ST
3573 STRLEN slen, olen;
3574
35da51f7 3575 const U8 opc = *x++;
979f2922
ST
3576 if (opc == '\n')
3577 continue;
3578
3579 nl = (U8*)memchr(x, '\n', xend - x);
3580
3581 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3582 if (nl) {
3583 x = nl + 1; /* 1 is length of "\n" */
3584 continue;
3585 }
3586 else {
3587 x = xend; /* to EXTRAS' end at which \n is not found */
3588 break;
3589 }
3590 }
3591
3592 namestr = x;
3593 if (nl) {
3594 namelen = nl - namestr;
3595 x = nl + 1;
3596 }
3597 else {
3598 namelen = xend - namestr;
3599 x = xend;
3600 }
3601
3602 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 3603 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 3604 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
ST
3605 otherbits = (STRLEN)SvUV(*otherbitssvp);
3606 if (bits < otherbits)
5637ef5b
NC
3607 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3608 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
979f2922
ST
3609
3610 /* The "other" swatch must be destroyed after. */
b0e3252e 3611 other = swatch_get(*othersvp, start, span);
979f2922
ST
3612 o = (U8*)SvPV(other, olen);
3613
3614 if (!olen)
b0e3252e 3615 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
979f2922
ST
3616
3617 s = (U8*)SvPV(swatch, slen);
3618 if (bits == 1 && otherbits == 1) {
3619 if (slen != olen)
5637ef5b
NC
3620 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3621 "mismatch, slen=%"UVuf", olen=%"UVuf,
3622 (UV)slen, (UV)olen);
979f2922
ST
3623
3624 switch (opc) {
3625 case '+':
3626 while (slen--)
3627 *s++ |= *o++;
3628 break;
3629 case '!':
3630 while (slen--)
3631 *s++ |= ~*o++;
3632 break;
3633 case '-':
3634 while (slen--)
3635 *s++ &= ~*o++;
3636 break;
3637 case '&':
3638 while (slen--)
3639 *s++ &= *o++;
3640 break;
3641 default:
3642 break;
3643 }
3644 }
711a919c 3645 else {
979f2922
ST
3646 STRLEN otheroctets = otherbits >> 3;
3647 STRLEN offset = 0;
35da51f7 3648 U8* const send = s + slen;
979f2922
ST
3649
3650 while (s < send) {
3651 UV otherval = 0;
3652
3653 if (otherbits == 1) {
3654 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3655 ++offset;
3656 }
3657 else {
3658 STRLEN vlen = otheroctets;