This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlapi (from sv.h) clarifications
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3
LW
33#include "perl.h"
34
a0c21aa1 35#ifndef EBCDIC
970ea3cb 36/* Separate prototypes needed because in ASCII systems these are
a0c21aa1
JH
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
27da23d5
JH
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 44
48ef279e 45/*
ccfc67b7 46=head1 Unicode Support
a0ed51b3 47
166f8a29
DM
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
61296642 50Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
166f8a29 53
eaf7a4d2
CS
54=cut
55*/
56
57/*
58=for apidoc is_ascii_string
59
970ea3cb
KW
60Returns true if the first C<len> bytes of the given string are the same whether
61or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
62is, if they are invariant. On ASCII-ish machines, only ASCII characters
63fit this definition, hence the function's name.
eaf7a4d2 64
9f7e3d64
MH
65If C<len> is 0, it will be calculated using C<strlen(s)>.
66
eaf7a4d2
CS
67See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
68
69=cut
70*/
71
72bool
668b6d8d 73Perl_is_ascii_string(const U8 *s, STRLEN len)
eaf7a4d2
CS
74{
75 const U8* const send = s + (len ? len : strlen((const char *)s));
76 const U8* x = s;
77
78 PERL_ARGS_ASSERT_IS_ASCII_STRING;
eaf7a4d2
CS
79
80 for (; x < send; ++x) {
81 if (!UTF8_IS_INVARIANT(*x))
82 break;
83 }
84
85 return x == send;
86}
87
88/*
87cea99e 89=for apidoc uvuni_to_utf8_flags
eebe1485 90
6ee84de2
KW
91Adds the UTF-8 representation of the code point C<uv> to the end
92of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
eebe1485 93bytes available. The return value is the pointer to the byte after the
9041c2e3 94end of the new character. In other words,
eebe1485 95
b851fbc1
JH
96 d = uvuni_to_utf8_flags(d, uv, flags);
97
98or, in most cases,
99
9041c2e3 100 d = uvuni_to_utf8(d, uv);
eebe1485 101
b851fbc1
JH
102(which is equivalent to)
103
104 d = uvuni_to_utf8_flags(d, uv, 0);
105
949cf498 106This is the recommended Unicode-aware way of saying
eebe1485
SC
107
108 *(d++) = uv;
109
949cf498
KW
110This function will convert to UTF-8 (and not warn) even code points that aren't
111legal Unicode or are problematic, unless C<flags> contains one or more of the
112following flags.
113If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
114the function will raise a warning, provided UTF8 warnings are enabled. If instead
115UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
116If both flags are set, the function will both warn and return NULL.
117
118The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
119affect how the function handles a Unicode non-character. And, likewise for the
120UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
121above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
122even less portable) can be warned and/or disallowed even if other above-Unicode
123code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
124flags.
125
126And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
127above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
128DISALLOW flags.
129
130
eebe1485
SC
131=cut
132*/
133
dfe13c55 134U8 *
b851fbc1 135Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 136{
7918f24d
NC
137 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
138
949cf498
KW
139 if (ckWARN_d(WARN_UTF8)) {
140 if (UNICODE_IS_SURROGATE(uv)) {
141 if (flags & UNICODE_WARN_SURROGATE) {
8457b38f 142 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
949cf498
KW
143 "UTF-16 surrogate U+%04"UVXf, uv);
144 }
145 if (flags & UNICODE_DISALLOW_SURROGATE) {
146 return NULL;
147 }
148 }
149 else if (UNICODE_IS_SUPER(uv)) {
150 if (flags & UNICODE_WARN_SUPER
151 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
152 {
8457b38f 153 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
949cf498
KW
154 "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
155 }
156 if (flags & UNICODE_DISALLOW_SUPER
157 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
158 {
159 return NULL;
160 }
161 }
162 else if (UNICODE_IS_NONCHAR(uv)) {
163 if (flags & UNICODE_WARN_NONCHAR) {
8457b38f 164 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
949cf498
KW
165 "Unicode non-character U+%04"UVXf" is illegal for open interchange",
166 uv);
167 }
168 if (flags & UNICODE_DISALLOW_NONCHAR) {
169 return NULL;
170 }
171 }
507b9800 172 }
c4d5f83a 173 if (UNI_IS_INVARIANT(uv)) {
eb160463 174 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
175 return d;
176 }
2d331972 177#if defined(EBCDIC)
1d72bdf6
NIS
178 else {
179 STRLEN len = UNISKIP(uv);
180 U8 *p = d+len-1;
181 while (p > d) {
eb160463 182 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
183 uv >>= UTF_ACCUMULATION_SHIFT;
184 }
eb160463 185 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
186 return d+len;
187 }
188#else /* Non loop style */
a0ed51b3 189 if (uv < 0x800) {
eb160463
GS
190 *d++ = (U8)(( uv >> 6) | 0xc0);
191 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
192 return d;
193 }
194 if (uv < 0x10000) {
eb160463
GS
195 *d++ = (U8)(( uv >> 12) | 0xe0);
196 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
197 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
198 return d;
199 }
200 if (uv < 0x200000) {
eb160463
GS
201 *d++ = (U8)(( uv >> 18) | 0xf0);
202 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
203 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
204 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
205 return d;
206 }
207 if (uv < 0x4000000) {
eb160463
GS
208 *d++ = (U8)(( uv >> 24) | 0xf8);
209 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
210 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
211 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
212 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
213 return d;
214 }
215 if (uv < 0x80000000) {
eb160463
GS
216 *d++ = (U8)(( uv >> 30) | 0xfc);
217 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
218 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
219 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
220 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
221 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
222 return d;
223 }
6b8eaf93 224#ifdef HAS_QUAD
d7578b48 225 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
226#endif
227 {
eb160463
GS
228 *d++ = 0xfe; /* Can't match U+FEFF! */
229 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
230 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
231 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
232 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
233 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
234 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
235 return d;
236 }
6b8eaf93 237#ifdef HAS_QUAD
a0ed51b3 238 {
eb160463
GS
239 *d++ = 0xff; /* Can't match U+FFFE! */
240 *d++ = 0x80; /* 6 Reserved bits */
241 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
242 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
243 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
244 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
245 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
246 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
247 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
248 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
249 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
250 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
251 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
252 return d;
253 }
254#endif
1d72bdf6 255#endif /* Loop style */
a0ed51b3 256}
9041c2e3 257
646ca15d
JH
258/*
259
f7d739d1 260Tests if the first C<len> bytes of string C<s> form a valid UTF-8
646ca15d 261character. Note that an INVARIANT (i.e. ASCII) character is a valid
f7d739d1 262UTF-8 character. The number of bytes in the UTF-8 character
646ca15d
JH
263will be returned if it is valid, otherwise 0.
264
265This is the "slow" version as opposed to the "fast" version which is
266the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
267difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
268or less you should use the IS_UTF8_CHAR(), for lengths of five or more
269you should use the _slow(). In practice this means that the _slow()
270will be used very rarely, since the maximum Unicode code point (as of
271Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
272the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
273five bytes or more.
274
275=cut */
c053b435 276STATIC STRLEN
5f66b61c 277S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
278{
279 U8 u = *s;
280 STRLEN slen;
281 UV uv, ouv;
282
7918f24d
NC
283 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
284
646ca15d 285 if (UTF8_IS_INVARIANT(u))
f7d739d1 286 return len == 1;
646ca15d
JH
287
288 if (!UTF8_IS_START(u))
289 return 0;
290
291 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
292 return 0;
293
294 slen = len - 1;
295 s++;
77263263
ST
296#ifdef EBCDIC
297 u = NATIVE_TO_UTF(u);
298#endif
646ca15d
JH
299 u &= UTF_START_MASK(len);
300 uv = u;
301 ouv = uv;
302 while (slen--) {
303 if (!UTF8_IS_CONTINUATION(*s))
304 return 0;
305 uv = UTF8_ACCUMULATE(uv, *s);
48ef279e 306 if (uv < ouv)
646ca15d
JH
307 return 0;
308 ouv = uv;
309 s++;
310 }
311
312 if ((STRLEN)UNISKIP(uv) < len)
313 return 0;
314
315 return len;
316}
9041c2e3
NIS
317
318/*
492a624f
KW
319=for apidoc is_utf8_char_buf
320
321Returns the number of bytes that comprise the first UTF-8 encoded character in
322buffer C<buf>. C<buf_end> should point to one position beyond the end of the
323buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
324encoded character.
325
326Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
327machines) is a valid UTF-8 character.
328
329=cut */
330
331STRLEN
332Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
333{
334
335 STRLEN len;
336
337 PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
338
339 if (buf_end <= buf) {
340 return 0;
341 }
342
343 len = buf_end - buf;
344 if (len > UTF8SKIP(buf)) {
345 len = UTF8SKIP(buf);
346 }
347
348#ifdef IS_UTF8_CHAR
349 if (IS_UTF8_CHAR_FAST(len))
350 return IS_UTF8_CHAR(buf, len) ? len : 0;
351#endif /* #ifdef IS_UTF8_CHAR */
352 return is_utf8_char_slow(buf, len);
353}
354
355/*
87cea99e 356=for apidoc is_utf8_char
eebe1485 357
76848387
KW
358DEPRECATED!
359
5da9da9e 360Tests if some arbitrary number of bytes begins in a valid UTF-8
2bbc8d55
SP
361character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
362character is a valid UTF-8 character. The actual number of bytes in the UTF-8
363character will be returned if it is valid, otherwise 0.
9041c2e3 364
76848387
KW
365This function is deprecated due to the possibility that malformed input could
366cause reading beyond the end of the input buffer. Use C<is_utf8_char_buf>
367instead.
e0328548 368
82686b01 369=cut */
76848387 370
067a85ef 371STRLEN
668b6d8d 372Perl_is_utf8_char(const U8 *s)
386d01d6 373{
7918f24d 374 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
492a624f 375
76848387 376 /* Assumes we have enough space, which is why this is deprecated */
492a624f 377 return is_utf8_char_buf(s, s + UTF8SKIP(s));
386d01d6
GS
378}
379
eaf7a4d2 380
6662521e 381/*
87cea99e 382=for apidoc is_utf8_string
6662521e 383
c9ada85f 384Returns true if first C<len> bytes of the given string form a valid
9f7e3d64 385UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
e0328548
KW
386using C<strlen(s)> (which means if you use this option, that C<s> has to have a
387terminating NUL byte). Note that all characters being ASCII constitute 'a
388valid UTF-8 string'.
6662521e 389
eaf7a4d2 390See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
768c67ee 391
6662521e
GS
392=cut
393*/
394
8e84507e 395bool
668b6d8d 396Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 397{
35da51f7 398 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 399 const U8* x = s;
067a85ef 400
7918f24d 401 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 402
6662521e 403 while (x < send) {
1acdb0da 404 /* Inline the easy bits of is_utf8_char() here for speed... */
e0328548
KW
405 if (UTF8_IS_INVARIANT(*x)) {
406 x++;
407 }
1acdb0da 408 else if (!UTF8_IS_START(*x))
e0328548 409 return FALSE;
1acdb0da
JH
410 else {
411 /* ... and call is_utf8_char() only if really needed. */
e0328548
KW
412 const STRLEN c = UTF8SKIP(x);
413 const U8* const next_char_ptr = x + c;
414
415 if (next_char_ptr > send) {
416 return FALSE;
417 }
418
768c67ee
JH
419 if (IS_UTF8_CHAR_FAST(c)) {
420 if (!IS_UTF8_CHAR(x, c))
e0328548 421 return FALSE;
3c614e38 422 }
e0328548
KW
423 else if (! is_utf8_char_slow(x, c)) {
424 return FALSE;
425 }
426 x = next_char_ptr;
1acdb0da 427 }
6662521e 428 }
768c67ee 429
067a85ef 430 return TRUE;
6662521e
GS
431}
432
67e989fb 433/*
814fafa7
NC
434Implemented as a macro in utf8.h
435
87cea99e 436=for apidoc is_utf8_string_loc
814fafa7
NC
437
438Like is_utf8_string() but stores the location of the failure (in the
439case of "utf8ness failure") or the location s+len (in the case of
440"utf8ness success") in the C<ep>.
441
442See also is_utf8_string_loclen() and is_utf8_string().
443
87cea99e 444=for apidoc is_utf8_string_loclen
81cd54e3 445
e3e4599f 446Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
447case of "utf8ness failure") or the location s+len (in the case of
448"utf8ness success") in the C<ep>, and the number of UTF-8
449encoded characters in the C<el>.
450
451See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
452
453=cut
454*/
455
456bool
668b6d8d 457Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 458{
35da51f7 459 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 460 const U8* x = s;
81cd54e3 461 STRLEN c;
3ebfea28 462 STRLEN outlen = 0;
7918f24d
NC
463
464 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 465
81cd54e3 466 while (x < send) {
e0328548
KW
467 const U8* next_char_ptr;
468
81cd54e3
JH
469 /* Inline the easy bits of is_utf8_char() here for speed... */
470 if (UTF8_IS_INVARIANT(*x))
e0328548 471 next_char_ptr = x + 1;
768c67ee
JH
472 else if (!UTF8_IS_START(*x))
473 goto out;
81cd54e3 474 else {
768c67ee 475 /* ... and call is_utf8_char() only if really needed. */
768c67ee 476 c = UTF8SKIP(x);
e0328548
KW
477 next_char_ptr = c + x;
478 if (next_char_ptr > send) {
479 goto out;
480 }
768c67ee
JH
481 if (IS_UTF8_CHAR_FAST(c)) {
482 if (!IS_UTF8_CHAR(x, c))
483 c = 0;
484 } else
485 c = is_utf8_char_slow(x, c);
768c67ee
JH
486 if (!c)
487 goto out;
81cd54e3 488 }
e0328548 489 x = next_char_ptr;
3ebfea28 490 outlen++;
81cd54e3 491 }
768c67ee
JH
492
493 out:
3ebfea28
AL
494 if (el)
495 *el = outlen;
496
768c67ee
JH
497 if (ep)
498 *ep = x;
3ebfea28 499 return (x == send);
81cd54e3
JH
500}
501
502/*
768c67ee 503
87cea99e 504=for apidoc utf8n_to_uvuni
67e989fb 505
9041c2e3 506Bottom level UTF-8 decode routine.
949cf498
KW
507Returns the code point value of the first character in the string C<s>
508which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
509C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
510character.
511
512The value of C<flags> determines the behavior when C<s> does not point to a
513well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
514C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
515is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
516is raised.
517
518Various ALLOW flags can be set in C<flags> to allow (and not warn on)
519individual types of malformations, such as the sequence being overlong (that
520is, when there is a shorter sequence that can express the same code point;
521overlong sequences are expressly forbidden in the UTF-8 standard due to
522potential security issues). Another malformation example is the first byte of
523a character not being a legal first byte. See F<utf8.h> for the list of such
524flags. Of course, the value returned by this function under such conditions is
525not reliable.
526
527The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
528flags) malformation is found. If this flag is set, the routine assumes that
529the caller will raise a warning, and this function will silently just set
530C<retlen> to C<-1> and return zero.
531
532Certain code points are considered problematic. These are Unicode surrogates,
533Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
534By default these are considered regular code points, but certain situations
5eafe189 535warrant special handling for them. If C<flags> contains
949cf498
KW
536UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
537malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
538UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
539maximum) can be set to disallow these categories individually.
540
541The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
542UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
543for their respective categories, but otherwise the code points are considered
544valid (not malformations). To get a category to both be treated as a
545malformation and raise a warning, specify both the WARN and DISALLOW flags.
546(But note that warnings are not raised if lexically disabled nor if
547UTF8_CHECK_ONLY is also specified.)
548
549Very large code points (above 0x7FFF_FFFF) are considered more problematic than
550the others that are above the Unicode legal maximum. There are several
551reasons, one of which is that the original UTF-8 specification never went above
552this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
5eafe189 553on ASCII platforms for these large code points begins with a byte containing
949cf498
KW
5540xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
555malformations, while allowing smaller above-Unicode code points. (Of course
556UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
557as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
558flags, but applies just to these code points.
559
560All other code points corresponding to Unicode characters, including private
561use and those yet to be assigned, are never considered malformed and never
562warn.
67e989fb 563
9041c2e3
NIS
564Most code should use utf8_to_uvchr() rather than call this directly.
565
37607a96
PK
566=cut
567*/
67e989fb 568
a0ed51b3 569UV
7fc63493 570Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 571{
97aff369 572 dVAR;
d4c19fe8 573 const U8 * const s0 = s;
9c5ffd7c 574 UV uv = *s, ouv = 0;
ba210ebe 575 STRLEN len = 1;
949cf498 576 bool dowarn = ckWARN_d(WARN_UTF8);
7fc63493 577 const UV startbyte = *s;
ba210ebe 578 STRLEN expectlen = 0;
a0dbb045 579 U32 warning = 0;
949cf498 580 SV* sv = NULL;
a0dbb045 581
7918f24d
NC
582 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
583
949cf498 584/* This list is a superset of the UTF8_ALLOW_XXX. */
a0dbb045
JH
585
586#define UTF8_WARN_EMPTY 1
587#define UTF8_WARN_CONTINUATION 2
588#define UTF8_WARN_NON_CONTINUATION 3
f9bc8ed7
KW
589#define UTF8_WARN_SHORT 4
590#define UTF8_WARN_OVERFLOW 5
591#define UTF8_WARN_LONG 6
a0dbb045
JH
592
593 if (curlen == 0 &&
594 !(flags & UTF8_ALLOW_EMPTY)) {
595 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
596 goto malformed;
597 }
598
1d72bdf6 599 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
600 if (retlen)
601 *retlen = 1;
c4d5f83a 602 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 603 }
67e989fb 604
421a8bf2 605 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 606 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 607 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
608 goto malformed;
609 }
610
421a8bf2 611 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 612 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 613 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
614 goto malformed;
615 }
9041c2e3 616
1d72bdf6 617#ifdef EBCDIC
75383841 618 uv = NATIVE_TO_UTF(uv);
1d72bdf6 619#else
949cf498
KW
620 if (uv == 0xfe || uv == 0xff) {
621 if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
111d382d 622 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
949cf498
KW
623 flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
624 }
625 if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
626 goto malformed;
627 }
a0ed51b3 628 }
1d72bdf6
NIS
629#endif
630
ba210ebe
JH
631 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
632 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
633 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
634 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
635#ifdef EBCDIC
636 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
637 else { len = 7; uv &= 0x01; }
638#else
ba210ebe
JH
639 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
640 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
641 else { len = 13; uv = 0; } /* whoa! */
642#endif
643
a0ed51b3
LW
644 if (retlen)
645 *retlen = len;
9041c2e3 646
ba210ebe
JH
647 expectlen = len;
648
fcc8fcf6
JH
649 if ((curlen < expectlen) &&
650 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 651 warning = UTF8_WARN_SHORT;
ba210ebe
JH
652 goto malformed;
653 }
654
655 len--;
a0ed51b3 656 s++;
949cf498 657 ouv = uv; /* ouv is the value from the previous iteration */
ba210ebe 658
a0ed51b3 659 while (len--) {
421a8bf2
JH
660 if (!UTF8_IS_CONTINUATION(*s) &&
661 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
662 s--;
663 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 664 goto malformed;
a0ed51b3
LW
665 }
666 else
8850bf83 667 uv = UTF8_ACCUMULATE(uv, *s);
949cf498
KW
668 if (!(uv > ouv)) { /* If the value didn't grow from the previous
669 iteration, something is horribly wrong */
a0dbb045
JH
670 /* These cannot be allowed. */
671 if (uv == ouv) {
75dbc644 672 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
673 warning = UTF8_WARN_LONG;
674 goto malformed;
675 }
676 }
677 else { /* uv < ouv */
678 /* This cannot be allowed. */
679 warning = UTF8_WARN_OVERFLOW;
680 goto malformed;
681 }
ba210ebe
JH
682 }
683 s++;
684 ouv = uv;
685 }
686
949cf498 687 if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 688 warning = UTF8_WARN_LONG;
ba210ebe 689 goto malformed;
949cf498
KW
690 } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
691 if (UNICODE_IS_SURROGATE(uv)) {
692 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
111d382d 693 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
949cf498
KW
694 }
695 if (flags & UTF8_DISALLOW_SURROGATE) {
696 goto disallowed;
697 }
698 }
699 else if (UNICODE_IS_NONCHAR(uv)) {
700 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
111d382d 701 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
949cf498
KW
702 }
703 if (flags & UTF8_DISALLOW_NONCHAR) {
704 goto disallowed;
705 }
706 }
707 else if ((uv > PERL_UNICODE_MAX)) {
708 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
111d382d 709 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
949cf498
KW
710 }
711 if (flags & UTF8_DISALLOW_SUPER) {
712 goto disallowed;
713 }
714 }
715
716 /* Here, this is not considered a malformed character, so drop through
717 * to return it */
a0ed51b3 718 }
ba210ebe 719
a0ed51b3 720 return uv;
ba210ebe 721
949cf498
KW
722disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
723 set if there is to be a warning. */
724 if (!sv) {
725 dowarn = 0;
726 }
727
ba210ebe
JH
728malformed:
729
fcc8fcf6 730 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 731 if (retlen)
10edeb5d 732 *retlen = ((STRLEN) -1);
ba210ebe
JH
733 return 0;
734 }
735
a0dbb045 736 if (dowarn) {
949cf498 737 if (! sv) {
5b311467 738 sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
949cf498 739 }
5b311467 740
5e3d7cf5
KW
741 switch (warning) {
742 case 0: /* Intentionally empty. */ break;
743 case UTF8_WARN_EMPTY:
744 sv_catpvs(sv, "(empty string)");
745 break;
746 case UTF8_WARN_CONTINUATION:
747 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
748 break;
749 case UTF8_WARN_NON_CONTINUATION:
750 if (s == s0)
751 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
752 (UV)s[1], startbyte);
753 else {
754 const int len = (int)(s-s0);
755 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
756 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
757 }
758
759 break;
760 case UTF8_WARN_SHORT:
761 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
762 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
763 expectlen = curlen; /* distance for caller to skip */
764 break;
765 case UTF8_WARN_OVERFLOW:
766 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
767 ouv, *s, startbyte);
768 break;
769 case UTF8_WARN_LONG:
770 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
771 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
772 break;
773 default:
774 sv_catpvs(sv, "(unknown reason)");
775 break;
776 }
a0dbb045 777
949cf498 778 if (sv) {
44f8325f 779 const char * const s = SvPVX_const(sv);
a0dbb045
JH
780
781 if (PL_op)
9014280d 782 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 783 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 784 else
9014280d 785 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
786 }
787 }
788
ba210ebe 789 if (retlen)
28d3d195 790 *retlen = expectlen ? expectlen : len;
ba210ebe 791
28d3d195 792 return 0;
a0ed51b3
LW
793}
794
8e84507e 795/*
87cea99e 796=for apidoc utf8_to_uvchr
9041c2e3 797
6ee84de2 798Returns the native code point of the first character in the string C<s>
1e54db1a 799which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
800length, in bytes, of that character.
801
1e54db1a 802If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
803returned and retlen is set, if possible, to -1.
804
805=cut
806*/
807
ae0e24b1 808
9041c2e3 809UV
7fc63493 810Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 811{
7918f24d
NC
812 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
813
1754c1a1 814 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
ae0e24b1 815 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
816}
817
818/*
87cea99e 819=for apidoc utf8_to_uvuni
9041c2e3
NIS
820
821Returns the Unicode code point of the first character in the string C<s>
1e54db1a 822which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
823length, in bytes, of that character.
824
2bbc8d55 825This function should only be used when the returned UV is considered
9041c2e3
NIS
826an index into the Unicode semantic tables (e.g. swashes).
827
1e54db1a 828If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 829returned and retlen is set, if possible, to -1.
8e84507e
NIS
830
831=cut
832*/
833
834UV
7fc63493 835Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 836{
7918f24d
NC
837 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
838
9041c2e3 839 /* Call the low level routine asking for checks */
89ebb4a3 840 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
ae0e24b1 841 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
842}
843
b76347f2 844/*
87cea99e 845=for apidoc utf8_length
b76347f2
JH
846
847Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
848Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
849up past C<e>, croaks.
b76347f2
JH
850
851=cut
852*/
853
854STRLEN
35a4481c 855Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 856{
97aff369 857 dVAR;
b76347f2
JH
858 STRLEN len = 0;
859
7918f24d
NC
860 PERL_ARGS_ASSERT_UTF8_LENGTH;
861
8850bf83
JH
862 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
863 * the bitops (especially ~) can create illegal UTF-8.
864 * In other words: in Perl UTF-8 is not just for Unicode. */
865
a3b680e6
AL
866 if (e < s)
867 goto warn_and_return;
b76347f2 868 while (s < e) {
8e91ec7f
AV
869 if (!UTF8_IS_INVARIANT(*s))
870 s += UTF8SKIP(s);
871 else
872 s++;
873 len++;
874 }
875
876 if (e != s) {
877 len--;
878 warn_and_return:
9b387841
NC
879 if (PL_op)
880 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
881 "%s in %s", unees, OP_DESC(PL_op));
882 else
61a12c31 883 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
884 }
885
886 return len;
887}
888
b06226ff 889/*
87cea99e 890=for apidoc utf8_distance
b06226ff 891
1e54db1a 892Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
893and C<b>.
894
895WARNING: use only if you *know* that the pointers point inside the
896same UTF-8 buffer.
897
37607a96
PK
898=cut
899*/
a0ed51b3 900
02eb7b47 901IV
35a4481c 902Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 903{
7918f24d
NC
904 PERL_ARGS_ASSERT_UTF8_DISTANCE;
905
bf1665bc 906 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
907}
908
b06226ff 909/*
87cea99e 910=for apidoc utf8_hop
b06226ff 911
8850bf83
JH
912Return the UTF-8 pointer C<s> displaced by C<off> characters, either
913forward or backward.
b06226ff
JH
914
915WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
916the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
917on the first byte of character or just after the last byte of a character.
b06226ff 918
37607a96
PK
919=cut
920*/
a0ed51b3
LW
921
922U8 *
4373e329 923Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 924{
7918f24d
NC
925 PERL_ARGS_ASSERT_UTF8_HOP;
926
96a5add6 927 PERL_UNUSED_CONTEXT;
8850bf83
JH
928 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
929 * the bitops (especially ~) can create illegal UTF-8.
930 * In other words: in Perl UTF-8 is not just for Unicode. */
931
a0ed51b3
LW
932 if (off >= 0) {
933 while (off--)
934 s += UTF8SKIP(s);
935 }
936 else {
937 while (off++) {
938 s--;
8850bf83
JH
939 while (UTF8_IS_CONTINUATION(*s))
940 s--;
a0ed51b3
LW
941 }
942 }
4373e329 943 return (U8 *)s;
a0ed51b3
LW
944}
945
6940069f 946/*
fed3ba5d
NC
947=for apidoc bytes_cmp_utf8
948
949Compares the sequence of characters (stored as octets) in b, blen with the
950sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
951equal, -1 or -2 if the first string is less than the second string, +1 or +2
952if the first string is greater than the second string.
953
954-1 or +1 is returned if the shorter string was identical to the start of the
955longer string. -2 or +2 is returned if the was a difference between characters
956within the strings.
957
958=cut
959*/
960
961int
962Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
963{
964 const U8 *const bend = b + blen;
965 const U8 *const uend = u + ulen;
966
967 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
968
969 PERL_UNUSED_CONTEXT;
970
971 while (b < bend && u < uend) {
972 U8 c = *u++;
973 if (!UTF8_IS_INVARIANT(c)) {
974 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
975 if (u < uend) {
976 U8 c1 = *u++;
977 if (UTF8_IS_CONTINUATION(c1)) {
356979f4 978 c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
fed3ba5d
NC
979 } else {
980 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
981 "Malformed UTF-8 character "
982 "(unexpected non-continuation byte 0x%02x"
983 ", immediately after start byte 0x%02x)"
984 /* Dear diag.t, it's in the pod. */
985 "%s%s", c1, c,
986 PL_op ? " in " : "",
987 PL_op ? OP_DESC(PL_op) : "");
988 return -2;
989 }
990 } else {
991 if (PL_op)
992 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
993 "%s in %s", unees, OP_DESC(PL_op));
994 else
61a12c31 995 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
996 return -2; /* Really want to return undef :-) */
997 }
998 } else {
999 return -2;
1000 }
1001 }
1002 if (*b != c) {
1003 return *b < c ? -2 : +2;
1004 }
1005 ++b;
1006 }
1007
1008 if (b == bend && u == uend)
1009 return 0;
1010
1011 return b < bend ? +1 : -1;
1012}
1013
1014/*
87cea99e 1015=for apidoc utf8_to_bytes
6940069f 1016
2bbc8d55 1017Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
246fae53
MG
1018Unlike C<bytes_to_utf8>, this over-writes the original string, and
1019updates len to contain the new length.
67e989fb 1020Returns zero on failure, setting C<len> to -1.
6940069f 1021
95be277c
NC
1022If you need a copy of the string, see C<bytes_from_utf8>.
1023
6940069f
GS
1024=cut
1025*/
1026
1027U8 *
37607a96 1028Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1029{
d4c19fe8
AL
1030 U8 * const save = s;
1031 U8 * const send = s + *len;
6940069f 1032 U8 *d;
246fae53 1033
7918f24d
NC
1034 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1035
1e54db1a 1036 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1037 while (s < send) {
dcad2880
JH
1038 U8 c = *s++;
1039
1d72bdf6
NIS
1040 if (!UTF8_IS_INVARIANT(c) &&
1041 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
1042 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 1043 *len = ((STRLEN) -1);
dcad2880
JH
1044 return 0;
1045 }
246fae53 1046 }
dcad2880
JH
1047
1048 d = s = save;
6940069f 1049 while (s < send) {
ed646e6e 1050 STRLEN ulen;
9041c2e3 1051 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 1052 s += ulen;
6940069f
GS
1053 }
1054 *d = '\0';
246fae53 1055 *len = d - save;
6940069f
GS
1056 return save;
1057}
1058
1059/*
87cea99e 1060=for apidoc bytes_from_utf8
f9a63242 1061
2bbc8d55 1062Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
35a4481c 1063Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1064the newly-created string, and updates C<len> to contain the new
1065length. Returns the original string if no conversion occurs, C<len>
1066is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55
SP
10670 if C<s> is converted or consisted entirely of characters that are invariant
1068in utf8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1069
37607a96
PK
1070=cut
1071*/
f9a63242
JH
1072
1073U8 *
e1ec3a88 1074Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1075{
f9a63242 1076 U8 *d;
e1ec3a88
AL
1077 const U8 *start = s;
1078 const U8 *send;
f9a63242
JH
1079 I32 count = 0;
1080
7918f24d
NC
1081 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1082
96a5add6 1083 PERL_UNUSED_CONTEXT;
f9a63242 1084 if (!*is_utf8)
73d840c0 1085 return (U8 *)start;
f9a63242 1086
1e54db1a 1087 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1088 for (send = s + *len; s < send;) {
e1ec3a88 1089 U8 c = *s++;
1d72bdf6 1090 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
1091 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
1092 (c = *s++) && UTF8_IS_CONTINUATION(c))
1093 count++;
1094 else
73d840c0 1095 return (U8 *)start;
db42d148 1096 }
f9a63242
JH
1097 }
1098
35da51f7 1099 *is_utf8 = FALSE;
f9a63242 1100
212542aa 1101 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1102 s = start; start = d;
f9a63242
JH
1103 while (s < send) {
1104 U8 c = *s++;
c4d5f83a
NIS
1105 if (!UTF8_IS_INVARIANT(c)) {
1106 /* Then it is two-byte encoded */
356979f4 1107 c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
c4d5f83a
NIS
1108 }
1109 *d++ = c;
f9a63242
JH
1110 }
1111 *d = '\0';
1112 *len = d - start;
73d840c0 1113 return (U8 *)start;
f9a63242
JH
1114}
1115
1116/*
87cea99e 1117=for apidoc bytes_to_utf8
6940069f 1118
ff97e5cf
KW
1119Converts a string C<s> of length C<len> bytes from the native encoding into
1120UTF-8.
6662521e 1121Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1122reflect the new length in bytes.
6940069f 1123
2bbc8d55
SP
1124A NUL character will be written after the end of the string.
1125
1126If you want to convert to UTF-8 from encodings other than
1127the native (Latin1 or EBCDIC),
c9ada85f
JH
1128see sv_recode_to_utf8().
1129
497711e7 1130=cut
6940069f
GS
1131*/
1132
c682ebef
FC
1133/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1134 likewise need duplication. */
1135
6940069f 1136U8*
35a4481c 1137Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1138{
35a4481c 1139 const U8 * const send = s + (*len);
6940069f
GS
1140 U8 *d;
1141 U8 *dst;
7918f24d
NC
1142
1143 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1144 PERL_UNUSED_CONTEXT;
6940069f 1145
212542aa 1146 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1147 dst = d;
1148
1149 while (s < send) {
35a4481c 1150 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 1151 if (UNI_IS_INVARIANT(uv))
eb160463 1152 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 1153 else {
eb160463
GS
1154 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
1155 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
1156 }
1157 }
1158 *d = '\0';
6662521e 1159 *len = d-dst;
6940069f
GS
1160 return dst;
1161}
1162
a0ed51b3 1163/*
dea0fc0b 1164 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1165 *
1166 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1167 * We optimize for native, for obvious reasons. */
1168
1169U8*
dea0fc0b 1170Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1171{
dea0fc0b
JH
1172 U8* pend;
1173 U8* dstart = d;
1174
7918f24d
NC
1175 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1176
dea0fc0b 1177 if (bytelen & 1)
f5992bc4 1178 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1179
1180 pend = p + bytelen;
1181
a0ed51b3 1182 while (p < pend) {
dea0fc0b
JH
1183 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1184 p += 2;
a0ed51b3 1185 if (uv < 0x80) {
e294cc5d
JH
1186#ifdef EBCDIC
1187 *d++ = UNI_TO_NATIVE(uv);
1188#else
eb160463 1189 *d++ = (U8)uv;
e294cc5d 1190#endif
a0ed51b3
LW
1191 continue;
1192 }
1193 if (uv < 0x800) {
eb160463
GS
1194 *d++ = (U8)(( uv >> 6) | 0xc0);
1195 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1196 continue;
1197 }
52b9aa85 1198 if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
01ea242b 1199 if (p >= pend) {
dea0fc0b 1200 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
01ea242b
NC
1201 } else {
1202 UV low = (p[0] << 8) + p[1];
1203 p += 2;
52b9aa85 1204 if (low < 0xdc00 || low > 0xdfff)
01ea242b
NC
1205 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1206 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
1207 }
dbde1951
NC
1208 } else if (uv >= 0xdc00 && uv <= 0xdfff) {
1209 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
1210 }
1211 if (uv < 0x10000) {
eb160463
GS
1212 *d++ = (U8)(( uv >> 12) | 0xe0);
1213 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1214 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1215 continue;
1216 }
1217 else {
eb160463
GS
1218 *d++ = (U8)(( uv >> 18) | 0xf0);
1219 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1220 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1221 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1222 continue;
1223 }
1224 }
dea0fc0b 1225 *newlen = d - dstart;
a0ed51b3
LW
1226 return d;
1227}
1228
1229/* Note: this one is slightly destructive of the source. */
1230
1231U8*
dea0fc0b 1232Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1233{
1234 U8* s = (U8*)p;
d4c19fe8 1235 U8* const send = s + bytelen;
7918f24d
NC
1236
1237 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1238
e0ea5e2d
NC
1239 if (bytelen & 1)
1240 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1241 (UV)bytelen);
1242
a0ed51b3 1243 while (s < send) {
d4c19fe8 1244 const U8 tmp = s[0];
a0ed51b3
LW
1245 s[0] = s[1];
1246 s[1] = tmp;
1247 s += 2;
1248 }
dea0fc0b 1249 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1250}
1251
c3fd2246
KW
1252/* for now these are all defined (inefficiently) in terms of the utf8 versions.
1253 * Note that the macros in handy.h that call these short-circuit calling them
1254 * for Latin-1 range inputs */
a0ed51b3
LW
1255
1256bool
84afefe6 1257Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 1258{
89ebb4a3 1259 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1260 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1261 return is_utf8_alnum(tmpbuf);
1262}
1263
1264bool
84afefe6 1265Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1266{
89ebb4a3 1267 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1268 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1269 return is_utf8_idfirst(tmpbuf);
1270}
1271
1272bool
84afefe6 1273Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1274{
89ebb4a3 1275 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1276 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1277 return is_utf8_alpha(tmpbuf);
1278}
1279
1280bool
84afefe6 1281Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1282{
bc39fe24 1283 return isASCII(c);
4d61ec05
GS
1284}
1285
1286bool
84afefe6 1287Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1288{
89ebb4a3 1289 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1290 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1291 return is_utf8_space(tmpbuf);
1292}
1293
1294bool
84afefe6 1295Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1296{
89ebb4a3 1297 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1298 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1299 return is_utf8_digit(tmpbuf);
1300}
1301
1302bool
84afefe6 1303Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1304{
89ebb4a3 1305 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1306 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1307 return is_utf8_upper(tmpbuf);
1308}
1309
1310bool
84afefe6 1311Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1312{
89ebb4a3 1313 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1314 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1315 return is_utf8_lower(tmpbuf);
1316}
1317
1318bool
84afefe6 1319Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1320{
7b952154 1321 return isCNTRL_L1(c);
b8c5462f
JH
1322}
1323
1324bool
84afefe6 1325Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1326{
89ebb4a3 1327 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1328 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1329 return is_utf8_graph(tmpbuf);
1330}
1331
1332bool
84afefe6 1333Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1334{
89ebb4a3 1335 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1336 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1337 return is_utf8_print(tmpbuf);
1338}
1339
b8c5462f 1340bool
84afefe6 1341Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1342{
89ebb4a3 1343 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1344 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1345 return is_utf8_punct(tmpbuf);
1346}
1347
4d61ec05 1348bool
84afefe6 1349Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1350{
89ebb4a3 1351 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1352 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1353 return is_utf8_xdigit(tmpbuf);
1354}
1355
3a4c58c9
KW
1356UV
1357Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1358{
1359 /* We have the latin1-range values compiled into the core, so just use
1360 * those, converting the result to utf8. The only difference between upper
1361 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1362 * either "SS" or "Ss". Which one to use is passed into the routine in
1363 * 'S_or_s' to avoid a test */
1364
1365 UV converted = toUPPER_LATIN1_MOD(c);
1366
1367 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1368
1369 assert(S_or_s == 'S' || S_or_s == 's');
1370
1371 if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
1372 characters in this range */
1373 *p = (U8) converted;
1374 *lenp = 1;
1375 return converted;
1376 }
1377
1378 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1379 * which it maps to one of them, so as to only have to have one check for
1380 * it in the main case */
1381 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1382 switch (c) {
1383 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1384 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1385 break;
1386 case MICRO_SIGN:
1387 converted = GREEK_CAPITAL_LETTER_MU;
1388 break;
1389 case LATIN_SMALL_LETTER_SHARP_S:
1390 *(p)++ = 'S';
1391 *p = S_or_s;
1392 *lenp = 2;
1393 return 'S';
1394 default:
1395 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1396 /* NOTREACHED */
1397 }
1398 }
1399
1400 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1401 *p = UTF8_TWO_BYTE_LO(converted);
1402 *lenp = 2;
1403
1404 return converted;
1405}
1406
50bda2c3
KW
1407/* Call the function to convert a UTF-8 encoded character to the specified case.
1408 * Note that there may be more than one character in the result.
1409 * INP is a pointer to the first byte of the input character
1410 * OUTP will be set to the first byte of the string of changed characters. It
1411 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1412 * LENP will be set to the length in bytes of the string of changed characters
1413 *
1414 * The functions return the ordinal of the first character in the string of OUTP */
f90a9a02
KW
1415#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1416#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1417#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
50bda2c3
KW
1418
1419/* This additionally has the input parameter SPECIALS, which if non-zero will
1420 * cause this to use the SPECIALS hash for folding (meaning get full case
1421 * folding); otherwise, when zero, this implies a simple case fold */
f90a9a02 1422#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
c3fd2246 1423
84afefe6
JH
1424UV
1425Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1426{
3a4c58c9
KW
1427 dVAR;
1428
c3fd2246
KW
1429 /* Convert the Unicode character whose ordinal is c to its uppercase
1430 * version and store that in UTF-8 in p and its length in bytes in lenp.
1431 * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1432 * the changed version may be longer than the original character.
1433 *
1434 * The ordinal of the first character of the changed version is returned
1435 * (but note, as explained above, that there may be more.) */
1436
7918f24d
NC
1437 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1438
3a4c58c9
KW
1439 if (c < 256) {
1440 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1441 }
1442
0ebc6274 1443 uvchr_to_utf8(p, c);
3a4c58c9 1444 return CALL_UPPER_CASE(p, p, lenp);
a0ed51b3
LW
1445}
1446
84afefe6
JH
1447UV
1448Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1449{
3a4c58c9
KW
1450 dVAR;
1451
7918f24d
NC
1452 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1453
3a4c58c9
KW
1454 if (c < 256) {
1455 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1456 }
1457
0ebc6274 1458 uvchr_to_utf8(p, c);
3a4c58c9 1459 return CALL_TITLE_CASE(p, p, lenp);
a0ed51b3
LW
1460}
1461
afc16117
KW
1462STATIC U8
1463S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1464{
1465 /* We have the latin1-range values compiled into the core, so just use
1466 * those, converting the result to utf8. Since the result is always just
1467 * one character, we allow p to be NULL */
1468
1469 U8 converted = toLOWER_LATIN1(c);
1470
1471 if (p != NULL) {
1472 if (UNI_IS_INVARIANT(converted)) {
1473 *p = converted;
1474 *lenp = 1;
1475 }
1476 else {
1477 *p = UTF8_TWO_BYTE_HI(converted);
1478 *(p+1) = UTF8_TWO_BYTE_LO(converted);
1479 *lenp = 2;
1480 }
1481 }
1482 return converted;
1483}
1484
84afefe6
JH
1485UV
1486Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1487{
968c5e6a
KW
1488 dVAR;
1489
7918f24d
NC
1490 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1491
afc16117
KW
1492 if (c < 256) {
1493 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
1494 }
1495
afc16117 1496 uvchr_to_utf8(p, c);
968c5e6a 1497 return CALL_LOWER_CASE(p, p, lenp);
a0ed51b3
LW
1498}
1499
84afefe6 1500UV
f673fad4 1501Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
a1dde8de 1502{
f673fad4
KW
1503 /* Corresponds to to_lower_latin1(), flags is TRUE if to use full case
1504 * folding */
1505
a1dde8de
KW
1506 UV converted;
1507
1508 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1509
1510 if (c == MICRO_SIGN) {
1511 converted = GREEK_SMALL_LETTER_MU;
1512 }
1513 else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
1514 *(p)++ = 's';
1515 *p = 's';
1516 *lenp = 2;
1517 return 's';
1518 }
1519 else { /* In this range the fold of all other characters is their lower
1520 case */
1521 converted = toLOWER_LATIN1(c);
1522 }
1523
1524 if (UNI_IS_INVARIANT(converted)) {
1525 *p = (U8) converted;
1526 *lenp = 1;
1527 }
1528 else {
1529 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1530 *p = UTF8_TWO_BYTE_LO(converted);
1531 *lenp = 2;
1532 }
1533
1534 return converted;
1535}
1536
1537UV
f673fad4 1538Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
84afefe6 1539{
4b593389
KW
1540
1541 /* Not currently externally documented, and subject to change, <flags> is
f673fad4 1542 * TRUE iff full folding is to be used */
4b593389 1543
36bb2ab6 1544 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 1545
a1dde8de
KW
1546 if (c < 256) {
1547 return _to_fold_latin1((U8) c, p, lenp, flags);
1548 }
1549
0ebc6274 1550 uvchr_to_utf8(p, c);
a1dde8de 1551 return CALL_FOLD_CASE(p, p, lenp, flags);
84afefe6
JH
1552}
1553
ea317ccb
KW
1554/* for now these all assume no locale info available for Unicode > 255; and
1555 * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been
1556 * called instead, so that these don't get called for < 255 */
a0ed51b3
LW
1557
1558bool
84afefe6 1559Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1560{
1561 return is_uni_alnum(c); /* XXX no locale support yet */
1562}
1563
1564bool
84afefe6 1565Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1566{
1567 return is_uni_idfirst(c); /* XXX no locale support yet */
1568}
1569
1570bool
84afefe6 1571Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1572{
1573 return is_uni_alpha(c); /* XXX no locale support yet */
1574}
1575
1576bool
84afefe6 1577Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1578{
1579 return is_uni_ascii(c); /* XXX no locale support yet */
1580}
1581
1582bool
84afefe6 1583Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1584{
1585 return is_uni_space(c); /* XXX no locale support yet */
1586}
1587
1588bool
84afefe6 1589Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1590{
1591 return is_uni_digit(c); /* XXX no locale support yet */
1592}
1593
1594bool
84afefe6 1595Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1596{
1597 return is_uni_upper(c); /* XXX no locale support yet */
1598}
1599
1600bool
84afefe6 1601Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1602{
1603 return is_uni_lower(c); /* XXX no locale support yet */
1604}
1605
1606bool
84afefe6 1607Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1608{
1609 return is_uni_cntrl(c); /* XXX no locale support yet */
1610}
1611
1612bool
84afefe6 1613Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1614{
1615 return is_uni_graph(c); /* XXX no locale support yet */
1616}
1617
1618bool
84afefe6 1619Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1620{
1621 return is_uni_print(c); /* XXX no locale support yet */
1622}
1623
b8c5462f 1624bool
84afefe6 1625Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1626{
1627 return is_uni_punct(c); /* XXX no locale support yet */
1628}
1629
4d61ec05 1630bool
84afefe6 1631Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1632{
1633 return is_uni_xdigit(c); /* XXX no locale support yet */
1634}
1635
b7ac61fa
JH
1636U32
1637Perl_to_uni_upper_lc(pTHX_ U32 c)
1638{
ee099d14
JH
1639 /* XXX returns only the first character -- do not use XXX */
1640 /* XXX no locale support yet */
1641 STRLEN len;
89ebb4a3 1642 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1643 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1644}
1645
1646U32
1647Perl_to_uni_title_lc(pTHX_ U32 c)
1648{
ee099d14
JH
1649 /* XXX returns only the first character XXX -- do not use XXX */
1650 /* XXX no locale support yet */
1651 STRLEN len;
89ebb4a3 1652 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1653 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1654}
1655
1656U32
1657Perl_to_uni_lower_lc(pTHX_ U32 c)
1658{
ee099d14
JH
1659 /* XXX returns only the first character -- do not use XXX */
1660 /* XXX no locale support yet */
1661 STRLEN len;
89ebb4a3 1662 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1663 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1664}
1665
7452cf6a 1666static bool
5141f98e 1667S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1668 const char *const swashname)
1669{
ea317ccb
KW
1670 /* returns a boolean giving whether or not the UTF8-encoded character that
1671 * starts at <p> is in the swash indicated by <swashname>. <swash>
1672 * contains a pointer to where the swash indicated by <swashname>
1673 * is to be stored; which this routine will do, so that future calls will
1674 * look at <*swash> and only generate a swash if it is not null
1675 *
1676 * Note that it is assumed that the buffer length of <p> is enough to
1677 * contain all the bytes that comprise the character. Thus, <*p> should
1678 * have been checked before this call for mal-formedness enough to assure
1679 * that. */
1680
97aff369 1681 dVAR;
7918f24d
NC
1682
1683 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1684
492a624f
KW
1685 /* The API should have included a length for the UTF-8 character in <p>,
1686 * but it doesn't. We therefor assume that p has been validated at least
1687 * as far as there being enough bytes available in it to accommodate the
1688 * character without reading beyond the end, and pass that number on to the
1689 * validating routine */
1690 if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
bde6a22d
NC
1691 return FALSE;
1692 if (!*swash)
711a919c 1693 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1694 return swash_fetch(*swash, p, TRUE) != 0;
1695}
1696
1697bool
7fc63493 1698Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1699{
97aff369 1700 dVAR;
7918f24d
NC
1701
1702 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1703
671c33bf
NC
1704 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1705 * descendant of isalnum(3), in other words, it doesn't
1706 * contain the '_'. --jhi */
d4c19fe8 1707 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1708}
1709
1710bool
7fc63493 1711Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1712{
97aff369 1713 dVAR;
7918f24d
NC
1714
1715 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1716
82686b01
JH
1717 if (*p == '_')
1718 return TRUE;
bde6a22d 1719 /* is_utf8_idstart would be more logical. */
d4c19fe8 1720 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1721}
1722
1723bool
c11ff943
KW
1724Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
1725{
1726 dVAR;
1727
1728 PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
1729
1730 if (*p == '_')
1731 return TRUE;
1732 /* is_utf8_idstart would be more logical. */
1733 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
1734}
1735
1736bool
b6912c02
KW
1737Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
1738{
1739 dVAR;
1740
1741 PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
1742
1743 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
1744}
1745
1746bool
7fc63493 1747Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1748{
97aff369 1749 dVAR;
7918f24d
NC
1750
1751 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1752
d4c19fe8 1753 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1754}
1755
1756bool
c11ff943
KW
1757Perl_is_utf8_xidcont(pTHX_ const U8 *p)
1758{
1759 dVAR;
1760
1761 PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
1762
c11ff943
KW
1763 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
1764}
1765
1766bool
7fc63493 1767Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1768{
97aff369 1769 dVAR;
7918f24d
NC
1770
1771 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1772
d4c19fe8 1773 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1774}
1775
1776bool
7fc63493 1777Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1778{
97aff369 1779 dVAR;
7918f24d
NC
1780
1781 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1782
bc39fe24
KW
1783 /* ASCII characters are the same whether in utf8 or not. So the macro
1784 * works on both utf8 and non-utf8 representations. */
1785 return isASCII(*p);
b8c5462f
JH
1786}
1787
1788bool
7fc63493 1789Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1790{
97aff369 1791 dVAR;
7918f24d
NC
1792
1793 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1794
a34094a9 1795 return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
a0ed51b3
LW
1796}
1797
1798bool
d1eb3177
YO
1799Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1800{
1801 dVAR;
1802
1803 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1804
c4428693
KW
1805 /* Only true if is an ASCII space-like character, and ASCII is invariant
1806 * under utf8, so can just use the macro */
1807 return isSPACE_A(*p);
d1eb3177
YO
1808}
1809
1810bool
1811Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1812{
1813 dVAR;
1814
1815 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1816
c4428693
KW
1817 /* Only true if is an ASCII word character, and ASCII is invariant
1818 * under utf8, so can just use the macro */
1819 return isWORDCHAR_A(*p);
d1eb3177
YO
1820}
1821
1822bool
7fc63493 1823Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1824{
97aff369 1825 dVAR;
7918f24d
NC
1826
1827 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1828
d4c19fe8 1829 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1830}
1831
1832bool
d1eb3177
YO
1833Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1834{
1835 dVAR;
1836
1837 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1838
c4428693
KW
1839 /* Only true if is an ASCII digit character, and ASCII is invariant
1840 * under utf8, so can just use the macro */
1841 return isDIGIT_A(*p);
d1eb3177
YO
1842}
1843
1844bool
7fc63493 1845Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1846{
97aff369 1847 dVAR;
7918f24d
NC
1848
1849 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1850
d4c19fe8 1851 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1852}
1853
1854bool
7fc63493 1855Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1856{
97aff369 1857 dVAR;
7918f24d
NC
1858
1859 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1860
d4c19fe8 1861 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1862}
1863
1864bool
7fc63493 1865Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1866{
97aff369 1867 dVAR;
7918f24d
NC
1868
1869 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1870
7b952154
KW
1871 if (isASCII(*p)) {
1872 return isCNTRL_A(*p);
1873 }
1874
1875 /* All controls are in Latin1 */
1876 if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
1877 return 0;
1878 }
1879 return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
b8c5462f
JH
1880}
1881
1882bool
7fc63493 1883Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1884{
97aff369 1885 dVAR;
7918f24d
NC
1886
1887 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1888
d4c19fe8 1889 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1890}
1891
1892bool
7fc63493 1893Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1894{
97aff369 1895 dVAR;
7918f24d
NC
1896
1897 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1898
d4c19fe8 1899 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1900}
1901
1902bool
7fc63493 1903Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1904{
97aff369 1905 dVAR;
7918f24d
NC
1906
1907 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1908
d4c19fe8 1909 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1910}
1911
1912bool
7fc63493 1913Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1914{
97aff369 1915 dVAR;
7918f24d
NC
1916
1917 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1918
d1eb3177 1919 return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
b8c5462f
JH
1920}
1921
1922bool
7fc63493 1923Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1924{
97aff369 1925 dVAR;
7918f24d
NC
1926
1927 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1928
d4c19fe8 1929 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1930}
1931
37e2e78e
KW
1932bool
1933Perl_is_utf8_X_begin(pTHX_ const U8 *p)
1934{
1935 dVAR;
1936
1937 PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
1938
1939 return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
1940}
1941
1942bool
1943Perl_is_utf8_X_extend(pTHX_ const U8 *p)
1944{
1945 dVAR;
1946
1947 PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
1948
1949 return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
1950}
1951
1952bool
1953Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
1954{
1955 dVAR;
1956
1957 PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
1958
1959 return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
1960}
1961
1962bool
1963Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
1964{
1965 dVAR;
1966
1967 PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
1968
1969 return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
1970}
1971
1972bool
1973Perl_is_utf8_X_L(pTHX_ const U8 *p)
1974{
1975 dVAR;
1976
1977 PERL_ARGS_ASSERT_IS_UTF8_X_L;
1978
1979 return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
1980}
1981
1982bool
1983Perl_is_utf8_X_LV(pTHX_ const U8 *p)
1984{
1985 dVAR;
1986
1987 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
1988
1989 return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
1990}
1991
1992bool
1993Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
1994{
1995 dVAR;
1996
1997 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
1998
1999 return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
2000}
2001
2002bool
2003Perl_is_utf8_X_T(pTHX_ const U8 *p)
2004{
2005 dVAR;
2006
2007 PERL_ARGS_ASSERT_IS_UTF8_X_T;
2008
2009 return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
2010}
2011
2012bool
2013Perl_is_utf8_X_V(pTHX_ const U8 *p)
2014{
2015 dVAR;
2016
2017 PERL_ARGS_ASSERT_IS_UTF8_X_V;
2018
2019 return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
2020}
2021
2022bool
2023Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
2024{
2025 dVAR;
2026
2027 PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
2028
2029 return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
2030}
2031
2e2b2571
KW
2032bool
2033Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
2034{
2035 /* For exclusive use of pp_quotemeta() */
2036
2037 dVAR;
2038
2039 PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
2040
2041 return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
2042}
2043
6b5c0936 2044/*
87cea99e 2045=for apidoc to_utf8_case
6b5c0936
JH
2046
2047The "p" contains the pointer to the UTF-8 string encoding
2048the character that is being converted.
2049
2050The "ustrp" is a pointer to the character buffer to put the
2051conversion result to. The "lenp" is a pointer to the length
2052of the result.
2053
0134edef 2054The "swashp" is a pointer to the swash to use.
6b5c0936 2055
36bb2ab6 2056Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
8fe4d5b2 2057and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 2058but not always, a multicharacter mapping), is tried first.
6b5c0936 2059
0134edef
JH
2060The "special" is a string like "utf8::ToSpecLower", which means the
2061hash %utf8::ToSpecLower. The access to the hash is through
2062Perl_to_utf8_case().
6b5c0936 2063
0134edef
JH
2064The "normal" is a string like "ToLower" which means the swash
2065%utf8::ToLower.
2066
2067=cut */
6b5c0936 2068
2104c8d9 2069UV
9a957fbc
AL
2070Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2071 SV **swashp, const char *normal, const char *special)
a0ed51b3 2072{
97aff369 2073 dVAR;
89ebb4a3 2074 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 2075 STRLEN len = 0;
aec46f14 2076 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
2077 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
2078 * are necessary in EBCDIC, they are redundant no-ops
2079 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 2080 const UV uv1 = NATIVE_TO_UNI(uv0);
7918f24d
NC
2081
2082 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2083
9ae3ac1a
KW
2084 /* Note that swash_fetch() doesn't output warnings for these because it
2085 * assumes we will */
8457b38f 2086 if (uv1 >= UNICODE_SURROGATE_FIRST) {
9ae3ac1a 2087 if (uv1 <= UNICODE_SURROGATE_LAST) {
8457b38f
KW
2088 if (ckWARN_d(WARN_SURROGATE)) {
2089 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2090 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2091 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2092 }
9ae3ac1a
KW
2093 }
2094 else if (UNICODE_IS_SUPER(uv1)) {
8457b38f
KW
2095 if (ckWARN_d(WARN_NON_UNICODE)) {
2096 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2097 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2098 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2099 }
9ae3ac1a
KW
2100 }
2101
2102 /* Note that non-characters are perfectly legal, so no warning should
2103 * be given */
2104 }
2105
1feea2c7 2106 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
2107
2108 if (!*swashp) /* load on-demand */
2109 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
2110
a6f87d8c 2111 if (special) {
0134edef 2112 /* It might be "special" (sometimes, but not always,
2a37f04d 2113 * a multicharacter mapping) */
6673a63c 2114 HV * const hv = get_hv(special, 0);
b08cf34e
JH
2115 SV **svp;
2116
35da51f7 2117 if (hv &&
b08cf34e
JH
2118 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
2119 (*svp)) {
cfd0369c 2120 const char *s;
47654450 2121
cfd0369c 2122 s = SvPV_const(*svp, len);
47654450
JH
2123 if (len == 1)
2124 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 2125 else {
2f9475ad
JH
2126#ifdef EBCDIC
2127 /* If we have EBCDIC we need to remap the characters
2128 * since any characters in the low 256 are Unicode
2129 * code points, not EBCDIC. */
7cda7a3d 2130 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
2131
2132 d = tmpbuf;
b08cf34e 2133 if (SvUTF8(*svp)) {
2f9475ad
JH
2134 STRLEN tlen = 0;
2135
2136 while (t < tend) {
d4c19fe8 2137 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
2138 if (tlen > 0) {
2139 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
2140 t += tlen;
2141 }
2142 else
2143 break;
2144 }
2145 }
2146 else {
36fec512
JH
2147 while (t < tend) {
2148 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
2149 t++;
2150 }
2f9475ad
JH
2151 }
2152 len = d - tmpbuf;
2153 Copy(tmpbuf, ustrp, len, U8);
2154#else
d2dcd0fb 2155 Copy(s, ustrp, len, U8);
2f9475ad 2156#endif
29e98929 2157 }
983ffd37 2158 }
0134edef
JH
2159 }
2160
2161 if (!len && *swashp) {
d4c19fe8
AL
2162 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
2163
0134edef
JH
2164 if (uv2) {
2165 /* It was "normal" (a single character mapping). */
d4c19fe8 2166 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 2167 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
2168 }
2169 }
1feea2c7 2170
37e2e78e
KW
2171 if (!len) /* Neither: just copy. In other words, there was no mapping
2172 defined, which means that the code point maps to itself */
0134edef
JH
2173 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
2174
2a37f04d
JH
2175 if (lenp)
2176 *lenp = len;
2177
0134edef 2178 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
2179}
2180
051a06d4
KW
2181STATIC UV
2182S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2183{
2184 /* This is called when changing the case of a utf8-encoded character above
2185 * the Latin1 range, and the operation is in locale. If the result
2186 * contains a character that crosses the 255/256 boundary, disallow the
2187 * change, and return the original code point. See L<perlfunc/lc> for why;
2188 *
2189 * p points to the original string whose case was changed
2190 * result the code point of the first character in the changed-case string
2191 * ustrp points to the changed-case string (<result> represents its first char)
2192 * lenp points to the length of <ustrp> */
2193
2194 UV original; /* To store the first code point of <p> */
2195
2196 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2197
2198 assert(! UTF8_IS_INVARIANT(*p) && ! UTF8_IS_DOWNGRADEABLE_START(*p));
2199
2200 /* We know immediately if the first character in the string crosses the
2201 * boundary, so can skip */
2202 if (result > 255) {
2203
2204 /* Look at every character in the result; if any cross the
2205 * boundary, the whole thing is disallowed */
2206 U8* s = ustrp + UTF8SKIP(ustrp);
2207 U8* e = ustrp + *lenp;
2208 while (s < e) {
2209 if (UTF8_IS_INVARIANT(*s) || UTF8_IS_DOWNGRADEABLE_START(*s))
2210 {
2211 goto bad_crossing;
2212 }
2213 s += UTF8SKIP(s);
2214 }
2215
2216 /* Here, no characters crossed, result is ok as-is */
2217 return result;
2218 }
2219
2220bad_crossing:
2221
2222 /* Failed, have to return the original */
2223 original = utf8_to_uvchr(p, lenp);
2224 Copy(p, ustrp, *lenp, char);
2225 return original;
2226}
2227
d3e79532 2228/*
87cea99e 2229=for apidoc to_utf8_upper
d3e79532
JH
2230
2231Convert the UTF-8 encoded character at p to its uppercase version and
2232store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
2233that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2234the uppercase version may be longer than the original character.
d3e79532
JH
2235
2236The first character of the uppercased version is returned
2237(but note, as explained above, that there may be more.)
2238
2239=cut */
2240
051a06d4
KW
2241/* Not currently externally documented, and subject to change:
2242 * <flags> is set iff locale semantics are to be used for code points < 256
2243 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2244 * were used in the calculation; otherwise unchanged. */
2245
2104c8d9 2246UV
051a06d4 2247Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2248{
97aff369 2249 dVAR;
7918f24d 2250
051a06d4
KW
2251 UV result;
2252
2253 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2254
3a4c58c9 2255 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2256 if (flags) {
2257 result = toUPPER_LC(*p);
2258 }
2259 else {
81c6c7ce 2260 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2261 }
3a4c58c9
KW
2262 }
2263 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4
KW
2264 if (flags) {
2265 result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2266 }
2267 else {
81c6c7ce
KW
2268 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2269 ustrp, lenp, 'S');
051a06d4
KW
2270 }
2271 }
2272 else { /* utf8, ord above 255 */
2273 result = CALL_UPPER_CASE(p, ustrp, lenp);
2274
2275 if (flags) {
2276 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2277 }
2278 return result;
2279 }
2280
2281 /* Here, used locale rules. Convert back to utf8 */
2282 if (UTF8_IS_INVARIANT(result)) {
2283 *ustrp = (U8) result;
2284 *lenp = 1;
2285 }
2286 else {
2287 *ustrp = UTF8_EIGHT_BIT_HI(result);
2288 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2289 *lenp = 2;
3a4c58c9
KW
2290 }
2291
051a06d4
KW
2292 if (tainted_ptr) {
2293 *tainted_ptr = TRUE;
2294 }
2295 return result;
983ffd37 2296}
a0ed51b3 2297
d3e79532 2298/*
87cea99e 2299=for apidoc to_utf8_title
d3e79532
JH
2300
2301Convert the UTF-8 encoded character at p to its titlecase version and
2302store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
2303that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2304titlecase version may be longer than the original character.
d3e79532
JH
2305
2306The first character of the titlecased version is returned
2307(but note, as explained above, that there may be more.)
2308
2309=cut */
2310
051a06d4
KW
2311/* Not currently externally documented, and subject to change:
2312 * <flags> is set iff locale semantics are to be used for code points < 256
2313 * Since titlecase is not defined in POSIX, uppercase is used instead
2314 * for these/
2315 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2316 * were used in the calculation; otherwise unchanged. */
2317
983ffd37 2318UV
051a06d4 2319Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
983ffd37 2320{
97aff369 2321 dVAR;
7918f24d 2322
051a06d4
KW
2323 UV result;
2324
2325 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2326
3a4c58c9 2327 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2328 if (flags) {
2329 result = toUPPER_LC(*p);
2330 }
2331 else {
81c6c7ce 2332 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2333 }
3a4c58c9
KW
2334 }
2335 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4
KW
2336 if (flags) {
2337 result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2338 }
2339 else {
81c6c7ce
KW
2340 return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2341 ustrp, lenp, 's');
051a06d4
KW
2342 }
2343 }
2344 else { /* utf8, ord above 255 */
2345 result = CALL_TITLE_CASE(p, ustrp, lenp);
2346
2347 if (flags) {
2348 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2349 }
2350 return result;
2351 }
2352
2353 /* Here, used locale rules. Convert back to utf8 */
2354 if (UTF8_IS_INVARIANT(result)) {
2355 *ustrp = (U8) result;
2356 *lenp = 1;
2357 }
2358 else {
2359 *ustrp = UTF8_EIGHT_BIT_HI(result);
2360 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2361 *lenp = 2;
3a4c58c9
KW
2362 }
2363
051a06d4
KW
2364 if (tainted_ptr) {
2365 *tainted_ptr = TRUE;
2366 }
2367 return result;
a0ed51b3
LW
2368}
2369
d3e79532 2370/*
87cea99e 2371=for apidoc to_utf8_lower
d3e79532
JH
2372
2373Convert the UTF-8 encoded character at p to its lowercase version and
2374store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
2375that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2376lowercase version may be longer than the original character.
d3e79532
JH
2377
2378The first character of the lowercased version is returned
2379(but note, as explained above, that there may be more.)
2380
2381=cut */
2382
051a06d4
KW
2383/* Not currently externally documented, and subject to change:
2384 * <flags> is set iff locale semantics are to be used for code points < 256
2385 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2386 * were used in the calculation; otherwise unchanged. */
2387
2104c8d9 2388UV
051a06d4 2389Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
a0ed51b3 2390{
051a06d4
KW
2391 UV result;
2392
97aff369 2393 dVAR;
7918f24d 2394
051a06d4 2395 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2396
968c5e6a 2397 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2398 if (flags) {
2399 result = toLOWER_LC(*p);
2400 }
2401 else {
81c6c7ce 2402 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2403 }
968c5e6a
KW
2404 }
2405 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4
KW
2406 if (flags) {
2407 result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2408 }
2409 else {
81c6c7ce
KW
2410 return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2411 ustrp, lenp);
051a06d4 2412 }
968c5e6a 2413 }
051a06d4
KW
2414 else { /* utf8, ord above 255 */
2415 result = CALL_LOWER_CASE(p, ustrp, lenp);
2416
2417 if (flags) {
2418 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2419 }
968c5e6a 2420
051a06d4
KW
2421 return result;
2422 }
2423
2424 /* Here, used locale rules. Convert back to utf8 */
2425 if (UTF8_IS_INVARIANT(result)) {
2426 *ustrp = (U8) result;
2427 *lenp = 1;
2428 }
2429 else {
2430 *ustrp = UTF8_EIGHT_BIT_HI(result);
2431 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2432 *lenp = 2;
2433 }
2434
2435 if (tainted_ptr) {
2436 *tainted_ptr = TRUE;
2437 }
2438 return result;
b4e400f9
JH
2439}
2440
d3e79532 2441/*
87cea99e 2442=for apidoc to_utf8_fold
d3e79532
JH
2443
2444Convert the UTF-8 encoded character at p to its foldcase version and
2445store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 2446that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
2447foldcase version may be longer than the original character (up to
2448three characters).
2449
2450The first character of the foldcased version is returned
2451(but note, as explained above, that there may be more.)
2452
2453=cut */
2454
051a06d4
KW
2455/* Not currently externally documented, and subject to change,
2456 * in <flags>
2457 * bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2458 * points < 256. Since foldcase is not defined in
2459 * POSIX, lowercase is used instead
2460 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2461 * otherwise simple folds
2462 * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2463 * were used in the calculation; otherwise unchanged. */
36bb2ab6 2464
b4e400f9 2465UV
051a06d4 2466Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
b4e400f9 2467{
97aff369 2468 dVAR;
7918f24d 2469
051a06d4
KW
2470 UV result;
2471
36bb2ab6 2472 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2473
a1dde8de 2474 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2475 if (flags & FOLD_FLAGS_LOCALE) {
2476 result = toLOWER_LC(*p);
2477 }
2478 else {
81c6c7ce
KW
2479 return _to_fold_latin1(*p, ustrp, lenp,
2480 cBOOL(flags & FOLD_FLAGS_FULL));
051a06d4 2481 }
a1dde8de
KW
2482 }
2483 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4
KW
2484 if (flags & FOLD_FLAGS_LOCALE) {
2485 result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2486 }
2487 else {
81c6c7ce 2488 return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
051a06d4
KW
2489 ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
2490 }
a1dde8de 2491 }
051a06d4
KW
2492 else { /* utf8, ord above 255 */
2493 result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
a1dde8de 2494
051a06d4
KW
2495 if ((flags & FOLD_FLAGS_LOCALE)) {
2496 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2497 }
2498
2499 return result;
2500 }
2501
2502 /* Here, used locale rules. Convert back to utf8 */
2503 if (UTF8_IS_INVARIANT(result)) {
2504 *ustrp = (U8) result;
2505 *lenp = 1;
2506 }
2507 else {
2508 *ustrp = UTF8_EIGHT_BIT_HI(result);
2509 *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2510 *lenp = 2;
2511 }
2512
2513 if (tainted_ptr) {
2514 *tainted_ptr = TRUE;
2515 }
2516 return result;
a0ed51b3
LW
2517}
2518
711a919c 2519/* Note:
f90a9a02 2520 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
2521 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2522 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2523 */
c4a5db0c 2524
a0ed51b3 2525SV*
7fc63493 2526Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 2527{
c4a5db0c
KW
2528 PERL_ARGS_ASSERT_SWASH_INIT;
2529
2530 /* Returns a copy of a swash initiated by the called function. This is the
2531 * public interface, and returning a copy prevents others from doing
2532 * mischief on the original */
2533
9a53f6cf 2534 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, FALSE, NULL, FALSE));
c4a5db0c
KW
2535}
2536
2537SV*
9a53f6cf 2538Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, bool return_if_undef, SV* invlist, bool passed_in_invlist_has_user_defined_property)
c4a5db0c
KW
2539{
2540 /* Initialize and return a swash, creating it if necessary. It does this
9a53f6cf 2541 * by calling utf8_heavy.pl in the general case.
c4a5db0c
KW
2542 *
2543 * This interface should only be used by functions that won't destroy or
2544 * adversely change the swash, as doing so affects all other uses of the
2545 * swash in the program; the general public should use 'Perl_swash_init'
2546 * instead.
2547 *
2548 * pkg is the name of the package that <name> should be in.
2549 * name is the name of the swash to find. Typically it is a Unicode
2550 * property name, including user-defined ones
2551 * listsv is a string to initialize the swash with. It must be of the form
2552 * documented as the subroutine return value in
2553 * L<perlunicode/User-Defined Character Properties>
2554 * minbits is the number of bits required to represent each data element.
2555 * It is '1' for binary properties.
2556 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf
KW
2557 * return_if_undef is TRUE if the routine shouldn't croak if it can't find
2558 * the requested property
2559 * invlist is an inversion list to initialize the swash with (or NULL)
2560 * has_user_defined_property is TRUE if <invlist> has some component that
2561 * came from a user-defined property
2562 *
2563 * Thus there are three possible inputs to find the swash: <name>,
2564 * <listsv>, and <invlist>. At least one must be specified. The result
2565 * will be the union of the specified ones, although <listsv>'s various
2566 * actions can intersect, etc. what <name> gives.
2567 *
2568 * <invlist> is only valid for binary properties */
c4a5db0c 2569
27da23d5 2570 dVAR;
c4a5db0c 2571 SV* retval = &PL_sv_undef;
9a53f6cf
KW
2572
2573 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2574 assert(! invlist || minbits == 1);
2575
2576 /* If data was passed in to go out to utf8_heavy to find the swash of, do
2577 * so */
2578 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
2579 dSP;
2580 const size_t pkg_len = strlen(pkg);
2581 const size_t name_len = strlen(name);
2582 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2583 SV* errsv_save;
2584 GV *method;
2585
2586 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2587
2588 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 2589 ENTER;
69794297
KW
2590 SAVEHINTS();
2591 save_re_context();
2592 if (PL_parser && PL_parser->error_count)
2593 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2594 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2595 if (!method) { /* demand load utf8 */
2596 ENTER;
2597 errsv_save = newSVsv(ERRSV);
2598 /* It is assumed that callers of this routine are not passing in
2599 * any user derived data. */
2600 /* Need to do this after save_re_context() as it will set
2601 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2602 * in Perl_magic_get). Even line to create errsv_save can turn on
2603 * PL_tainted. */
2604 SAVEBOOL(PL_tainted);
2605 PL_tainted = 0;
2606 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2607 NULL);
2608 if (!SvTRUE(ERRSV))
2609 sv_setsv(ERRSV, errsv_save);
2610 SvREFCNT_dec(errsv_save);
2611 LEAVE;
2612 }
2613 SPAGAIN;
2614 PUSHMARK(SP);
2615 EXTEND(SP,5);
2616 mPUSHp(pkg, pkg_len);
2617 mPUSHp(name, name_len);
2618 PUSHs(listsv);
2619 mPUSHi(minbits);
2620 mPUSHi(none);
2621 PUTBACK;
f8be5cf0 2622 errsv_save = newSVsv(ERRSV);
69794297
KW
2623 /* If we already have a pointer to the method, no need to use
2624 * call_method() to repeat the lookup. */
2625 if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
2626 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2627 {
2628 retval = *PL_stack_sp--;
2629 SvREFCNT_inc(retval);
2630 }
f8be5cf0
JH
2631 if (!SvTRUE(ERRSV))
2632 sv_setsv(ERRSV, errsv_save);
2633 SvREFCNT_dec(errsv_save);
ce3b816e 2634 LEAVE;
69794297
KW
2635 POPSTACK;
2636 if (IN_PERL_COMPILETIME) {
2637 CopHINTS_set(PL_curcop, PL_hints);
2638 }
2639 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2640 if (SvPOK(retval))
2641
2642 /* If caller wants to handle missing properties, let them */
2643 if (return_if_undef) {
2644 return NULL;
2645 }
2646 Perl_croak(aTHX_
2647 "Can't find Unicode property definition \"%"SVf"\"",
2648 SVfARG(retval));
2649 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
2650 }
9a53f6cf 2651 } /* End of calling the module to find the swash */
36eb48b4
KW
2652
2653 /* Make sure there is an inversion list for binary properties */
2654 if (minbits == 1) {
2655 SV** swash_invlistsvp = NULL;
2656 SV* swash_invlist = NULL;
9a53f6cf 2657 bool invlist_in_swash_is_valid = FALSE;
88d45d28 2658 HV* swash_hv = NULL;
36eb48b4 2659
9a53f6cf
KW
2660 /* If this operation fetched a swash, get its already existing
2661 * inversion list or create one for it */
2662 if (retval != &PL_sv_undef) {
36eb48b4
KW
2663 swash_hv = MUTABLE_HV(SvRV(retval));
2664
2665 swash_invlistsvp = hv_fetchs(swash_hv, "INVLIST", FALSE);
9a53f6cf
KW
2666 if (swash_invlistsvp) {
2667 swash_invlist = *swash_invlistsvp;
2668 invlist_in_swash_is_valid = TRUE;
2669 }
2670 else {
36eb48b4 2671 swash_invlist = _swash_to_invlist(retval);
9a53f6cf
KW
2672 }
2673 }
2674
2675 /* If an inversion list was passed in, have to include it */
2676 if (invlist) {
2677
2678 /* Any fetched swash will by now have an inversion list in it;
2679 * otherwise <swash_invlist> will be NULL, indicating that we
2680 * didn't fetch a swash */
2681 if (swash_invlist) {
2682
2683 /* Add the passed-in inversion list, which invalidates the one
2684 * already stored in the swash */
2685 invlist_in_swash_is_valid = FALSE;
2686 _invlist_union(invlist, swash_invlist, &swash_invlist);
2687 }
2688 else {
2689
2690 /* Here, there is no swash already. Set up a minimal one */
2691 swash_hv = newHV();
2692 retval = newRV_inc(MUTABLE_SV(swash_hv));
2693 swash_invlist = invlist;
2694 }
2695
2696 if (passed_in_invlist_has_user_defined_property) {
2697 if (! hv_stores(swash_hv, "USER_DEFINED", newSVuv(1))) {
2698 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2699 }
2700 }
2701 }
2702
2703 /* Here, we have computed the union of all the passed-in data. It may
2704 * be that there was an inversion list in the swash which didn't get
2705 * touched; otherwise save the one computed one */
2706 if (! invlist_in_swash_is_valid) {
69794297
KW
2707 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
2708 {
2709 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2710 }
9a53f6cf 2711 }
36eb48b4
KW
2712 }
2713
a0ed51b3
LW
2714 return retval;
2715}
2716
035d37be
JH
2717
2718/* This API is wrong for special case conversions since we may need to
2719 * return several Unicode characters for a single Unicode character
2720 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2721 * the lower-level routine, and it is similarly broken for returning
38684baa
KW
2722 * multiple values. --jhi
2723 * For those, you should use to_utf8_case() instead */
b0e3252e 2724/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
2725
2726/* Note:
2727 * Returns the value of property/mapping C<swash> for the first character
2728 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2729 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
2730 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
2731 *
2732 * A "swash" is a hash which contains initially the keys/values set up by
2733 * SWASHNEW. The purpose is to be able to completely represent a Unicode
2734 * property for all possible code points. Things are stored in a compact form
2735 * (see utf8_heavy.pl) so that calculation is required to find the actual
2736 * property value for a given code point. As code points are looked up, new
2737 * key/value pairs are added to the hash, so that the calculation doesn't have
2738 * to ever be re-done. Further, each calculation is done, not just for the
2739 * desired one, but for a whole block of code points adjacent to that one.
2740 * For binary properties on ASCII machines, the block is usually for 64 code
2741 * points, starting with a code point evenly divisible by 64. Thus if the
2742 * property value for code point 257 is requested, the code goes out and
2743 * calculates the property values for all 64 code points between 256 and 319,
2744 * and stores these as a single 64-bit long bit vector, called a "swatch",
2745 * under the key for code point 256. The key is the UTF-8 encoding for code
2746 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
2747 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
2748 * for code point 258 is then requested, this code realizes that it would be
2749 * stored under the key for 256, and would find that value and extract the
2750 * relevant bit, offset from 256.
2751 *
2752 * Non-binary properties are stored in as many bits as necessary to represent
2753 * their values (32 currently, though the code is more general than that), not
2754 * as single bits, but the principal is the same: the value for each key is a
2755 * vector that encompasses the property values for all code points whose UTF-8
2756 * representations are represented by the key. That is, for all code points
2757 * whose UTF-8 representations are length N bytes, and the key is the first N-1
2758 * bytes of that.
680c470c 2759 */
a0ed51b3 2760UV
680c470c 2761Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 2762{
27da23d5 2763 dVAR;
ef8f7699 2764 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
2765 U32 klen;
2766 U32 off;
a0ed51b3 2767 STRLEN slen;
7d85a32c 2768 STRLEN needents;
cfd0369c 2769 const U8 *tmps = NULL;
a0ed51b3 2770 U32 bit;
979f2922 2771 SV *swatch;
3568d838 2772 U8 tmputf8[2];
35da51f7 2773 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838 2774
7918f24d
NC
2775 PERL_ARGS_ASSERT_SWASH_FETCH;
2776
dbe7a391 2777 /* Convert to utf8 if not already */
3568d838 2778 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
ST
2779 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
2780 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
2781 ptr = tmputf8;
3568d838
JH
2782 }
2783 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
37e2e78e 2784 * then the "swatch" is a vec() for all the chars which start
3568d838
JH
2785 * with 0xAA..0xYY
2786 * So the key in the hash (klen) is length of encoded char -1
2787 */
2788 klen = UTF8SKIP(ptr) - 1;
2789 off = ptr[klen];
a0ed51b3 2790
979f2922 2791 if (klen == 0) {
37e2e78e 2792 /* If char is invariant then swatch is for all the invariant chars
1e54db1a 2793 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 2794 */
979f2922
ST
2795 needents = UTF_CONTINUATION_MARK;
2796 off = NATIVE_TO_UTF(ptr[klen]);
2797 }
2798 else {
7d85a32c 2799 /* If char is encoded then swatch is for the prefix */
979f2922
ST
2800 needents = (1 << UTF_ACCUMULATION_SHIFT);
2801 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
2802 }
7d85a32c 2803
a0ed51b3
LW
2804 /*
2805 * This single-entry cache saves about 1/3 of the utf8 overhead in test
2806 * suite. (That is, only 7-8% overall over just a hash cache. Still,
2807 * it's nothing to sniff at.) Pity we usually come through at least
2808 * two function calls to get here...
2809 *
2810 * NB: this code assumes that swatches are never modified, once generated!
2811 */
2812
3568d838 2813 if (hv == PL_last_swash_hv &&
a0ed51b3 2814 klen == PL_last_swash_klen &&
27da23d5 2815 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
2816 {
2817 tmps = PL_last_swash_tmps;
2818 slen = PL_last_swash_slen;
2819 }
2820 else {
2821 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 2822 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 2823
b0e3252e 2824 /* If not cached, generate it via swatch_get */
979f2922
ST
2825 if (!svp || !SvPOK(*svp)
2826 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
2827 /* We use utf8n_to_uvuni() as we want an index into
2828 Unicode tables, not a native character number.
2829 */
aec46f14 2830 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
2831 ckWARN(WARN_UTF8) ?
2832 0 : UTF8_ALLOW_ANY);
b0e3252e 2833 swatch = swatch_get(swash,
979f2922 2834 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
361ee0fe 2835 (klen) ? (code_point & ~((UV)needents - 1)) : 0,
979f2922
ST
2836 needents);
2837
923e4eb5 2838 if (IN_PERL_COMPILETIME)
623e6609 2839 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 2840
979f2922 2841 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 2842
979f2922
ST
2843 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
2844 || (slen << 3) < needents)
5637ef5b
NC
2845 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
2846 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
2847 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
2848 }
2849
2850 PL_last_swash_hv = hv;
16d8f38a 2851 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 2852 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
2853 /* FIXME change interpvar.h? */
2854 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
2855 PL_last_swash_slen = slen;
2856 if (klen)
2857 Copy(ptr, PL_last_swash_key, klen, U8);
2858 }
2859
a410ec23
KW
2860 if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
2861 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2862
2863 /* This outputs warnings for binary properties only, assuming that
2864 * to_utf8_case() will output any for non-binary. Also, surrogates
2865 * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
2866
9a53f6cf 2867 if (! bitssvp || SvUV(*bitssvp) == 1) {
a410ec23
KW
2868 /* User-defined properties can silently match above-Unicode */
2869 SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
2870 if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
2871 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
2872 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2873 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
2874 }
2875 }
2876 }
2877
9faf8d75 2878 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
2879 case 1:
2880 bit = 1 << (off & 7);
2881 off >>= 3;
2882 return (tmps[off] & bit) != 0;
2883 case 8:
2884 return tmps[off];
2885 case 16:
2886 off <<= 1;
2887 return (tmps[off] << 8) + tmps[off + 1] ;
2888 case 32:
2889 off <<= 2;
2890 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2891 }
5637ef5b
NC
2892 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
2893 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 2894 NORETURN_FUNCTION_END;
a0ed51b3 2895}
2b9d42f0 2896
319009ee
KW
2897/* Read a single line of the main body of the swash input text. These are of
2898 * the form:
2899 * 0053 0056 0073
2900 * where each number is hex. The first two numbers form the minimum and
2901 * maximum of a range, and the third is the value associated with the range.
2902 * Not all swashes should have a third number
2903 *
2904 * On input: l points to the beginning of the line to be examined; it points
2905 * to somewhere in the string of the whole input text, and is
2906 * terminated by a \n or the null string terminator.
2907 * lend points to the null terminator of that string
2908 * wants_value is non-zero if the swash expects a third number
2909 * typestr is the name of the swash's mapping, like 'ToLower'
2910 * On output: *min, *max, and *val are set to the values read from the line.
2911 * returns a pointer just beyond the line examined. If there was no
2912 * valid min number on the line, returns lend+1
2913 */
2914
2915STATIC U8*
2916S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
2917 const bool wants_value, const U8* const typestr)
2918{
2919 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
2920 STRLEN numlen; /* Length of the number */
02470786
KW
2921 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
2922 | PERL_SCAN_DISALLOW_PREFIX
2923 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
2924
2925 /* nl points to the next \n in the scan */
2926 U8* const nl = (U8*)memchr(l, '\n', lend - l);
2927
2928 /* Get the first number on the line: the range minimum */
2929 numlen = lend - l;
2930 *min = grok_hex((char *)l, &numlen, &flags, NULL);
2931 if (numlen) /* If found a hex number, position past it */
2932 l += numlen;
2933 else if (nl) { /* Else, go handle next line, if any */
2934 return nl + 1; /* 1 is length of "\n" */
2935 }
2936 else { /* Else, no next line */
2937 return lend + 1; /* to LIST's end at which \n is not found */
2938 }
2939
2940 /* The max range value follows, separated by a BLANK */
2941 if (isBLANK(*l)) {
2942 ++l;
02470786
KW
2943 flags = PERL_SCAN_SILENT_ILLDIGIT
2944 | PERL_SCAN_DISALLOW_PREFIX
2945 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
2946 numlen = lend - l;
2947 *max = grok_hex((char *)l, &numlen, &flags, NULL);
2948 if (numlen)
2949 l += numlen;
2950 else /* If no value here, it is a single element range */
2951 *max = *min;
2952
2953 /* Non-binary tables have a third entry: what the first element of the
2954 * range maps to */
2955 if (wants_value) {
2956 if (isBLANK(*l)) {
2957 ++l;
f90a9a02
KW
2958
2959 /* The ToLc, etc table mappings are not in hex, and must be
2960 * corrected by adding the code point to them */
2961 if (typeto) {
2962 char *after_strtol = (char *) lend;
2963 *val = Strtol((char *)l, &after_strtol, 10);
2964 l = (U8 *) after_strtol;
f90a9a02
KW
2965 }
2966 else { /* Other tables are in hex, and are the correct result
2967 without tweaking */
a9d188b3
KW
2968 flags = PERL_SCAN_SILENT_ILLDIGIT
2969 | PERL_SCAN_DISALLOW_PREFIX
2970 | PERL_SCAN_SILENT_NON_PORTABLE;
2971 numlen = lend - l;
2972 *val = grok_hex((char *)l, &numlen, &flags, NULL);
2973 if (numlen)
2974 l += numlen;
2975 else
2976 *val = 0;
f90a9a02 2977 }
319009ee
KW
2978 }
2979 else {
2980 *val = 0;
2981 if (typeto) {
dcbac5bb 2982 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
2983 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2984 typestr, l);
2985 }
2986 }
2987 }
2988 else
2989 *val = 0; /* bits == 1, then any val should be ignored */
2990 }
2991 else { /* Nothing following range min, should be single element with no
2992 mapping expected */
2993 *max = *min;
2994 if (wants_value) {
2995 *val = 0;
2996 if (typeto) {
dcbac5bb 2997 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
2998 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2999 }
3000 }
3001 else
3002 *val = 0; /* bits == 1, then val should be ignored */
3003 }
3004
3005 /* Position to next line if any, or EOF */
3006 if (nl)
3007 l = nl + 1;
3008 else
3009 l = lend;
3010
3011 return l;
3012}
3013
979f2922
ST
3014/* Note:
3015 * Returns a swatch (a bit vector string) for a code point sequence
3016 * that starts from the value C<start> and comprises the number C<span>.
3017 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3018 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3019 */
3020STATIC SV*
b0e3252e 3021S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
ST
3022{
3023 SV *swatch;
77f9f126 3024 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3025 STRLEN lcur, xcur, scur;
ef8f7699 3026 HV *const hv = MUTABLE_HV(SvRV(swash));
36eb48b4
KW
3027 SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
3028
88d45d28
KW
3029 SV** listsvp = NULL; /* The string containing the main body of the table */
3030 SV** extssvp = NULL;
3031 SV** invert_it_svp = NULL;
3032 U8* typestr = NULL;
786861f5
KW
3033 STRLEN bits;
3034 STRLEN octets; /* if bits == 1, then octets == 0 */
3035 UV none;
3036 UV end = start + span;
972dd592 3037
36eb48b4 3038 if (invlistsvp == NULL) {
786861f5
KW
3039 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3040 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3041 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3042 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3043 listsvp = hv_fetchs(hv, "LIST", FALSE);
3044 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3045
3046 bits = SvUV(*bitssvp);
3047 none = SvUV(*nonesvp);
3048 typestr = (U8*)SvPV_nolen(*typesvp);
3049 }
36eb48b4
KW
3050 else {
3051 bits = 1;
3052 none = 0;
3053 }
786861f5 3054 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3055
b0e3252e 3056 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3057
979f2922 3058 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3059 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3060 (UV)bits);
979f2922
ST
3061 }
3062
84ea5ef6
KW
3063 /* If overflowed, use the max possible */
3064 if (end < start) {
3065 end = UV_MAX;
3066 span = end - start;
3067 }
3068
979f2922 3069 /* create and initialize $swatch */
979f2922 3070 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3071 swatch = newSV(scur);
3072 SvPOK_on(swatch);
979f2922
ST
3073 s = (U8*)SvPVX(swatch);
3074 if (octets && none) {
0bd48802 3075 const U8* const e = s + scur;
979f2922
ST
3076 while (s < e) {
3077 if (bits == 8)
3078 *s++ = (U8)(none & 0xff);
3079 else if (bits == 16) {
3080 *s++ = (U8)((none >> 8) & 0xff);
3081 *s++ = (U8)( none & 0xff);
3082 }
3083 else if (bits == 32) {
3084 *s++ = (U8)((none >> 24) & 0xff);
3085 *s++ = (U8)((none >> 16) & 0xff);
3086 *s++ = (U8)((none >> 8) & 0xff);
3087 *s++ = (U8)( none & 0xff);
3088 }
3089 }
3090 *s = '\0';
3091 }
3092 else {
3093 (void)memzero((U8*)s, scur + 1);
3094 }
3095 SvCUR_set(swatch, scur);
3096 s = (U8*)SvPVX(swatch);
3097
36eb48b4
KW
3098 if (invlistsvp) { /* If has an inversion list set up use that */
3099 _invlist_populate_swatch(*invlistsvp, start, end, s);
3100 return swatch;
3101 }
3102
3103 /* read $swash->{LIST} */
979f2922
ST
3104 l = (U8*)SvPV(*listsvp, lcur);
3105 lend = l + lcur;
3106 while (l < lend) {
8ed25d53 3107 UV min, max, val, upper;
319009ee
KW
3108 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3109 cBOOL(octets), typestr);
3110 if (l > lend) {
979f2922
ST
3111 break;
3112 }
3113
972dd592 3114 /* If looking for something beyond this range, go try the next one */
979f2922
ST
3115 if (max < start)
3116 continue;
3117
8ed25d53
KW
3118 /* <end> is generally 1 beyond where we want to set things, but at the
3119 * platform's infinity, where we can't go any higher, we want to
3120 * include the code point at <end> */
3121 upper = (max < end)
3122 ? max
3123 : (max != UV_MAX || end != UV_MAX)
3124 ? end - 1
3125 : end;
3126
979f2922 3127 if (octets) {
35da51f7 3128 UV key;
979f2922
ST
3129 if (min < start) {
3130 if (!none || val < none) {
3131 val += start - min;
3132 }
3133 min = start;
3134 }
8ed25d53 3135 for (key = min; key <= upper; key++) {
979f2922 3136 STRLEN offset;
979f2922
ST
3137 /* offset must be non-negative (start <= min <= key < end) */
3138 offset = octets * (key - start);
3139 if (bits == 8)
3140 s[offset] = (U8)(val & 0xff);
3141 else if (bits == 16) {
3142 s[offset ] = (U8)((val >> 8) & 0xff);
3143 s[offset + 1] = (U8)( val & 0xff);
3144 }
3145 else if (bits == 32) {
3146 s[offset ] = (U8)((val >> 24) & 0xff);
3147 s[offset + 1] = (U8)((val >> 16) & 0xff);
3148 s[offset + 2] = (U8)((val >> 8) & 0xff);
3149 s[offset + 3] = (U8)( val & 0xff);
3150 }
3151
3152 if (!none || val < none)
3153 ++val;
3154 }
3155 }
711a919c 3156 else { /* bits == 1, then val should be ignored */
35da51f7 3157 UV key;
979f2922
ST
3158 if (min < start)
3159 min = start;
6cb05c12 3160
8ed25d53 3161 for (key = min; key <= upper; key++) {
0bd48802 3162 const STRLEN offset = (STRLEN)(key - start);
979f2922
ST
3163 s[offset >> 3] |= 1 << (offset & 7);
3164 }
3165 }
3166 } /* while */
979f2922 3167
9479a769 3168 /* Invert if the data says it should be. Assumes that bits == 1 */
77f9f126 3169 if (invert_it_svp && SvUV(*invert_it_svp)) {
0bda3001
KW
3170
3171 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3172 * be 0, and their inversion should also be 0, as we don't succeed any
3173 * Unicode property matches for non-Unicode code points */
3174 if (start <= PERL_UNICODE_MAX) {
3175
3176 /* The code below assumes that we never cross the
3177 * Unicode/above-Unicode boundary in a range, as otherwise we would
3178 * have to figure out where to stop flipping the bits. Since this
3179 * boundary is divisible by a large power of 2, and swatches comes
3180 * in small powers of 2, this should be a valid assumption */
3181 assert(start + span - 1 <= PERL_UNICODE_MAX);
3182
507a8485
KW
3183 send = s + scur;
3184 while (s < send) {
3185 *s = ~(*s);
3186 s++;
3187 }
0bda3001 3188 }
77f9f126
KW
3189 }
3190
d73c39c5
KW
3191 /* read $swash->{EXTRAS}
3192 * This code also copied to swash_to_invlist() below */
979f2922
ST
3193 x = (U8*)SvPV(*extssvp, xcur);
3194 xend = x + xcur;
3195 while (x < xend) {
3196 STRLEN namelen;
3197 U8 *namestr;
3198 SV** othersvp;
3199 HV* otherhv;
3200 STRLEN otherbits;
3201 SV **otherbitssvp, *other;
711a919c 3202 U8 *s, *o, *nl;
979f2922
ST
3203 STRLEN slen, olen;
3204
35da51f7 3205 const U8 opc = *x++;
979f2922
ST
3206 if (opc == '\n')
3207 continue;
3208
3209 nl = (U8*)memchr(x, '\n', xend - x);
3210
3211 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3212 if (nl) {
3213 x = nl + 1; /* 1 is length of "\n" */
3214 continue;
3215 }
3216 else {
3217 x = xend; /* to EXTRAS' end at which \n is not found */
3218 break;
3219 }
3220 }
3221
3222 namestr = x;
3223 if (nl) {
3224 namelen = nl - namestr;
3225 x = nl + 1;
3226 }
3227 else {
3228 namelen = xend - namestr;
3229 x = xend;
3230 }
3231
3232 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 3233 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 3234 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
ST
3235 otherbits = (STRLEN)SvUV(*otherbitssvp);
3236 if (bits < otherbits)
5637ef5b
NC
3237 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3238 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
979f2922
ST
3239
3240 /* The "other" swatch must be destroyed after. */
b0e3252e 3241 other = swatch_get(*othersvp, start, span);
979f2922
ST
3242 o = (U8*)SvPV(other, olen);
3243
3244 if (!olen)
b0e3252e 3245 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
979f2922
ST
3246
3247 s = (U8*)SvPV(swatch, slen);
3248 if (bits == 1 && otherbits == 1) {
3249 if (slen != olen)
5637ef5b
NC
3250 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3251 "mismatch, slen=%"UVuf", olen=%"UVuf,
3252 (UV)slen, (UV)olen);
979f2922
ST
3253
3254 switch (opc) {
3255 case '+':
3256 while (slen--)
3257 *s++ |= *o++;
3258 break;
3259 case '!':
3260 while (slen--)
3261 *s++ |= ~*o++;
3262 break;
3263 case '-':
3264 while (slen--)
3265 *s++ &= ~*o++;
3266 break;
3267 case '&':
3268 while (slen--)
3269 *s++ &= *o++;
3270 break;
3271 default:
3272 break;
3273 }
3274 }
711a919c 3275 else {
979f2922
ST
3276 STRLEN otheroctets = otherbits >> 3;
3277 STRLEN offset = 0;
35da51f7 3278 U8* const send = s + slen;
979f2922
ST
3279
3280 while (s < send) {
3281 UV otherval = 0;
3282
3283 if (otherbits == 1) {
3284 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3285 ++offset;
3286 }
3287 else {
3288 STRLEN vlen = otheroctets;
3289 otherval = *o++;
3290 while (--vlen) {
3291 otherval <<= 8;
3292 otherval |= *o++;
3293 }
3294 }
3295
711a919c 3296 if (opc == '+' && otherval)
6f207bd3 3297 NOOP; /* replace with otherval */
979f2922
ST
3298 else if (opc == '!' && !otherval)
3299 otherval = 1;
3300 else if (opc == '-' && otherval)
3301 otherval = 0;
3302 else if (opc == '&' && !otherval)
3303 otherval = 0;
3304 else {
711a919c 3305 s += octets; /* no replacement */
979f2922
ST
3306 continue;
3307 }
3308
3309 if (bits == 8)
3310 *s++ = (U8)( otherval & 0xff);
3311 else if (bits == 16) {
3312 *s++ = (U8)((otherval >> 8) & 0xff);
3313 *s++ = (U8)( otherval & 0xff);
3314 }
3315 else if (bits == 32) {
3316 *s++ = (U8)((otherval >> 24) & 0xff);
3317 *s++ = (U8)((otherval >> 16) & 0xff);
3318 *s++ = (U8)((otherval >> 8) & 0xff);
3319 *s++ = (U8)( otherval & 0xff);
3320 }
3321 }
3322 }
3323 sv_free(other); /* through with it! */
3324 } /* while */
3325 return swatch;
3326}
3327
064c021d 3328HV*
4c2e1131 3329Perl__swash_inversion_hash(pTHX_ SV* const swash)
064c021d
KW
3330{
3331
5662e334
KW
3332 /* Subject to change or removal. For use only in one place in regcomp.c.
3333 * Can't be used on a property that is subject to user override, as it
3334 * relies on the value of SPECIALS in the swash which would be set by
3335 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3336 * for overridden properties
064c021d
KW
3337 *
3338 * Returns a hash which is the inversion and closure of a swash mapping.
3339 * For example, consider the input lines:
3340 * 004B 006B
3341 * 004C 006C
3342 * 212A 006B
3343 *
3344 * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3345 * 006C. The value for each key is an array. For 006C, the array would
3346 * have a two elements, the utf8 for itself, and for 004C. For 006B, there
3347 * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3348 *
3349 * Essentially, for any code point, it gives all the code points that map to
3350 * it, or the list of 'froms' for that point.
3351 *
5662e334
KW
3352 * Currently it ignores any additions or deletions from other swashes,
3353 * looking at just the main body of the swash, and if there are SPECIALS
3354 * in the swash, at that hash
3355 *
3356 * The specials hash can be extra code points, and most likely consists of
3357 * maps from single code points to multiple ones (each expressed as a string
3358 * of utf8 characters). This function currently returns only 1-1 mappings.
3359 * However consider this possible input in the specials hash:
3360 * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
3361 * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
3362 *
3363 * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3364 * currently handle. But it also means that FB05 and FB06 are equivalent in
3365 * a 1-1 mapping which we should handle, and this relationship may not be in
3366 * the main table. Therefore this function examines all the multi-char
3367 * sequences and adds the 1-1 mappings that come out of that. */
064c021d
KW
3368
3369 U8 *l, *lend;
3370 STRLEN lcur;
3371 HV *const hv = MUTABLE_HV(SvRV(swash));
3372
3373 /* The string containing the main body of the table */
3374 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3375
3376 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3377 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3378 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3379 /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3380 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3381 const STRLEN bits = SvUV(*bitssvp);
3382 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3383 const UV none = SvUV(*nonesvp);
5662e334 3384 SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
064c021d
KW
3385
3386 HV* ret = newHV();
3387
3388 PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3389
3390 /* Must have at least 8 bits to get the mappings */
3391 if (bits != 8 && bits != 16 && bits != 32) {
3392 Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3393 (UV)bits);
3394 }
3395
5662e334
KW
3396 if (specials_p) { /* It might be "special" (sometimes, but not always, a
3397 mapping to more than one character */
3398
3399 /* Construct an inverse mapping hash for the specials */
3400 HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3401 HV * specials_inverse = newHV();
3402 char *char_from; /* the lhs of the map */
3403 I32 from_len; /* its byte length */
3404 char *char_to; /* the rhs of the map */
3405 I32 to_len; /* its byte length */
3406 SV *sv_to; /* and in a sv */
3407 AV* from_list; /* list of things that map to each 'to' */
3408
3409 hv_iterinit(specials_hv);
3410
3411 /* The keys are the characters (in utf8) that map to the corresponding
3412 * utf8 string value. Iterate through the list creating the inverse
3413 * list. */
3414 while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3415 SV** listp;
3416 if (! SvPOK(sv_to)) {
5637ef5b
NC
3417 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3418 "unexpectedly is not a string, flags=%lu",
3419 (unsigned long)SvFLAGS(sv_to));
5662e334
KW
3420 }
3421 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3422
3423 /* Each key in the inverse list is a mapped-to value, and the key's
3424 * hash value is a list of the strings (each in utf8) that map to
3425 * it. Those strings are all one character long */
3426 if ((listp = hv_fetch(specials_inverse,
3427 SvPVX(sv_to),
3428 SvCUR(sv_to), 0)))
3429 {
3430 from_list = (AV*) *listp;
3431 }
3432 else { /* No entry yet for it: create one */
3433 from_list = newAV();
3434 if (! hv_store(specials_inverse,
3435 SvPVX(sv_to),
3436 SvCUR(sv_to),
3437 (SV*) from_list, 0))
3438 {
3439 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3440 }
3441 }
3442
3443 /* Here have the list associated with this 'to' (perhaps newly
3444 * created and empty). Just add to it. Note that we ASSUME that
3445 * the input is guaranteed to not have duplications, so we don't
3446 * check for that. Duplications just slow down execution time. */
3447 av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3448 }
3449
3450 /* Here, 'specials_inverse' contains the inverse mapping. Go through
3451 * it looking for cases like the FB05/FB06 examples above. There would
3452 * be an entry in the hash like
3453 * 'st' => [ FB05, FB06 ]
3454 * In this example we will create two lists that get stored in the
3455 * returned hash, 'ret':
3456 * FB05 => [ FB05, FB06 ]
3457 * FB06 => [ FB05, FB06 ]
3458 *
3459 * Note that there is nothing to do if the array only has one element.
3460 * (In the normal 1-1 case handled below, we don't have to worry about
3461 * two lists, as everything gets tied to the single list that is
3462 * generated for the single character 'to'. But here, we are omitting
3463 * that list, ('st' in the example), so must have multiple lists.) */
3464 while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3465 &char_to, &to_len)))
3466 {
3467 if (av_len(from_list) > 0) {
3468 int i;
3469
3470 /* We iterate over all combinations of i,j to place each code
3471 * point on each list */
3472 for (i = 0; i <= av_len(from_list); i++) {
3473 int j;
3474 AV* i_list = newAV();
3475 SV** entryp = av_fetch(from_list, i, FALSE);
3476 if (entryp == NULL) {
3477 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3478 }
3479 if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3480 Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3481 }
3482 if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3483 (SV*) i_list, FALSE))
3484 {
3485 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3486 }
3487
3488 /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3489 for (j = 0; j <= av_len(from_list); j++) {
3490 entryp = av_fetch(from_list, j, FALSE);
3491 if (entryp == NULL) {
3492 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3493 }
3494
3495 /* When i==j this adds itself to the list */
3496 av_push(i_list, newSVuv(utf8_to_uvchr(
3497 (U8*) SvPVX(*entryp), 0)));
3498 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3499 }
3500 }
3501 }
3502 }
3503 SvREFCNT_dec(specials_inverse); /* done with it */
3504 } /* End of specials */
3505
064c021d
KW
3506 /* read $swash->{LIST} */
3507 l = (U8*)SvPV(*listsvp, lcur);
3508 lend = l + lcur;
3509
3510 /* Go through each input line */
3511 while (l < lend) {
3512 UV min, max, val;
3513 UV inverse;
3514 l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3515 cBOOL(octets), typestr);
3516 if (l > lend) {
3517 break;
3518 }
3519
3520 /* Each element in the range is to be inverted */
3521 for (inverse = min; inverse <= max; inverse++) {
3522 AV* list;
064c021d
KW
3523 SV** listp;
3524 IV i;
3525 bool found_key = FALSE;
5662e334 3526 bool found_inverse = FALSE;
064c021d
KW
3527
3528 /* The key is the inverse mapping */
3529 char key[UTF8_MAXBYTES+1];
3530 char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
3531 STRLEN key_len = key_end - key;
3532
064c021d
KW
3533 /* Get the list for the map */
3534 if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3535 list = (AV*) *listp;
3536 }
3537 else { /* No entry yet for it: create one */
3538 list = newAV();
3539 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3540 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3541 }
3542 }
3543
5662e334
KW
3544 /* Look through list to see if this inverse mapping already is
3545 * listed, or if there is a mapping to itself already */
508f7cfa 3546 for (i = 0; i <= av_len(list); i++) {
064c021d
KW
3547 SV** entryp = av_fetch(list, i, FALSE);
3548 SV* entry;
3549 if (entryp == NULL) {
3550 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3551 }
3552 entry = *entryp;
5662e334 3553 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
56ca34ca 3554 if (SvUV(entry) == val) {
064c021d 3555 found_key = TRUE;
5662e334
KW
3556 }
3557 if (SvUV(entry) == inverse) {
3558 found_inverse = TRUE;
3559 }
3560
3561 /* No need to continue searching if found everything we are
3562 * looking for */
3563 if (found_key && found_inverse) {
064c021d
KW
3564 break;
3565 }
3566 }
56ca34ca
KW
3567
3568 /* Make sure there is a mapping to itself on the list */
064c021d 3569 if (! found_key) {
d397ff6a 3570 av_push(list, newSVuv(val));
5662e334 3571 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));*/
064c021d
KW
3572 }
3573
3574
3575 /* Simply add the value to the list */
5662e334
KW
3576 if (! found_inverse) {
3577 av_push(list, newSVuv(inverse));
3578 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));*/
3579 }
064c021d 3580
b0e3252e 3581 /* swatch_get() increments the value of val for each element in the
064c021d
KW
3582 * range. That makes more compact tables possible. You can
3583 * express the capitalization, for example, of all consecutive
3584 * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3585 * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
bd3f2f94 3586 * and it's not documented; it appears to be used only in
b0e3252e 3587 * implementing tr//; I copied the semantics from swatch_get(), just
bd3f2f94 3588 * in case */
064c021d
KW
3589 if (!none || val < none) {
3590 ++val;
3591 }
3592 }
3593 }
3594
3595 return ret;
3596}
3597
a25abddc 3598SV*
d764b54e
KW
3599Perl__swash_to_invlist(pTHX_ SV* const swash)
3600{
3601
3602 /* Subject to change or removal. For use only in one place in regcomp.c */
3603
3604 U8 *l, *lend;
3605 char *loc;
3606 STRLEN lcur;
3607 HV *const hv = MUTABLE_HV(SvRV(swash));
3608 UV elements = 0; /* Number of elements in the inversion list */
b443038a 3609 U8 empty[] = "";
d764b54e
KW
3610
3611 /* The string containing the main body of the table */
3612 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3613 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3614 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
d73c39c5 3615 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
77f9f126 3616 SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
d764b54e
KW
3617
3618 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3619 const STRLEN bits = SvUV(*bitssvp);
3620 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
d73c39c5
KW
3621 U8 *x, *xend;
3622 STRLEN xcur;
d764b54e 3623
a25abddc 3624 SV* invlist;
d764b54e
KW
3625
3626 PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3627
3628 /* read $swash->{LIST} */
b443038a
KW
3629 if (SvPOK(*listsvp)) {
3630 l = (U8*)SvPV(*listsvp, lcur);
3631 }
3632 else {
3633 /* LIST legitimately doesn't contain a string during compilation phases
3634 * of Perl itself, before the Unicode tables are generated. In this
3635 * case, just fake things up by creating an empty list */
3636 l = empty;
3637 lcur = 0;
3638 }
d764b54e
KW
3639 loc = (char *) l;
3640 lend = l + lcur;
3641
3642 /* Scan the input to count the number of lines to preallocate array size
3643 * based on worst possible case, which is each line in the input creates 2
3644 * elements in the inversion list: 1) the beginning of a range in the list;
3645 * 2) the beginning of a range not in the list. */
3646 while ((loc = (strchr(loc, '\n'))) != NULL) {
3647 elements += 2;
3648 loc++;
3649 }
3650
3651 /* If the ending is somehow corrupt and isn't a new line, add another
3652 * element for the final range that isn't in the inversion list */
fd05e003
KW
3653 if (! (*lend == '\n'
3654 || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3655 {
d764b54e
KW
3656 elements++;
3657 }
3658
3659 invlist = _new_invlist(elements);
3660
3661 /* Now go through the input again, adding each range to the list */
3662 while (l < lend) {
3663 UV start, end;
3664 UV val; /* Not used by this function */
3665
3666 l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
3667 cBOOL(octets), typestr);
3668
3669 if (l > lend) {
3670 break;
3671 }
3672
3673 _append_range_to_invlist(invlist, start, end);
3674 }
3675
77f9f126
KW
3676 /* Invert if the data says it should be */
3677 if (invert_it_svp && SvUV(*invert_it_svp)) {
45bb2768 3678 _invlist_invert_prop(invlist);
77f9f126
KW
3679 }
3680
b0e3252e 3681 /* This code is copied from swatch_get()
d73c39c5
KW
3682 * read $swash->{EXTRAS} */
3683 x = (U8*)SvPV(*extssvp, xcur);
3684 xend = x + xcur;
3685 while (x < xend) {
3686 STRLEN namelen;
3687 U8 *namestr;
3688 SV** othersvp;
3689 HV* otherhv;
3690 STRLEN otherbits;
3691 SV **otherbitssvp, *other;
3692 U8 *nl;
3693
3694 const U8 opc = *x++;
3695 if (opc == '\n')
3696 continue;
3697
3698 nl = (U8*)memchr(x, '\n', xend - x);
3699
3700 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3701 if (nl) {
3702 x = nl + 1; /* 1 is length of "\n" */
3703 continue;
3704 }
3705 else {
3706 x = xend; /* to EXTRAS' end at which \n is not found */
3707 break;
3708 }
3709 }
3710
3711 namestr = x;
3712 if (nl) {
3713 namelen = nl - namestr;
3714 x = nl + 1;
3715 }
3716 else {
3717 namelen = xend - namestr;
3718 x = xend;
3719 }
3720
3721 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3722 otherhv = MUTABLE_HV(SvRV(*othersvp));
3723 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3724 otherbits = (STRLEN)SvUV(*otherbitssvp);
3725
3726 if (bits != otherbits || bits != 1) {
5637ef5b
NC
3727 Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
3728 "properties, bits=%"UVuf", otherbits=%"UVuf,
3729 (UV)bits, (UV)otherbits);
d73c39c5
KW
3730 }
3731
3732 /* The "other" swatch must be destroyed after. */
3733 other = _swash_to_invlist((SV *)*othersvp);
3734
b0e3252e 3735 /* End of code copied from swatch_get() */
d73c39c5
KW
3736 switch (opc) {
3737 case '+':
3738 _invlist_union(invlist, other, &invlist);
3739 break;
3740 case '!':
3741 _invlist_invert(other);
3742 _invlist_union(invlist, other, &invlist);
3743 break;
3744 case '-':
3745 _invlist_subtract(invlist, other, &invlist);
3746 break;
3747 case '&':
3748 _invlist_intersection(invlist, other, &invlist);
3749 break;
3750 default:
3751 break;
3752 }
3753 sv_free(other); /* through with it! */
3754 }
3755
d764b54e
KW
3756 return invlist;
3757}
3758
0f830e0b 3759/*
87cea99e 3760=for apidoc uvchr_to_utf8
0f830e0b 3761
6ee84de2 3762Adds the UTF-8 representation of the Native code point C<uv> to the end
e0aa61c6 3763of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
0f830e0b
NC
3764bytes available. The return value is the pointer to the byte after the
3765end of the new character. In other words,
3766
3767 d = uvchr_to_utf8(d, uv);
3768
3769is the recommended wide native character-aware way of saying
3770
3771 *(d++) = uv;
3772
3773=cut
3774*/
3775
3776/* On ASCII machines this is normally a macro but we want a
3777 real function in case XS code wants it
3778*/
0f830e0b
NC
3779U8 *
3780Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
3781{
7918f24d
NC
3782 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
3783
0f830e0b
NC
3784 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
3785}
3786
b851fbc1
JH
3787U8 *
3788Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
3789{
7918f24d
NC
3790 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
3791
b851fbc1
JH
3792 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
3793}
2b9d42f0
NIS
3794
3795/*
87cea99e 3796=for apidoc utf8n_to_uvchr
0f830e0b 3797
48ef279e 3798Returns the native character value of the first character in the string
0f830e0b
NC
3799C<s>
3800which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
3801length, in bytes, of that character.
3802
6ee84de2 3803length and flags are the same as utf8n_to_uvuni().
0f830e0b
NC
3804
3805=cut
3806*/
3807/* On ASCII machines this is normally a macro but we want
3808 a real function in case XS code wants it
3809*/
0f830e0b 3810UV
48ef279e 3811Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
0f830e0b
NC
3812U32 flags)
3813{
3814 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
7918f24d
NC
3815
3816 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
3817
0f830e0b
NC
3818 return UNI_TO_NATIVE(uv);
3819}
3820
0876b9a0
KW
3821bool
3822Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
3823{
3824 /* May change: warns if surrogates, non-character code points, or
af2af982
KW
3825 * non-Unicode code points are in s which has length len bytes. Returns
3826 * TRUE if none found; FALSE otherwise. The only other validity check is
3827 * to make sure that this won't exceed the string's length */
0876b9a0
KW
3828