This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlapi: Account for EBCDIC extend UTF-8 range
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
27da23d5
JH
36static const char unees[] =
37 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 38
48ef279e 39/*
ccfc67b7 40=head1 Unicode Support
7fefc6c1 41These are various utility functions for manipulating UTF8-encoded
72d33970 42strings. For the uninitiated, this is a method of representing arbitrary
61296642 43Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
166f8a29 46
eaf7a4d2
CS
47=cut
48*/
49
50/*
7bbfa158 51=for apidoc is_invariant_string
eaf7a4d2 52
b8ed1ca9
KW
53Returns true iff the first C<len> bytes of the string C<s> are the same
54regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
55EBCDIC machines). That is, if they are UTF-8 invariant. On ASCII-ish
56machines, all the ASCII characters and only the ASCII characters fit this
7bbfa158 57definition. On EBCDIC machines, the ASCII-range characters are invariant, but
b8ed1ca9
KW
58so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
59EBCDIC).
eaf7a4d2 60
75200dff
KW
61If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
62use this option, that C<s> can't have embedded C<NUL> characters and has to
63have a terminating C<NUL> byte).
9f7e3d64 64
a1433954 65See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
eaf7a4d2
CS
66
67=cut
68*/
69
70bool
7bbfa158 71Perl_is_invariant_string(const U8 *s, STRLEN len)
eaf7a4d2
CS
72{
73 const U8* const send = s + (len ? len : strlen((const char *)s));
74 const U8* x = s;
75
7bbfa158 76 PERL_ARGS_ASSERT_IS_INVARIANT_STRING;
eaf7a4d2
CS
77
78 for (; x < send; ++x) {
79 if (!UTF8_IS_INVARIANT(*x))
80 break;
81 }
82
83 return x == send;
84}
85
86/*
378516de 87=for apidoc uvoffuni_to_utf8_flags
eebe1485 88
a27992cc 89THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
90Instead, B<Almost all code should use L</uvchr_to_utf8> or
91L</uvchr_to_utf8_flags>>.
a27992cc 92
de69f3af
KW
93This function is like them, but the input is a strict Unicode
94(as opposed to native) code point. Only in very rare circumstances should code
95not be using the native code point.
949cf498 96
efa9cd84 97For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 98
eebe1485
SC
99=cut
100*/
101
dfe13c55 102U8 *
378516de 103Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 104{
378516de 105 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 106
2d1545e5 107 if (OFFUNI_IS_INVARIANT(uv)) {
d9432125
KW
108 *d++ = (U8) LATIN1_TO_NATIVE(uv);
109 return d;
110 }
111
979f77b6 112 /* The first problematic code point is the first surrogate */
57162cb6 113 if ( flags /* It's common to turn off all these */
80670d37 114 && uv >= UNICODE_SURROGATE_FIRST)
979f77b6 115 {
949cf498
KW
116 if (UNICODE_IS_SURROGATE(uv)) {
117 if (flags & UNICODE_WARN_SURROGATE) {
8457b38f 118 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
949cf498
KW
119 "UTF-16 surrogate U+%04"UVXf, uv);
120 }
121 if (flags & UNICODE_DISALLOW_SURROGATE) {
122 return NULL;
123 }
124 }
125 else if (UNICODE_IS_SUPER(uv)) {
0bcdd8f6 126 if ( (flags & UNICODE_WARN_SUPER)
d35f2ca5 127 || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_WARN_ABOVE_31_BIT)))
0bcdd8f6
KW
128 {
129 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
130
131 /* Choose the more dire applicable warning */
d35f2ca5 132 (UNICODE_IS_ABOVE_31_BIT(uv))
0bcdd8f6
KW
133 ? "Code point 0x%"UVXf" is not Unicode, and not portable"
134 : "Code point 0x%"UVXf" is not Unicode, may not be portable",
135 uv);
949cf498
KW
136 }
137 if (flags & UNICODE_DISALLOW_SUPER
d35f2ca5 138 || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
949cf498
KW
139 {
140 return NULL;
141 }
142 }
143 else if (UNICODE_IS_NONCHAR(uv)) {
144 if (flags & UNICODE_WARN_NONCHAR) {
8457b38f 145 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
ba707cdc 146 "Unicode non-character U+%04"UVXf" is not recommended for open interchange",
949cf498
KW
147 uv);
148 }
149 if (flags & UNICODE_DISALLOW_NONCHAR) {
150 return NULL;
151 }
152 }
507b9800 153 }
d9432125 154
2d331972 155#if defined(EBCDIC)
d9432125 156 {
5aaebcb3 157 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
158 U8 *p = d+len-1;
159 while (p > d) {
bc3632a8 160 *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
161 uv >>= UTF_ACCUMULATION_SHIFT;
162 }
bc3632a8 163 *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
164 return d+len;
165 }
166#else /* Non loop style */
a0ed51b3 167 if (uv < 0x800) {
eb160463
GS
168 *d++ = (U8)(( uv >> 6) | 0xc0);
169 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
170 return d;
171 }
172 if (uv < 0x10000) {
eb160463
GS
173 *d++ = (U8)(( uv >> 12) | 0xe0);
174 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
175 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
176 return d;
177 }
178 if (uv < 0x200000) {
eb160463
GS
179 *d++ = (U8)(( uv >> 18) | 0xf0);
180 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
181 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
182 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
183 return d;
184 }
185 if (uv < 0x4000000) {
eb160463
GS
186 *d++ = (U8)(( uv >> 24) | 0xf8);
187 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
188 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
189 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
190 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
191 return d;
192 }
193 if (uv < 0x80000000) {
eb160463
GS
194 *d++ = (U8)(( uv >> 30) | 0xfc);
195 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
196 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
197 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
198 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
199 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
200 return d;
201 }
6588300d 202#ifdef UTF8_QUAD_MAX
d7578b48 203 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
204#endif
205 {
eb160463
GS
206 *d++ = 0xfe; /* Can't match U+FEFF! */
207 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
208 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
209 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
210 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
211 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
212 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
213 return d;
214 }
6588300d 215#ifdef UTF8_QUAD_MAX
a0ed51b3 216 {
eb160463
GS
217 *d++ = 0xff; /* Can't match U+FFFE! */
218 *d++ = 0x80; /* 6 Reserved bits */
219 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
220 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
221 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
222 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
223 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
224 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
225 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
226 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
227 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
228 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
229 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
230 return d;
231 }
232#endif
537124e4 233#endif /* Non loop style */
a0ed51b3 234}
646ca15d 235/*
07693fe6
KW
236=for apidoc uvchr_to_utf8
237
bcb1a2d4 238Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 239of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
240C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
241the byte after the end of the new character. In other words,
07693fe6
KW
242
243 d = uvchr_to_utf8(d, uv);
244
245is the recommended wide native character-aware way of saying
246
247 *(d++) = uv;
248
de69f3af
KW
249This function accepts any UV as input. To forbid or warn on non-Unicode code
250points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
251
07693fe6
KW
252=cut
253*/
254
de69f3af
KW
255/* This is also a macro */
256PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
257
07693fe6
KW
258U8 *
259Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
260{
de69f3af 261 return uvchr_to_utf8(d, uv);
07693fe6
KW
262}
263
de69f3af
KW
264/*
265=for apidoc uvchr_to_utf8_flags
266
267Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 268of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
269C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
270the byte after the end of the new character. In other words,
de69f3af
KW
271
272 d = uvchr_to_utf8_flags(d, uv, flags);
273
274or, in most cases,
275
276 d = uvchr_to_utf8_flags(d, uv, 0);
277
278This is the Unicode-aware way of saying
279
280 *(d++) = uv;
281
282This function will convert to UTF-8 (and not warn) even code points that aren't
283legal Unicode or are problematic, unless C<flags> contains one or more of the
284following flags:
285
796b6530 286If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
de69f3af 287the function will raise a warning, provided UTF8 warnings are enabled. If instead
796b6530 288C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return NULL.
de69f3af
KW
289If both flags are set, the function will both warn and return NULL.
290
796b6530 291The C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
de69f3af 292affect how the function handles a Unicode non-character. And likewise, the
796b6530 293C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags affect the handling of
de69f3af 294code points that are
93e6dbd6
KW
295above the Unicode maximum of 0x10FFFF.
296
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
300three DISALLOW flags.
301
ab8e6d41
KW
302Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
303so using them is more problematic than other above-Unicode code points. Perl
304invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
305likely that non-Perl languages will not be able to read files that contain
306these that written by the perl interpreter; nor would Perl understand files
307written by something that uses a different extension. For these reasons, there
308is a separate set of flags that can warn and/or disallow these extremely high
309code points, even if other above-Unicode ones are accepted. These are the
310C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.
311(Of course C<UNICODE_DISALLOW_SUPER> will treat all
312above-Unicode code points, including these, as malformations; and
313C<UNICODE_WARN_SUPER> warns on these.)
314
315On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
316extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
317than on ASCII. Prior to that, code points 2**31 and higher were simply
318unrepresentable, and a different, incompatible method was used to represent
319code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
320and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
321platforms, warning and disallowing 2**31 and higher.
de69f3af 322
de69f3af
KW
323=cut
324*/
325
326/* This is also a macro */
327PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
328
07693fe6
KW
329U8 *
330Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
331{
de69f3af 332 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
333}
334
335/*
87cea99e 336=for apidoc is_utf8_string
6662521e 337
a1433954 338Returns true if the first C<len> bytes of string C<s> form a valid
9f7e3d64 339UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
75200dff
KW
340using C<strlen(s)> (which means if you use this option, that C<s> can't have
341embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
342that all characters being ASCII constitute 'a valid UTF-8 string'.
6662521e 343
7bbfa158 344See also L</is_invariant_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
768c67ee 345
6662521e
GS
346=cut
347*/
348
8e84507e 349bool
668b6d8d 350Perl_is_utf8_string(const U8 *s, STRLEN len)
6662521e 351{
35da51f7 352 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 353 const U8* x = s;
067a85ef 354
7918f24d 355 PERL_ARGS_ASSERT_IS_UTF8_STRING;
1aa99e6b 356
6662521e 357 while (x < send) {
6302f837
KW
358 STRLEN len = isUTF8_CHAR(x, send);
359 if (UNLIKELY(! len)) {
360 return FALSE;
361 }
362 x += len;
6662521e 363 }
768c67ee 364
067a85ef 365 return TRUE;
6662521e
GS
366}
367
67e989fb 368/*
814fafa7
NC
369Implemented as a macro in utf8.h
370
87cea99e 371=for apidoc is_utf8_string_loc
814fafa7 372
a1433954
KW
373Like L</is_utf8_string> but stores the location of the failure (in the
374case of "utf8ness failure") or the location C<s>+C<len> (in the case of
814fafa7
NC
375"utf8ness success") in the C<ep>.
376
a1433954 377See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
814fafa7 378
87cea99e 379=for apidoc is_utf8_string_loclen
81cd54e3 380
a1433954
KW
381Like L</is_utf8_string>() but stores the location of the failure (in the
382case of "utf8ness failure") or the location C<s>+C<len> (in the case of
768c67ee
JH
383"utf8ness success") in the C<ep>, and the number of UTF-8
384encoded characters in the C<el>.
385
a1433954 386See also L</is_utf8_string_loc>() and L</is_utf8_string>().
81cd54e3
JH
387
388=cut
389*/
390
391bool
668b6d8d 392Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 393{
35da51f7 394 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 395 const U8* x = s;
3ebfea28 396 STRLEN outlen = 0;
7918f24d
NC
397
398 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
81cd54e3 399
81cd54e3 400 while (x < send) {
6302f837
KW
401 STRLEN len = isUTF8_CHAR(x, send);
402 if (UNLIKELY(! len)) {
403 goto out;
404 }
405 x += len;
406 outlen++;
81cd54e3 407 }
768c67ee
JH
408
409 out:
3ebfea28
AL
410 if (el)
411 *el = outlen;
412
768c67ee
JH
413 if (ep)
414 *ep = x;
3ebfea28 415 return (x == send);
81cd54e3
JH
416}
417
418/*
768c67ee 419
de69f3af 420=for apidoc utf8n_to_uvchr
378516de
KW
421
422THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 423Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 424
9041c2e3 425Bottom level UTF-8 decode routine.
de69f3af 426Returns the native code point value of the first character in the string C<s>,
746afd53
KW
427which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
428C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
429the length, in bytes, of that character.
949cf498
KW
430
431The value of C<flags> determines the behavior when C<s> does not point to a
432well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
524080c4
KW
433zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
434next possible position in C<s> that could begin a non-malformed character.
435Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
949cf498
KW
436
437Various ALLOW flags can be set in C<flags> to allow (and not warn on)
438individual types of malformations, such as the sequence being overlong (that
439is, when there is a shorter sequence that can express the same code point;
440overlong sequences are expressly forbidden in the UTF-8 standard due to
441potential security issues). Another malformation example is the first byte of
442a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
443flags. For allowed 0 length strings, this function returns 0; for allowed
444overlong sequences, the computed code point is returned; for all other allowed
445malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
446determinable reasonable value.
949cf498 447
796b6530 448The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
449flags) malformation is found. If this flag is set, the routine assumes that
450the caller will raise a warning, and this function will silently just set
d088425d
KW
451C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
452
75200dff 453Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 454character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
d088425d 455in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
75200dff
KW
456first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the
457input had an error.
949cf498
KW
458
459Certain code points are considered problematic. These are Unicode surrogates,
746afd53 460Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 461By default these are considered regular code points, but certain situations
5eafe189 462warrant special handling for them. If C<flags> contains
796b6530
KW
463C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all three classes are treated as
464malformations and handled as such. The flags C<UTF8_DISALLOW_SURROGATE>,
465C<UTF8_DISALLOW_NONCHAR>, and C<UTF8_DISALLOW_SUPER> (meaning above the legal
466Unicode maximum) can be set to disallow these categories individually.
467
468The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
469C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
470raised for their respective categories, but otherwise the code points are
471considered valid (not malformations). To get a category to both be treated as
472a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 473(But note that warnings are not raised if lexically disabled nor if
796b6530 474C<UTF8_CHECK_ONLY> is also specified.)
949cf498 475
ab8e6d41
KW
476
477Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
478so using them is more problematic than other above-Unicode code points. Perl
479invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
480likely that non-Perl languages will not be able to read files that contain
481these that written by the perl interpreter; nor would Perl understand files
482written by something that uses a different extension. For these reasons, there
483is a separate set of flags that can warn and/or disallow these extremely high
484code points, even if other above-Unicode ones are accepted. These are the
485C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.
486(Of course C<UTF8_DISALLOW_SUPER> will treat all
487above-Unicode code points, including these, as malformations; and
488C<UTF8_WARN_SUPER> warns on these.)
489
490On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
491extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
492than on ASCII. Prior to that, code points 2**31 and higher were simply
493unrepresentable, and a different, incompatible method was used to represent
494code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
495and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
496platforms, warning and disallowing 2**31 and higher.
949cf498
KW
497
498All other code points corresponding to Unicode characters, including private
499use and those yet to be assigned, are never considered malformed and never
500warn.
67e989fb 501
37607a96
PK
502=cut
503*/
67e989fb 504
a0ed51b3 505UV
de69f3af 506Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 507{
d4c19fe8 508 const U8 * const s0 = s;
eb83ed87 509 U8 overflow_byte = '\0'; /* Save byte in case of overflow */
0b8d30e8 510 U8 * send;
eb83ed87
KW
511 UV uv = *s;
512 STRLEN expectlen;
949cf498 513 SV* sv = NULL;
eb83ed87
KW
514 UV outlier_ret = 0; /* return value when input is in error or problematic
515 */
516 UV pack_warn = 0; /* Save result of packWARN() for later */
517 bool unexpected_non_continuation = FALSE;
518 bool overflowed = FALSE;
2f8f112e 519 bool do_overlong_test = TRUE; /* May have to skip this test */
a0dbb045 520
eb83ed87 521 const char* const malformed_text = "Malformed UTF-8 character";
7918f24d 522
de69f3af 523 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
a0dbb045 524
eb83ed87
KW
525 /* The order of malformation tests here is important. We should consume as
526 * few bytes as possible in order to not skip any valid character. This is
527 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
528 * http://unicode.org/reports/tr36 for more discussion as to why. For
529 * example, once we've done a UTF8SKIP, we can tell the expected number of
530 * bytes, and could fail right off the bat if the input parameters indicate
531 * that there are too few available. But it could be that just that first
532 * byte is garbled, and the intended character occupies fewer bytes. If we
533 * blindly assumed that the first byte is correct, and skipped based on
534 * that number, we could skip over a valid input character. So instead, we
535 * always examine the sequence byte-by-byte.
536 *
537 * We also should not consume too few bytes, otherwise someone could inject
538 * things. For example, an input could be deliberately designed to
539 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 540 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
541 * which is actually part of of the overflowing sequence), that could look
542 * legitimate to the caller, which could discard the initial partial
543 * sequence and process the rest, inappropriately */
544
545 /* Zero length strings, if allowed, of necessity are zero */
b5b9af04 546 if (UNLIKELY(curlen == 0)) {
eb83ed87
KW
547 if (retlen) {
548 *retlen = 0;
549 }
a0dbb045 550
eb83ed87
KW
551 if (flags & UTF8_ALLOW_EMPTY) {
552 return 0;
553 }
554 if (! (flags & UTF8_CHECK_ONLY)) {
555 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
556 }
0c443dc2
JH
557 goto malformed;
558 }
559
eb83ed87
KW
560 expectlen = UTF8SKIP(s);
561
562 /* A well-formed UTF-8 character, as the vast majority of calls to this
563 * function will be for, has this expected length. For efficiency, set
564 * things up here to return it. It will be overriden only in those rare
565 * cases where a malformation is found */
566 if (retlen) {
567 *retlen = expectlen;
568 }
569
570 /* An invariant is trivially well-formed */
1d72bdf6 571 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 572 return uv;
a0ed51b3 573 }
67e989fb 574
eb83ed87 575 /* A continuation character can't start a valid sequence */
b5b9af04 576 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
eb83ed87
KW
577 if (flags & UTF8_ALLOW_CONTINUATION) {
578 if (retlen) {
579 *retlen = 1;
580 }
581 return UNICODE_REPLACEMENT;
582 }
ba210ebe 583
eb83ed87
KW
584 if (! (flags & UTF8_CHECK_ONLY)) {
585 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
586 }
587 curlen = 1;
ba210ebe
JH
588 goto malformed;
589 }
9041c2e3 590
dcd27b3c
KW
591 /* Here is not a continuation byte, nor an invariant. The only thing left
592 * is a start byte (possibly for an overlong) */
593
1d72bdf6 594#ifdef EBCDIC
bc3632a8 595 uv = NATIVE_UTF8_TO_I8(uv);
1d72bdf6
NIS
596#endif
597
eb83ed87
KW
598 /* Remove the leading bits that indicate the number of bytes in the
599 * character's whole UTF-8 sequence, leaving just the bits that are part of
600 * the value */
601 uv &= UTF_START_MASK(expectlen);
ba210ebe 602
eb83ed87
KW
603 /* Now, loop through the remaining bytes in the character's sequence,
604 * accumulating each into the working value as we go. Be sure to not look
605 * past the end of the input string */
0b8d30e8
KW
606 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
607
eb83ed87 608 for (s = s0 + 1; s < send; s++) {
b5b9af04 609 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
eb83ed87
KW
610 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
611
612 /* The original implementors viewed this malformation as more
613 * serious than the others (though I, khw, don't understand
614 * why, since other malformations also give very very wrong
615 * results), so there is no way to turn off checking for it.
616 * Set a flag, but keep going in the loop, so that we absorb
617 * the rest of the bytes that comprise the character. */
618 overflowed = TRUE;
619 overflow_byte = *s; /* Save for warning message's use */
620 }
8850bf83 621 uv = UTF8_ACCUMULATE(uv, *s);
eb83ed87
KW
622 }
623 else {
624 /* Here, found a non-continuation before processing all expected
625 * bytes. This byte begins a new character, so quit, even if
626 * allowing this malformation. */
627 unexpected_non_continuation = TRUE;
628 break;
629 }
630 } /* End of loop through the character's bytes */
631
632 /* Save how many bytes were actually in the character */
633 curlen = s - s0;
634
635 /* The loop above finds two types of malformations: non-continuation and/or
636 * overflow. The non-continuation malformation is really a too-short
637 * malformation, as it means that the current character ended before it was
638 * expected to (being terminated prematurely by the beginning of the next
639 * character, whereas in the too-short malformation there just are too few
640 * bytes available to hold the character. In both cases, the check below
641 * that we have found the expected number of bytes would fail if executed.)
642 * Thus the non-continuation malformation is really unnecessary, being a
643 * subset of the too-short malformation. But there may be existing
644 * applications that are expecting the non-continuation type, so we retain
645 * it, and return it in preference to the too-short malformation. (If this
646 * code were being written from scratch, the two types might be collapsed
647 * into one.) I, khw, am also giving priority to returning the
648 * non-continuation and too-short malformations over overflow when multiple
649 * ones are present. I don't know of any real reason to prefer one over
650 * the other, except that it seems to me that multiple-byte errors trumps
651 * errors from a single byte */
b5b9af04 652 if (UNLIKELY(unexpected_non_continuation)) {
eb83ed87
KW
653 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
654 if (! (flags & UTF8_CHECK_ONLY)) {
655 if (curlen == 1) {
656 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
657 }
658 else {
659 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
a0dbb045
JH
660 }
661 }
eb83ed87
KW
662 goto malformed;
663 }
664 uv = UNICODE_REPLACEMENT;
2f8f112e
KW
665
666 /* Skip testing for overlongs, as the REPLACEMENT may not be the same
667 * as what the original expectations were. */
668 do_overlong_test = FALSE;
eb83ed87
KW
669 if (retlen) {
670 *retlen = curlen;
671 }
672 }
b5b9af04 673 else if (UNLIKELY(curlen < expectlen)) {
eb83ed87
KW
674 if (! (flags & UTF8_ALLOW_SHORT)) {
675 if (! (flags & UTF8_CHECK_ONLY)) {
676 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
a0dbb045 677 }
eb83ed87
KW
678 goto malformed;
679 }
680 uv = UNICODE_REPLACEMENT;
2f8f112e 681 do_overlong_test = FALSE;
eb83ed87
KW
682 if (retlen) {
683 *retlen = curlen;
684 }
685 }
686
b5b9af04 687 if (UNLIKELY(overflowed)) {
eb83ed87 688 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
ba210ebe 689 goto malformed;
eb83ed87 690 }
eb83ed87 691
2f8f112e 692 if (do_overlong_test
5aaebcb3 693 && expectlen > (STRLEN) OFFUNISKIP(uv)
2f8f112e
KW
694 && ! (flags & UTF8_ALLOW_LONG))
695 {
eb83ed87
KW
696 /* The overlong malformation has lower precedence than the others.
697 * Note that if this malformation is allowed, we return the actual
698 * value, instead of the replacement character. This is because this
699 * value is actually well-defined. */
700 if (! (flags & UTF8_CHECK_ONLY)) {
5aaebcb3 701 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
eb83ed87
KW
702 }
703 goto malformed;
704 }
705
1a89bb6c 706 /* Here, the input is considered to be well-formed, but it still could be a
eb83ed87
KW
707 * problematic code point that is not allowed by the input parameters. */
708 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
93e6dbd6
KW
709 && (flags & ( UTF8_DISALLOW_NONCHAR
710 |UTF8_DISALLOW_SURROGATE
711 |UTF8_DISALLOW_SUPER
712 |UTF8_DISALLOW_ABOVE_31_BIT
713 |UTF8_WARN_NONCHAR
714 |UTF8_WARN_SURROGATE
715 |UTF8_WARN_SUPER
716 |UTF8_WARN_ABOVE_31_BIT)))
eb83ed87 717 {
949cf498 718 if (UNICODE_IS_SURROGATE(uv)) {
ea5ced44
KW
719
720 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
721 * generation of the sv, since no warnings are raised under CHECK */
eb83ed87 722 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
54f4afef 723 && ckWARN_d(WARN_SURROGATE))
eb83ed87 724 {
111d382d 725 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
54f4afef 726 pack_warn = packWARN(WARN_SURROGATE);
949cf498
KW
727 }
728 if (flags & UTF8_DISALLOW_SURROGATE) {
729 goto disallowed;
730 }
731 }
949cf498 732 else if ((uv > PERL_UNICODE_MAX)) {
eb83ed87 733 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
ea5ced44 734 && ckWARN_d(WARN_NON_UNICODE))
eb83ed87 735 {
0bcdd8f6
KW
736 sv = sv_2mortal(Perl_newSVpvf(aTHX_
737 "Code point 0x%04"UVXf" is not Unicode, may not be portable",
738 uv));
54f4afef 739 pack_warn = packWARN(WARN_NON_UNICODE);
949cf498 740 }
c0236afe
KW
741
742 /* The maximum code point ever specified by a standard was
743 * 2**31 - 1. Anything larger than that is a Perl extension that
744 * very well may not be understood by other applications (including
745 * earlier perl versions on EBCDIC platforms). On ASCII platforms,
746 * these code points are indicated by the first UTF-8 byte being
d35f2ca5
KW
747 * 0xFE or 0xFF. We test for these after the regular SUPER ones,
748 * and before possibly bailing out, so that the slightly more dire
c0236afe
KW
749 * warning will override the regular one. */
750 if (
751#ifndef EBCDIC
752 (*s0 & 0xFE) == 0xFE /* matches both FE, FF */
753#else
754 /* The I8 for 2**31 (U+80000000) is
755 * \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
756 * and it turns out that on all EBCDIC pages recognized that
757 * the UTF-EBCDIC for that code point is
758 * \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
759 * For the next lower code point, the 1047 UTF-EBCDIC is
760 * \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
761 * The other code pages differ only in the bytes following
762 * \x42. Thus the following works (the minimum continuation
763 * byte is \x41). */
764 *s0 == 0xFE && send - s0 > 7 && ( s0[1] > 0x41
765 || s0[2] > 0x41
766 || s0[3] > 0x41
767 || s0[4] > 0x41
768 || s0[5] > 0x41
769 || s0[6] > 0x41
770 || s0[7] > 0x42)
771#endif
d35f2ca5
KW
772 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER
773 |UTF8_DISALLOW_ABOVE_31_BIT)))
ea5ced44 774 {
0bcdd8f6 775 if ( ! (flags & UTF8_CHECK_ONLY)
d35f2ca5 776 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
0bcdd8f6 777 && ckWARN_d(WARN_UTF8))
ea5ced44 778 {
0bcdd8f6
KW
779 sv = sv_2mortal(Perl_newSVpvf(aTHX_
780 "Code point 0x%"UVXf" is not Unicode, and not portable",
781 uv));
ea5ced44
KW
782 pack_warn = packWARN(WARN_UTF8);
783 }
d35f2ca5 784 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
ea5ced44
KW
785 goto disallowed;
786 }
787 }
c0236afe 788
949cf498
KW
789 if (flags & UTF8_DISALLOW_SUPER) {
790 goto disallowed;
791 }
792 }
4190d317
KW
793 else if (UNICODE_IS_NONCHAR(uv)) {
794 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
54f4afef 795 && ckWARN_d(WARN_NONCHAR))
4190d317 796 {
ba707cdc 797 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
54f4afef 798 pack_warn = packWARN(WARN_NONCHAR);
4190d317
KW
799 }
800 if (flags & UTF8_DISALLOW_NONCHAR) {
801 goto disallowed;
802 }
803 }
949cf498 804
eb83ed87 805 if (sv) {
de69f3af
KW
806 outlier_ret = uv; /* Note we don't bother to convert to native,
807 as all the outlier code points are the same
808 in both ASCII and EBCDIC */
eb83ed87
KW
809 goto do_warn;
810 }
811
949cf498
KW
812 /* Here, this is not considered a malformed character, so drop through
813 * to return it */
a0ed51b3 814 }
ba210ebe 815
de69f3af 816 return UNI_TO_NATIVE(uv);
ba210ebe 817
eb83ed87
KW
818 /* There are three cases which get to beyond this point. In all 3 cases:
819 * <sv> if not null points to a string to print as a warning.
820 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
821 * set.
822 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
823 * This is done by initializing it to 0, and changing it only
824 * for case 1).
825 * The 3 cases are:
826 * 1) The input is valid but problematic, and to be warned about. The
827 * return value is the resultant code point; <*retlen> is set to
828 * <curlen>, the number of bytes that comprise the code point.
829 * <pack_warn> contains the result of packWARN() for the warning
830 * types. The entry point for this case is the label <do_warn>;
831 * 2) The input is a valid code point but disallowed by the parameters to
832 * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
833 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
834 * comprise the code point. <pack_warn> contains the result of
835 * packWARN() for the warning types. The entry point for this case is
836 * the label <disallowed>.
837 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
838 * is set, <*relen> is -1; otherwise it is <curlen>, the number of
839 * bytes that comprise the malformation. All such malformations are
840 * assumed to be warning type <utf8>. The entry point for this case
841 * is the label <malformed>.
842 */
949cf498 843
7b52d656 844 malformed:
ba210ebe 845
eb83ed87
KW
846 if (sv && ckWARN_d(WARN_UTF8)) {
847 pack_warn = packWARN(WARN_UTF8);
848 }
849
7b52d656 850 disallowed:
eb83ed87 851
fcc8fcf6 852 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 853 if (retlen)
10edeb5d 854 *retlen = ((STRLEN) -1);
ba210ebe
JH
855 return 0;
856 }
857
7b52d656 858 do_warn:
5b311467 859
eb83ed87
KW
860 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
861 if warnings are to be raised. */
f555bc63 862 const char * const string = SvPVX_const(sv);
a0dbb045 863
f555bc63
KW
864 if (PL_op)
865 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
866 else
867 Perl_warner(aTHX_ pack_warn, "%s", string);
a0dbb045
JH
868 }
869
eb83ed87
KW
870 if (retlen) {
871 *retlen = curlen;
872 }
ba210ebe 873
eb83ed87 874 return outlier_ret;
a0ed51b3
LW
875}
876
8e84507e 877/*
ec5f19d0
KW
878=for apidoc utf8_to_uvchr_buf
879
880Returns the native code point of the first character in the string C<s> which
881is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 882C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 883
524080c4
KW
884If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
885enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 886C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 887(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 888C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 889the next possible position in C<s> that could begin a non-malformed character.
de69f3af 890See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 891returned.
ec5f19d0
KW
892
893=cut
894*/
895
896
897UV
898Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
899{
ec5f19d0
KW
900 assert(s < send);
901
902 return utf8n_to_uvchr(s, send - s, retlen,
903 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
904}
905
27d6c58a 906/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
3986bb7c 907 * there are no malformations in the input UTF-8 string C<s>. surrogates,
57b0056d 908 * non-character code points, and non-Unicode code points are allowed. */
27d6c58a
KW
909
910UV
911Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
912{
010ab96b
KW
913 UV expectlen = UTF8SKIP(s);
914 const U8* send = s + expectlen;
9ff2f0f7 915 UV uv = *s;
3986bb7c 916
27d6c58a 917 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
81611534 918 PERL_UNUSED_CONTEXT;
27d6c58a 919
010ab96b
KW
920 if (retlen) {
921 *retlen = expectlen;
922 }
923
924 /* An invariant is trivially returned */
925 if (expectlen == 1) {
9ff2f0f7 926 return uv;
010ab96b
KW
927 }
928
9ff2f0f7
KW
929#ifdef EBCDIC
930 uv = NATIVE_UTF8_TO_I8(uv);
931#endif
932
010ab96b
KW
933 /* Remove the leading bits that indicate the number of bytes, leaving just
934 * the bits that are part of the value */
935 uv &= UTF_START_MASK(expectlen);
936
937 /* Now, loop through the remaining bytes, accumulating each into the
938 * working total as we go. (I khw tried unrolling the loop for up to 4
939 * bytes, but there was no performance improvement) */
940 for (++s; s < send; s++) {
941 uv = UTF8_ACCUMULATE(uv, *s);
942 }
943
3986bb7c 944 return UNI_TO_NATIVE(uv);
010ab96b 945
27d6c58a
KW
946}
947
ec5f19d0 948/*
ec5f19d0
KW
949=for apidoc utf8_to_uvuni_buf
950
de69f3af
KW
951Only in very rare circumstances should code need to be dealing in Unicode
952(as opposed to native) code points. In those few cases, use
953C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
954
955Returns the Unicode (not-native) code point of the first character in the
956string C<s> which
ec5f19d0
KW
957is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
958C<retlen> will be set to the length, in bytes, of that character.
959
524080c4
KW
960If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
961enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
962NULL) to -1. If those warnings are off, the computed value if well-defined (or
963the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
964is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
965next possible position in C<s> that could begin a non-malformed character.
de69f3af 966See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0
KW
967
968=cut
969*/
970
971UV
972Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
973{
974 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
975
976 assert(send > s);
977
978 /* Call the low level routine asking for checks */
de69f3af
KW
979 return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
980 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
ec5f19d0
KW
981}
982
b76347f2 983/*
87cea99e 984=for apidoc utf8_length
b76347f2
JH
985
986Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
987Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
988up past C<e>, croaks.
b76347f2
JH
989
990=cut
991*/
992
993STRLEN
35a4481c 994Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
995{
996 STRLEN len = 0;
997
7918f24d
NC
998 PERL_ARGS_ASSERT_UTF8_LENGTH;
999
8850bf83
JH
1000 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1001 * the bitops (especially ~) can create illegal UTF-8.
1002 * In other words: in Perl UTF-8 is not just for Unicode. */
1003
a3b680e6
AL
1004 if (e < s)
1005 goto warn_and_return;
b76347f2 1006 while (s < e) {
4cbf4130 1007 s += UTF8SKIP(s);
8e91ec7f
AV
1008 len++;
1009 }
1010
1011 if (e != s) {
1012 len--;
1013 warn_and_return:
9b387841
NC
1014 if (PL_op)
1015 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1016 "%s in %s", unees, OP_DESC(PL_op));
1017 else
61a12c31 1018 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1019 }
1020
1021 return len;
1022}
1023
b06226ff 1024/*
87cea99e 1025=for apidoc utf8_distance
b06226ff 1026
1e54db1a 1027Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
1028and C<b>.
1029
1030WARNING: use only if you *know* that the pointers point inside the
1031same UTF-8 buffer.
1032
37607a96
PK
1033=cut
1034*/
a0ed51b3 1035
02eb7b47 1036IV
35a4481c 1037Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 1038{
7918f24d
NC
1039 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1040
bf1665bc 1041 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
1042}
1043
b06226ff 1044/*
87cea99e 1045=for apidoc utf8_hop
b06226ff 1046
8850bf83
JH
1047Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1048forward or backward.
b06226ff
JH
1049
1050WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
1051the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1052on the first byte of character or just after the last byte of a character.
b06226ff 1053
37607a96
PK
1054=cut
1055*/
a0ed51b3
LW
1056
1057U8 *
ddeaf645 1058Perl_utf8_hop(const U8 *s, I32 off)
a0ed51b3 1059{
7918f24d
NC
1060 PERL_ARGS_ASSERT_UTF8_HOP;
1061
8850bf83
JH
1062 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1063 * the bitops (especially ~) can create illegal UTF-8.
1064 * In other words: in Perl UTF-8 is not just for Unicode. */
1065
a0ed51b3
LW
1066 if (off >= 0) {
1067 while (off--)
1068 s += UTF8SKIP(s);
1069 }
1070 else {
1071 while (off++) {
1072 s--;
8850bf83
JH
1073 while (UTF8_IS_CONTINUATION(*s))
1074 s--;
a0ed51b3
LW
1075 }
1076 }
4373e329 1077 return (U8 *)s;
a0ed51b3
LW
1078}
1079
6940069f 1080/*
fed3ba5d
NC
1081=for apidoc bytes_cmp_utf8
1082
a1433954 1083Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1084sequence of characters (stored as UTF-8)
1085in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1086equal, -1 or -2 if the first string is less than the second string, +1 or +2
1087if the first string is greater than the second string.
1088
1089-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1090longer string. -2 or +2 is returned if
1091there was a difference between characters
fed3ba5d
NC
1092within the strings.
1093
1094=cut
1095*/
1096
1097int
1098Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1099{
1100 const U8 *const bend = b + blen;
1101 const U8 *const uend = u + ulen;
1102
1103 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1104
1105 while (b < bend && u < uend) {
1106 U8 c = *u++;
1107 if (!UTF8_IS_INVARIANT(c)) {
1108 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1109 if (u < uend) {
1110 U8 c1 = *u++;
1111 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1112 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d
NC
1113 } else {
1114 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1115 "Malformed UTF-8 character "
1116 "(unexpected non-continuation byte 0x%02x"
1117 ", immediately after start byte 0x%02x)"
1118 /* Dear diag.t, it's in the pod. */
1119 "%s%s", c1, c,
1120 PL_op ? " in " : "",
1121 PL_op ? OP_DESC(PL_op) : "");
1122 return -2;
1123 }
1124 } else {
1125 if (PL_op)
1126 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1127 "%s in %s", unees, OP_DESC(PL_op));
1128 else
61a12c31 1129 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1130 return -2; /* Really want to return undef :-) */
1131 }
1132 } else {
1133 return -2;
1134 }
1135 }
1136 if (*b != c) {
1137 return *b < c ? -2 : +2;
1138 }
1139 ++b;
1140 }
1141
1142 if (b == bend && u == uend)
1143 return 0;
1144
1145 return b < bend ? +1 : -1;
1146}
1147
1148/*
87cea99e 1149=for apidoc utf8_to_bytes
6940069f 1150
2bbc8d55 1151Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1152Unlike L</bytes_to_utf8>, this over-writes the original string, and
1153updates C<len> to contain the new length.
67e989fb 1154Returns zero on failure, setting C<len> to -1.
6940069f 1155
a1433954 1156If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1157
6940069f
GS
1158=cut
1159*/
1160
1161U8 *
37607a96 1162Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1163{
d4c19fe8
AL
1164 U8 * const save = s;
1165 U8 * const send = s + *len;
6940069f 1166 U8 *d;
246fae53 1167
7918f24d 1168 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1169 PERL_UNUSED_CONTEXT;
7918f24d 1170
1e54db1a 1171 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1172 while (s < send) {
d59937ca
KW
1173 if (! UTF8_IS_INVARIANT(*s)) {
1174 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1175 *len = ((STRLEN) -1);
1176 return 0;
1177 }
1178 s++;
dcad2880 1179 }
d59937ca 1180 s++;
246fae53 1181 }
dcad2880
JH
1182
1183 d = s = save;
6940069f 1184 while (s < send) {
80e0b38f
KW
1185 U8 c = *s++;
1186 if (! UTF8_IS_INVARIANT(c)) {
1187 /* Then it is two-byte encoded */
a62b247b 1188 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1189 s++;
1190 }
1191 *d++ = c;
6940069f
GS
1192 }
1193 *d = '\0';
246fae53 1194 *len = d - save;
6940069f
GS
1195 return save;
1196}
1197
1198/*
87cea99e 1199=for apidoc bytes_from_utf8
f9a63242 1200
2bbc8d55 1201Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1202Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1203the newly-created string, and updates C<len> to contain the new
1204length. Returns the original string if no conversion occurs, C<len>
72d33970 1205is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 12060 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1207in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1208
37607a96
PK
1209=cut
1210*/
f9a63242
JH
1211
1212U8 *
e1ec3a88 1213Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1214{
f9a63242 1215 U8 *d;
e1ec3a88
AL
1216 const U8 *start = s;
1217 const U8 *send;
f9a63242
JH
1218 I32 count = 0;
1219
7918f24d 1220 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1221 PERL_UNUSED_CONTEXT;
f9a63242 1222 if (!*is_utf8)
73d840c0 1223 return (U8 *)start;
f9a63242 1224
1e54db1a 1225 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1226 for (send = s + *len; s < send;) {
d59937ca
KW
1227 if (! UTF8_IS_INVARIANT(*s)) {
1228 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1229 return (U8 *)start;
d59937ca
KW
1230 }
1231 count++;
1232 s++;
db42d148 1233 }
d59937ca 1234 s++;
f9a63242
JH
1235 }
1236
35da51f7 1237 *is_utf8 = FALSE;
f9a63242 1238
212542aa 1239 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1240 s = start; start = d;
f9a63242
JH
1241 while (s < send) {
1242 U8 c = *s++;
1a91c45d 1243 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1244 /* Then it is two-byte encoded */
a62b247b 1245 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1246 s++;
c4d5f83a
NIS
1247 }
1248 *d++ = c;
f9a63242
JH
1249 }
1250 *d = '\0';
1251 *len = d - start;
73d840c0 1252 return (U8 *)start;
f9a63242
JH
1253}
1254
1255/*
87cea99e 1256=for apidoc bytes_to_utf8
6940069f 1257
ff97e5cf
KW
1258Converts a string C<s> of length C<len> bytes from the native encoding into
1259UTF-8.
6662521e 1260Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1261reflect the new length in bytes.
6940069f 1262
75200dff 1263A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1264
1265If you want to convert to UTF-8 from encodings other than
1266the native (Latin1 or EBCDIC),
a1433954 1267see L</sv_recode_to_utf8>().
c9ada85f 1268
497711e7 1269=cut
6940069f
GS
1270*/
1271
c682ebef
FC
1272/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1273 likewise need duplication. */
1274
6940069f 1275U8*
35a4481c 1276Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1277{
35a4481c 1278 const U8 * const send = s + (*len);
6940069f
GS
1279 U8 *d;
1280 U8 *dst;
7918f24d
NC
1281
1282 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1283 PERL_UNUSED_CONTEXT;
6940069f 1284
212542aa 1285 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1286 dst = d;
1287
1288 while (s < send) {
55d09dc8
KW
1289 append_utf8_from_native_byte(*s, &d);
1290 s++;
6940069f
GS
1291 }
1292 *d = '\0';
6662521e 1293 *len = d-dst;
6940069f
GS
1294 return dst;
1295}
1296
a0ed51b3 1297/*
dea0fc0b 1298 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1299 *
1300 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1301 * We optimize for native, for obvious reasons. */
1302
1303U8*
dea0fc0b 1304Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1305{
dea0fc0b
JH
1306 U8* pend;
1307 U8* dstart = d;
1308
7918f24d
NC
1309 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1310
dea0fc0b 1311 if (bytelen & 1)
f5992bc4 1312 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1313
1314 pend = p + bytelen;
1315
a0ed51b3 1316 while (p < pend) {
dea0fc0b
JH
1317 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1318 p += 2;
2d1545e5 1319 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1320 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1321 continue;
1322 }
56d37426
KW
1323 if (uv <= MAX_UTF8_TWO_BYTE) {
1324 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1325 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1326 continue;
1327 }
46956fad
KW
1328#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1329#define LAST_HIGH_SURROGATE 0xDBFF
1330#define FIRST_LOW_SURROGATE 0xDC00
1331#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1332
1333 /* This assumes that most uses will be in the first Unicode plane, not
1334 * needing surrogates */
1335 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1336 && uv <= UNICODE_SURROGATE_LAST))
1337 {
1338 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1339 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1340 }
1341 else {
01ea242b 1342 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1343 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1344 || UNLIKELY(low > LAST_LOW_SURROGATE))
1345 {
01ea242b 1346 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
1347 }
1348 p += 2;
46956fad
KW
1349 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1350 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1351 }
a0ed51b3 1352 }
56d37426
KW
1353#ifdef EBCDIC
1354 d = uvoffuni_to_utf8_flags(d, uv, 0);
1355#else
a0ed51b3 1356 if (uv < 0x10000) {
eb160463
GS
1357 *d++ = (U8)(( uv >> 12) | 0xe0);
1358 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1359 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1360 continue;
1361 }
1362 else {
eb160463
GS
1363 *d++ = (U8)(( uv >> 18) | 0xf0);
1364 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1365 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1366 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1367 continue;
1368 }
56d37426 1369#endif
a0ed51b3 1370 }
dea0fc0b 1371 *newlen = d - dstart;
a0ed51b3
LW
1372 return d;
1373}
1374
1375/* Note: this one is slightly destructive of the source. */
1376
1377U8*
dea0fc0b 1378Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1379{
1380 U8* s = (U8*)p;
d4c19fe8 1381 U8* const send = s + bytelen;
7918f24d
NC
1382
1383 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1384
e0ea5e2d
NC
1385 if (bytelen & 1)
1386 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1387 (UV)bytelen);
1388
a0ed51b3 1389 while (s < send) {
d4c19fe8 1390 const U8 tmp = s[0];
a0ed51b3
LW
1391 s[0] = s[1];
1392 s[1] = tmp;
1393 s += 2;
1394 }
dea0fc0b 1395 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1396}
1397
922e8cb4
KW
1398bool
1399Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1400{
1401 U8 tmpbuf[UTF8_MAXBYTES+1];
1402 uvchr_to_utf8(tmpbuf, c);
1403 return _is_utf8_FOO(classnum, tmpbuf);
1404}
1405
f9ae8fb6
JD
1406/* Internal function so we can deprecate the external one, and call
1407 this one from other deprecated functions in this file */
1408
f2645549
KW
1409bool
1410Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 1411{
f2645549 1412 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
1413
1414 if (*p == '_')
1415 return TRUE;
f25ce844 1416 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
1417}
1418
5092f92a 1419bool
eba68aa0
KW
1420Perl__is_uni_perl_idcont(pTHX_ UV c)
1421{
1422 U8 tmpbuf[UTF8_MAXBYTES+1];
1423 uvchr_to_utf8(tmpbuf, c);
1424 return _is_utf8_perl_idcont(tmpbuf);
1425}
1426
1427bool
f91dcd13
KW
1428Perl__is_uni_perl_idstart(pTHX_ UV c)
1429{
1430 U8 tmpbuf[UTF8_MAXBYTES+1];
1431 uvchr_to_utf8(tmpbuf, c);
1432 return _is_utf8_perl_idstart(tmpbuf);
1433}
1434
3a4c58c9
KW
1435UV
1436Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1437{
1438 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 1439 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
1440 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1441 * either "SS" or "Ss". Which one to use is passed into the routine in
1442 * 'S_or_s' to avoid a test */
1443
1444 UV converted = toUPPER_LATIN1_MOD(c);
1445
1446 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1447
1448 assert(S_or_s == 'S' || S_or_s == 's');
1449
6f2d5cbc 1450 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 1451 characters in this range */
3a4c58c9
KW
1452 *p = (U8) converted;
1453 *lenp = 1;
1454 return converted;
1455 }
1456
1457 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1458 * which it maps to one of them, so as to only have to have one check for
1459 * it in the main case */
1460 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1461 switch (c) {
1462 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1463 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1464 break;
1465 case MICRO_SIGN:
1466 converted = GREEK_CAPITAL_LETTER_MU;
1467 break;
79e064b9
KW
1468#if UNICODE_MAJOR_VERSION > 2 \
1469 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
1470 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
1471 case LATIN_SMALL_LETTER_SHARP_S:
1472 *(p)++ = 'S';
1473 *p = S_or_s;
1474 *lenp = 2;
1475 return 'S';
79e064b9 1476#endif
3a4c58c9
KW
1477 default:
1478 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 1479 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
1480 }
1481 }
1482
1483 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1484 *p = UTF8_TWO_BYTE_LO(converted);
1485 *lenp = 2;
1486
1487 return converted;
1488}
1489
50bda2c3
KW
1490/* Call the function to convert a UTF-8 encoded character to the specified case.
1491 * Note that there may be more than one character in the result.
1492 * INP is a pointer to the first byte of the input character
1493 * OUTP will be set to the first byte of the string of changed characters. It
1494 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1495 * LENP will be set to the length in bytes of the string of changed characters
1496 *
1497 * The functions return the ordinal of the first character in the string of OUTP */
4a8240a3
KW
1498#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
1499#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
1500#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
50bda2c3
KW
1501
1502/* This additionally has the input parameter SPECIALS, which if non-zero will
1503 * cause this to use the SPECIALS hash for folding (meaning get full case
1504 * folding); otherwise, when zero, this implies a simple case fold */
4a8240a3 1505#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
c3fd2246 1506
84afefe6
JH
1507UV
1508Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1509{
a1433954
KW
1510 /* Convert the Unicode character whose ordinal is <c> to its uppercase
1511 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1512 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
1513 * the changed version may be longer than the original character.
1514 *
1515 * The ordinal of the first character of the changed version is returned
1516 * (but note, as explained above, that there may be more.) */
1517
7918f24d
NC
1518 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1519
3a4c58c9
KW
1520 if (c < 256) {
1521 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1522 }
1523
0ebc6274 1524 uvchr_to_utf8(p, c);
3a4c58c9 1525 return CALL_UPPER_CASE(p, p, lenp);
a0ed51b3
LW
1526}
1527
84afefe6
JH
1528UV
1529Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1530{
7918f24d
NC
1531 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1532
3a4c58c9
KW
1533 if (c < 256) {
1534 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1535 }
1536
0ebc6274 1537 uvchr_to_utf8(p, c);
3a4c58c9 1538 return CALL_TITLE_CASE(p, p, lenp);
a0ed51b3
LW
1539}
1540
afc16117 1541STATIC U8
81611534 1542S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
1543{
1544 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 1545 * those, converting the result to UTF-8. Since the result is always just
a1433954 1546 * one character, we allow <p> to be NULL */
afc16117
KW
1547
1548 U8 converted = toLOWER_LATIN1(c);
1549
1550 if (p != NULL) {
6f2d5cbc 1551 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
1552 *p = converted;
1553 *lenp = 1;
1554 }
1555 else {
430c9760
KW
1556 /* Result is known to always be < 256, so can use the EIGHT_BIT
1557 * macros */
1558 *p = UTF8_EIGHT_BIT_HI(converted);
1559 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
1560 *lenp = 2;
1561 }
1562 }
1563 return converted;
1564}
1565
84afefe6
JH
1566UV
1567Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1568{
7918f24d
NC
1569 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1570
afc16117
KW
1571 if (c < 256) {
1572 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
1573 }
1574
afc16117 1575 uvchr_to_utf8(p, c);
968c5e6a 1576 return CALL_LOWER_CASE(p, p, lenp);
a0ed51b3
LW
1577}
1578
84afefe6 1579UV
51910141 1580Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 1581{
51910141 1582 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 1583 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 1584 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
1585 *
1586 * Not to be used for locale folds
51910141 1587 */
f673fad4 1588
a1dde8de
KW
1589 UV converted;
1590
1591 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 1592 PERL_UNUSED_CONTEXT;
a1dde8de 1593
1ca267a5
KW
1594 assert (! (flags & FOLD_FLAGS_LOCALE));
1595
a1dde8de
KW
1596 if (c == MICRO_SIGN) {
1597 converted = GREEK_SMALL_LETTER_MU;
1598 }
9b63e895
KW
1599#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1600 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1601 || UNICODE_DOT_DOT_VERSION > 0)
51910141 1602 else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1ca267a5
KW
1603
1604 /* If can't cross 127/128 boundary, can't return "ss"; instead return
1605 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1606 * under those circumstances. */
1607 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1608 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1609 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1610 p, *lenp, U8);
1611 return LATIN_SMALL_LETTER_LONG_S;
1612 }
1613 else {
4f489194
KW
1614 *(p)++ = 's';
1615 *p = 's';
1616 *lenp = 2;
1617 return 's';
1ca267a5 1618 }
a1dde8de 1619 }
9b63e895 1620#endif
a1dde8de
KW
1621 else { /* In this range the fold of all other characters is their lower
1622 case */
1623 converted = toLOWER_LATIN1(c);
1624 }
1625
6f2d5cbc 1626 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
1627 *p = (U8) converted;
1628 *lenp = 1;
1629 }
1630 else {
1631 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1632 *p = UTF8_TWO_BYTE_LO(converted);
1633 *lenp = 2;
1634 }
1635
1636 return converted;
1637}
1638
1639UV
31f05a37 1640Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 1641{
4b593389 1642
a0270393
KW
1643 /* Not currently externally documented, and subject to change
1644 * <flags> bits meanings:
1645 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
1646 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
1647 * locale are to be used.
a0270393
KW
1648 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1649 */
4b593389 1650
36bb2ab6 1651 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 1652
780fcc9f
KW
1653 if (flags & FOLD_FLAGS_LOCALE) {
1654 /* Treat a UTF-8 locale as not being in locale at all */
1655 if (IN_UTF8_CTYPE_LOCALE) {
1656 flags &= ~FOLD_FLAGS_LOCALE;
1657 }
1658 else {
1659 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 1660 goto needs_full_generality;
780fcc9f 1661 }
31f05a37
KW
1662 }
1663
a1dde8de 1664 if (c < 256) {
e7b7ac46 1665 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 1666 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
1667 }
1668
2f306ab9 1669 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
1670 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1671 uvchr_to_utf8(p, c);
1672 return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1673 }
1674 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1675 the special flags. */
1676 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
1677
1678 needs_full_generality:
a0270393 1679 uvchr_to_utf8(utf8_c, c);
445bf929 1680 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 1681 }
84afefe6
JH
1682}
1683
26483009 1684PERL_STATIC_INLINE bool
5141f98e 1685S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 1686 const char *const swashname, SV* const invlist)
bde6a22d 1687{
ea317ccb
KW
1688 /* returns a boolean giving whether or not the UTF8-encoded character that
1689 * starts at <p> is in the swash indicated by <swashname>. <swash>
1690 * contains a pointer to where the swash indicated by <swashname>
1691 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
1692 * look at <*swash> and only generate a swash if it is not null. <invlist>
1693 * is NULL or an inversion list that defines the swash. If not null, it
1694 * saves time during initialization of the swash.
ea317ccb
KW
1695 *
1696 * Note that it is assumed that the buffer length of <p> is enough to
1697 * contain all the bytes that comprise the character. Thus, <*p> should
1698 * have been checked before this call for mal-formedness enough to assure
1699 * that. */
1700
7918f24d
NC
1701 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1702
492a624f 1703 /* The API should have included a length for the UTF-8 character in <p>,
28123549 1704 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
1705 * as far as there being enough bytes available in it to accommodate the
1706 * character without reading beyond the end, and pass that number on to the
1707 * validating routine */
6302f837 1708 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
1709 if (ckWARN_d(WARN_UTF8)) {
1710 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 1711 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
1712 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
1713 what the malformation is */
1714 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
1715 }
1716 }
1717 return FALSE;
1718 }
87367d5f
KW
1719 if (!*swash) {
1720 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
1721 *swash = _core_swash_init("utf8",
1722
1723 /* Only use the name if there is no inversion
1724 * list; otherwise will go out to disk */
1725 (invlist) ? "" : swashname,
1726
1727 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 1728 }
28123549 1729
bde6a22d
NC
1730 return swash_fetch(*swash, p, TRUE) != 0;
1731}
1732
1733bool
922e8cb4
KW
1734Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
1735{
922e8cb4
KW
1736 PERL_ARGS_ASSERT__IS_UTF8_FOO;
1737
1738 assert(classnum < _FIRST_NON_SWASH_CC);
1739
f25ce844
KW
1740 return is_utf8_common(p,
1741 &PL_utf8_swash_ptrs[classnum],
1742 swash_property_names[classnum],
1743 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
1744}
1745
1746bool
f2645549 1747Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 1748{
f2645549 1749 SV* invlist = NULL;
7918f24d 1750
f2645549 1751 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 1752
f2645549
KW
1753 if (! PL_utf8_perl_idstart) {
1754 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
1755 }
60071a22 1756 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
1757}
1758
1759bool
f2645549 1760Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 1761{
f2645549 1762 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
1763
1764 if (*p == '_')
1765 return TRUE;
f25ce844 1766 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
1767}
1768
1769bool
eba68aa0
KW
1770Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
1771{
b24b43f7 1772 SV* invlist = NULL;
eba68aa0
KW
1773
1774 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
1775
b24b43f7
KW
1776 if (! PL_utf8_perl_idcont) {
1777 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
1778 }
60071a22 1779 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
1780}
1781
eba68aa0 1782bool
f2645549 1783Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1784{
f2645549 1785 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 1786
f25ce844 1787 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
1788}
1789
1790bool
f2645549 1791Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 1792{
f2645549 1793 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 1794
f25ce844 1795 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
1796}
1797
1798bool
7dbf68d2
KW
1799Perl__is_utf8_mark(pTHX_ const U8 *p)
1800{
7dbf68d2
KW
1801 PERL_ARGS_ASSERT__IS_UTF8_MARK;
1802
f25ce844 1803 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
1804}
1805
6b5c0936 1806/*
87cea99e 1807=for apidoc to_utf8_case
6b5c0936 1808
6fae5207 1809C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
1810the character that is being converted. This routine assumes that the character
1811at C<p> is well-formed.
6b5c0936 1812
6fae5207
KW
1813C<ustrp> is a pointer to the character buffer to put the
1814conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
1815of the result.
1816
6fae5207 1817C<swashp> is a pointer to the swash to use.
6b5c0936 1818
a1433954 1819Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 1820and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 1821but not always, a multicharacter mapping), is tried first.
6b5c0936 1822
4a8240a3
KW
1823C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
1824any special mappings; C<""> means to use the special mappings. Values other
1825than these two are treated as the name of the hash containing the special
1826mappings, like C<"utf8::ToSpecLower">.
6b5c0936 1827
796b6530
KW
1828C<normal> is a string like C<"ToLower"> which means the swash
1829C<%utf8::ToLower>.
0134edef
JH
1830
1831=cut */
6b5c0936 1832
2104c8d9 1833UV
9a957fbc
AL
1834Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1835 SV **swashp, const char *normal, const char *special)
a0ed51b3 1836{
0134edef 1837 STRLEN len = 0;
f4cd282c 1838 const UV uv1 = valid_utf8_to_uvchr(p, NULL);
7918f24d
NC
1839
1840 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1841
9ae3ac1a
KW
1842 /* Note that swash_fetch() doesn't output warnings for these because it
1843 * assumes we will */
8457b38f 1844 if (uv1 >= UNICODE_SURROGATE_FIRST) {
9ae3ac1a 1845 if (uv1 <= UNICODE_SURROGATE_LAST) {
8457b38f
KW
1846 if (ckWARN_d(WARN_SURROGATE)) {
1847 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
1848 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
1849 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
1850 }
9ae3ac1a
KW
1851 }
1852 else if (UNICODE_IS_SUPER(uv1)) {
8457b38f
KW
1853 if (ckWARN_d(WARN_NON_UNICODE)) {
1854 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
1855 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
1856 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
1857 }
9ae3ac1a
KW
1858 }
1859
1860 /* Note that non-characters are perfectly legal, so no warning should
1861 * be given */
1862 }
1863
0134edef 1864 if (!*swashp) /* load on-demand */
5ab9d2ef 1865 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 1866
a6f87d8c 1867 if (special) {
0134edef 1868 /* It might be "special" (sometimes, but not always,
2a37f04d 1869 * a multicharacter mapping) */
4a8240a3 1870 HV *hv = NULL;
b08cf34e
JH
1871 SV **svp;
1872
4a8240a3
KW
1873 /* If passed in the specials name, use that; otherwise use any
1874 * given in the swash */
1875 if (*special != '\0') {
1876 hv = get_hv(special, 0);
1877 }
1878 else {
1879 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
1880 if (svp) {
1881 hv = MUTABLE_HV(SvRV(*svp));
1882 }
1883 }
1884
176fe009 1885 if (hv
5f560d8a 1886 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
1887 && (*svp))
1888 {
cfd0369c 1889 const char *s;
47654450 1890
cfd0369c 1891 s = SvPV_const(*svp, len);
47654450 1892 if (len == 1)
f4cd282c 1893 /* EIGHTBIT */
c80e42f3 1894 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 1895 else {
d2dcd0fb 1896 Copy(s, ustrp, len, U8);
29e98929 1897 }
983ffd37 1898 }
0134edef
JH
1899 }
1900
1901 if (!len && *swashp) {
4a4088c4 1902 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 1903
0134edef
JH
1904 if (uv2) {
1905 /* It was "normal" (a single character mapping). */
f4cd282c 1906 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
1907 }
1908 }
1feea2c7 1909
cbe07460
KW
1910 if (len) {
1911 if (lenp) {
1912 *lenp = len;
1913 }
1914 return valid_utf8_to_uvchr(ustrp, 0);
1915 }
1916
1917 /* Here, there was no mapping defined, which means that the code point maps
1918 * to itself. Return the inputs */
bfdf22ec 1919 len = UTF8SKIP(p);
ca9fab46
KW
1920 if (p != ustrp) { /* Don't copy onto itself */
1921 Copy(p, ustrp, len, U8);
1922 }
0134edef 1923
2a37f04d
JH
1924 if (lenp)
1925 *lenp = len;
1926
f4cd282c 1927 return uv1;
cbe07460 1928
a0ed51b3
LW
1929}
1930
051a06d4 1931STATIC UV
357aadde 1932S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 1933{
4a4088c4 1934 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
1935 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
1936 * result contains a character that crosses the 255/256 boundary, disallow
1937 * the change, and return the original code point. See L<perlfunc/lc> for
1938 * why;
051a06d4 1939 *
a1433954
KW
1940 * p points to the original string whose case was changed; assumed
1941 * by this routine to be well-formed
051a06d4
KW
1942 * result the code point of the first character in the changed-case string
1943 * ustrp points to the changed-case string (<result> represents its first char)
1944 * lenp points to the length of <ustrp> */
1945
1946 UV original; /* To store the first code point of <p> */
1947
1948 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
1949
a4f12ed7 1950 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
1951
1952 /* We know immediately if the first character in the string crosses the
1953 * boundary, so can skip */
1954 if (result > 255) {
1955
1956 /* Look at every character in the result; if any cross the
1957 * boundary, the whole thing is disallowed */
1958 U8* s = ustrp + UTF8SKIP(ustrp);
1959 U8* e = ustrp + *lenp;
1960 while (s < e) {
a4f12ed7 1961 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
1962 goto bad_crossing;
1963 }
1964 s += UTF8SKIP(s);
1965 }
1966
613abc6d
KW
1967 /* Here, no characters crossed, result is ok as-is, but we warn. */
1968 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
1969 return result;
1970 }
1971
7b52d656 1972 bad_crossing:
051a06d4
KW
1973
1974 /* Failed, have to return the original */
4b88fb76 1975 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
1976
1977 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
1978 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1979 "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
1980 "resolved to \"\\x{%"UVXf"}\".",
357aadde 1981 OP_DESC(PL_op),
ab0b796c
KW
1982 original,
1983 original);
051a06d4
KW
1984 Copy(p, ustrp, *lenp, char);
1985 return original;
1986}
1987
d3e79532 1988/*
87cea99e 1989=for apidoc to_utf8_upper
d3e79532 1990
1f607577 1991Instead use L</toUPPER_utf8>.
a1433954 1992
d3e79532
JH
1993=cut */
1994
051a06d4 1995/* Not currently externally documented, and subject to change:
31f05a37
KW
1996 * <flags> is set iff iff the rules from the current underlying locale are to
1997 * be used. */
051a06d4 1998
2104c8d9 1999UV
31f05a37 2000Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2001{
051a06d4
KW
2002 UV result;
2003
2004 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2005
780fcc9f
KW
2006 if (flags) {
2007 /* Treat a UTF-8 locale as not being in locale at all */
2008 if (IN_UTF8_CTYPE_LOCALE) {
2009 flags = FALSE;
2010 }
2011 else {
2012 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2013 }
31f05a37
KW
2014 }
2015
3a4c58c9 2016 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2017 if (flags) {
2018 result = toUPPER_LC(*p);
2019 }
2020 else {
81c6c7ce 2021 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2022 }
3a4c58c9
KW
2023 }
2024 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2025 if (flags) {
a62b247b 2026 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2027 result = toUPPER_LC(c);
051a06d4
KW
2028 }
2029 else {
a62b247b 2030 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2031 ustrp, lenp, 'S');
051a06d4
KW
2032 }
2033 }
4a4088c4 2034 else { /* UTF-8, ord above 255 */
051a06d4
KW
2035 result = CALL_UPPER_CASE(p, ustrp, lenp);
2036
2037 if (flags) {
357aadde 2038 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2039 }
2040 return result;
2041 }
2042
4a4088c4 2043 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2044 if (UTF8_IS_INVARIANT(result)) {
2045 *ustrp = (U8) result;
2046 *lenp = 1;
2047 }
2048 else {
62cb07ea
KW
2049 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2050 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2051 *lenp = 2;
3a4c58c9 2052 }
baa60164 2053
051a06d4 2054 return result;
983ffd37 2055}
a0ed51b3 2056
d3e79532 2057/*
87cea99e 2058=for apidoc to_utf8_title
d3e79532 2059
1f607577 2060Instead use L</toTITLE_utf8>.
a1433954 2061
d3e79532
JH
2062=cut */
2063
051a06d4 2064/* Not currently externally documented, and subject to change:
31f05a37
KW
2065 * <flags> is set iff the rules from the current underlying locale are to be
2066 * used. Since titlecase is not defined in POSIX, for other than a
2067 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2068 */
051a06d4 2069
983ffd37 2070UV
31f05a37 2071Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2072{
051a06d4
KW
2073 UV result;
2074
2075 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2076
780fcc9f
KW
2077 if (flags) {
2078 /* Treat a UTF-8 locale as not being in locale at all */
2079 if (IN_UTF8_CTYPE_LOCALE) {
2080 flags = FALSE;
2081 }
2082 else {
2083 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2084 }
31f05a37
KW
2085 }
2086
3a4c58c9 2087 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2088 if (flags) {
2089 result = toUPPER_LC(*p);
2090 }
2091 else {
81c6c7ce 2092 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2093 }
3a4c58c9
KW
2094 }
2095 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2096 if (flags) {
a62b247b 2097 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2098 result = toUPPER_LC(c);
051a06d4
KW
2099 }
2100 else {
a62b247b 2101 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2102 ustrp, lenp, 's');
051a06d4
KW
2103 }
2104 }
4a4088c4 2105 else { /* UTF-8, ord above 255 */
051a06d4
KW
2106 result = CALL_TITLE_CASE(p, ustrp, lenp);
2107
2108 if (flags) {
357aadde 2109 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2110 }
2111 return result;
2112 }
2113
4a4088c4 2114 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2115 if (UTF8_IS_INVARIANT(result)) {
2116 *ustrp = (U8) result;
2117 *lenp = 1;
2118 }
2119 else {
62cb07ea
KW
2120 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2121 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2122 *lenp = 2;
3a4c58c9
KW
2123 }
2124
051a06d4 2125 return result;
a0ed51b3
LW
2126}
2127
d3e79532 2128/*
87cea99e 2129=for apidoc to_utf8_lower
d3e79532 2130
1f607577 2131Instead use L</toLOWER_utf8>.
a1433954 2132
d3e79532
JH
2133=cut */
2134
051a06d4 2135/* Not currently externally documented, and subject to change:
31f05a37
KW
2136 * <flags> is set iff iff the rules from the current underlying locale are to
2137 * be used.
2138 */
051a06d4 2139
2104c8d9 2140UV
31f05a37 2141Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2142{
051a06d4
KW
2143 UV result;
2144
051a06d4 2145 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2146
780fcc9f
KW
2147 if (flags) {
2148 /* Treat a UTF-8 locale as not being in locale at all */
2149 if (IN_UTF8_CTYPE_LOCALE) {
2150 flags = FALSE;
2151 }
2152 else {
2153 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2154 }
31f05a37
KW
2155 }
2156
968c5e6a 2157 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2158 if (flags) {
2159 result = toLOWER_LC(*p);
2160 }
2161 else {
81c6c7ce 2162 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2163 }
968c5e6a
KW
2164 }
2165 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2166 if (flags) {
a62b247b 2167 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2168 result = toLOWER_LC(c);
051a06d4
KW
2169 }
2170 else {
a62b247b 2171 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2172 ustrp, lenp);
051a06d4 2173 }
968c5e6a 2174 }
4a4088c4 2175 else { /* UTF-8, ord above 255 */
051a06d4
KW
2176 result = CALL_LOWER_CASE(p, ustrp, lenp);
2177
2178 if (flags) {
357aadde 2179 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2180 }
968c5e6a 2181
051a06d4
KW
2182 return result;
2183 }
2184
4a4088c4 2185 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2186 if (UTF8_IS_INVARIANT(result)) {
2187 *ustrp = (U8) result;
2188 *lenp = 1;
2189 }
2190 else {
62cb07ea
KW
2191 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2192 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2193 *lenp = 2;
2194 }
2195
051a06d4 2196 return result;
b4e400f9
JH
2197}
2198
d3e79532 2199/*
87cea99e 2200=for apidoc to_utf8_fold
d3e79532 2201
1f607577 2202Instead use L</toFOLD_utf8>.
a1433954 2203
d3e79532
JH
2204=cut */
2205
051a06d4
KW
2206/* Not currently externally documented, and subject to change,
2207 * in <flags>
31f05a37
KW
2208 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2209 * locale are to be used.
051a06d4
KW
2210 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2211 * otherwise simple folds
a0270393
KW
2212 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2213 * prohibited
445bf929 2214 */
36bb2ab6 2215
b4e400f9 2216UV
445bf929 2217Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2218{
051a06d4
KW
2219 UV result;
2220
36bb2ab6 2221 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2222
a0270393
KW
2223 /* These are mutually exclusive */
2224 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2225
50ba90ff
KW
2226 assert(p != ustrp); /* Otherwise overwrites */
2227
780fcc9f
KW
2228 if (flags & FOLD_FLAGS_LOCALE) {
2229 /* Treat a UTF-8 locale as not being in locale at all */
2230 if (IN_UTF8_CTYPE_LOCALE) {
2231 flags &= ~FOLD_FLAGS_LOCALE;
2232 }
2233 else {
2234 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2235 }
31f05a37
KW
2236 }
2237
a1dde8de 2238 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2239 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2240 result = toFOLD_LC(*p);
051a06d4
KW
2241 }
2242 else {
81c6c7ce 2243 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2244 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2245 }
a1dde8de
KW
2246 }
2247 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2248 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2249 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2250 result = toFOLD_LC(c);
051a06d4
KW
2251 }
2252 else {
a62b247b 2253 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2254 ustrp, lenp,
1ca267a5 2255 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2256 }
a1dde8de 2257 }
4a4088c4 2258 else { /* UTF-8, ord above 255 */
a0270393 2259 result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2260
1ca267a5
KW
2261 if (flags & FOLD_FLAGS_LOCALE) {
2262
76f2ffcd 2263# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
2264 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
2265
2266# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2267# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
2268
2269 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 2270
538e84ed
KW
2271 /* Special case these two characters, as what normally gets
2272 * returned under locale doesn't work */
76f2ffcd
KW
2273 if (UTF8SKIP(p) == cap_sharp_s_len
2274 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 2275 {
ab0b796c
KW
2276 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2277 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2278 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2279 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
2280 goto return_long_s;
2281 }
0766489e
KW
2282 else
2283#endif
2284 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 2285 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 2286 {
ab0b796c
KW
2287 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2288 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2289 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
2290 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
2291 goto return_ligature_st;
2292 }
74894415
KW
2293
2294#if UNICODE_MAJOR_VERSION == 3 \
2295 && UNICODE_DOT_VERSION == 0 \
2296 && UNICODE_DOT_DOT_VERSION == 1
2297# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
2298
2299 /* And special case this on this Unicode version only, for the same
2300 * reaons the other two are special cased. They would cross the
2301 * 255/256 boundary which is forbidden under /l, and so the code
2302 * wouldn't catch that they are equivalent (which they are only in
2303 * this release) */
2304 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
2305 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
2306 {
2307 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2308 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2309 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
2310 "resolved to \"\\x{0131}\".");
2311 goto return_dotless_i;
2312 }
2313#endif
2314
357aadde 2315 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2316 }
a0270393
KW
2317 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2318 return result;
2319 }
2320 else {
4a4088c4 2321 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
2322 * character above the ASCII range, and the result should not
2323 * contain an ASCII character. */
a0270393
KW
2324
2325 UV original; /* To store the first code point of <p> */
2326
2327 /* Look at every character in the result; if any cross the
2328 * boundary, the whole thing is disallowed */
2329 U8* s = ustrp;
2330 U8* e = ustrp + *lenp;
2331 while (s < e) {
2332 if (isASCII(*s)) {
2333 /* Crossed, have to return the original */
2334 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 2335
9fc2026f 2336 /* But in these instances, there is an alternative we can
1ca267a5 2337 * return that is valid */
0766489e
KW
2338 if (original == LATIN_SMALL_LETTER_SHARP_S
2339#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
2340 || original == LATIN_CAPITAL_LETTER_SHARP_S
2341#endif
2342 ) {
1ca267a5
KW
2343 goto return_long_s;
2344 }
9fc2026f
KW
2345 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2346 goto return_ligature_st;
2347 }
74894415
KW
2348#if UNICODE_MAJOR_VERSION == 3 \
2349 && UNICODE_DOT_VERSION == 0 \
2350 && UNICODE_DOT_DOT_VERSION == 1
2351
2352 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
2353 goto return_dotless_i;
2354 }
2355#endif
a0270393
KW
2356 Copy(p, ustrp, *lenp, char);
2357 return original;
2358 }
2359 s += UTF8SKIP(s);
2360 }
051a06d4 2361
a0270393
KW
2362 /* Here, no characters crossed, result is ok as-is */
2363 return result;
2364 }
051a06d4
KW
2365 }
2366
4a4088c4 2367 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2368 if (UTF8_IS_INVARIANT(result)) {
2369 *ustrp = (U8) result;
2370 *lenp = 1;
2371 }
2372 else {
62cb07ea
KW
2373 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2374 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2375 *lenp = 2;
2376 }
2377
051a06d4 2378 return result;
1ca267a5
KW
2379
2380 return_long_s:
2381 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2382 * folds to a string of two of these characters. By returning this
2383 * instead, then, e.g.,
2384 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2385 * works. */
2386
2387 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2388 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2389 ustrp, *lenp, U8);
2390 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
2391
2392 return_ligature_st:
2393 /* Two folds to 'st' are prohibited by the options; instead we pick one and
2394 * have the other one fold to it */
2395
2396 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2397 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2398 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
2399
2400#if UNICODE_MAJOR_VERSION == 3 \
2401 && UNICODE_DOT_VERSION == 0 \
2402 && UNICODE_DOT_DOT_VERSION == 1
2403
2404 return_dotless_i:
2405 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
2406 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
2407 return LATIN_SMALL_LETTER_DOTLESS_I;
2408
2409#endif
2410
a0ed51b3
LW
2411}
2412
711a919c 2413/* Note:
f90a9a02 2414 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
2415 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2416 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2417 */
c4a5db0c 2418
a0ed51b3 2419SV*
7fc63493 2420Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 2421{
c4a5db0c
KW
2422 PERL_ARGS_ASSERT_SWASH_INIT;
2423
2424 /* Returns a copy of a swash initiated by the called function. This is the
2425 * public interface, and returning a copy prevents others from doing
2426 * mischief on the original */
2427
5d3d13d1 2428 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
2429}
2430
2431SV*
5d3d13d1 2432Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 2433{
2c1f00b9
YO
2434
2435 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
2436 * use the following define */
2437
2438#define CORE_SWASH_INIT_RETURN(x) \
2439 PL_curpm= old_PL_curpm; \
2440 return x
2441
c4a5db0c 2442 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
2443 * by calling utf8_heavy.pl in the general case. The returned value may be
2444 * the swash's inversion list instead if the input parameters allow it.
2445 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
2446 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2447 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
2448 *
2449 * This interface should only be used by functions that won't destroy or
2450 * adversely change the swash, as doing so affects all other uses of the
2451 * swash in the program; the general public should use 'Perl_swash_init'
2452 * instead.
2453 *
2454 * pkg is the name of the package that <name> should be in.
2455 * name is the name of the swash to find. Typically it is a Unicode
2456 * property name, including user-defined ones
2457 * listsv is a string to initialize the swash with. It must be of the form
2458 * documented as the subroutine return value in
2459 * L<perlunicode/User-Defined Character Properties>
2460 * minbits is the number of bits required to represent each data element.
2461 * It is '1' for binary properties.
2462 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 2463 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
2464 * flags_p if non-NULL is the address of various input and output flag bits
2465 * to the routine, as follows: ('I' means is input to the routine;
2466 * 'O' means output from the routine. Only flags marked O are
2467 * meaningful on return.)
2468 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2469 * came from a user-defined property. (I O)
5d3d13d1
KW
2470 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2471 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
2472 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2473 * return of an inversion list instead of a swash hash if this routine
2474 * thinks that would result in faster execution of swash_fetch() later
2475 * on. (I)
9a53f6cf
KW
2476 *
2477 * Thus there are three possible inputs to find the swash: <name>,
2478 * <listsv>, and <invlist>. At least one must be specified. The result
2479 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
2480 * actions can intersect, etc. what <name> gives. To avoid going out to
2481 * disk at all, <invlist> should specify completely what the swash should
2482 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
2483 *
2484 * <invlist> is only valid for binary properties */
c4a5db0c 2485
2c1f00b9
YO
2486 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
2487
c4a5db0c 2488 SV* retval = &PL_sv_undef;
83199d38 2489 HV* swash_hv = NULL;
87367d5f
KW
2490 const int invlist_swash_boundary =
2491 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2492 ? 512 /* Based on some benchmarking, but not extensive, see commit
2493 message */
2494 : -1; /* Never return just an inversion list */
9a53f6cf
KW
2495
2496 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2497 assert(! invlist || minbits == 1);
2498
2c1f00b9
YO
2499 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
2500 that triggered the swash init and the swash init perl logic itself.
2501 See perl #122747 */
2502
9a53f6cf
KW
2503 /* If data was passed in to go out to utf8_heavy to find the swash of, do
2504 * so */
2505 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
2506 dSP;
2507 const size_t pkg_len = strlen(pkg);
2508 const size_t name_len = strlen(name);
2509 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2510 SV* errsv_save;
2511 GV *method;
2512
2513 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2514
2515 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 2516 ENTER;
69794297 2517 SAVEHINTS();
2782061f 2518 save_re_context();
650f067c
JL
2519 /* We might get here via a subroutine signature which uses a utf8
2520 * parameter name, at which point PL_subname will have been set
2521 * but not yet used. */
2522 save_item(PL_subname);
69794297
KW
2523 if (PL_parser && PL_parser->error_count)
2524 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2525 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 2526 if (!method) { /* demand load UTF-8 */
69794297 2527 ENTER;
db2c6cb3
FC
2528 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2529 GvSV(PL_errgv) = NULL;
1a419e6b 2530#ifndef NO_TAINT_SUPPORT
69794297
KW
2531 /* It is assumed that callers of this routine are not passing in
2532 * any user derived data. */
2782061f
DM
2533 /* Need to do this after save_re_context() as it will set
2534 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2535 * in Perl_magic_get). Even line to create errsv_save can turn on
2536 * PL_tainted. */
284167a5
S
2537 SAVEBOOL(TAINT_get);
2538 TAINT_NOT;
2539#endif
69794297
KW
2540 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2541 NULL);
eed484f9 2542 {
db2c6cb3
FC
2543 /* Not ERRSV, as there is no need to vivify a scalar we are
2544 about to discard. */
2545 SV * const errsv = GvSV(PL_errgv);
2546 if (!SvTRUE(errsv)) {
2547 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2548 SvREFCNT_dec(errsv);
2549 }
eed484f9 2550 }
69794297
KW
2551 LEAVE;
2552 }
2553 SPAGAIN;
2554 PUSHMARK(SP);
2555 EXTEND(SP,5);
2556 mPUSHp(pkg, pkg_len);
2557 mPUSHp(name, name_len);
2558 PUSHs(listsv);
2559 mPUSHi(minbits);
2560 mPUSHi(none);
2561 PUTBACK;
db2c6cb3
FC
2562 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2563 GvSV(PL_errgv) = NULL;
69794297
KW
2564 /* If we already have a pointer to the method, no need to use
2565 * call_method() to repeat the lookup. */
c41800a8
KW
2566 if (method
2567 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
2568 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2569 {
2570 retval = *PL_stack_sp--;
2571 SvREFCNT_inc(retval);
2572 }
eed484f9 2573 {
db2c6cb3
FC
2574 /* Not ERRSV. See above. */
2575 SV * const errsv = GvSV(PL_errgv);
2576 if (!SvTRUE(errsv)) {
2577 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2578 SvREFCNT_dec(errsv);
2579 }
eed484f9 2580 }
ce3b816e 2581 LEAVE;
69794297
KW
2582 POPSTACK;
2583 if (IN_PERL_COMPILETIME) {
2584 CopHINTS_set(PL_curcop, PL_hints);
2585 }
2586 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2587 if (SvPOK(retval))
2588
2589 /* If caller wants to handle missing properties, let them */
5d3d13d1 2590 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 2591 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
2592 }
2593 Perl_croak(aTHX_
2594 "Can't find Unicode property definition \"%"SVf"\"",
2595 SVfARG(retval));
a25b5927 2596 NOT_REACHED; /* NOTREACHED */
69794297 2597 }
9a53f6cf 2598 } /* End of calling the module to find the swash */
36eb48b4 2599
83199d38
KW
2600 /* If this operation fetched a swash, and we will need it later, get it */
2601 if (retval != &PL_sv_undef
2602 && (minbits == 1 || (flags_p
2603 && ! (*flags_p
2604 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
2605 {
2606 swash_hv = MUTABLE_HV(SvRV(retval));
2607
2608 /* If we don't already know that there is a user-defined component to
2609 * this swash, and the user has indicated they wish to know if there is
2610 * one (by passing <flags_p>), find out */
2611 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
2612 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
2613 if (user_defined && SvUV(*user_defined)) {
2614 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
2615 }
2616 }
2617 }
2618
36eb48b4
KW
2619 /* Make sure there is an inversion list for binary properties */
2620 if (minbits == 1) {
2621 SV** swash_invlistsvp = NULL;
2622 SV* swash_invlist = NULL;
9a53f6cf 2623 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
2624 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
2625 an unclaimed reference count */
36eb48b4 2626
9a53f6cf 2627 /* If this operation fetched a swash, get its already existing
83199d38 2628 * inversion list, or create one for it */
36eb48b4 2629
83199d38 2630 if (swash_hv) {
5c9f4bd2 2631 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
2632 if (swash_invlistsvp) {
2633 swash_invlist = *swash_invlistsvp;
2634 invlist_in_swash_is_valid = TRUE;
2635 }
2636 else {
36eb48b4 2637 swash_invlist = _swash_to_invlist(retval);
02c85471 2638 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
2639 }
2640 }
2641
2642 /* If an inversion list was passed in, have to include it */
2643 if (invlist) {
2644
2645 /* Any fetched swash will by now have an inversion list in it;
2646 * otherwise <swash_invlist> will be NULL, indicating that we
2647 * didn't fetch a swash */
2648 if (swash_invlist) {
2649
2650 /* Add the passed-in inversion list, which invalidates the one
2651 * already stored in the swash */
2652 invlist_in_swash_is_valid = FALSE;
2653 _invlist_union(invlist, swash_invlist, &swash_invlist);
2654 }
2655 else {
2656
87367d5f
KW
2657 /* Here, there is no swash already. Set up a minimal one, if
2658 * we are going to return a swash */
2659 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 2660 swash_hv = newHV();
4aca0fe6 2661 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 2662 }
9a53f6cf
KW
2663 swash_invlist = invlist;
2664 }
9a53f6cf
KW
2665 }
2666
2667 /* Here, we have computed the union of all the passed-in data. It may
2668 * be that there was an inversion list in the swash which didn't get
538e84ed 2669 * touched; otherwise save the computed one */
87367d5f
KW
2670 if (! invlist_in_swash_is_valid
2671 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
2672 {
5c9f4bd2 2673 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
2674 {
2675 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2676 }
cc34d8c5
FC
2677 /* We just stole a reference count. */
2678 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
2679 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 2680 }
87367d5f 2681
dbfdbd26
KW
2682 SvREADONLY_on(swash_invlist);
2683
c41800a8 2684 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
2685 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
2686 SvREFCNT_dec(retval);
02c85471
FC
2687 if (!swash_invlist_unclaimed)
2688 SvREFCNT_inc_simple_void_NN(swash_invlist);
2689 retval = newRV_noinc(swash_invlist);
87367d5f 2690 }
36eb48b4
KW
2691 }
2692
2c1f00b9
YO
2693 CORE_SWASH_INIT_RETURN(retval);
2694#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
2695}
2696
035d37be
JH
2697
2698/* This API is wrong for special case conversions since we may need to
2699 * return several Unicode characters for a single Unicode character
2700 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2701 * the lower-level routine, and it is similarly broken for returning
38684baa
KW
2702 * multiple values. --jhi
2703 * For those, you should use to_utf8_case() instead */
b0e3252e 2704/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
TS
2705
2706/* Note:
2707 * Returns the value of property/mapping C<swash> for the first character
2708 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 2709 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 2710 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
2711 *
2712 * A "swash" is a hash which contains initially the keys/values set up by
2713 * SWASHNEW. The purpose is to be able to completely represent a Unicode
2714 * property for all possible code points. Things are stored in a compact form
2715 * (see utf8_heavy.pl) so that calculation is required to find the actual
2716 * property value for a given code point. As code points are looked up, new
2717 * key/value pairs are added to the hash, so that the calculation doesn't have
2718 * to ever be re-done. Further, each calculation is done, not just for the
2719 * desired one, but for a whole block of code points adjacent to that one.
2720 * For binary properties on ASCII machines, the block is usually for 64 code
2721 * points, starting with a code point evenly divisible by 64. Thus if the
2722 * property value for code point 257 is requested, the code goes out and
2723 * calculates the property values for all 64 code points between 256 and 319,
2724 * and stores these as a single 64-bit long bit vector, called a "swatch",
2725 * under the key for code point 256. The key is the UTF-8 encoding for code
2726 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
2727 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
2728 * for code point 258 is then requested, this code realizes that it would be
2729 * stored under the key for 256, and would find that value and extract the
2730 * relevant bit, offset from 256.
2731 *
2732 * Non-binary properties are stored in as many bits as necessary to represent
2733 * their values (32 currently, though the code is more general than that), not
2734 * as single bits, but the principal is the same: the value for each key is a
2735 * vector that encompasses the property values for all code points whose UTF-8
2736 * representations are represented by the key. That is, for all code points
2737 * whose UTF-8 representations are length N bytes, and the key is the first N-1
2738 * bytes of that.
680c470c 2739 */
a0ed51b3 2740UV
680c470c 2741Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 2742{
ef8f7699 2743 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
2744 U32 klen;
2745 U32 off;
9b56a019 2746 STRLEN slen = 0;
7d85a32c 2747 STRLEN needents;
cfd0369c 2748 const U8 *tmps = NULL;
979f2922 2749 SV *swatch;
08fb1ac5 2750 const U8 c = *ptr;
3568d838 2751
7918f24d
NC
2752 PERL_ARGS_ASSERT_SWASH_FETCH;
2753
87367d5f
KW
2754 /* If it really isn't a hash, it isn't really swash; must be an inversion
2755 * list */
2756 if (SvTYPE(hv) != SVt_PVHV) {
2757 return _invlist_contains_cp((SV*)hv,
2758 (do_utf8)
2759 ? valid_utf8_to_uvchr(ptr, NULL)
2760 : c);
2761 }
2762
08fb1ac5
KW
2763 /* We store the values in a "swatch" which is a vec() value in a swash
2764 * hash. Code points 0-255 are a single vec() stored with key length
2765 * (klen) 0. All other code points have a UTF-8 representation
2766 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
2767 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
2768 * length for them is the length of the encoded char - 1. ptr[klen] is the
2769 * final byte in the sequence representing the character */
2770 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
2771 klen = 0;
2772 needents = 256;
2773 off = c;
3568d838 2774 }
08fb1ac5
KW
2775 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2776 klen = 0;
2777 needents = 256;
a62b247b 2778 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
TS
2779 }
2780 else {
08fb1ac5
KW
2781 klen = UTF8SKIP(ptr) - 1;
2782
2783 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
2784 * the vec is the final byte in the sequence. (In EBCDIC this is
2785 * converted to I8 to get consecutive values.) To help you visualize
2786 * all this:
2787 * Straight 1047 After final byte
2788 * UTF-8 UTF-EBCDIC I8 transform
2789 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
2790 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
2791 * ...
2792 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
2793 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
2794 * ...
2795 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
2796 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
2797 * ...
2798 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
2799 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
2800 * ...
2801 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
2802 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
2803 *
2804 * (There are no discontinuities in the elided (...) entries.)
2805 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
2806 * key for the next 31, up through U+043F, whose UTF-8 final byte is
2807 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
2808 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
2809 * index into the vec() swatch (after subtracting 0x80, which we
2810 * actually do with an '&').
2811 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
2812 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
2813 * dicontinuities which go away by transforming it into I8, and we
2814 * effectively subtract 0xA0 to get the index. */
979f2922 2815 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 2816 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 2817 }
7d85a32c 2818
a0ed51b3 2819 /*
4a4088c4 2820 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
2821 * suite. (That is, only 7-8% overall over just a hash cache. Still,
2822 * it's nothing to sniff at.) Pity we usually come through at least
2823 * two function calls to get here...
2824 *
2825 * NB: this code assumes that swatches are never modified, once generated!
2826 */
2827
3568d838 2828 if (hv == PL_last_swash_hv &&
a0ed51b3 2829 klen == PL_last_swash_klen &&
27da23d5 2830 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
2831 {
2832 tmps = PL_last_swash_tmps;
2833 slen = PL_last_swash_slen;
2834 }
2835 else {
2836 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 2837 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 2838
b0e3252e 2839 /* If not cached, generate it via swatch_get */
979f2922 2840 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
2841 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
2842 {
2843 if (klen) {
2844 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
2845 swatch = swatch_get(swash,
2846 code_point & ~((UV)needents - 1),
2847 needents);
2848 }
2849 else { /* For the first 256 code points, the swatch has a key of
2850 length 0 */
2851 swatch = swatch_get(swash, 0, needents);
2852 }
979f2922 2853
923e4eb5 2854 if (IN_PERL_COMPILETIME)
623e6609 2855 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 2856
979f2922 2857 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 2858
979f2922
TS
2859 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
2860 || (slen << 3) < needents)
5637ef5b
NC
2861 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
2862 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
2863 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
2864 }
2865
2866 PL_last_swash_hv = hv;
16d8f38a 2867 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 2868 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
2869 /* FIXME change interpvar.h? */
2870 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
2871 PL_last_swash_slen = slen;
2872 if (klen)
2873 Copy(ptr, PL_last_swash_key, klen, U8);
2874 }
2875
9faf8d75 2876 switch ((int)((slen << 3) / needents)) {
a0ed51b3 2877 case 1:
e7aca353 2878 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 2879 case 8:
e7aca353 2880 return ((UV) tmps[off]);
a0ed51b3
LW
2881 case 16:
2882 off <<= 1;
e7aca353
JH
2883 return
2884 ((UV) tmps[off ] << 8) +
2885 ((UV) tmps[off + 1]);
a0ed51b3
LW
2886 case 32:
2887 off <<= 2;
e7aca353
JH
2888 return
2889 ((UV) tmps[off ] << 24) +
2890 ((UV) tmps[off + 1] << 16) +
2891 ((UV) tmps[off + 2] << 8) +
2892 ((UV) tmps[off + 3]);
a0ed51b3 2893 }
5637ef5b
NC
2894 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
2895 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 2896 NORETURN_FUNCTION_END;
a0ed51b3 2897}
2b9d42f0 2898
319009ee
KW
2899/* Read a single line of the main body of the swash input text. These are of
2900 * the form:
2901 * 0053 0056 0073
2902 * where each number is hex. The first two numbers form the minimum and
2903 * maximum of a range, and the third is the value associated with the range.
2904 * Not all swashes should have a third number
2905 *
2906 * On input: l points to the beginning of the line to be examined; it points
2907 * to somewhere in the string of the whole input text, and is
2908 * terminated by a \n or the null string terminator.
2909 * lend points to the null terminator of that string
2910 * wants_value is non-zero if the swash expects a third number
2911 * typestr is the name of the swash's mapping, like 'ToLower'
2912 * On output: *min, *max, and *val are set to the values read from the line.
2913 * returns a pointer just beyond the line examined. If there was no
2914 * valid min number on the line, returns lend+1
2915 */
2916
2917STATIC U8*
2918S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
2919 const bool wants_value, const U8* const typestr)
2920{
2921 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
2922 STRLEN numlen; /* Length of the number */
02470786
KW
2923 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
2924 | PERL_SCAN_DISALLOW_PREFIX
2925 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
2926
2927 /* nl points to the next \n in the scan */
2928 U8* const nl = (U8*)memchr(l, '\n', lend - l);
2929
95543e92
KW
2930 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
2931
319009ee
KW
2932 /* Get the first number on the line: the range minimum */
2933 numlen = lend - l;
2934 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 2935 *max = *min; /* So can never return without setting max */
319009ee
KW
2936 if (numlen) /* If found a hex number, position past it */
2937 l += numlen;
2938 else if (nl) { /* Else, go handle next line, if any */
2939 return nl + 1; /* 1 is length of "\n" */
2940 }
2941 else { /* Else, no next line */
2942 return lend + 1; /* to LIST's end at which \n is not found */
2943 }
2944
2945 /* The max range value follows, separated by a BLANK */
2946 if (isBLANK(*l)) {
2947 ++l;
02470786
KW
2948 flags = PERL_SCAN_SILENT_ILLDIGIT
2949 | PERL_SCAN_DISALLOW_PREFIX
2950 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
2951 numlen = lend - l;
2952 *max = grok_hex((char *)l, &numlen, &flags, NULL);
2953 if (numlen)
2954 l += numlen;
2955 else /* If no value here, it is a single element range */
2956 *max = *min;
2957
2958 /* Non-binary tables have a third entry: what the first element of the
24303724 2959 * range maps to. The map for those currently read here is in hex */
319009ee
KW
2960 if (wants_value) {
2961 if (isBLANK(*l)) {
2962 ++l;
f2a7d0fc
KW
2963 flags = PERL_SCAN_SILENT_ILLDIGIT
2964 | PERL_SCAN_DISALLOW_PREFIX
2965 | PERL_SCAN_SILENT_NON_PORTABLE;
2966 numlen = lend - l;
2967 *val = grok_hex((char *)l, &numlen, &flags, NULL);
2968 if (numlen)
2969 l += numlen;
2970 else
2971 *val = 0;
319009ee
KW
2972 }
2973 else {
2974 *val = 0;
2975 if (typeto) {
dcbac5bb 2976 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
2977 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2978 typestr, l);
2979 }
2980 }
2981 }
2982 else
2983 *val = 0; /* bits == 1, then any val should be ignored */
2984 }
2985 else { /* Nothing following range min, should be single element with no
2986 mapping expected */
319009ee
KW
2987 if (wants_value) {
2988 *val = 0;
2989 if (typeto) {
dcbac5bb 2990 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
2991 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2992 }
2993 }
2994 else
2995 *val = 0; /* bits == 1, then val should be ignored */
2996 }
2997
2998 /* Position to next line if any, or EOF */
2999 if (nl)
3000 l = nl + 1;
3001 else
3002 l = lend;
3003
3004 return l;
3005}
3006
979f2922
TS
3007/* Note:
3008 * Returns a swatch (a bit vector string) for a code point sequence
3009 * that starts from the value C<start> and comprises the number C<span>.
3010 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3011 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3012 */
3013STATIC SV*
b0e3252e 3014S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
TS
3015{
3016 SV *swatch;
77f9f126 3017 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3018 STRLEN lcur, xcur, scur;
ef8f7699 3019 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3020 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3021
88d45d28
KW
3022 SV** listsvp = NULL; /* The string containing the main body of the table */
3023 SV** extssvp = NULL;
3024 SV** invert_it_svp = NULL;
3025 U8* typestr = NULL;
786861f5
KW
3026 STRLEN bits;
3027 STRLEN octets; /* if bits == 1, then octets == 0 */
3028 UV none;
3029 UV end = start + span;
972dd592 3030
36eb48b4 3031 if (invlistsvp == NULL) {
786861f5
KW
3032 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3033 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3034 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3035 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3036 listsvp = hv_fetchs(hv, "LIST", FALSE);
3037 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3038
3039 bits = SvUV(*bitssvp);
3040 none = SvUV(*nonesvp);
3041 typestr = (U8*)SvPV_nolen(*typesvp);
3042 }
36eb48b4
KW
3043 else {
3044 bits = 1;
3045 none = 0;
3046 }
786861f5 3047 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3048
b0e3252e 3049 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3050
979f2922 3051 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3052 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3053 (UV)bits);
979f2922
TS
3054 }
3055
84ea5ef6
KW
3056 /* If overflowed, use the max possible */
3057 if (end < start) {
3058 end = UV_MAX;
3059 span = end - start;
3060 }
3061
979f2922 3062 /* create and initialize $swatch */
979f2922 3063 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3064 swatch = newSV(scur);
3065 SvPOK_on(swatch);
979f2922
TS
3066 s = (U8*)SvPVX(swatch);
3067 if (octets && none) {
0bd48802 3068 const U8* const e = s + scur;
979f2922
TS
3069 while (s < e) {
3070 if (bits == 8)
3071 *s++ = (U8)(none & 0xff);
3072 else if (bits == 16) {
3073 *s++ = (U8)((none >> 8) & 0xff);
3074 *s++ = (U8)( none & 0xff);
3075 }
3076 else if (bits == 32) {
3077 *s++ = (U8)((none >> 24) & 0xff);
3078 *s++ = (U8)((none >> 16) & 0xff);
3079 *s++ = (U8)((none >> 8) & 0xff);
3080 *s++ = (U8)( none & 0xff);
3081 }
3082 }
3083 *s = '\0';
3084 }
3085 else {
3086 (void)memzero((U8*)s, scur + 1);
3087 }
3088 SvCUR_set(swatch, scur);
3089 s = (U8*)SvPVX(swatch);
3090
36eb48b4
KW
3091 if (invlistsvp) { /* If has an inversion list set up use that */
3092 _invlist_populate_swatch(*invlistsvp, start, end, s);
3093 return swatch;
3094 }
3095
3096 /* read $swash->{LIST} */
979f2922
TS
3097 l = (U8*)SvPV(*listsvp, lcur);
3098 lend = l + lcur;
3099 while (l < lend) {
8ed25d53 3100 UV min, max, val, upper;
95543e92
KW
3101 l = swash_scan_list_line(l, lend, &min, &max, &val,
3102 cBOOL(octets), typestr);
319009ee 3103 if (l > lend) {
979f2922
TS
3104 break;
3105 }
3106
972dd592 3107 /* If looking for something beyond this range, go try the next one */
979f2922
TS
3108 if (max < start)
3109 continue;
3110
8ed25d53
KW
3111 /* <end> is generally 1 beyond where we want to set things, but at the
3112 * platform's infinity, where we can't go any higher, we want to
3113 * include the code point at <end> */
3114 upper = (max < end)
3115 ? max
3116 : (max != UV_MAX || end != UV_MAX)
3117 ? end - 1
3118 : end;
3119
979f2922 3120 if (octets) {
35da51f7 3121 UV key;
979f2922
TS
3122 if (min < start) {
3123 if (!none || val < none) {
3124 val += start - min;
3125 }
3126 min = start;
3127 }
8ed25d53 3128 for (key = min; key <= upper; key++) {
979f2922 3129 STRLEN offset;
979f2922
TS
3130 /* offset must be non-negative (start <= min <= key < end) */
3131 offset = octets * (key - start);
3132 if (bits == 8)
3133 s[offset] = (U8)(val & 0xff);
3134 else if (bits == 16) {
3135 s[offset ] = (U8)((val >> 8) & 0xff);
3136 s[offset + 1] = (U8)( val & 0xff);
3137 }
3138 else if (bits == 32) {
3139 s[offset ] = (U8)((val >> 24) & 0xff);
3140 s[offset + 1] = (U8)((val >> 16) & 0xff);
3141 s[offset + 2] = (U8)((val >> 8) & 0xff);
3142 s[offset + 3] = (U8)( val & 0xff);
3143 }
3144
3145 if (!none || val < none)
3146 ++val;
3147 }
3148 }
711a919c 3149 else { /* bits == 1, then val should be ignored */
35da51f7 3150 UV key;
979f2922
TS
3151 if (min < start)
3152 min = start;
6cb05c12 3153
8ed25d53 3154 for (key = min; key <= upper; key++) {
0bd48802 3155 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
3156 s[offset >> 3] |= 1 << (offset & 7);
3157 }
3158 }
3159 } /* while */
979f2922 3160
9479a769 3161 /* Invert if the data says it should be. Assumes that bits == 1 */
77f9f126 3162 if (invert_it_svp && SvUV(*invert_it_svp)) {
0bda3001
KW
3163
3164 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3165 * be 0, and their inversion should also be 0, as we don't succeed any
3166 * Unicode property matches for non-Unicode code points */
3167 if (start <= PERL_UNICODE_MAX) {
3168
3169 /* The code below assumes that we never cross the
3170 * Unicode/above-Unicode boundary in a range, as otherwise we would
3171 * have to figure out where to stop flipping the bits. Since this
3172 * boundary is divisible by a large power of 2, and swatches comes
3173 * in small powers of 2, this should be a valid assumption */
3174 assert(start + span - 1 <= PERL_UNICODE_MAX);
3175
507a8485
KW
3176 send = s + scur;
3177 while (s < send) {
3178 *s = ~(*s);
3179 s++;
3180 }
0bda3001 3181 }
77f9f126
KW
3182 }
3183
d73c39c5
KW
3184 /* read $swash->{EXTRAS}
3185 * This code also copied to swash_to_invlist() below */
979f2922
TS
3186 x = (U8*)SvPV(*extssvp, xcur);
3187 xend = x + xcur;
3188 while (x < xend) {
3189 STRLEN namelen;
3190 U8 *namestr;
3191 SV** othersvp;
3192 HV* otherhv;
3193 STRLEN otherbits;
3194 SV **otherbitssvp, *other;
711a919c 3195 U8 *s, *o, *nl;
979f2922
TS
3196 STRLEN slen, olen;
3197
35da51f7 3198 const U8 opc = *x++;
979f2922
TS
3199 if (opc == '\n')
3200 continue;
3201
3202 nl = (U8*)memchr(x, '\n', xend - x);
3203
3204 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3205 if (nl) {
3206 x = nl + 1; /* 1 is length of "\n" */
3207 continue;
3208 }
3209 else {
3210 x = xend; /* to EXTRAS' end at which \n is not found */
3211 break;
3212 }
3213 }
3214
3215 namestr = x;
3216 if (nl) {
3217 namelen = nl - namestr;
3218 x = nl + 1;
3219 }
3220 else {
3221 namelen = xend - namestr;
3222 x = xend;
3223 }
3224
3225 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 3226 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 3227 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
3228 otherbits = (STRLEN)SvUV(*otherbitssvp);
3229 if (bits < otherbits)
5637ef5b
NC
3230 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3231 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
979f2922
TS
3232
3233 /* The "other" swatch must be destroyed after. */
b0e3252e 3234 other = swatch_get(*othersvp, start, span);
979f2922
TS
3235 o = (U8*)SvPV(other, olen);
3236
3237 if (!olen)
b0e3252e 3238 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
979f2922
TS
3239
3240 s = (U8*)SvPV(swatch, slen);
3241 if (bits == 1 && otherbits == 1) {
3242 if (slen != olen)
5637ef5b
NC
3243 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3244 "mismatch, slen=%"UVuf", olen=%"UVuf,
3245 (UV)slen, (UV)olen);
979f2922
TS
3246
3247 switch (opc) {
3248 case '+':
3249 while (slen--)
3250 *s++ |= *o++;
3251 break;
3252 case '!':
3253 while (slen--)
3254 *s++ |= ~*o++;
3255 break;
3256 case '-':
3257 while (slen--)
3258 *s++ &= ~*o++;
3259 break;
3260 case '&':
3261 while (slen--)
3262 *s++ &= *o++;
3263 break;
3264 default:
3265 break;
3266 }
3267 }
711a919c 3268 else {
979f2922
TS
3269 STRLEN otheroctets = otherbits >> 3;
3270 STRLEN offset = 0;
35da51f7 3271 U8* const send = s + slen;
979f2922
TS
3272
3273 while (s < send) {
3274 UV otherval = 0;
3275
3276 if (otherbits == 1) {
3277 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3278 ++offset;
3279 }
3280 else {
3281 STRLEN vlen = otheroctets;
3282 otherval = *o++;
3283 while (--vlen) {
3284 otherval <<= 8;
3285 otherval |= *o++;
3286 }
3287 }
3288
711a919c 3289 if (opc == '+' && otherval)
6f207bd3 3290 NOOP; /* replace with otherval */
979f2922
TS
3291 else if (opc == '!' && !otherval)
3292 otherval = 1;
3293 else if (opc == '-' && otherval)
3294 otherval = 0;
3295 else if (opc == '&' && !otherval)
3296 otherval = 0;
3297 else {
711a919c 3298 s += octets; /* no replacement */
979f2922
TS
3299 continue;
3300 }
3301
3302 if (bits == 8)
3303 *s++ = (U8)( otherval & 0xff);
3304 else if (bits == 16) {
3305 *s++ = (U8)((otherval >> 8) & 0xff);
3306 *s++ = (U8)( otherval & 0xff);
3307 }
3308 else if (bits == 32) {
3309 *s++ = (U8)((otherval >> 24) & 0xff);
3310 *s++ = (U8)((otherval >> 16) & 0xff);
3311 *s++ = (U8)((otherval >> 8) & 0xff);
3312 *s++ = (U8)( otherval & 0xff);
3313 }
3314 }
3315 }
3316 sv_free(other); /* through with it! */
3317 } /* while */
3318 return swatch;
3319}
3320
064c021d 3321HV*
4c2e1131 3322Perl__swash_inversion_hash(pTHX_ SV* const swash)
064c021d
KW
3323{
3324
79a2a0e8 3325 /* Subject to change or removal. For use only in regcomp.c and regexec.c
5662e334
KW
3326 * Can't be used on a property that is subject to user override, as it
3327 * relies on the value of SPECIALS in the swash which would be set by
3328 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3329 * for overridden properties
064c021d
KW
3330 *
3331 * Returns a hash which is the inversion and closure of a swash mapping.
3332 * For example, consider the input lines:
3333 * 004B 006B
3334 * 004C 006C
3335 * 212A 006B
3336 *
4a4088c4 3337 * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
064c021d 3338 * 006C. The value for each key is an array. For 006C, the array would
4a4088c4
KW
3339 * have two elements, the UTF-8 for itself, and for 004C. For 006B, there
3340 * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
064c021d 3341 *
538e84ed
KW
3342 * Note that there are no elements in the hash for 004B, 004C, 212A. The
3343 * keys are only code points that are folded-to, so it isn't a full closure.
3344 *
064c021d
KW
3345 * Essentially, for any code point, it gives all the code points that map to
3346 * it, or the list of 'froms' for that point.
3347 *
5662e334
KW
3348 * Currently it ignores any additions or deletions from other swashes,
3349 * looking at just the main body of the swash, and if there are SPECIALS
3350 * in the swash, at that hash
3351 *
3352 * The specials hash can be extra code points, and most likely consists of
3353 * maps from single code points to multiple ones (each expressed as a string
4a4088c4 3354 * of UTF-8 characters). This function currently returns only 1-1 mappings.
5662e334
KW
3355 * However consider this possible input in the specials hash:
3356 * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
3357 * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
3358 *
3359 * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3360 * currently handle. But it also means that FB05 and FB06 are equivalent in
3361 * a 1-1 mapping which we should handle, and this relationship may not be in
3362 * the main table. Therefore this function examines all the multi-char
74894415
KW
3363 * sequences and adds the 1-1 mappings that come out of that.
3364 *
3365 * XXX This function was originally intended to be multipurpose, but its
3366 * only use is quite likely to remain for constructing the inversion of
3367 * the CaseFolding (//i) property. If it were more general purpose for
3368 * regex patterns, it would have to do the FB05/FB06 game for simple folds,
3369 * because certain folds are prohibited under /iaa and /il. As an example,
3370 * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
3371 * equivalent under /i. But under /iaa and /il, the folds to 'i' are
3372 * prohibited, so we would not figure out that they fold to each other.
3373 * Code could be written to automatically figure this out, similar to the
3374 * code that does this for multi-character folds, but this is the only case
3375 * where something like this is ever likely to happen, as all the single
3376 * char folds to The 0-255 range are now quite settled. Instead there is a
3377 * little special code that is compiled only for this Unicode version. This
3378 * is smaller and didn't require much coding time to do. But this makes
3379 * this routine strongly tied to being used just for CaseFolding. If ever
3380 * it should be generalized, this would have to be fixed */
064c021d
KW
3381
3382 U8 *l, *lend;
3383 STRLEN lcur;
3384 HV *const hv = MUTABLE_HV(SvRV(swash));
3385
923b6d4e
KW
3386 /* The string containing the main body of the table. This will have its
3387 * assertion fail if the swash has been converted to its inversion list */
064c021d
KW
3388 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3389
3390 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3391 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3392 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3393 /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3394 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3395 const STRLEN bits = SvUV(*bitssvp);
3396 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3397 const UV none = SvUV(*nonesvp);
5662e334 3398 SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
064c021d
KW
3399
3400 HV* ret = newHV();
3401
3402 PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3403
3404 /* Must have at least 8 bits to get the mappings */
3405 if (bits != 8 && bits != 16 && bits != 32) {
3406 Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3407 (UV)bits);
3408 }
3409
5662e334
KW
3410 if (specials_p) { /* It might be "special" (sometimes, but not always, a
3411 mapping to more than one character */
3412
3413 /* Construct an inverse mapping hash for the specials */
3414 HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3415 HV * specials_inverse = newHV();
3416 char *char_from; /* the lhs of the map */
3417 I32 from_len; /* its byte length */
3418 char *char_to; /* the rhs of the map */
3419 I32 to_len; /* its byte length */
3420 SV *sv_to; /* and in a sv */
3421 AV* from_list; /* list of things that map to each 'to' */
3422
3423 hv_iterinit(specials_hv);
3424
4a4088c4
KW
3425 /* The keys are the characters (in UTF-8) that map to the corresponding
3426 * UTF-8 string value. Iterate through the list creating the inverse
5662e334
KW
3427 * list. */
3428 while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3429 SV** listp;
3430 if (! SvPOK(sv_to)) {
5637ef5b
NC
3431 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3432 "unexpectedly is not a string, flags=%lu",
3433 (unsigned long)SvFLAGS(sv_to));
5662e334 3434 }
4b88fb76 3435 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
5662e334
KW
3436
3437 /* Each key in the inverse list is a mapped-to value, and the key's
4a4088c4 3438 * hash value is a list of the strings (each in UTF-8) that map to
5662e334
KW
3439 * it. Those strings are all one character long */
3440 if ((listp = hv_fetch(specials_inverse,
3441 SvPVX(sv_to),
3442 SvCUR(sv_to), 0)))
3443 {
3444 from_list = (AV*) *listp;
3445 }
3446 else { /* No entry yet for it: create one */
3447 from_list = newAV();
3448 if (! hv_store(specials_inverse,
3449 SvPVX(sv_to),
3450 SvCUR(sv_to),
3451 (SV*) from_list, 0))
3452 {
3453 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3454 }
3455 }
3456
3457 /* Here have the list associated with this 'to' (perhaps newly
3458 * created and empty). Just add to it. Note that we ASSUME that
3459 * the input is guaranteed to not have duplications, so we don't
3460 * check for that. Duplications just slow down execution time. */
3461 av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3462 }
3463
3464 /* Here, 'specials_inverse' contains the inverse mapping. Go through
3465 * it looking for cases like the FB05/FB06 examples above. There would
3466 * be an entry in the hash like
3467 * 'st' => [ FB05, FB06 ]
3468 * In this example we will create two lists that get stored in the
3469 * returned hash, 'ret':
3470 * FB05 => [ FB05, FB06 ]
3471 * FB06 => [ FB05, FB06 ]
3472 *
3473 * Note that there is nothing to do if the array only has one element.
3474 * (In the normal 1-1 case handled below, we don't have to worry about
3475 * two lists, as everything gets tied to the single list that is
3476 * generated for the single character 'to'. But here, we are omitting
3477 * that list, ('st' in the example), so must have multiple lists.) */
3478 while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3479 &char_to, &to_len)))
3480 {
b9f2b683 3481 if (av_tindex(from_list) > 0) {
c70927a6 3482 SSize_t i;
5662e334
KW
3483
3484 /* We iterate over all combinations of i,j to place each code
3485 * point on each list */
b9f2b683 3486 for (i = 0; i <= av_tindex(from_list); i++) {
c70927a6 3487 SSize_t j;
5662e334
KW
3488 AV* i_list = newAV();
3489 SV** entryp = av_fetch(from_list, i, FALSE);
3490 if (entryp == NULL) {
3491 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3492 }
3493 if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3494 Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3495 }
3496 if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3497 (SV*) i_list, FALSE))
3498 {
3499 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3500 }
3501
538e84ed 3502 /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
b9f2b683 3503 for (j = 0; j <= av_tindex(from_list); j++) {
5662e334
KW
3504 entryp = av_fetch(from_list, j, FALSE);
3505 if (entryp == NULL) {
3506 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3507 }
3508
3509 /* When i==j this adds itself to the list */
4b88fb76
KW
3510 av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3511 (U8*) SvPVX(*entryp),
3512 (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3513 0)));
4637d003 3514 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
5662e334
KW
3515 }
3516 }
3517 }
3518 }
3519 SvREFCNT_dec(specials_inverse); /* done with it */
3520 } /* End of specials */
3521
064c021d 3522 /* read $swash->{LIST} */
74894415
KW
3523
3524#if UNICODE_MAJOR_VERSION == 3 \
3525 && UNICODE_DOT_VERSION == 0 \
3526 && UNICODE_DOT_DOT_VERSION == 1
3527
3528 /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
3529 * rule so that things work under /iaa and /il */
3530
3531 SV * mod_listsv = sv_mortalcopy(*listsvp);
3532 sv_catpv(mod_listsv, "130\t130\t131\n");
3533 l = (U8*)SvPV(mod_listsv, lcur);
3534
3535#else
3536
064c021d 3537 l = (U8*)SvPV(*listsvp, lcur);
74894415
KW
3538
3539#endif
3540
064c021d
KW
3541 lend = l + lcur;
3542
3543 /* Go through each input line */
3544 while (l < lend) {
3545 UV min, max, val;
3546 UV inverse;
95543e92
KW
3547 l = swash_scan_list_line(l, lend, &min, &max, &val,
3548 cBOOL(octets), typestr);
064c021d
KW
3549 if (l > lend) {
3550 break;
3551 }
3552
3553 /* Each element in the range is to be inverted */
3554 for (inverse = min; inverse <= max; inverse++) {
3555 AV* list;
064c021d
KW
3556 SV** listp;
3557 IV i;
3558 bool found_key = FALSE;
5662e334 3559 bool found_inverse = FALSE;
064c021d
KW
3560
3561 /* The key is the inverse mapping */
3562 char key[UTF8_MAXBYTES+1];
c80e42f3 3563 char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
064c021d
KW
3564 STRLEN key_len = key_end - key;
3565
064c021d
KW
3566 /* Get the list for the map */
3567 if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3568 list = (AV*) *listp;
3569 }
3570 else { /* No entry yet for it: create one */
3571 list = newAV();
3572 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3573 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3574 }
3575 }
3576
5662e334
KW
3577 /* Look through list to see if this inverse mapping already is
3578 * listed, or if there is a mapping to itself already */
b9f2b683 3579 for (i = 0; i <= av_tindex(list); i++) {
064c021d
KW
3580 SV** entryp = av_fetch(list, i, FALSE);
3581 SV* entry;
414db8a1 3582 UV uv;
064c021d
KW
3583 if (entryp == NULL) {
3584 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3585 }
3586 entry = *entryp;
414db8a1
DM
3587 uv = SvUV(entry);
3588 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));*/
3589 if (uv == val) {
064c021d 3590 found_key = TRUE;
5662e334 3591 }
414db8a1 3592 if (uv == inverse) {
5662e334
KW
3593 found_inverse = TRUE;
3594 }
3595
3596 /* No need to continue searching if found everything we are
3597 * looking for */
3598 if (found_key && found_inverse) {
064c021d
KW
3599 break;
3600 }
3601 }
56ca34ca
KW
3602
3603 /* Make sure there is a mapping to itself on the list */
064c021d 3604 if (! found_key) {
d397ff6a 3605 av_push(list, newSVuv(val));
4637d003 3606 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
064c021d
KW
3607 }
3608
3609
3610 /* Simply add the value to the list */
5662e334
KW
3611 if (! found_inverse) {
3612 av_push(list, newSVuv(inverse));
4637d003 3613 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
5662e334 3614 }
064c021d 3615
b0e3252e 3616 /* swatch_get() increments the value of val for each element in the
064c021d
KW
3617 * range. That makes more compact tables possible. You can
3618 * express the capitalization, for example, of all consecutive
3619 * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3620 * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
bd3f2f94 3621 * and it's not documented; it appears to be used only in
b0e3252e 3622 * implementing tr//; I copied the semantics from swatch_get(), just
bd3f2f94 3623 * in case */
064c021d
KW
3624 if (!none || val < none) {
3625 ++val;
3626 }
3627 }
3628 }