This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8n_to_uvchr(): Make a parameter const
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
2d212e86 40 "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
75 "UTF-16 surrogate U+%04"UVXf, uv); \
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
86 "Unicode non-character U+%04"UVXf" is not " \
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
167 ? "Code point 0x%"UVXf" is not Unicode, and not portable"
168 : "Code point 0x%"UVXf" is not Unicode, may not be portable",
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
b343c774
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
433 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
434 * platform, that is if it represents a code point larger than the highest
435 * representable code point. (For ASCII platforms, we could use memcmp()
436 * because we don't have to convert each byte to I8, but it's very rare
437 * input indeed that would approach overflow, so the loop below will likely
438 * only get executed once.
439 *
440 * 'e' must not be beyond a full character. If it is less than a full
441 * character, the function returns FALSE if there is any input beyond 'e'
442 * that could result in a non-overflowing code point */
443
444 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
445 assert(s + UTF8SKIP(s) >= e);
446
447 for (x = s; x < e; x++, y++) {
448
449 /* If this byte is larger than the corresponding highest UTF-8 byte, it
450 * overflows */
451 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
452 return TRUE;
453 }
454
455 /* If not the same as this byte, it must be smaller, doesn't overflow */
456 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
457 return FALSE;
458 }
459 }
460
461 /* Got to the end and all bytes are the same. If the input is a whole
462 * character, it doesn't overflow. And if it is a partial character,
463 * there's not enough information to tell, so assume doesn't overflow */
464 return FALSE;
465}
466
467PERL_STATIC_INLINE bool
468S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
469{
470 /* Overlongs can occur whenever the number of continuation bytes
471 * changes. That means whenever the number of leading 1 bits in a start
472 * byte increases from the next lower start byte. That happens for start
473 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
474 * illegal start bytes have already been excluded, so don't need to be
475 * tested here;
476 * ASCII platforms: C0, C1
477 * EBCDIC platforms C0, C1, C2, C3, C4, E0
478 *
479 * At least a second byte is required to determine if other sequences will
480 * be an overlong. */
481
482 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
483 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
484
485 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
486 assert(len > 1 && UTF8_IS_START(*s));
487
488 /* Each platform has overlongs after the start bytes given above (expressed
489 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
490 * the logic is the same, except the E0 overlong has already been excluded
491 * on EBCDIC platforms. The values below were found by manually
492 * inspecting the UTF-8 patterns. See the tables in utf8.h and
493 * utfebcdic.h. */
494
495# ifdef EBCDIC
496# define F0_ABOVE_OVERLONG 0xB0
497# define F8_ABOVE_OVERLONG 0xA8
498# define FC_ABOVE_OVERLONG 0xA4
499# define FE_ABOVE_OVERLONG 0xA2
500# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
501 /* I8(0xfe) is FF */
502# else
503
504 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
505 return TRUE;
506 }
507
508# define F0_ABOVE_OVERLONG 0x90
509# define F8_ABOVE_OVERLONG 0x88
510# define FC_ABOVE_OVERLONG 0x84
511# define FE_ABOVE_OVERLONG 0x82
512# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
513# endif
514
515
516 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
517 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
518 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
519 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
520 {
521 return TRUE;
522 }
523
524# if defined(UV_IS_QUAD) || defined(EBCDIC)
525
526 /* Check for the FF overlong. This happens only if all these bytes match;
527 * what comes after them doesn't matter. See tables in utf8.h,
528 * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
529 * instead.) */
530
531 if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
532 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
533 sizeof(FF_OVERLONG_PREFIX) - 1)))
534 {
535 return TRUE;
536 }
537
538#endif
539
540 return FALSE;
541}
542
543#undef F0_ABOVE_OVERLONG
544#undef F8_ABOVE_OVERLONG
545#undef FC_ABOVE_OVERLONG
546#undef FE_ABOVE_OVERLONG
547#undef FF_OVERLONG_PREFIX
548
35f8c9bd 549STRLEN
edc2c47a 550Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 551{
2b479609 552 STRLEN len;
12a4bed3 553 const U8 *x;
35f8c9bd 554
2b479609
KW
555 /* A helper function that should not be called directly.
556 *
557 * This function returns non-zero if the string beginning at 's' and
558 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
559 * code point; otherwise it returns 0. The examination stops after the
560 * first code point in 's' is validated, not looking at the rest of the
561 * input. If 'e' is such that there are not enough bytes to represent a
562 * complete code point, this function will return non-zero anyway, if the
563 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
564 * excluded by 'flags'.
565 *
566 * A non-zero return gives the number of bytes required to represent the
567 * code point. Be aware that if the input is for a partial character, the
568 * return will be larger than 'e - s'.
569 *
570 * This function assumes that the code point represented is UTF-8 variant.
571 * The caller should have excluded this possibility before calling this
572 * function.
573 *
574 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
575 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
576 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
577 * disallowed by the flags. If the input is only for a partial character,
578 * the function will return non-zero if there is any sequence of
579 * well-formed UTF-8 that, when appended to the input sequence, could
580 * result in an allowed code point; otherwise it returns 0. Non characters
581 * cannot be determined based on partial character input. But many of the
582 * other excluded types can be determined with just the first one or two
583 * bytes.
584 *
585 */
586
587 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
588
589 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
590 |UTF8_DISALLOW_ABOVE_31_BIT)));
591 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 592
2b479609 593 /* A variant char must begin with a start byte */
35f8c9bd
KW
594 if (UNLIKELY(! UTF8_IS_START(*s))) {
595 return 0;
596 }
597
edc2c47a
KW
598 /* Examine a maximum of a single whole code point */
599 if (e - s > UTF8SKIP(s)) {
600 e = s + UTF8SKIP(s);
601 }
602
2b479609
KW
603 len = e - s;
604
605 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
606 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 607
2b479609
KW
608 /* The code below is derived from this table. Keep in mind that legal
609 * continuation bytes range between \x80..\xBF for UTF-8, and
610 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
611 * Hence, we don't have to test the upper edge because if any of those
612 * are encountered, the sequence is malformed, and will fail elsewhere
613 * in this function.
614 * UTF-8 UTF-EBCDIC I8
615 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
616 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
617 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
618 *
619 */
620
621#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
622# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 623# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 624
19794540
KW
625# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
626 /* B6 and B7 */ \
627 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
628#else
629# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
630# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
631# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
632#endif
633
634 if ( (flags & UTF8_DISALLOW_SUPER)
635 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
636 return 0; /* Above Unicode */
637 }
638
639 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
640 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
641 {
642 return 0; /* Above 31 bits */
643 }
644
645 if (len > 1) {
646 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
647
648 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 649 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
650 {
651 return 0; /* Above Unicode */
652 }
653
654 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 655 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
656 {
657 return 0; /* Surrogate */
658 }
659
660 if ( (flags & UTF8_DISALLOW_NONCHAR)
661 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
662 {
663 return 0; /* Noncharacter code point */
664 }
665 }
666 }
667
668 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
669 for (x = s + 1; x < e; x++) {
670 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
671 return 0;
672 }
673 }
674
af13dd8a 675 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
676 * overlong. */
677 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
678 return 0;
af13dd8a
KW
679 }
680
12a4bed3
KW
681 /* And finally, that the code point represented fits in a word on this
682 * platform */
683 if (does_utf8_overflow(s, e)) {
684 return 0;
35f8c9bd
KW
685 }
686
2b479609 687 return UTF8SKIP(s);
35f8c9bd
KW
688}
689
7cf8d05d
KW
690STATIC char *
691S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
692{
693 /* Returns a mortalized C string that is a displayable copy of the 'len'
694 * bytes starting at 's', each in a \xXY format. */
695
696 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
697 trailing NUL */
698 const U8 * const e = s + len;
699 char * output;
700 char * d;
701
702 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
703
704 Newx(output, output_len, char);
705 SAVEFREEPV(output);
706
707 d = output;
708 for (; s < e; s++) {
709 const unsigned high_nibble = (*s & 0xF0) >> 4;
710 const unsigned low_nibble = (*s & 0x0F);
711
712 *d++ = '\\';
713 *d++ = 'x';
714
715 if (high_nibble < 10) {
716 *d++ = high_nibble + '0';
717 }
718 else {
719 *d++ = high_nibble - 10 + 'a';
720 }
721
722 if (low_nibble < 10) {
723 *d++ = low_nibble + '0';
724 }
725 else {
726 *d++ = low_nibble - 10 + 'a';
727 }
728 }
729
730 *d = '\0';
731 return output;
732}
733
806547a7 734PERL_STATIC_INLINE char *
7cf8d05d
KW
735S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
736
737 /* How many bytes to print */
738 const STRLEN print_len,
739
740 /* Which one is the non-continuation */
741 const STRLEN non_cont_byte_pos,
742
743 /* How many bytes should there be? */
744 const STRLEN expect_len)
806547a7
KW
745{
746 /* Return the malformation warning text for an unexpected continuation
747 * byte. */
748
7cf8d05d 749 const char * const where = (non_cont_byte_pos == 1)
806547a7 750 ? "immediately"
7cf8d05d
KW
751 : Perl_form(aTHX_ "%d bytes",
752 (int) non_cont_byte_pos);
806547a7
KW
753
754 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
755
7cf8d05d
KW
756 /* We don't need to pass this parameter, but since it has already been
757 * calculated, it's likely faster to pass it; verify under DEBUGGING */
758 assert(expect_len == UTF8SKIP(s));
759
760 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
761 " %s after start byte 0x%02x; need %d bytes, got %d)",
762 malformed_text,
763 _byte_dump_string(s, print_len),
764 *(s + non_cont_byte_pos),
765 where,
766 *s,
767 (int) expect_len,
768 (int) non_cont_byte_pos);
806547a7
KW
769}
770
35f8c9bd
KW
771/*
772
de69f3af 773=for apidoc utf8n_to_uvchr
378516de
KW
774
775THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 776Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 777
9041c2e3 778Bottom level UTF-8 decode routine.
de69f3af 779Returns the native code point value of the first character in the string C<s>,
746afd53
KW
780which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
781C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
782the length, in bytes, of that character.
949cf498
KW
783
784The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
785well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
786causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
787is the next possible position in C<s> that could begin a non-malformed
788character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
789is raised. Some UTF-8 input sequences may contain multiple malformations.
790This function tries to find every possible one in each call, so multiple
791warnings can be raised for each sequence.
949cf498
KW
792
793Various ALLOW flags can be set in C<flags> to allow (and not warn on)
794individual types of malformations, such as the sequence being overlong (that
795is, when there is a shorter sequence that can express the same code point;
796overlong sequences are expressly forbidden in the UTF-8 standard due to
797potential security issues). Another malformation example is the first byte of
798a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
799flags. For allowed 0 length strings, this function returns 0; for allowed
800overlong sequences, the computed code point is returned; for all other allowed
801malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
802determinable reasonable value.
949cf498 803
796b6530 804The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
805flags) malformation is found. If this flag is set, the routine assumes that
806the caller will raise a warning, and this function will silently just set
d088425d
KW
807C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
808
75200dff 809Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 810character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
811in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
812be set to 1. To disambiguate, upon a zero return, see if the first byte of
813C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
814error.
949cf498
KW
815
816Certain code points are considered problematic. These are Unicode surrogates,
746afd53 817Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 818By default these are considered regular code points, but certain situations
ecc1615f
KW
819warrant special handling for them, which can be specified using the C<flags>
820parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
821three classes are treated as malformations and handled as such. The flags
822C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
823C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
824disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
825restricts the allowed inputs to the strict UTF-8 traditionally defined by
826Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
827definition given by
828L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
829The difference between traditional strictness and C9 strictness is that the
830latter does not forbid non-character code points. (They are still discouraged,
831however.) For more discussion see L<perlunicode/Noncharacter code points>.
832
833The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
834C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
835C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
836raised for their respective categories, but otherwise the code points are
837considered valid (not malformations). To get a category to both be treated as
838a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 839(But note that warnings are not raised if lexically disabled nor if
796b6530 840C<UTF8_CHECK_ONLY> is also specified.)
949cf498 841
760c7c2f
KW
842It is now deprecated to have very high code points (above C<IV_MAX> on the
843platforms) and this function will raise a deprecation warning for these (unless
844such warnings are turned off). This value, is typically 0x7FFF_FFFF (2**31 -1)
845in a 32-bit word.
ab8e6d41
KW
846
847Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
848so using them is more problematic than other above-Unicode code points. Perl
849invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
850likely that non-Perl languages will not be able to read files that contain
851these that written by the perl interpreter; nor would Perl understand files
852written by something that uses a different extension. For these reasons, there
853is a separate set of flags that can warn and/or disallow these extremely high
854code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
855C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
856are entirely independent from the deprecation warning for code points above
857C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
858code point that needs more than 31 bits to represent. When that happens,
859effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
86032-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
861above-Unicode code points, including these, as malformations; and
862C<UTF8_WARN_SUPER> warns on these.)
863
864On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
865extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
866than on ASCII. Prior to that, code points 2**31 and higher were simply
867unrepresentable, and a different, incompatible method was used to represent
868code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
869and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
870platforms, warning and disallowing 2**31 and higher.
949cf498
KW
871
872All other code points corresponding to Unicode characters, including private
873use and those yet to be assigned, are never considered malformed and never
874warn.
67e989fb 875
37607a96
PK
876=cut
877*/
67e989fb 878
a0ed51b3 879UV
3757e55d 880Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags)
a0ed51b3 881{
d4c19fe8 882 const U8 * const s0 = s;
2b5e7bc2
KW
883 U8 * send = NULL; /* (initialized to silence compilers' wrong
884 warning) */
885 U32 possible_problems = 0; /* A bit is set here for each potential problem
886 found as we go along */
eb83ed87 887 UV uv = *s;
2b5e7bc2
KW
888 STRLEN expectlen = 0; /* How long should this sequence be?
889 (initialized to silence compilers' wrong
890 warning) */
a0dbb045 891
2b5e7bc2
KW
892 /* The below are used only if there is both an overlong malformation and a
893 * too short one. Otherwise the first two are set to 's0' and 'send', and
894 * the third not used at all */
895 U8 * adjusted_s0 = (U8 *) s0;
896 U8 * adjusted_send;
897 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 898
de69f3af 899 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
a0dbb045 900
eb83ed87
KW
901 /* The order of malformation tests here is important. We should consume as
902 * few bytes as possible in order to not skip any valid character. This is
903 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
904 * http://unicode.org/reports/tr36 for more discussion as to why. For
905 * example, once we've done a UTF8SKIP, we can tell the expected number of
906 * bytes, and could fail right off the bat if the input parameters indicate
907 * that there are too few available. But it could be that just that first
908 * byte is garbled, and the intended character occupies fewer bytes. If we
909 * blindly assumed that the first byte is correct, and skipped based on
910 * that number, we could skip over a valid input character. So instead, we
911 * always examine the sequence byte-by-byte.
912 *
913 * We also should not consume too few bytes, otherwise someone could inject
914 * things. For example, an input could be deliberately designed to
915 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 916 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
917 * which is actually part of of the overflowing sequence), that could look
918 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
919 * sequence and process the rest, inappropriately.
920 *
921 * Some possible input sequences are malformed in more than one way. This
922 * function goes to lengths to try to find all of them. This is necessary
923 * for correctness, as the inputs may allow one malformation but not
924 * another, and if we abandon searching for others after finding the
925 * allowed one, we could allow in something that shouldn't have been.
926 */
eb83ed87 927
b5b9af04 928 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
929 possible_problems |= UTF8_GOT_EMPTY;
930 curlen = 0;
931 uv = 0; /* XXX It could be argued that this should be
932 UNICODE_REPLACEMENT? */
933 goto ready_to_handle_errors;
0c443dc2
JH
934 }
935
eb83ed87
KW
936 expectlen = UTF8SKIP(s);
937
938 /* A well-formed UTF-8 character, as the vast majority of calls to this
939 * function will be for, has this expected length. For efficiency, set
940 * things up here to return it. It will be overriden only in those rare
941 * cases where a malformation is found */
942 if (retlen) {
943 *retlen = expectlen;
944 }
945
946 /* An invariant is trivially well-formed */
1d72bdf6 947 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 948 return uv;
a0ed51b3 949 }
67e989fb 950
eb83ed87 951 /* A continuation character can't start a valid sequence */
b5b9af04 952 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
953 possible_problems |= UTF8_GOT_CONTINUATION;
954 curlen = 1;
955 uv = UNICODE_REPLACEMENT;
956 goto ready_to_handle_errors;
ba210ebe 957 }
9041c2e3 958
dcd27b3c
KW
959 /* Here is not a continuation byte, nor an invariant. The only thing left
960 * is a start byte (possibly for an overlong) */
961
534752c1
KW
962 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
963 * that indicate the number of bytes in the character's whole UTF-8
964 * sequence, leaving just the bits that are part of the value. */
965 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 966
eb83ed87
KW
967 /* Now, loop through the remaining bytes in the character's sequence,
968 * accumulating each into the working value as we go. Be sure to not look
969 * past the end of the input string */
2b5e7bc2
KW
970 send = adjusted_send
971 = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
eb83ed87 972 for (s = s0 + 1; s < send; s++) {
b5b9af04 973 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 974 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
975 continue;
976 }
977
978 /* Here, found a non-continuation before processing all expected bytes.
979 * This byte indicates the beginning of a new character, so quit, even
980 * if allowing this malformation. */
981 curlen = s - s0; /* Save how many bytes we actually got */
982 possible_problems |= UTF8_GOT_NON_CONTINUATION;
983 goto finish_short;
eb83ed87
KW
984 } /* End of loop through the character's bytes */
985
986 /* Save how many bytes were actually in the character */
987 curlen = s - s0;
988
2b5e7bc2
KW
989 /* Did we get all the continuation bytes that were expected? Note that we
990 * know this result even without executing the loop above. But we had to
991 * do the loop to see if there are unexpected non-continuations. */
992 if (UNLIKELY(curlen < expectlen)) {
993 possible_problems |= UTF8_GOT_SHORT;
2f8f112e 994
2b5e7bc2
KW
995 finish_short:
996 uv_so_far = uv;
997 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
998 }
999
2b5e7bc2
KW
1000 /* Note that there are two types of too-short malformation. One is when
1001 * there is actual wrong data before the normal termination of the
1002 * sequence. The other is that the sequence wasn't complete before the end
1003 * of the data we are allowed to look at, based on the input 'curlen'.
1004 * This means that we were passed data for a partial character, but it is
1005 * valid as far as we saw. The other is definitely invalid. This
1006 * distinction could be important to a caller, so the two types are kept
1007 * separate. */
1008
1009 /* Check for overflow */
1010 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1011 possible_problems |= UTF8_GOT_OVERFLOW;
1012 uv = UNICODE_REPLACEMENT;
eb83ed87 1013 }
eb83ed87 1014
2b5e7bc2
KW
1015 /* Check for overlong. If no problems so far, 'uv' is the correct code
1016 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1017 * we must look at the UTF-8 byte sequence itself to see if it is for an
1018 * overlong */
1019 if ( ( LIKELY(! possible_problems)
1020 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1021 || ( UNLIKELY( possible_problems)
1022 && ( UNLIKELY(! UTF8_IS_START(*s0))
1023 || ( curlen > 1
1024 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1025 send - s0))))))
2f8f112e 1026 {
2b5e7bc2
KW
1027 possible_problems |= UTF8_GOT_LONG;
1028
1029 /* A convenience macro that matches either of the too-short conditions.
1030 * */
1031# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1032
1033 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1034 UV min_uv = uv_so_far;
1035 STRLEN i;
1036
1037 /* Here, the input is both overlong and is missing some trailing
1038 * bytes. There is no single code point it could be for, but there
1039 * may be enough information present to determine if what we have
1040 * so far is for an unallowed code point, such as for a surrogate.
1041 * The code below has the intelligence to determine this, but just
1042 * for non-overlong UTF-8 sequences. What we do here is calculate
1043 * the smallest code point the input could represent if there were
1044 * no too short malformation. Then we compute and save the UTF-8
1045 * for that, which is what the code below looks at instead of the
1046 * raw input. It turns out that the smallest such code point is
1047 * all we need. */
1048 for (i = curlen; i < expectlen; i++) {
1049 min_uv = UTF8_ACCUMULATE(min_uv,
1050 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1051 }
1052
1053 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1054 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1055 to free it ourselves if
1056 warnings are made fatal */
1057 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1058 }
eb83ed87
KW
1059 }
1060
2b5e7bc2
KW
1061 /* Now check that the input isn't for a problematic code point not allowed
1062 * by the input parameters. */
1063 /* isn't problematic if < this */
1064 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1065 || ( UNLIKELY(possible_problems)
1066 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1067 && ((flags & ( UTF8_DISALLOW_NONCHAR
1068 |UTF8_DISALLOW_SURROGATE
1069 |UTF8_DISALLOW_SUPER
1070 |UTF8_DISALLOW_ABOVE_31_BIT
1071 |UTF8_WARN_NONCHAR
1072 |UTF8_WARN_SURROGATE
1073 |UTF8_WARN_SUPER
1074 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1075 /* In case of a malformation, 'uv' is not valid, and has
1076 * been changed to something in the Unicode range.
1077 * Currently we don't output a deprecation message if there
1078 * is already a malformation, so we don't have to special
1079 * case the test immediately below */
760c7c2f
KW
1080 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1081 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1082 {
2b5e7bc2
KW
1083 /* If there were no malformations, or the only malformation is an
1084 * overlong, 'uv' is valid */
1085 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1086 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1087 possible_problems |= UTF8_GOT_SURROGATE;
1088 }
1089 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1090 possible_problems |= UTF8_GOT_SUPER;
1091 }
1092 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1093 possible_problems |= UTF8_GOT_NONCHAR;
1094 }
1095 }
1096 else { /* Otherwise, need to look at the source UTF-8, possibly
1097 adjusted to be non-overlong */
1098
1099 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1100 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1101 {
2b5e7bc2
KW
1102 possible_problems |= UTF8_GOT_SUPER;
1103 }
1104 else if (curlen > 1) {
1105 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1106 NATIVE_UTF8_TO_I8(*adjusted_s0),
1107 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1108 {
2b5e7bc2 1109 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1110 }
2b5e7bc2
KW
1111 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1112 NATIVE_UTF8_TO_I8(*adjusted_s0),
1113 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1114 {
1115 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1116 }
1117 }
c0236afe 1118
2b5e7bc2
KW
1119 /* We need a complete well-formed UTF-8 character to discern
1120 * non-characters, so can't look for them here */
1121 }
1122 }
949cf498 1123
2b5e7bc2
KW
1124 ready_to_handle_errors:
1125
1126 /* At this point:
1127 * curlen contains the number of bytes in the sequence that
1128 * this call should advance the input by.
1129 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1130 * is set in it for each potential problem found.
1131 * uv contains the code point the input sequence
1132 * represents; or if there is a problem that prevents
1133 * a well-defined value from being computed, it is
1134 * some subsitute value, typically the REPLACEMENT
1135 * CHARACTER.
1136 * s0 points to the first byte of the character
1137 * send points to just after where that (potentially
1138 * partial) character ends
1139 * adjusted_s0 normally is the same as s0, but in case of an
1140 * overlong for which the UTF-8 matters below, it is
1141 * the first byte of the shortest form representation
1142 * of the input.
1143 * adjusted_send normally is the same as 'send', but if adjusted_s0
1144 * is set to something other than s0, this points one
1145 * beyond its end
1146 */
eb83ed87 1147
2b5e7bc2
KW
1148 if (UNLIKELY(possible_problems)) {
1149 bool disallowed = FALSE;
1150 const U32 orig_problems = possible_problems;
1151
1152 while (possible_problems) { /* Handle each possible problem */
1153 UV pack_warn = 0;
1154 char * message = NULL;
1155
1156 /* Each 'if' clause handles one problem. They are ordered so that
1157 * the first ones' messages will be displayed before the later
1158 * ones; this is kinda in decreasing severity order */
1159 if (possible_problems & UTF8_GOT_OVERFLOW) {
1160
1161 /* Overflow means also got a super and above 31 bits, but we
1162 * handle all three cases here */
1163 possible_problems
1164 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
1165
1166 disallowed = TRUE;
1167
1168 /* The warnings code explicitly says it doesn't handle the case
1169 * of packWARN2 and two categories which have parent-child
1170 * relationship. Even if it works now to raise the warning if
1171 * either is enabled, it wouldn't necessarily do so in the
1172 * future. We output (only) the most dire warning*/
1173 if (! (flags & UTF8_CHECK_ONLY)) {
1174 if (ckWARN_d(WARN_UTF8)) {
1175 pack_warn = packWARN(WARN_UTF8);
1176 }
1177 else if (ckWARN_d(WARN_NON_UNICODE)) {
1178 pack_warn = packWARN(WARN_NON_UNICODE);
1179 }
1180 if (pack_warn) {
1181 message = Perl_form(aTHX_ "%s: %s (overflows)",
1182 malformed_text,
1183 _byte_dump_string(s0, send - s0));
1184 }
1185 }
1186 }
1187 else if (possible_problems & UTF8_GOT_EMPTY) {
1188 possible_problems &= ~UTF8_GOT_EMPTY;
1189
1190 if (! (flags & UTF8_ALLOW_EMPTY)) {
1191 disallowed = TRUE;
1192 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1193 pack_warn = packWARN(WARN_UTF8);
1194 message = Perl_form(aTHX_ "%s (empty string)",
1195 malformed_text);
1196 }
1197 }
1198 }
1199 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1200 possible_problems &= ~UTF8_GOT_CONTINUATION;
1201
1202 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1203 disallowed = TRUE;
1204 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1205 pack_warn = packWARN(WARN_UTF8);
1206 message = Perl_form(aTHX_
1207 "%s: %s (unexpected continuation byte 0x%02x,"
1208 " with no preceding start byte)",
1209 malformed_text,
1210 _byte_dump_string(s0, 1), *s0);
1211 }
1212 }
1213 }
1214 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1215 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1216
1217 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1218 disallowed = TRUE;
1219 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1220 pack_warn = packWARN(WARN_UTF8);
1221 message = Perl_form(aTHX_ "%s",
1222 unexpected_non_continuation_text(s0,
1223 send - s0,
1224 s - s0,
1225 (int) expectlen));
1226 }
1227 }
1228 }
1229 else if (possible_problems & UTF8_GOT_SHORT) {
1230 possible_problems &= ~UTF8_GOT_SHORT;
1231
1232 if (! (flags & UTF8_ALLOW_SHORT)) {
1233 disallowed = TRUE;
1234 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1235 pack_warn = packWARN(WARN_UTF8);
1236 message = Perl_form(aTHX_
1237 "%s: %s (too short; got %d byte%s, need %d)",
1238 malformed_text,
1239 _byte_dump_string(s0, send - s0),
1240 (int)curlen,
1241 curlen == 1 ? "" : "s",
1242 (int)expectlen);
1243 }
1244 }
ba210ebe 1245
2b5e7bc2
KW
1246 }
1247 else if (possible_problems & UTF8_GOT_LONG) {
1248 possible_problems &= ~UTF8_GOT_LONG;
1249
1250 if (! (flags & UTF8_ALLOW_LONG)) {
1251 disallowed = TRUE;
1252
1253 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1254 pack_warn = packWARN(WARN_UTF8);
1255
1256 /* These error types cause 'uv' to be something that
1257 * isn't what was intended, so can't use it in the
1258 * message. The other error types either can't
1259 * generate an overlong, or else the 'uv' is valid */
1260 if (orig_problems &
1261 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1262 {
1263 message = Perl_form(aTHX_
1264 "%s: %s (any UTF-8 sequence that starts"
1265 " with \"%s\" is overlong which can and"
1266 " should be represented with a"
1267 " different, shorter sequence)",
1268 malformed_text,
1269 _byte_dump_string(s0, send - s0),
1270 _byte_dump_string(s0, curlen));
1271 }
1272 else {
1273 U8 tmpbuf[UTF8_MAXBYTES+1];
1274 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1275 uv, 0);
1276 message = Perl_form(aTHX_
1277 "%s: %s (overlong; instead use %s to represent"
1278 " U+%0*"UVXf")",
1279 malformed_text,
1280 _byte_dump_string(s0, send - s0),
1281 _byte_dump_string(tmpbuf, e - tmpbuf),
1282 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1283 small code points */
1284 uv);
1285 }
1286 }
1287 }
1288 }
1289 else if (possible_problems & UTF8_GOT_SURROGATE) {
1290 possible_problems &= ~UTF8_GOT_SURROGATE;
1291
1292 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
1293 * generation of the format, since no warnings are raised under
1294 * CHECK */
1295 if ( (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
1296 == UTF8_WARN_SURROGATE
1297 && ckWARN_d(WARN_SURROGATE))
1298 {
1299 pack_warn = packWARN(WARN_SURROGATE);
1300
1301 /* These are the only errors that can occur with a
1302 * surrogate when the 'uv' isn't valid */
1303 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1304 message = Perl_form(aTHX_
1305 "UTF-16 surrogate (any UTF-8 sequence that"
1306 " starts with \"%s\" is for a surrogate)",
1307 _byte_dump_string(s0, curlen));
1308 }
1309 else {
1310 message = Perl_form(aTHX_
1311 "UTF-16 surrogate U+%04"UVXf"", uv);
1312 }
1313 }
ba210ebe 1314
2b5e7bc2
KW
1315 if (flags & UTF8_DISALLOW_SURROGATE) {
1316 disallowed = TRUE;
1317 }
1318 }
1319 else if (possible_problems & UTF8_GOT_SUPER) {
1320 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1321
2b5e7bc2
KW
1322 if ( (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
1323 == UTF8_WARN_SUPER
1324 && ckWARN_d(WARN_NON_UNICODE))
1325 {
1326 pack_warn = packWARN(WARN_NON_UNICODE);
1327
1328 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1329 message = Perl_form(aTHX_
1330 "Any UTF-8 sequence that starts with"
1331 " \"%s\" is for a non-Unicode code point,"
1332 " may not be portable",
1333 _byte_dump_string(s0, curlen));
1334 }
1335 else {
1336 message = Perl_form(aTHX_
1337 "Code point 0x%04"UVXf" is not"
1338 " Unicode, may not be portable",
1339 uv);
1340 }
1341 }
ba210ebe 1342
2b5e7bc2
KW
1343 /* The maximum code point ever specified by a standard was
1344 * 2**31 - 1. Anything larger than that is a Perl extension
1345 * that very well may not be understood by other applications
1346 * (including earlier perl versions on EBCDIC platforms). We
1347 * test for these after the regular SUPER ones, and before
1348 * possibly bailing out, so that the slightly more dire warning
1349 * will override the regular one. */
1350 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1351 |UTF8_WARN_SUPER
1352 |UTF8_DISALLOW_ABOVE_31_BIT))
1353 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1354 && UNLIKELY(is_utf8_cp_above_31_bits(
1355 adjusted_s0,
1356 adjusted_send)))
1357 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1358 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1359 {
1360 if ( ! (flags & UTF8_CHECK_ONLY)
1361 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1362 && ckWARN_d(WARN_UTF8))
1363 {
1364 pack_warn = packWARN(WARN_UTF8);
1365
1366 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1367 message = Perl_form(aTHX_
1368 "Any UTF-8 sequence that starts with"
1369 " \"%s\" is for a non-Unicode code"
1370 " point, and is not portable",
1371 _byte_dump_string(s0, curlen));
1372 }
1373 else {
1374 message = Perl_form(aTHX_
1375 "Code point 0x%"UVXf" is not Unicode,"
1376 " and not portable",
1377 uv);
1378 }
1379 }
1380
1381 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1382 disallowed = TRUE;
1383 }
1384 }
eb83ed87 1385
2b5e7bc2
KW
1386 if (flags & UTF8_DISALLOW_SUPER) {
1387 disallowed = TRUE;
1388 }
eb83ed87 1389
2b5e7bc2
KW
1390 /* The deprecated warning overrides any non-deprecated one. If
1391 * there are other problems, a deprecation message is not
1392 * really helpful, so don't bother to raise it in that case.
1393 * This also keeps the code from having to handle the case
1394 * where 'uv' is not valid. */
1395 if ( ! (orig_problems
1396 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1397 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1398 && ckWARN_d(WARN_DEPRECATED))
1399 {
1400 message = Perl_form(aTHX_ cp_above_legal_max,
1401 uv, MAX_NON_DEPRECATED_CP);
1402 pack_warn = packWARN(WARN_DEPRECATED);
1403 }
1404 }
1405 else if (possible_problems & UTF8_GOT_NONCHAR) {
1406 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1407
2b5e7bc2
KW
1408 if ( (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
1409 == UTF8_WARN_NONCHAR
1410 && ckWARN_d(WARN_NONCHAR))
1411 {
1412 /* The code above should have guaranteed that we don't
1413 * get here with errors other than overlong */
1414 assert (! (orig_problems
1415 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1416
1417 pack_warn = packWARN(WARN_NONCHAR);
1418 message = Perl_form(aTHX_ "Unicode non-character"
1419 " U+%04"UVXf" is not recommended"
1420 " for open interchange", uv);
1421 }
5b311467 1422
2b5e7bc2
KW
1423 if (flags & UTF8_DISALLOW_NONCHAR) {
1424 disallowed = TRUE;
1425 }
1426 } /* End of looking through the possible flags */
1427
1428 /* Display the message (if any) for the problem being handled in
1429 * this iteration of the loop */
1430 if (message) {
1431 if (PL_op)
1432 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1433 OP_DESC(PL_op));
1434 else
1435 Perl_warner(aTHX_ pack_warn, "%s", message);
1436 }
1437 } /* End of 'while (possible_problems) {' */
a0dbb045 1438
2b5e7bc2
KW
1439 /* Since there was a possible problem, the returned length may need to
1440 * be changed from the one stored at the beginning of this function.
1441 * Instead of trying to figure out if that's needed, just do it. */
1442 if (retlen) {
1443 *retlen = curlen;
1444 }
a0dbb045 1445
2b5e7bc2
KW
1446 if (disallowed) {
1447 if (flags & UTF8_CHECK_ONLY && retlen) {
1448 *retlen = ((STRLEN) -1);
1449 }
1450 return 0;
1451 }
eb83ed87 1452 }
ba210ebe 1453
2b5e7bc2 1454 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1455}
1456
8e84507e 1457/*
ec5f19d0
KW
1458=for apidoc utf8_to_uvchr_buf
1459
1460Returns the native code point of the first character in the string C<s> which
1461is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1462C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1463
524080c4
KW
1464If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1465enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1466C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1467(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1468C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1469the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1470See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1471returned.
ec5f19d0 1472
760c7c2f
KW
1473Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1474unless those are turned off.
1475
ec5f19d0 1476=cut
52be2536
KW
1477
1478Also implemented as a macro in utf8.h
1479
ec5f19d0
KW
1480*/
1481
1482
1483UV
1484Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1485{
ec5f19d0
KW
1486 assert(s < send);
1487
1488 return utf8n_to_uvchr(s, send - s, retlen,
1489 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1490}
1491
52be2536
KW
1492/* This is marked as deprecated
1493 *
ec5f19d0
KW
1494=for apidoc utf8_to_uvuni_buf
1495
de69f3af
KW
1496Only in very rare circumstances should code need to be dealing in Unicode
1497(as opposed to native) code points. In those few cases, use
1498C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1499
1500Returns the Unicode (not-native) code point of the first character in the
1501string C<s> which
ec5f19d0
KW
1502is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1503C<retlen> will be set to the length, in bytes, of that character.
1504
524080c4
KW
1505If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1506enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1507NULL) to -1. If those warnings are off, the computed value if well-defined (or
1508the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1509is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1510next possible position in C<s> that could begin a non-malformed character.
de69f3af 1511See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1512
760c7c2f
KW
1513Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1514unless those are turned off.
1515
ec5f19d0
KW
1516=cut
1517*/
1518
1519UV
1520Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1521{
1522 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1523
1524 assert(send > s);
1525
5962d97e
KW
1526 /* Call the low level routine, asking for checks */
1527 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1528}
1529
b76347f2 1530/*
87cea99e 1531=for apidoc utf8_length
b76347f2
JH
1532
1533Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1534Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1535up past C<e>, croaks.
b76347f2
JH
1536
1537=cut
1538*/
1539
1540STRLEN
35a4481c 1541Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1542{
1543 STRLEN len = 0;
1544
7918f24d
NC
1545 PERL_ARGS_ASSERT_UTF8_LENGTH;
1546
8850bf83
JH
1547 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1548 * the bitops (especially ~) can create illegal UTF-8.
1549 * In other words: in Perl UTF-8 is not just for Unicode. */
1550
a3b680e6
AL
1551 if (e < s)
1552 goto warn_and_return;
b76347f2 1553 while (s < e) {
4cbf4130 1554 s += UTF8SKIP(s);
8e91ec7f
AV
1555 len++;
1556 }
1557
1558 if (e != s) {
1559 len--;
1560 warn_and_return:
9b387841
NC
1561 if (PL_op)
1562 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1563 "%s in %s", unees, OP_DESC(PL_op));
1564 else
61a12c31 1565 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1566 }
1567
1568 return len;
1569}
1570
b06226ff 1571/*
fed3ba5d
NC
1572=for apidoc bytes_cmp_utf8
1573
a1433954 1574Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1575sequence of characters (stored as UTF-8)
1576in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1577equal, -1 or -2 if the first string is less than the second string, +1 or +2
1578if the first string is greater than the second string.
1579
1580-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1581longer string. -2 or +2 is returned if
1582there was a difference between characters
fed3ba5d
NC
1583within the strings.
1584
1585=cut
1586*/
1587
1588int
1589Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1590{
1591 const U8 *const bend = b + blen;
1592 const U8 *const uend = u + ulen;
1593
1594 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1595
1596 while (b < bend && u < uend) {
1597 U8 c = *u++;
1598 if (!UTF8_IS_INVARIANT(c)) {
1599 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1600 if (u < uend) {
1601 U8 c1 = *u++;
1602 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1603 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1604 } else {
2b5e7bc2 1605 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1606 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1607 "%s %s%s",
7cf8d05d 1608 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1609 PL_op ? " in " : "",
1610 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1611 return -2;
1612 }
1613 } else {
1614 if (PL_op)
1615 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1616 "%s in %s", unees, OP_DESC(PL_op));
1617 else
61a12c31 1618 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1619 return -2; /* Really want to return undef :-) */
1620 }
1621 } else {
1622 return -2;
1623 }
1624 }
1625 if (*b != c) {
1626 return *b < c ? -2 : +2;
1627 }
1628 ++b;
1629 }
1630
1631 if (b == bend && u == uend)
1632 return 0;
1633
1634 return b < bend ? +1 : -1;
1635}
1636
1637/*
87cea99e 1638=for apidoc utf8_to_bytes
6940069f 1639
2bbc8d55 1640Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1641Unlike L</bytes_to_utf8>, this over-writes the original string, and
1642updates C<len> to contain the new length.
67e989fb 1643Returns zero on failure, setting C<len> to -1.
6940069f 1644
a1433954 1645If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1646
6940069f
GS
1647=cut
1648*/
1649
1650U8 *
37607a96 1651Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1652{
d4c19fe8
AL
1653 U8 * const save = s;
1654 U8 * const send = s + *len;
6940069f 1655 U8 *d;
246fae53 1656
7918f24d 1657 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1658 PERL_UNUSED_CONTEXT;
7918f24d 1659
1e54db1a 1660 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1661 while (s < send) {
d59937ca
KW
1662 if (! UTF8_IS_INVARIANT(*s)) {
1663 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1664 *len = ((STRLEN) -1);
1665 return 0;
1666 }
1667 s++;
dcad2880 1668 }
d59937ca 1669 s++;
246fae53 1670 }
dcad2880
JH
1671
1672 d = s = save;
6940069f 1673 while (s < send) {
80e0b38f
KW
1674 U8 c = *s++;
1675 if (! UTF8_IS_INVARIANT(c)) {
1676 /* Then it is two-byte encoded */
a62b247b 1677 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1678 s++;
1679 }
1680 *d++ = c;
6940069f
GS
1681 }
1682 *d = '\0';
246fae53 1683 *len = d - save;
6940069f
GS
1684 return save;
1685}
1686
1687/*
87cea99e 1688=for apidoc bytes_from_utf8
f9a63242 1689
2bbc8d55 1690Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1691Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1692the newly-created string, and updates C<len> to contain the new
1693length. Returns the original string if no conversion occurs, C<len>
72d33970 1694is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 16950 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1696in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1697
37607a96
PK
1698=cut
1699*/
f9a63242
JH
1700
1701U8 *
e1ec3a88 1702Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1703{
f9a63242 1704 U8 *d;
e1ec3a88
AL
1705 const U8 *start = s;
1706 const U8 *send;
f9a63242
JH
1707 I32 count = 0;
1708
7918f24d 1709 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1710 PERL_UNUSED_CONTEXT;
f9a63242 1711 if (!*is_utf8)
73d840c0 1712 return (U8 *)start;
f9a63242 1713
1e54db1a 1714 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1715 for (send = s + *len; s < send;) {
d59937ca
KW
1716 if (! UTF8_IS_INVARIANT(*s)) {
1717 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1718 return (U8 *)start;
d59937ca
KW
1719 }
1720 count++;
1721 s++;
db42d148 1722 }
d59937ca 1723 s++;
f9a63242
JH
1724 }
1725
35da51f7 1726 *is_utf8 = FALSE;
f9a63242 1727
212542aa 1728 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1729 s = start; start = d;
f9a63242
JH
1730 while (s < send) {
1731 U8 c = *s++;
1a91c45d 1732 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1733 /* Then it is two-byte encoded */
a62b247b 1734 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1735 s++;
c4d5f83a
NIS
1736 }
1737 *d++ = c;
f9a63242
JH
1738 }
1739 *d = '\0';
1740 *len = d - start;
73d840c0 1741 return (U8 *)start;
f9a63242
JH
1742}
1743
1744/*
87cea99e 1745=for apidoc bytes_to_utf8
6940069f 1746
ff97e5cf
KW
1747Converts a string C<s> of length C<len> bytes from the native encoding into
1748UTF-8.
6662521e 1749Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1750reflect the new length in bytes.
6940069f 1751
75200dff 1752A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1753
1754If you want to convert to UTF-8 from encodings other than
1755the native (Latin1 or EBCDIC),
a1433954 1756see L</sv_recode_to_utf8>().
c9ada85f 1757
497711e7 1758=cut
6940069f
GS
1759*/
1760
c682ebef
FC
1761/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1762 likewise need duplication. */
1763
6940069f 1764U8*
35a4481c 1765Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1766{
35a4481c 1767 const U8 * const send = s + (*len);
6940069f
GS
1768 U8 *d;
1769 U8 *dst;
7918f24d
NC
1770
1771 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1772 PERL_UNUSED_CONTEXT;
6940069f 1773
212542aa 1774 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1775 dst = d;
1776
1777 while (s < send) {
55d09dc8
KW
1778 append_utf8_from_native_byte(*s, &d);
1779 s++;
6940069f
GS
1780 }
1781 *d = '\0';
6662521e 1782 *len = d-dst;
6940069f
GS
1783 return dst;
1784}
1785
a0ed51b3 1786/*
dea0fc0b 1787 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1788 *
1789 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1790 * We optimize for native, for obvious reasons. */
1791
1792U8*
dea0fc0b 1793Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1794{
dea0fc0b
JH
1795 U8* pend;
1796 U8* dstart = d;
1797
7918f24d
NC
1798 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1799
dea0fc0b 1800 if (bytelen & 1)
f5992bc4 1801 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1802
1803 pend = p + bytelen;
1804
a0ed51b3 1805 while (p < pend) {
dea0fc0b
JH
1806 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1807 p += 2;
2d1545e5 1808 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1809 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1810 continue;
1811 }
56d37426
KW
1812 if (uv <= MAX_UTF8_TWO_BYTE) {
1813 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1814 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1815 continue;
1816 }
46956fad
KW
1817#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1818#define LAST_HIGH_SURROGATE 0xDBFF
1819#define FIRST_LOW_SURROGATE 0xDC00
1820#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1821
1822 /* This assumes that most uses will be in the first Unicode plane, not
1823 * needing surrogates */
1824 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1825 && uv <= UNICODE_SURROGATE_LAST))
1826 {
1827 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1828 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1829 }
1830 else {
01ea242b 1831 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1832 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1833 || UNLIKELY(low > LAST_LOW_SURROGATE))
1834 {
01ea242b 1835 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
1836 }
1837 p += 2;
46956fad
KW
1838 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1839 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1840 }
a0ed51b3 1841 }
56d37426
KW
1842#ifdef EBCDIC
1843 d = uvoffuni_to_utf8_flags(d, uv, 0);
1844#else
a0ed51b3 1845 if (uv < 0x10000) {
eb160463
GS
1846 *d++ = (U8)(( uv >> 12) | 0xe0);
1847 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1848 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1849 continue;
1850 }
1851 else {
eb160463
GS
1852 *d++ = (U8)(( uv >> 18) | 0xf0);
1853 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1854 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1855 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1856 continue;
1857 }
56d37426 1858#endif
a0ed51b3 1859 }
dea0fc0b 1860 *newlen = d - dstart;
a0ed51b3
LW
1861 return d;
1862}
1863
1864/* Note: this one is slightly destructive of the source. */
1865
1866U8*
dea0fc0b 1867Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1868{
1869 U8* s = (U8*)p;
d4c19fe8 1870 U8* const send = s + bytelen;
7918f24d
NC
1871
1872 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1873
e0ea5e2d
NC
1874 if (bytelen & 1)
1875 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1876 (UV)bytelen);
1877
a0ed51b3 1878 while (s < send) {
d4c19fe8 1879 const U8 tmp = s[0];
a0ed51b3
LW
1880 s[0] = s[1];
1881 s[1] = tmp;
1882 s += 2;
1883 }
dea0fc0b 1884 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1885}
1886
922e8cb4
KW
1887bool
1888Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1889{
1890 U8 tmpbuf[UTF8_MAXBYTES+1];
1891 uvchr_to_utf8(tmpbuf, c);
1892 return _is_utf8_FOO(classnum, tmpbuf);
1893}
1894
f9ae8fb6
JD
1895/* Internal function so we can deprecate the external one, and call
1896 this one from other deprecated functions in this file */
1897
f2645549
KW
1898bool
1899Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 1900{
f2645549 1901 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
1902
1903 if (*p == '_')
1904 return TRUE;
f25ce844 1905 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
1906}
1907
5092f92a 1908bool
eba68aa0
KW
1909Perl__is_uni_perl_idcont(pTHX_ UV c)
1910{
1911 U8 tmpbuf[UTF8_MAXBYTES+1];
1912 uvchr_to_utf8(tmpbuf, c);
1913 return _is_utf8_perl_idcont(tmpbuf);
1914}
1915
1916bool
f91dcd13
KW
1917Perl__is_uni_perl_idstart(pTHX_ UV c)
1918{
1919 U8 tmpbuf[UTF8_MAXBYTES+1];
1920 uvchr_to_utf8(tmpbuf, c);
1921 return _is_utf8_perl_idstart(tmpbuf);
1922}
1923
3a4c58c9
KW
1924UV
1925Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1926{
1927 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 1928 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
1929 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1930 * either "SS" or "Ss". Which one to use is passed into the routine in
1931 * 'S_or_s' to avoid a test */
1932
1933 UV converted = toUPPER_LATIN1_MOD(c);
1934
1935 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1936
1937 assert(S_or_s == 'S' || S_or_s == 's');
1938
6f2d5cbc 1939 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 1940 characters in this range */
3a4c58c9
KW
1941 *p = (U8) converted;
1942 *lenp = 1;
1943 return converted;
1944 }
1945
1946 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1947 * which it maps to one of them, so as to only have to have one check for
1948 * it in the main case */
1949 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1950 switch (c) {
1951 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1952 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1953 break;
1954 case MICRO_SIGN:
1955 converted = GREEK_CAPITAL_LETTER_MU;
1956 break;
79e064b9
KW
1957#if UNICODE_MAJOR_VERSION > 2 \
1958 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
1959 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
1960 case LATIN_SMALL_LETTER_SHARP_S:
1961 *(p)++ = 'S';
1962 *p = S_or_s;
1963 *lenp = 2;
1964 return 'S';
79e064b9 1965#endif
3a4c58c9
KW
1966 default:
1967 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 1968 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
1969 }
1970 }
1971
1972 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1973 *p = UTF8_TWO_BYTE_LO(converted);
1974 *lenp = 2;
1975
1976 return converted;
1977}
1978
50bda2c3
KW
1979/* Call the function to convert a UTF-8 encoded character to the specified case.
1980 * Note that there may be more than one character in the result.
1981 * INP is a pointer to the first byte of the input character
1982 * OUTP will be set to the first byte of the string of changed characters. It
1983 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1984 * LENP will be set to the length in bytes of the string of changed characters
1985 *
1986 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
1987#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
1988#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
1989#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 1990
b9992569
KW
1991/* This additionally has the input parameter 'specials', which if non-zero will
1992 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 1993 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 1994#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 1995
84afefe6
JH
1996UV
1997Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1998{
a1433954
KW
1999 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2000 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2001 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2002 * the changed version may be longer than the original character.
2003 *
2004 * The ordinal of the first character of the changed version is returned
2005 * (but note, as explained above, that there may be more.) */
2006
7918f24d
NC
2007 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2008
3a4c58c9
KW
2009 if (c < 256) {
2010 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2011 }
2012
0ebc6274 2013 uvchr_to_utf8(p, c);
b9992569 2014 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2015}
2016
84afefe6
JH
2017UV
2018Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2019{
7918f24d
NC
2020 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2021
3a4c58c9
KW
2022 if (c < 256) {
2023 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2024 }
2025
0ebc6274 2026 uvchr_to_utf8(p, c);
b9992569 2027 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2028}
2029
afc16117 2030STATIC U8
81611534 2031S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
2032{
2033 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2034 * those, converting the result to UTF-8. Since the result is always just
a1433954 2035 * one character, we allow <p> to be NULL */
afc16117
KW
2036
2037 U8 converted = toLOWER_LATIN1(c);
2038
2039 if (p != NULL) {
6f2d5cbc 2040 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2041 *p = converted;
2042 *lenp = 1;
2043 }
2044 else {
430c9760
KW
2045 /* Result is known to always be < 256, so can use the EIGHT_BIT
2046 * macros */
2047 *p = UTF8_EIGHT_BIT_HI(converted);
2048 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2049 *lenp = 2;
2050 }
2051 }
2052 return converted;
2053}
2054
84afefe6
JH
2055UV
2056Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2057{
7918f24d
NC
2058 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2059
afc16117
KW
2060 if (c < 256) {
2061 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
2062 }
2063
afc16117 2064 uvchr_to_utf8(p, c);
b9992569 2065 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2066}
2067
84afefe6 2068UV
51910141 2069Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2070{
51910141 2071 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2072 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2073 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2074 *
2075 * Not to be used for locale folds
51910141 2076 */
f673fad4 2077
a1dde8de
KW
2078 UV converted;
2079
2080 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2081 PERL_UNUSED_CONTEXT;
a1dde8de 2082
1ca267a5
KW
2083 assert (! (flags & FOLD_FLAGS_LOCALE));
2084
659a7c2d 2085 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2086 converted = GREEK_SMALL_LETTER_MU;
2087 }
9b63e895
KW
2088#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2089 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2090 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2091 else if ( (flags & FOLD_FLAGS_FULL)
2092 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2093 {
1ca267a5
KW
2094 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2095 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2096 * under those circumstances. */
2097 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2098 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2099 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2100 p, *lenp, U8);
2101 return LATIN_SMALL_LETTER_LONG_S;
2102 }
2103 else {
4f489194
KW
2104 *(p)++ = 's';
2105 *p = 's';
2106 *lenp = 2;
2107 return 's';
1ca267a5 2108 }
a1dde8de 2109 }
9b63e895 2110#endif
a1dde8de
KW
2111 else { /* In this range the fold of all other characters is their lower
2112 case */
2113 converted = toLOWER_LATIN1(c);
2114 }
2115
6f2d5cbc 2116 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2117 *p = (U8) converted;
2118 *lenp = 1;
2119 }
2120 else {
2121 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2122 *p = UTF8_TWO_BYTE_LO(converted);
2123 *lenp = 2;
2124 }
2125
2126 return converted;
2127}
2128
2129UV
31f05a37 2130Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2131{
4b593389 2132
a0270393
KW
2133 /* Not currently externally documented, and subject to change
2134 * <flags> bits meanings:
2135 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2136 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2137 * locale are to be used.
a0270393
KW
2138 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2139 */
4b593389 2140
36bb2ab6 2141 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2142
780fcc9f
KW
2143 if (flags & FOLD_FLAGS_LOCALE) {
2144 /* Treat a UTF-8 locale as not being in locale at all */
2145 if (IN_UTF8_CTYPE_LOCALE) {
2146 flags &= ~FOLD_FLAGS_LOCALE;
2147 }
2148 else {
2149 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2150 goto needs_full_generality;
780fcc9f 2151 }
31f05a37
KW
2152 }
2153
a1dde8de 2154 if (c < 256) {
e7b7ac46 2155 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2156 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2157 }
2158
2f306ab9 2159 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2160 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2161 uvchr_to_utf8(p, c);
b9992569 2162 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2163 }
2164 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2165 the special flags. */
2166 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2167
2168 needs_full_generality:
a0270393 2169 uvchr_to_utf8(utf8_c, c);
445bf929 2170 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2171 }
84afefe6
JH
2172}
2173
26483009 2174PERL_STATIC_INLINE bool
5141f98e 2175S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2176 const char *const swashname, SV* const invlist)
bde6a22d 2177{
ea317ccb
KW
2178 /* returns a boolean giving whether or not the UTF8-encoded character that
2179 * starts at <p> is in the swash indicated by <swashname>. <swash>
2180 * contains a pointer to where the swash indicated by <swashname>
2181 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2182 * look at <*swash> and only generate a swash if it is not null. <invlist>
2183 * is NULL or an inversion list that defines the swash. If not null, it
2184 * saves time during initialization of the swash.
ea317ccb
KW
2185 *
2186 * Note that it is assumed that the buffer length of <p> is enough to
2187 * contain all the bytes that comprise the character. Thus, <*p> should
2188 * have been checked before this call for mal-formedness enough to assure
2189 * that. */
2190
7918f24d
NC
2191 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2192
492a624f 2193 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2194 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2195 * as far as there being enough bytes available in it to accommodate the
2196 * character without reading beyond the end, and pass that number on to the
2197 * validating routine */
6302f837 2198 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2199 if (ckWARN_d(WARN_UTF8)) {
2200 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2201 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2202 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2203 what the malformation is */
2204 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2205 }
2206 }
2207 return FALSE;
2208 }
87367d5f
KW
2209 if (!*swash) {
2210 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2211 *swash = _core_swash_init("utf8",
2212
2213 /* Only use the name if there is no inversion
2214 * list; otherwise will go out to disk */
2215 (invlist) ? "" : swashname,
2216
2217 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2218 }
28123549 2219
bde6a22d
NC
2220 return swash_fetch(*swash, p, TRUE) != 0;
2221}
2222
2223bool
922e8cb4
KW
2224Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2225{
922e8cb4
KW
2226 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2227
2228 assert(classnum < _FIRST_NON_SWASH_CC);
2229
f25ce844
KW
2230 return is_utf8_common(p,
2231 &PL_utf8_swash_ptrs[classnum],
2232 swash_property_names[classnum],
2233 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2234}
2235
2236bool
f2645549 2237Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2238{
f2645549 2239 SV* invlist = NULL;
7918f24d 2240
f2645549 2241 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2242
f2645549
KW
2243 if (! PL_utf8_perl_idstart) {
2244 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2245 }
60071a22 2246 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2247}
2248
2249bool
f2645549 2250Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2251{
f2645549 2252 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2253
2254 if (*p == '_')
2255 return TRUE;
f25ce844 2256 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2257}
2258
2259bool
eba68aa0
KW
2260Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2261{
b24b43f7 2262 SV* invlist = NULL;
eba68aa0
KW
2263
2264 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2265
b24b43f7
KW
2266 if (! PL_utf8_perl_idcont) {
2267 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2268 }
60071a22 2269 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2270}
2271
eba68aa0 2272bool
f2645549 2273Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2274{
f2645549 2275 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2276
f25ce844 2277 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2278}
2279
2280bool
f2645549 2281Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2282{
f2645549 2283 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2284
f25ce844 2285 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2286}
2287
2288bool
7dbf68d2
KW
2289Perl__is_utf8_mark(pTHX_ const U8 *p)
2290{
7dbf68d2
KW
2291 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2292
f25ce844 2293 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2294}
2295
6b5c0936 2296/*
87cea99e 2297=for apidoc to_utf8_case
6b5c0936 2298
9da1e7cb
KW
2299Instead use the appropriate one of L</toUPPER_utf8>,
2300L</toTITLE_utf8>,
2301L</toLOWER_utf8>,
2302or L</toFOLD_utf8>.
2303
6fae5207 2304C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2305the character that is being converted. This routine assumes that the character
2306at C<p> is well-formed.
6b5c0936 2307
6fae5207
KW
2308C<ustrp> is a pointer to the character buffer to put the
2309conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2310of the result.
2311
6fae5207 2312C<swashp> is a pointer to the swash to use.
6b5c0936 2313
a1433954 2314Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2315and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2316but not always, a multicharacter mapping), is tried first.
6b5c0936 2317
4a8240a3
KW
2318C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2319any special mappings; C<""> means to use the special mappings. Values other
2320than these two are treated as the name of the hash containing the special
2321mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2322
796b6530
KW
2323C<normal> is a string like C<"ToLower"> which means the swash
2324C<%utf8::ToLower>.
0134edef 2325
760c7c2f
KW
2326Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2327unless those are turned off.
2328
0134edef 2329=cut */
6b5c0936 2330
2104c8d9 2331UV
9a957fbc
AL
2332Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2333 SV **swashp, const char *normal, const char *special)
a0ed51b3 2334{
b9992569
KW
2335 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2336
2337 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2338}
2339
2340 /* change namve uv1 to 'from' */
6a4a25f4 2341STATIC UV
b9992569
KW
2342S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2343 SV **swashp, const char *normal, const char *special)
2344{
0134edef 2345 STRLEN len = 0;
7918f24d 2346
b9992569 2347 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2348
36eaa811
KW
2349 /* For code points that don't change case, we already know that the output
2350 * of this function is the unchanged input, so we can skip doing look-ups
2351 * for them. Unfortunately the case-changing code points are scattered
2352 * around. But there are some long consecutive ranges where there are no
2353 * case changing code points. By adding tests, we can eliminate the lookup
2354 * for all the ones in such ranges. This is currently done here only for
2355 * just a few cases where the scripts are in common use in modern commerce
2356 * (and scripts adjacent to those which can be included without additional
2357 * tests). */
2358
2359 if (uv1 >= 0x0590) {
2360 /* This keeps from needing further processing the code points most
2361 * likely to be used in the following non-cased scripts: Hebrew,
2362 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2363 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2364 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2365 if (uv1 < 0x10A0) {
2366 goto cases_to_self;
2367 }
2368
2369 /* The following largish code point ranges also don't have case
2370 * changes, but khw didn't think they warranted extra tests to speed
2371 * them up (which would slightly slow down everything else above them):
2372 * 1100..139F Hangul Jamo, Ethiopic
2373 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2374 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2375 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2376 * Combining Diacritical Marks Extended, Balinese,
2377 * Sundanese, Batak, Lepcha, Ol Chiki
2378 * 2000..206F General Punctuation
2379 */
2380
2381 if (uv1 >= 0x2D30) {
2382
2383 /* This keeps the from needing further processing the code points
2384 * most likely to be used in the following non-cased major scripts:
2385 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2386 *
2387 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2388 * event that Unicode eventually allocates the unused block as of
2389 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2390 * that the test suite will start having failures to alert you
2391 * should that happen) */
2392 if (uv1 < 0xA640) {
2393 goto cases_to_self;
2394 }
2395
2396 if (uv1 >= 0xAC00) {
2397 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2398 if (ckWARN_d(WARN_SURROGATE)) {
2399 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2400 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2401 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2402 }
2403 goto cases_to_self;
2404 }
36eaa811
KW
2405
2406 /* AC00..FAFF Catches Hangul syllables and private use, plus
2407 * some others */
2408 if (uv1 < 0xFB00) {
2409 goto cases_to_self;
2410
2411 }
2412
5af9bc97
KW
2413 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2414 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2415 && ckWARN_d(WARN_DEPRECATED))
2416 {
2417 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2418 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2419 }
2420 if (ckWARN_d(WARN_NON_UNICODE)) {
2421 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2422 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2423 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2424 }
2425 goto cases_to_self;
2426 }
3bfc1e70
KW
2427#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2428 if (UNLIKELY(uv1
2429 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2430 {
2431
2432 /* As of this writing, this means we avoid swash creation
2433 * for anything beyond low Plane 1 */
2434 goto cases_to_self;
2435 }
2436#endif
36eaa811
KW
2437 }
2438 }
9ae3ac1a 2439
36eaa811
KW
2440 /* Note that non-characters are perfectly legal, so no warning should
2441 * be given. There are so few of them, that it isn't worth the extra
2442 * tests to avoid swash creation */
9ae3ac1a
KW
2443 }
2444
0134edef 2445 if (!*swashp) /* load on-demand */
5ab9d2ef 2446 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2447
a6f87d8c 2448 if (special) {
0134edef 2449 /* It might be "special" (sometimes, but not always,
2a37f04d 2450 * a multicharacter mapping) */
4a8240a3 2451 HV *hv = NULL;
b08cf34e
JH
2452 SV **svp;
2453
4a8240a3
KW
2454 /* If passed in the specials name, use that; otherwise use any
2455 * given in the swash */
2456 if (*special != '\0') {
2457 hv = get_hv(special, 0);
2458 }
2459 else {
2460 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2461 if (svp) {
2462 hv = MUTABLE_HV(SvRV(*svp));
2463 }
2464 }
2465
176fe009 2466 if (hv
5f560d8a 2467 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2468 && (*svp))
2469 {
cfd0369c 2470 const char *s;
47654450 2471
cfd0369c 2472 s = SvPV_const(*svp, len);
47654450 2473 if (len == 1)
f4cd282c 2474 /* EIGHTBIT */
c80e42f3 2475 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2476 else {
d2dcd0fb 2477 Copy(s, ustrp, len, U8);
29e98929 2478 }
983ffd37 2479 }
0134edef
JH
2480 }
2481
2482 if (!len && *swashp) {
4a4088c4 2483 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2484
0134edef
JH
2485 if (uv2) {
2486 /* It was "normal" (a single character mapping). */
f4cd282c 2487 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2488 }
2489 }
1feea2c7 2490
cbe07460
KW
2491 if (len) {
2492 if (lenp) {
2493 *lenp = len;
2494 }
2495 return valid_utf8_to_uvchr(ustrp, 0);
2496 }
2497
2498 /* Here, there was no mapping defined, which means that the code point maps
2499 * to itself. Return the inputs */
e24dfe9c 2500 cases_to_self:
bfdf22ec 2501 len = UTF8SKIP(p);
ca9fab46
KW
2502 if (p != ustrp) { /* Don't copy onto itself */
2503 Copy(p, ustrp, len, U8);
2504 }
0134edef 2505
2a37f04d
JH
2506 if (lenp)
2507 *lenp = len;
2508
f4cd282c 2509 return uv1;
cbe07460 2510
a0ed51b3
LW
2511}
2512
051a06d4 2513STATIC UV
357aadde 2514S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2515{
4a4088c4 2516 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2517 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2518 * result contains a character that crosses the 255/256 boundary, disallow
2519 * the change, and return the original code point. See L<perlfunc/lc> for
2520 * why;
051a06d4 2521 *
a1433954
KW
2522 * p points to the original string whose case was changed; assumed
2523 * by this routine to be well-formed
051a06d4
KW
2524 * result the code point of the first character in the changed-case string
2525 * ustrp points to the changed-case string (<result> represents its first char)
2526 * lenp points to the length of <ustrp> */
2527
2528 UV original; /* To store the first code point of <p> */
2529
2530 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2531
a4f12ed7 2532 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2533
2534 /* We know immediately if the first character in the string crosses the
2535 * boundary, so can skip */
2536 if (result > 255) {
2537
2538 /* Look at every character in the result; if any cross the
2539 * boundary, the whole thing is disallowed */
2540 U8* s = ustrp + UTF8SKIP(ustrp);
2541 U8* e = ustrp + *lenp;
2542 while (s < e) {
a4f12ed7 2543 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2544 goto bad_crossing;
2545 }
2546 s += UTF8SKIP(s);
2547 }
2548
613abc6d
KW
2549 /* Here, no characters crossed, result is ok as-is, but we warn. */
2550 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2551 return result;
2552 }
2553
7b52d656 2554 bad_crossing:
051a06d4
KW
2555
2556 /* Failed, have to return the original */
4b88fb76 2557 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2558
2559 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2560 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2561 "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
2562 "resolved to \"\\x{%"UVXf"}\".",
357aadde 2563 OP_DESC(PL_op),
ab0b796c
KW
2564 original,
2565 original);
051a06d4
KW
2566 Copy(p, ustrp, *lenp, char);
2567 return original;
2568}
2569
d3e79532 2570/*
87cea99e 2571=for apidoc to_utf8_upper
d3e79532 2572
1f607577 2573Instead use L</toUPPER_utf8>.
a1433954 2574
d3e79532
JH
2575=cut */
2576
051a06d4 2577/* Not currently externally documented, and subject to change:
31f05a37
KW
2578 * <flags> is set iff iff the rules from the current underlying locale are to
2579 * be used. */
051a06d4 2580
2104c8d9 2581UV
31f05a37 2582Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2583{
051a06d4
KW
2584 UV result;
2585
2586 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2587
780fcc9f
KW
2588 if (flags) {
2589 /* Treat a UTF-8 locale as not being in locale at all */
2590 if (IN_UTF8_CTYPE_LOCALE) {
2591 flags = FALSE;
2592 }
2593 else {
2594 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2595 }
31f05a37
KW
2596 }
2597
3a4c58c9 2598 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2599 if (flags) {
2600 result = toUPPER_LC(*p);
2601 }
2602 else {
81c6c7ce 2603 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2604 }
3a4c58c9
KW
2605 }
2606 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2607 if (flags) {
a62b247b 2608 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2609 result = toUPPER_LC(c);
051a06d4
KW
2610 }
2611 else {
a62b247b 2612 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2613 ustrp, lenp, 'S');
051a06d4
KW
2614 }
2615 }
4a4088c4 2616 else { /* UTF-8, ord above 255 */
b9992569 2617 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2618
2619 if (flags) {
357aadde 2620 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2621 }
2622 return result;
2623 }
2624
4a4088c4 2625 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2626 if (UTF8_IS_INVARIANT(result)) {
2627 *ustrp = (U8) result;
2628 *lenp = 1;
2629 }
2630 else {
62cb07ea
KW
2631 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2632 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2633 *lenp = 2;
3a4c58c9 2634 }
baa60164 2635
051a06d4 2636 return result;
983ffd37 2637}
a0ed51b3 2638
d3e79532 2639/*
87cea99e 2640=for apidoc to_utf8_title
d3e79532 2641
1f607577 2642Instead use L</toTITLE_utf8>.
a1433954 2643
d3e79532
JH
2644=cut */
2645
051a06d4 2646/* Not currently externally documented, and subject to change:
31f05a37
KW
2647 * <flags> is set iff the rules from the current underlying locale are to be
2648 * used. Since titlecase is not defined in POSIX, for other than a
2649 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2650 */
051a06d4 2651
983ffd37 2652UV
31f05a37 2653Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2654{
051a06d4
KW
2655 UV result;
2656
2657 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2658
780fcc9f
KW
2659 if (flags) {
2660 /* Treat a UTF-8 locale as not being in locale at all */
2661 if (IN_UTF8_CTYPE_LOCALE) {
2662 flags = FALSE;
2663 }
2664 else {
2665 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2666 }
31f05a37
KW
2667 }
2668
3a4c58c9 2669 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2670 if (flags) {
2671 result = toUPPER_LC(*p);
2672 }
2673 else {
81c6c7ce 2674 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2675 }
3a4c58c9
KW
2676 }
2677 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2678 if (flags) {
a62b247b 2679 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2680 result = toUPPER_LC(c);
051a06d4
KW
2681 }
2682 else {
a62b247b 2683 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2684 ustrp, lenp, 's');
051a06d4
KW
2685 }
2686 }
4a4088c4 2687 else { /* UTF-8, ord above 255 */
b9992569 2688 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2689
2690 if (flags) {
357aadde 2691 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2692 }
2693 return result;
2694 }
2695
4a4088c4 2696 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2697 if (UTF8_IS_INVARIANT(result)) {
2698 *ustrp = (U8) result;
2699 *lenp = 1;
2700 }
2701 else {
62cb07ea
KW
2702 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2703 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2704 *lenp = 2;
3a4c58c9
KW
2705 }
2706
051a06d4 2707 return result;
a0ed51b3
LW
2708}
2709
d3e79532 2710/*
87cea99e 2711=for apidoc to_utf8_lower
d3e79532 2712
1f607577 2713Instead use L</toLOWER_utf8>.
a1433954 2714
d3e79532
JH
2715=cut */
2716
051a06d4 2717/* Not currently externally documented, and subject to change:
31f05a37
KW
2718 * <flags> is set iff iff the rules from the current underlying locale are to
2719 * be used.
2720 */
051a06d4 2721
2104c8d9 2722UV
31f05a37 2723Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2724{
051a06d4
KW
2725 UV result;
2726
051a06d4 2727 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2728
780fcc9f
KW
2729 if (flags) {
2730 /* Treat a UTF-8 locale as not being in locale at all */
2731 if (IN_UTF8_CTYPE_LOCALE) {
2732 flags = FALSE;
2733 }
2734 else {
2735 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2736 }
31f05a37
KW
2737 }
2738
968c5e6a 2739 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2740 if (flags) {
2741 result = toLOWER_LC(*p);
2742 }
2743 else {
81c6c7ce 2744 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2745 }
968c5e6a
KW
2746 }
2747 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2748 if (flags) {
a62b247b 2749 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2750 result = toLOWER_LC(c);
051a06d4
KW
2751 }
2752 else {
a62b247b 2753 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2754 ustrp, lenp);
051a06d4 2755 }
968c5e6a 2756 }
4a4088c4 2757 else { /* UTF-8, ord above 255 */
b9992569 2758 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2759
2760 if (flags) {
357aadde 2761 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2762 }
968c5e6a 2763
051a06d4
KW
2764 return result;
2765 }
2766
4a4088c4 2767 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2768 if (UTF8_IS_INVARIANT(result)) {
2769 *ustrp = (U8) result;
2770 *lenp = 1;
2771 }
2772 else {
62cb07ea
KW
2773 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2774 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2775 *lenp = 2;
2776 }
2777
051a06d4 2778 return result;
b4e400f9
JH
2779}
2780
d3e79532 2781/*
87cea99e 2782=for apidoc to_utf8_fold
d3e79532 2783
1f607577 2784Instead use L</toFOLD_utf8>.
a1433954 2785
d3e79532
JH
2786=cut */
2787
051a06d4
KW
2788/* Not currently externally documented, and subject to change,
2789 * in <flags>
31f05a37
KW
2790 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2791 * locale are to be used.
051a06d4
KW
2792 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2793 * otherwise simple folds
a0270393
KW
2794 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2795 * prohibited
445bf929 2796 */
36bb2ab6 2797
b4e400f9 2798UV
445bf929 2799Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2800{
051a06d4
KW
2801 UV result;
2802
36bb2ab6 2803 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2804
a0270393
KW
2805 /* These are mutually exclusive */
2806 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2807
50ba90ff
KW
2808 assert(p != ustrp); /* Otherwise overwrites */
2809
780fcc9f
KW
2810 if (flags & FOLD_FLAGS_LOCALE) {
2811 /* Treat a UTF-8 locale as not being in locale at all */
2812 if (IN_UTF8_CTYPE_LOCALE) {
2813 flags &= ~FOLD_FLAGS_LOCALE;
2814 }
2815 else {
2816 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2817 }
31f05a37
KW
2818 }
2819
a1dde8de 2820 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2821 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2822 result = toFOLD_LC(*p);
051a06d4
KW
2823 }
2824 else {
81c6c7ce 2825 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2826 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2827 }
a1dde8de
KW
2828 }
2829 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2830 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2831 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2832 result = toFOLD_LC(c);
051a06d4
KW
2833 }
2834 else {
a62b247b 2835 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2836 ustrp, lenp,
1ca267a5 2837 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2838 }
a1dde8de 2839 }
4a4088c4 2840 else { /* UTF-8, ord above 255 */
b9992569 2841 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2842
1ca267a5
KW
2843 if (flags & FOLD_FLAGS_LOCALE) {
2844
76f2ffcd 2845# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
2846 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
2847
2848# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2849# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
2850
2851 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 2852
538e84ed
KW
2853 /* Special case these two characters, as what normally gets
2854 * returned under locale doesn't work */
76f2ffcd
KW
2855 if (UTF8SKIP(p) == cap_sharp_s_len
2856 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 2857 {
ab0b796c
KW
2858 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2859 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2860 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2861 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
2862 goto return_long_s;
2863 }
0766489e
KW
2864 else
2865#endif
2866 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 2867 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 2868 {
ab0b796c
KW
2869 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2870 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2871 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
2872 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
2873 goto return_ligature_st;
2874 }
74894415
KW
2875
2876#if UNICODE_MAJOR_VERSION == 3 \
2877 && UNICODE_DOT_VERSION == 0 \
2878 && UNICODE_DOT_DOT_VERSION == 1
2879# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
2880
2881 /* And special case this on this Unicode version only, for the same
2882 * reaons the other two are special cased. They would cross the
2883 * 255/256 boundary which is forbidden under /l, and so the code
2884 * wouldn't catch that they are equivalent (which they are only in
2885 * this release) */
2886 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
2887 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
2888 {
2889 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2890 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2891 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
2892 "resolved to \"\\x{0131}\".");
2893 goto return_dotless_i;
2894 }
2895#endif
2896
357aadde 2897 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2898 }
a0270393
KW
2899 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2900 return result;
2901 }
2902 else {
4a4088c4 2903 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
2904 * character above the ASCII range, and the result should not
2905 * contain an ASCII character. */
a0270393
KW
2906
2907 UV original; /* To store the first code point of <p> */
2908
2909 /* Look at every character in the result; if any cross the
2910 * boundary, the whole thing is disallowed */
2911 U8* s = ustrp;
2912 U8* e = ustrp + *lenp;
2913 while (s < e) {
2914 if (isASCII(*s)) {
2915 /* Crossed, have to return the original */
2916 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 2917
9fc2026f 2918 /* But in these instances, there is an alternative we can
1ca267a5 2919 * return that is valid */
0766489e
KW
2920 if (original == LATIN_SMALL_LETTER_SHARP_S
2921#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
2922 || original == LATIN_CAPITAL_LETTER_SHARP_S
2923#endif
2924 ) {
1ca267a5
KW
2925 goto return_long_s;
2926 }
9fc2026f
KW
2927 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2928 goto return_ligature_st;
2929 }
74894415
KW
2930#if UNICODE_MAJOR_VERSION == 3 \
2931 && UNICODE_DOT_VERSION == 0 \
2932 && UNICODE_DOT_DOT_VERSION == 1
2933
2934 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
2935 goto return_dotless_i;
2936 }
2937#endif
a0270393
KW
2938 Copy(p, ustrp, *lenp, char);
2939 return original;
2940 }
2941 s += UTF8SKIP(s);
2942 }
051a06d4 2943
a0270393
KW
2944 /* Here, no characters crossed, result is ok as-is */
2945 return result;
2946 }
051a06d4
KW
2947 }
2948
4a4088c4 2949 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2950 if (UTF8_IS_INVARIANT(result)) {
2951 *ustrp = (U8) result;
2952 *lenp = 1;
2953 }
2954 else {
62cb07ea
KW
2955 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2956 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2957 *lenp = 2;
2958 }
2959
051a06d4 2960 return result;
1ca267a5
KW
2961
2962 return_long_s:
2963 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2964 * folds to a string of two of these characters. By returning this
2965 * instead, then, e.g.,
2966 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2967 * works. */
2968
2969 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2970 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2971 ustrp, *lenp, U8);
2972 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
2973
2974 return_ligature_st:
2975 /* Two folds to 'st' are prohibited by the options; instead we pick one and
2976 * have the other one fold to it */
2977
2978 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2979 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2980 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
2981
2982#if UNICODE_MAJOR_VERSION == 3 \
2983 && UNICODE_DOT_VERSION == 0 \
2984 && UNICODE_DOT_DOT_VERSION == 1
2985
2986 return_dotless_i:
2987 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
2988 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
2989 return LATIN_SMALL_LETTER_DOTLESS_I;
2990
2991#endif
2992
a0ed51b3
LW
2993}
2994
711a919c 2995/* Note:
f90a9a02 2996 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
2997 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2998 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2999 */
c4a5db0c 3000
a0ed51b3 3001SV*
7fc63493 3002Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3003{
c4a5db0c
KW
3004 PERL_ARGS_ASSERT_SWASH_INIT;
3005
3006 /* Returns a copy of a swash initiated by the called function. This is the
3007 * public interface, and returning a copy prevents others from doing
3008 * mischief on the original */
3009
5d3d13d1 3010 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3011}
3012
3013SV*
5d3d13d1 3014Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3015{
2c1f00b9
YO
3016
3017 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3018 * use the following define */
3019
3020#define CORE_SWASH_INIT_RETURN(x) \
3021 PL_curpm= old_PL_curpm; \
3022 return x
3023
c4a5db0c 3024 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3025 * by calling utf8_heavy.pl in the general case. The returned value may be
3026 * the swash's inversion list instead if the input parameters allow it.
3027 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3028 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3029 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3030 *
3031 * This interface should only be used by functions that won't destroy or
3032 * adversely change the swash, as doing so affects all other uses of the
3033 * swash in the program; the general public should use 'Perl_swash_init'
3034 * instead.
3035 *
3036 * pkg is the name of the package that <name> should be in.
3037 * name is the name of the swash to find. Typically it is a Unicode
3038 * property name, including user-defined ones
3039 * listsv is a string to initialize the swash with. It must be of the form
3040 * documented as the subroutine return value in
3041 * L<perlunicode/User-Defined Character Properties>
3042 * minbits is the number of bits required to represent each data element.
3043 * It is '1' for binary properties.
3044 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3045 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3046 * flags_p if non-NULL is the address of various input and output flag bits
3047 * to the routine, as follows: ('I' means is input to the routine;
3048 * 'O' means output from the routine. Only flags marked O are
3049 * meaningful on return.)
3050 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3051 * came from a user-defined property. (I O)
5d3d13d1
KW
3052 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3053 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3054 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3055 * return of an inversion list instead of a swash hash if this routine
3056 * thinks that would result in faster execution of swash_fetch() later
3057 * on. (I)
9a53f6cf
KW
3058 *
3059 * Thus there are three possible inputs to find the swash: <name>,
3060 * <listsv>, and <invlist>. At least one must be specified. The result
3061 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3062 * actions can intersect, etc. what <name> gives. To avoid going out to
3063 * disk at all, <invlist> should specify completely what the swash should
3064 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3065 *
3066 * <invlist> is only valid for binary properties */
c4a5db0c 3067
2c1f00b9
YO
3068 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3069
c4a5db0c 3070 SV* retval = &PL_sv_undef;
83199d38 3071 HV* swash_hv = NULL;
87367d5f
KW
3072 const int invlist_swash_boundary =
3073 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3074 ? 512 /* Based on some benchmarking, but not extensive, see commit
3075 message */
3076 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3077
3078 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3079 assert(! invlist || minbits == 1);
3080
2c1f00b9
YO
3081 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3082 that triggered the swash init and the swash init perl logic itself.
3083 See perl #122747 */
3084
9a53f6cf
KW
3085 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3086 * so */
3087 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3088 dSP;
3089 const size_t pkg_len = strlen(pkg);
3090 const size_t name_len = strlen(name);
3091 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3092 SV* errsv_save;
3093 GV *method;
3094
3095 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3096
3097 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3098 ENTER;
69794297 3099 SAVEHINTS();
2782061f 3100 save_re_context();
650f067c
JL
3101 /* We might get here via a subroutine signature which uses a utf8
3102 * parameter name, at which point PL_subname will have been set
3103 * but not yet used. */
3104 save_item(PL_subname);
69794297
KW
3105 if (PL_parser && PL_parser->error_count)
3106 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3107 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3108 if (!method) { /* demand load UTF-8 */
69794297 3109 ENTER;
db2c6cb3
FC
3110 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3111 GvSV(PL_errgv) = NULL;
1a419e6b 3112#ifndef NO_TAINT_SUPPORT
69794297
KW
3113 /* It is assumed that callers of this routine are not passing in
3114 * any user derived data. */
2782061f
DM
3115 /* Need to do this after save_re_context() as it will set
3116 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3117 * in Perl_magic_get). Even line to create errsv_save can turn on
3118 * PL_tainted. */
284167a5
S
3119 SAVEBOOL(TAINT_get);
3120 TAINT_NOT;
3121#endif
69794297
KW
3122 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3123 NULL);
eed484f9 3124 {
db2c6cb3
FC
3125 /* Not ERRSV, as there is no need to vivify a scalar we are
3126 about to discard. */
3127 SV * const errsv = GvSV(PL_errgv);
3128 if (!SvTRUE(errsv)) {
3129 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3130 SvREFCNT_dec(errsv);
3131 }
eed484f9 3132 }
69794297
KW
3133 LEAVE;
3134 }
3135 SPAGAIN;
3136 PUSHMARK(SP);
3137 EXTEND(SP,5);
3138 mPUSHp(pkg, pkg_len);
3139 mPUSHp(name, name_len);
3140 PUSHs(listsv);
3141 mPUSHi(minbits);
3142 mPUSHi(none);
3143 PUTBACK;
db2c6cb3
FC
3144 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3145 GvSV(PL_errgv) = NULL;
69794297
KW
3146 /* If we already have a pointer to the method, no need to use
3147 * call_method() to repeat the lookup. */
c41800a8
KW
3148 if (method
3149 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3150 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3151 {
3152 retval = *PL_stack_sp--;
3153 SvREFCNT_inc(retval);
3154 }
eed484f9 3155 {
db2c6cb3
FC
3156 /* Not ERRSV. See above. */
3157 SV * const errsv = GvSV(PL_errgv);
3158 if (!SvTRUE(errsv)) {
3159 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3160 SvREFCNT_dec(errsv);
3161 }
eed484f9 3162 }
ce3b816e 3163 LEAVE;
69794297
KW
3164 POPSTACK;
3165 if (IN_PERL_COMPILETIME) {
3166 CopHINTS_set(PL_curcop, PL_hints);
3167 }
3168 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3169 if (SvPOK(retval)) {
69794297
KW
3170
3171 /* If caller wants to handle missing properties, let them */
5d3d13d1 3172 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3173 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3174 }
3175 Perl_croak(aTHX_
3176 "Can't find Unicode property definition \"%"SVf"\"",
3177 SVfARG(retval));
a25b5927 3178 NOT_REACHED; /* NOTREACHED */
d95e4a00 3179 }
69794297 3180 }
9a53f6cf 3181 } /* End of calling the module to find the swash */
36eb48b4 3182
83199d38
KW
3183 /* If this operation fetched a swash, and we will need it later, get it */
3184 if (retval != &PL_sv_undef
3185 && (minbits == 1 || (flags_p
3186 && ! (*flags_p
3187 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3188 {
3189 swash_hv = MUTABLE_HV(SvRV(retval));
3190
3191 /* If we don't already know that there is a user-defined component to
3192 * this swash, and the user has indicated they wish to know if there is
3193 * one (by passing <flags_p>), find out */
3194 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3195 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3196 if (user_defined && SvUV(*user_defined)) {
3197 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3198 }
3199 }
3200 }
3201
36eb48b4
KW
3202 /* Make sure there is an inversion list for binary properties */
3203 if (minbits == 1) {
3204 SV** swash_invlistsvp = NULL;
3205 SV* swash_invlist = NULL;
9a53f6cf 3206 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3207 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3208 an unclaimed reference count */
36eb48b4 3209
9a53f6cf 3210 /* If this operation fetched a swash, get its already existing
83199d38 3211 * inversion list, or create one for it */
36eb48b4 3212
83199d38 3213 if (swash_hv) {
5c9f4bd2 3214 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3215 if (swash_invlistsvp) {
3216 swash_invlist = *swash_invlistsvp;
3217 invlist_in_swash_is_valid = TRUE;
3218 }
3219 else {
36eb48b4 3220 swash_invlist = _swash_to_invlist(retval);
02c85471 3221 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3222 }
3223 }
3224
3225 /* If an inversion list was passed in, have to include it */
3226 if (invlist) {
3227
3228 /* Any fetched swash will by now have an inversion list in it;
3229 * otherwise <swash_invlist> will be NULL, indicating that we
3230 * didn't fetch a swash */
3231 if (swash_invlist) {
3232
3233 /* Add the passed-in inversion list, which invalidates the one
3234 * already stored in the swash */
3235 invlist_in_swash_is_valid = FALSE;
3236 _invlist_union(invlist, swash_invlist, &swash_invlist);
3237 }
3238 else {
3239
87367d5f
KW
3240 /* Here, there is no swash already. Set up a minimal one, if
3241 * we are going to return a swash */
3242 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3243 swash_hv = newHV();
4aca0fe6 3244 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3245 }
9a53f6cf
KW
3246 swash_invlist = invlist;
3247 }
9a53f6cf
KW
3248 }
3249
3250 /* Here, we have computed the union of all the passed-in data. It may
3251 * be that there was an inversion list in the swash which didn't get
538e84ed 3252 * touched; otherwise save the computed one */
87367d5f
KW
3253 if (! invlist_in_swash_is_valid
3254 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3255 {
5c9f4bd2 3256 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3257 {
3258 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3259 }
cc34d8c5
FC
3260 /* We just stole a reference count. */
3261 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3262 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3263 }
87367d5f 3264
dbfdbd26
KW
3265 SvREADONLY_on(swash_invlist);
3266
c41800a8 3267 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3268 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3269 SvREFCNT_dec(retval);
02c85471
FC
3270 if (!swash_invlist_unclaimed)
3271 SvREFCNT_inc_simple_void_NN(swash_invlist);
3272 retval = newRV_noinc(swash_invlist);
87367d5f 3273 }
36eb48b4
KW
3274 }
3275
2c1f00b9
YO
3276 CORE_SWASH_INIT_RETURN(retval);
3277#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3278}
3279
035d37be
JH
3280
3281/* This API is wrong for special case conversions since we may need to
3282 * return several Unicode characters for a single Unicode character
3283 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3284 * the lower-level routine, and it is similarly broken for returning
38684baa 3285 * multiple values. --jhi
b9992569 3286 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3287/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
TS
3288
3289/* Note:
3290 * Returns the value of property/mapping C<swash> for the first character
3291 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3292 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3293 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3294 *
3295 * A "swash" is a hash which contains initially the keys/values set up by
3296 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3297 * property for all possible code points. Things are stored in a compact form
3298 * (see utf8_heavy.pl) so that calculation is required to find the actual
3299 * property value for a given code point. As code points are looked up, new
3300 * key/value pairs are added to the hash, so that the calculation doesn't have
3301 * to ever be re-done. Further, each calculation is done, not just for the
3302 * desired one, but for a whole block of code points adjacent to that one.
3303 * For binary properties on ASCII machines, the block is usually for 64 code
3304 * points, starting with a code point evenly divisible by 64. Thus if the
3305 * property value for code point 257 is requested, the code goes out and
3306 * calculates the property values for all 64 code points between 256 and 319,
3307 * and stores these as a single 64-bit long bit vector, called a "swatch",
3308 * under the key for code point 256. The key is the UTF-8 encoding for code
3309 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3310 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3311 * for code point 258 is then requested, this code realizes that it would be
3312 * stored under the key for 256, and would find that value and extract the
3313 * relevant bit, offset from 256.
3314 *
3315 * Non-binary properties are stored in as many bits as necessary to represent
3316 * their values (32 currently, though the code is more general than that), not
fc273927 3317 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3318 * vector that encompasses the property values for all code points whose UTF-8
3319 * representations are represented by the key. That is, for all code points
3320 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3321 * bytes of that.
680c470c 3322 */
a0ed51b3 3323UV
680c470c 3324Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3325{
ef8f7699 3326 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3327 U32 klen;
3328 U32 off;
9b56a019 3329 STRLEN slen = 0;
7d85a32c 3330 STRLEN needents;
cfd0369c 3331 const U8 *tmps = NULL;
979f2922 3332 SV *swatch;
08fb1ac5 3333 const U8 c = *ptr;
3568d838 3334
7918f24d
NC
3335 PERL_ARGS_ASSERT_SWASH_FETCH;
3336
87367d5f
KW
3337 /* If it really isn't a hash, it isn't really swash; must be an inversion
3338 * list */
3339 if (SvTYPE(hv) != SVt_PVHV) {
3340 return _invlist_contains_cp((SV*)hv,
3341 (do_utf8)
3342 ? valid_utf8_to_uvchr(ptr, NULL)
3343 : c);
3344 }
3345
08fb1ac5
KW
3346 /* We store the values in a "swatch" which is a vec() value in a swash
3347 * hash. Code points 0-255 are a single vec() stored with key length
3348 * (klen) 0. All other code points have a UTF-8 representation
3349 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3350 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3351 * length for them is the length of the encoded char - 1. ptr[klen] is the
3352 * final byte in the sequence representing the character */
3353 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3354 klen = 0;
3355 needents = 256;
3356 off = c;
3568d838 3357 }
08fb1ac5
KW
3358 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3359 klen = 0;
3360 needents = 256;
a62b247b 3361 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
TS
3362 }
3363 else {
08fb1ac5
KW
3364 klen = UTF8SKIP(ptr) - 1;
3365
3366 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3367 * the vec is the final byte in the sequence. (In EBCDIC this is
3368 * converted to I8 to get consecutive values.) To help you visualize
3369 * all this:
3370 * Straight 1047 After final byte
3371 * UTF-8 UTF-EBCDIC I8 transform
3372 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3373 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3374 * ...
3375 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3376 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3377 * ...
3378 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3379 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3380 * ...
3381 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3382 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3383 * ...
3384 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3385 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3386 *
3387 * (There are no discontinuities in the elided (...) entries.)
3388 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3389 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3390 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3391 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3392 * index into the vec() swatch (after subtracting 0x80, which we
3393 * actually do with an '&').
3394 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3395 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3396 * dicontinuities which go away by transforming it into I8, and we
3397 * effectively subtract 0xA0 to get the index. */
979f2922 3398 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3399 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3400 }
7d85a32c 3401
a0ed51b3 3402 /*
4a4088c4 3403 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3404 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3405 * it's nothing to sniff at.) Pity we usually come through at least
3406 * two function calls to get here...
3407 *
3408 * NB: this code assumes that swatches are never modified, once generated!
3409 */
3410
3568d838 3411 if (hv == PL_last_swash_hv &&
a0ed51b3 3412 klen == PL_last_swash_klen &&
27da23d5 3413 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3414 {
3415 tmps = PL_last_swash_tmps;
3416 slen = PL_last_swash_slen;
3417 }
3418 else {
3419 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3420 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3421
b0e3252e 3422 /* If not cached, generate it via swatch_get */
979f2922 3423 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3424 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3425 {
3426 if (klen) {
3427 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3428 swatch = swatch_get(swash,
3429 code_point & ~((UV)needents - 1),
3430 needents);
3431 }
3432 else { /* For the first 256 code points, the swatch has a key of
3433 length 0 */
3434 swatch = swatch_get(swash, 0, needents);
3435 }
979f2922 3436
923e4eb5 3437 if (IN_PERL_COMPILETIME)
623e6609 3438 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3439
979f2922 3440 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3441
979f2922
TS
3442 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3443 || (slen << 3) < needents)
5637ef5b
NC
3444 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3445 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3446 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3447 }
3448
3449 PL_last_swash_hv = hv;
16d8f38a 3450 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3451 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3452 /* FIXME change interpvar.h? */
3453 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3454 PL_last_swash_slen = slen;
3455 if (klen)
3456 Copy(ptr, PL_last_swash_key, klen, U8);
3457 }
3458
9faf8d75 3459 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3460 case 1:
e7aca353 3461 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3462 case 8:
e7aca353 3463 return ((UV) tmps[off]);
a0ed51b3
LW
3464 case 16:
3465 off <<= 1;
e7aca353
JH
3466 return
3467 ((UV) tmps[off ] << 8) +
3468 ((UV) tmps[off + 1]);
a0ed51b3
LW
3469 case 32:
3470 off <<= 2;
e7aca353
JH
3471 return
3472 ((UV) tmps[off ] << 24) +
3473 ((UV) tmps[off + 1] << 16) +
3474 ((UV) tmps[off + 2] << 8) +
3475 ((UV) tmps[off + 3]);
a0ed51b3 3476 }
5637ef5b
NC
3477 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3478 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3479 NORETURN_FUNCTION_END;
a0ed51b3 3480}
2b9d42f0 3481
319009ee
KW
3482/* Read a single line of the main body of the swash input text. These are of
3483 * the form:
3484 * 0053 0056 0073
3485 * where each number is hex. The first two numbers form the minimum and
3486 * maximum of a range, and the third is the value associated with the range.
3487 * Not all swashes should have a third number
3488 *
3489 * On input: l points to the beginning of the line to be examined; it points
3490 * to somewhere in the string of the whole input text, and is
3491 * terminated by a \n or the null string terminator.
3492 * lend points to the null terminator of that string
3493 * wants_value is non-zero if the swash expects a third number
3494 * typestr is the name of the swash's mapping, like 'ToLower'
3495 * On output: *min, *max, and *val are set to the values read from the line.
3496 * returns a pointer just beyond the line examined. If there was no
3497 * valid min number on the line, returns lend+1
3498 */
3499
3500STATIC U8*
3501S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3502 const bool wants_value, const U8* const typestr)
3503{
3504 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3505 STRLEN numlen; /* Length of the number */
02470786
KW
3506 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3507 | PERL_SCAN_DISALLOW_PREFIX
3508 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3509
3510 /* nl points to the next \n in the scan */
3511 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3512
95543e92
KW
3513 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3514
319009ee
KW
3515 /* Get the first number on the line: the range minimum */
3516 numlen = lend - l;
3517 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3518 *max = *min; /* So can never return without setting max */
319009ee
KW
3519 if (numlen) /* If found a hex number, position past it */
3520 l += numlen;
3521 else if (nl) { /* Else, go handle next line, if any */
3522 return nl + 1; /* 1 is length of "\n" */
3523 }
3524 else { /* Else, no next line */
3525 return lend + 1; /* to LIST's end at which \n is not found */
3526 }
3527
3528 /* The max range value follows, separated by a BLANK */
3529 if (isBLANK(*l)) {
3530 ++l;
02470786
KW
3531 flags = PERL_SCAN_SILENT_ILLDIGIT
3532 | PERL_SCAN_DISALLOW_PREFIX
3533 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3534 numlen = lend - l;
3535 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3536 if (numlen)
3537 l += numlen;
3538 else /* If no value here, it is a single element range */
3539 *max = *min;
3540
3541 /* Non-binary tables have a third entry: what the first element of the
24303724 3542 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3543 if (wants_value) {
3544 if (isBLANK(*l)) {
3545 ++l;
f2a7d0fc
KW
3546 flags = PERL_SCAN_SILENT_ILLDIGIT
3547 | PERL_SCAN_DISALLOW_PREFIX
3548 | PERL_SCAN_SILENT_NON_PORTABLE;
3549 numlen = lend - l;
3550 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3551 if (numlen)
3552 l += numlen;
3553 else
3554 *val = 0;
319009ee
KW
3555 }
3556 else {
3557 *val = 0;
3558 if (typeto) {
dcbac5bb 3559 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3560 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3561 typestr, l);
3562 }
3563 }
3564 }
3565 else
3566 *val = 0; /* bits == 1, then any val should be ignored */
3567 }
3568 else { /* Nothing following range min, should be single element with no
3569 mapping expected */
319009ee
KW
3570 if (wants_value) {
3571 *val = 0;
3572 if (typeto) {
dcbac5bb 3573 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3574 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3575 }
3576 }
3577 else
3578 *val = 0; /* bits == 1, then val should be ignored */
3579 }
3580
3581 /* Position to next line if any, or EOF */
3582 if (nl)
3583 l = nl + 1;
3584 else
3585 l = lend;
3586
3587 return l;
3588}
3589
979f2922
TS
3590/* Note:
3591 * Returns a swatch (a bit vector string) for a code point sequence
3592 * that starts from the value C<start> and comprises the number C<span>.
3593 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3594 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3595 */
3596STATIC SV*
b0e3252e 3597S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
TS
3598{
3599 SV *swatch;
77f9f126 3600 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3601 STRLEN lcur, xcur, scur;
ef8f7699 3602 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3603 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3604
88d45d28
KW
3605 SV** listsvp = NULL; /* The string containing the main body of the table */
3606 SV** extssvp = NULL;
3607 SV** invert_it_svp = NULL;
3608 U8* typestr = NULL;
786861f5
KW
3609 STRLEN bits;
3610 STRLEN octets; /* if bits == 1, then octets == 0 */
3611 UV none;
3612 UV end = start + span;
972dd592 3613
36eb48b4 3614 if (invlistsvp == NULL) {
786861f5
KW
3615 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3616 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3617 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3618 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3619 listsvp = hv_fetchs(hv, "LIST", FALSE);
3620 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3621
3622 bits = SvUV(*bitssvp);
3623 none = SvUV(*nonesvp);
3624 typestr = (U8*)SvPV_nolen(*typesvp);
3625 }
36eb48b4
KW
3626 else {
3627 bits = 1;
3628 none = 0;
3629 }
786861f5 3630 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3631
b0e3252e 3632 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3633
979f2922 3634 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3635 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3636 (UV)bits);
979f2922
TS
3637 }
3638
84ea5ef6
KW
3639 /* If overflowed, use the max possible */
3640 if (end < start) {
3641 end = UV_MAX;
3642 span = end - start;
3643 }
3644
979f2922 3645 /* create and initialize $swatch */
979f2922 3646 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3647 swatch = newSV(scur);
3648 SvPOK_on(swatch);
979f2922
TS
3649 s = (U8*)SvPVX(swatch);
3650 if (octets && none) {
0bd48802 3651 const U8* const e = s + scur;
979f2922
TS
3652 while (s < e) {
3653 if (bits == 8)
3654 *s++ = (U8)(none & 0xff);
3655 else if (bits == 16) {
3656 *s++ = (U8)((none >> 8) & 0xff);
3657 *s++ = (U8)( none & 0xff);
3658 }
3659 else if (bits == 32) {
3660 *s++ = (U8)((none >> 24) & 0xff);
3661 *s++ = (U8)((none >> 16) & 0xff);
3662 *s++ = (U8)((none >> 8) & 0xff);
3663 *s++ = (U8)( none & 0xff);
3664 }
3665 }
3666 *s = '\0';
3667 }
3668 else {
3669 (void)memzero((U8*)s, scur + 1);
3670 }
3671 SvCUR_set(swatch, scur);
3672 s = (U8*)SvPVX(swatch);
3673
36eb48b4
KW
3674 if (invlistsvp) { /* If has an inversion list set up use that */
3675 _invlist_populate_swatch(*invlistsvp, start, end, s);
3676 return swatch;
3677 }
3678
3679 /* read $swash->{LIST} */
979f2922
TS
3680 l = (U8*)SvPV(*listsvp, lcur);
3681 lend = l + lcur;
3682 while (l < lend) {
8ed25d53 3683 UV min, max, val, upper;
95543e92
KW
3684 l = swash_scan_list_line(l, lend, &min, &max, &val,
3685 cBOOL(octets), typestr);
319009ee 3686 if (l > lend) {
979f2922
TS
3687 break;
3688 }
3689
972dd592 3690 /* If looking for something beyond this range, go try the next one */
979f2922
TS
3691 if (max < start)
3692 continue;
3693
8ed25d53
KW
3694 /* <end> is generally 1 beyond where we want to set things, but at the
3695 * platform's infinity, where we can't go any higher, we want to
3696 * include the code point at <end> */
3697 upper = (max < end)
3698 ? max
3699 : (max != UV_MAX || end != UV_MAX)
3700 ? end - 1
3701 : end;
3702
979f2922 3703 if (octets) {
35da51f7 3704 UV key;
979f2922
TS
3705 if (min < start) {
3706 if (!none || val < none) {
3707 val += start - min;
3708 }
3709 min = start;
3710 }
8ed25d53 3711 for (key = min; key <= upper; key++) {
979f2922 3712 STRLEN offset;
979f2922
TS
3713 /* offset must be non-negative (start <= min <= key < end) */
3714 offset = octets * (key - start);
3715 if (bits == 8)
3716 s[offset] = (U8)(val & 0xff);
3717 else if (bits == 16) {
3718 s[offset ] = (U8)((val >> 8) & 0xff);
3719 s[offset + 1] = (U8)( val & 0xff);
3720 }
3721 else if (bits == 32) {
3722 s[offset ] = (U8)((val >> 24) & 0xff);
3723 s[offset + 1] = (U8)((val >> 16) & 0xff);
3724 s[offset + 2] = (U8)((val >> 8) & 0xff);
3725 s[offset + 3] = (U8)( val & 0xff);
3726 }
3727
3728 if (!none || val < none)
3729 ++val;
3730 }
3731 }
711a919c 3732 else { /* bits == 1, then val should be ignored */
35da51f7 3733 UV key;
979f2922
TS
3734 if (min < start)