This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8.t: Partially refactor to use table data
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
147e3846 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
147e3846 75 "UTF-16 surrogate U+%04" UVXf, uv); \
8ee1cdcb
KW
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
147e3846 86 "Unicode non-character U+%04" UVXf " is not " \
8ee1cdcb
KW
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
147e3846
KW
167 ? "Code point 0x%" UVXf " is not Unicode, and not portable"
168 : "Code point 0x%" UVXf " is not Unicode, may not be portable",
a5bf80e0
KW
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
b343c774
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
433 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
434 * platform, that is if it represents a code point larger than the highest
435 * representable code point. (For ASCII platforms, we could use memcmp()
436 * because we don't have to convert each byte to I8, but it's very rare
437 * input indeed that would approach overflow, so the loop below will likely
438 * only get executed once.
439 *
440 * 'e' must not be beyond a full character. If it is less than a full
441 * character, the function returns FALSE if there is any input beyond 'e'
442 * that could result in a non-overflowing code point */
443
444 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
b0b342d4
KW
445 assert(s <= e && s + UTF8SKIP(s) >= e);
446
447#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
448
449 /* On 32 bit ASCII machines, many overlongs that start with FF don't
450 * overflow */
451
452 if (isFF_OVERLONG(s, e - s)) {
453 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
454 return memGE(s, max_32_bit_overlong,
bb93890c 455 MIN(e - s, sizeof(max_32_bit_overlong) - 1));
b0b342d4
KW
456 }
457
458#endif
12a4bed3
KW
459
460 for (x = s; x < e; x++, y++) {
461
462 /* If this byte is larger than the corresponding highest UTF-8 byte, it
463 * overflows */
464 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
465 return TRUE;
466 }
467
468 /* If not the same as this byte, it must be smaller, doesn't overflow */
469 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
470 return FALSE;
471 }
472 }
473
474 /* Got to the end and all bytes are the same. If the input is a whole
475 * character, it doesn't overflow. And if it is a partial character,
476 * there's not enough information to tell, so assume doesn't overflow */
477 return FALSE;
478}
479
480PERL_STATIC_INLINE bool
481S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
482{
483 /* Overlongs can occur whenever the number of continuation bytes
484 * changes. That means whenever the number of leading 1 bits in a start
485 * byte increases from the next lower start byte. That happens for start
486 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
487 * illegal start bytes have already been excluded, so don't need to be
488 * tested here;
489 * ASCII platforms: C0, C1
490 * EBCDIC platforms C0, C1, C2, C3, C4, E0
491 *
492 * At least a second byte is required to determine if other sequences will
493 * be an overlong. */
494
495 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
496 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
497
498 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
499 assert(len > 1 && UTF8_IS_START(*s));
500
501 /* Each platform has overlongs after the start bytes given above (expressed
502 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
503 * the logic is the same, except the E0 overlong has already been excluded
504 * on EBCDIC platforms. The values below were found by manually
505 * inspecting the UTF-8 patterns. See the tables in utf8.h and
506 * utfebcdic.h. */
507
508# ifdef EBCDIC
509# define F0_ABOVE_OVERLONG 0xB0
510# define F8_ABOVE_OVERLONG 0xA8
511# define FC_ABOVE_OVERLONG 0xA4
512# define FE_ABOVE_OVERLONG 0xA2
513# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
514 /* I8(0xfe) is FF */
515# else
516
517 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
518 return TRUE;
519 }
520
521# define F0_ABOVE_OVERLONG 0x90
522# define F8_ABOVE_OVERLONG 0x88
523# define FC_ABOVE_OVERLONG 0x84
524# define FE_ABOVE_OVERLONG 0x82
525# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
526# endif
527
528
529 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
530 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
531 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
532 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
533 {
534 return TRUE;
535 }
536
b0b342d4
KW
537 /* Check for the FF overlong */
538 return isFF_OVERLONG(s, len);
539}
540
541PERL_STATIC_INLINE bool
542S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
543{
544 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3
KW
545
546 /* Check for the FF overlong. This happens only if all these bytes match;
547 * what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 548 * utfebcdic.h. */
12a4bed3 549
b0b342d4
KW
550 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
551 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
552 sizeof(FF_OVERLONG_PREFIX) - 1));
12a4bed3
KW
553}
554
555#undef F0_ABOVE_OVERLONG
556#undef F8_ABOVE_OVERLONG
557#undef FC_ABOVE_OVERLONG
558#undef FE_ABOVE_OVERLONG
559#undef FF_OVERLONG_PREFIX
560
35f8c9bd 561STRLEN
edc2c47a 562Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 563{
2b479609 564 STRLEN len;
12a4bed3 565 const U8 *x;
35f8c9bd 566
2b479609
KW
567 /* A helper function that should not be called directly.
568 *
569 * This function returns non-zero if the string beginning at 's' and
570 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
571 * code point; otherwise it returns 0. The examination stops after the
572 * first code point in 's' is validated, not looking at the rest of the
573 * input. If 'e' is such that there are not enough bytes to represent a
574 * complete code point, this function will return non-zero anyway, if the
575 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
576 * excluded by 'flags'.
577 *
578 * A non-zero return gives the number of bytes required to represent the
579 * code point. Be aware that if the input is for a partial character, the
580 * return will be larger than 'e - s'.
581 *
582 * This function assumes that the code point represented is UTF-8 variant.
583 * The caller should have excluded this possibility before calling this
584 * function.
585 *
586 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
587 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
588 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
589 * disallowed by the flags. If the input is only for a partial character,
590 * the function will return non-zero if there is any sequence of
591 * well-formed UTF-8 that, when appended to the input sequence, could
592 * result in an allowed code point; otherwise it returns 0. Non characters
593 * cannot be determined based on partial character input. But many of the
594 * other excluded types can be determined with just the first one or two
595 * bytes.
596 *
597 */
598
599 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
600
601 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
602 |UTF8_DISALLOW_ABOVE_31_BIT)));
603 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 604
2b479609 605 /* A variant char must begin with a start byte */
35f8c9bd
KW
606 if (UNLIKELY(! UTF8_IS_START(*s))) {
607 return 0;
608 }
609
edc2c47a
KW
610 /* Examine a maximum of a single whole code point */
611 if (e - s > UTF8SKIP(s)) {
612 e = s + UTF8SKIP(s);
613 }
614
2b479609
KW
615 len = e - s;
616
617 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
618 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 619
2b479609
KW
620 /* The code below is derived from this table. Keep in mind that legal
621 * continuation bytes range between \x80..\xBF for UTF-8, and
622 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
623 * Hence, we don't have to test the upper edge because if any of those
624 * are encountered, the sequence is malformed, and will fail elsewhere
625 * in this function.
626 * UTF-8 UTF-EBCDIC I8
627 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
628 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
629 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
630 *
631 */
632
633#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
634# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 635# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 636
19794540
KW
637# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
638 /* B6 and B7 */ \
639 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
640#else
641# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
642# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
643# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
644#endif
645
646 if ( (flags & UTF8_DISALLOW_SUPER)
647 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
648 return 0; /* Above Unicode */
649 }
650
651 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
652 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
653 {
654 return 0; /* Above 31 bits */
655 }
656
657 if (len > 1) {
658 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
659
660 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 661 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
662 {
663 return 0; /* Above Unicode */
664 }
665
666 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 667 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
668 {
669 return 0; /* Surrogate */
670 }
671
672 if ( (flags & UTF8_DISALLOW_NONCHAR)
673 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
674 {
675 return 0; /* Noncharacter code point */
676 }
677 }
678 }
679
680 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
681 for (x = s + 1; x < e; x++) {
682 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
683 return 0;
684 }
685 }
686
af13dd8a 687 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
688 * overlong. */
689 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
690 return 0;
af13dd8a
KW
691 }
692
12a4bed3
KW
693 /* And finally, that the code point represented fits in a word on this
694 * platform */
695 if (does_utf8_overflow(s, e)) {
696 return 0;
35f8c9bd
KW
697 }
698
2b479609 699 return UTF8SKIP(s);
35f8c9bd
KW
700}
701
7cf8d05d
KW
702STATIC char *
703S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
704{
705 /* Returns a mortalized C string that is a displayable copy of the 'len'
706 * bytes starting at 's', each in a \xXY format. */
707
708 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
709 trailing NUL */
710 const U8 * const e = s + len;
711 char * output;
712 char * d;
713
714 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
715
716 Newx(output, output_len, char);
717 SAVEFREEPV(output);
718
719 d = output;
720 for (; s < e; s++) {
721 const unsigned high_nibble = (*s & 0xF0) >> 4;
722 const unsigned low_nibble = (*s & 0x0F);
723
724 *d++ = '\\';
725 *d++ = 'x';
726
727 if (high_nibble < 10) {
728 *d++ = high_nibble + '0';
729 }
730 else {
731 *d++ = high_nibble - 10 + 'a';
732 }
733
734 if (low_nibble < 10) {
735 *d++ = low_nibble + '0';
736 }
737 else {
738 *d++ = low_nibble - 10 + 'a';
739 }
740 }
741
742 *d = '\0';
743 return output;
744}
745
806547a7 746PERL_STATIC_INLINE char *
7cf8d05d
KW
747S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
748
749 /* How many bytes to print */
3cc6a05e 750 STRLEN print_len,
7cf8d05d
KW
751
752 /* Which one is the non-continuation */
753 const STRLEN non_cont_byte_pos,
754
755 /* How many bytes should there be? */
756 const STRLEN expect_len)
806547a7
KW
757{
758 /* Return the malformation warning text for an unexpected continuation
759 * byte. */
760
7cf8d05d 761 const char * const where = (non_cont_byte_pos == 1)
806547a7 762 ? "immediately"
7cf8d05d
KW
763 : Perl_form(aTHX_ "%d bytes",
764 (int) non_cont_byte_pos);
3cc6a05e 765 unsigned int i;
806547a7
KW
766
767 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
768
7cf8d05d
KW
769 /* We don't need to pass this parameter, but since it has already been
770 * calculated, it's likely faster to pass it; verify under DEBUGGING */
771 assert(expect_len == UTF8SKIP(s));
772
3cc6a05e
KW
773 /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
774 * length that is larger than is actually available in the buffer. If we
775 * print all the bytes based on that length, we will read past the buffer
776 * end. Often, the strings are NUL terminated, so to lower the chances of
777 * this happening, print the malformed bytes only up through any NUL. */
778 for (i = 1; i < print_len; i++) {
779 if (*(s + i) == '\0') {
780 print_len = i + 1; /* +1 gets the NUL printed */
781 break;
782 }
783 }
784
7cf8d05d
KW
785 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
786 " %s after start byte 0x%02x; need %d bytes, got %d)",
787 malformed_text,
788 _byte_dump_string(s, print_len),
789 *(s + non_cont_byte_pos),
790 where,
791 *s,
792 (int) expect_len,
793 (int) non_cont_byte_pos);
806547a7
KW
794}
795
35f8c9bd
KW
796/*
797
de69f3af 798=for apidoc utf8n_to_uvchr
378516de
KW
799
800THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 801Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 802
9041c2e3 803Bottom level UTF-8 decode routine.
de69f3af 804Returns the native code point value of the first character in the string C<s>,
746afd53
KW
805which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
806C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
807the length, in bytes, of that character.
949cf498
KW
808
809The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
810well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
811causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
812is the next possible position in C<s> that could begin a non-malformed
813character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
814is raised. Some UTF-8 input sequences may contain multiple malformations.
815This function tries to find every possible one in each call, so multiple
816warnings can be raised for each sequence.
949cf498
KW
817
818Various ALLOW flags can be set in C<flags> to allow (and not warn on)
819individual types of malformations, such as the sequence being overlong (that
820is, when there is a shorter sequence that can express the same code point;
821overlong sequences are expressly forbidden in the UTF-8 standard due to
822potential security issues). Another malformation example is the first byte of
823a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
824flags. For allowed 0 length strings, this function returns 0; for allowed
825overlong sequences, the computed code point is returned; for all other allowed
826malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
827determinable reasonable value.
949cf498 828
796b6530 829The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
830flags) malformation is found. If this flag is set, the routine assumes that
831the caller will raise a warning, and this function will silently just set
d088425d
KW
832C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
833
75200dff 834Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 835character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
836in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
837be set to 1. To disambiguate, upon a zero return, see if the first byte of
838C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 839error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
840
841Certain code points are considered problematic. These are Unicode surrogates,
746afd53 842Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 843By default these are considered regular code points, but certain situations
ecc1615f
KW
844warrant special handling for them, which can be specified using the C<flags>
845parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
846three classes are treated as malformations and handled as such. The flags
847C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
848C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
849disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
850restricts the allowed inputs to the strict UTF-8 traditionally defined by
851Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
852definition given by
853L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
854The difference between traditional strictness and C9 strictness is that the
855latter does not forbid non-character code points. (They are still discouraged,
856however.) For more discussion see L<perlunicode/Noncharacter code points>.
857
858The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
859C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
860C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
861raised for their respective categories, but otherwise the code points are
862considered valid (not malformations). To get a category to both be treated as
863a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 864(But note that warnings are not raised if lexically disabled nor if
796b6530 865C<UTF8_CHECK_ONLY> is also specified.)
949cf498 866
760c7c2f
KW
867It is now deprecated to have very high code points (above C<IV_MAX> on the
868platforms) and this function will raise a deprecation warning for these (unless
d5944cab 869such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 870in a 32-bit word.
ab8e6d41
KW
871
872Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
873so using them is more problematic than other above-Unicode code points. Perl
874invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
875likely that non-Perl languages will not be able to read files that contain
aff2be59 876these; nor would Perl understand files
ab8e6d41
KW
877written by something that uses a different extension. For these reasons, there
878is a separate set of flags that can warn and/or disallow these extremely high
879code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
880C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
881are entirely independent from the deprecation warning for code points above
882C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
883code point that needs more than 31 bits to represent. When that happens,
884effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
88532-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
886above-Unicode code points, including these, as malformations; and
887C<UTF8_WARN_SUPER> warns on these.)
888
889On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
890extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
891than on ASCII. Prior to that, code points 2**31 and higher were simply
892unrepresentable, and a different, incompatible method was used to represent
893code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
894and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
895platforms, warning and disallowing 2**31 and higher.
949cf498
KW
896
897All other code points corresponding to Unicode characters, including private
898use and those yet to be assigned, are never considered malformed and never
899warn.
67e989fb 900
37607a96 901=cut
f9380377
KW
902
903Also implemented as a macro in utf8.h
904*/
905
906UV
907Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
908 STRLEN curlen,
909 STRLEN *retlen,
910 const U32 flags)
911{
912 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
913
914 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
915}
916
917/*
918
919=for apidoc utf8n_to_uvchr_error
920
921THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
922Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
923
924This function is for code that needs to know what the precise malformation(s)
925are when an error is found.
926
927It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
928all the others, C<errors>. If this parameter is 0, this function behaves
929identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
930to a C<U32> variable, which this function sets to indicate any errors found.
931Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
932C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
933of these bits will be set if a malformation is found, even if the input
934C<flags> parameter indicates that the given malformation is allowed; the
935exceptions are noted:
936
937=over 4
938
939=item C<UTF8_GOT_ABOVE_31_BIT>
940
941The code point represented by the input UTF-8 sequence occupies more than 31
942bits.
943This bit is set only if the input C<flags> parameter contains either the
944C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
945
946=item C<UTF8_GOT_CONTINUATION>
947
948The input sequence was malformed in that the first byte was a a UTF-8
949continuation byte.
950
951=item C<UTF8_GOT_EMPTY>
952
953The input C<curlen> parameter was 0.
954
955=item C<UTF8_GOT_LONG>
956
957The input sequence was malformed in that there is some other sequence that
958evaluates to the same code point, but that sequence is shorter than this one.
959
960=item C<UTF8_GOT_NONCHAR>
961
962The code point represented by the input UTF-8 sequence is for a Unicode
963non-character code point.
964This bit is set only if the input C<flags> parameter contains either the
965C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
966
967=item C<UTF8_GOT_NON_CONTINUATION>
968
969The input sequence was malformed in that a non-continuation type byte was found
970in a position where only a continuation type one should be.
971
972=item C<UTF8_GOT_OVERFLOW>
973
974The input sequence was malformed in that it is for a code point that is not
975representable in the number of bits available in a UV on the current platform.
976
977=item C<UTF8_GOT_SHORT>
978
979The input sequence was malformed in that C<curlen> is smaller than required for
980a complete sequence. In other words, the input is for a partial character
981sequence.
982
983=item C<UTF8_GOT_SUPER>
984
985The input sequence was malformed in that it is for a non-Unicode code point;
986that is, one above the legal Unicode maximum.
987This bit is set only if the input C<flags> parameter contains either the
988C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
989
990=item C<UTF8_GOT_SURROGATE>
991
992The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
993code point.
994This bit is set only if the input C<flags> parameter contains either the
995C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
996
997=back
998
999=cut
37607a96 1000*/
67e989fb 1001
a0ed51b3 1002UV
f9380377
KW
1003Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1004 STRLEN curlen,
1005 STRLEN *retlen,
1006 const U32 flags,
1007 U32 * errors)
a0ed51b3 1008{
d4c19fe8 1009 const U8 * const s0 = s;
2b5e7bc2
KW
1010 U8 * send = NULL; /* (initialized to silence compilers' wrong
1011 warning) */
1012 U32 possible_problems = 0; /* A bit is set here for each potential problem
1013 found as we go along */
eb83ed87 1014 UV uv = *s;
2b5e7bc2
KW
1015 STRLEN expectlen = 0; /* How long should this sequence be?
1016 (initialized to silence compilers' wrong
1017 warning) */
f9380377
KW
1018 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1019 this gets set and discarded */
a0dbb045 1020
2b5e7bc2
KW
1021 /* The below are used only if there is both an overlong malformation and a
1022 * too short one. Otherwise the first two are set to 's0' and 'send', and
1023 * the third not used at all */
1024 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1025 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1026 warning) */
2b5e7bc2 1027 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1028
f9380377
KW
1029 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1030
1031 if (errors) {
1032 *errors = 0;
1033 }
1034 else {
1035 errors = &discard_errors;
1036 }
a0dbb045 1037
eb83ed87
KW
1038 /* The order of malformation tests here is important. We should consume as
1039 * few bytes as possible in order to not skip any valid character. This is
1040 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1041 * http://unicode.org/reports/tr36 for more discussion as to why. For
1042 * example, once we've done a UTF8SKIP, we can tell the expected number of
1043 * bytes, and could fail right off the bat if the input parameters indicate
1044 * that there are too few available. But it could be that just that first
1045 * byte is garbled, and the intended character occupies fewer bytes. If we
1046 * blindly assumed that the first byte is correct, and skipped based on
1047 * that number, we could skip over a valid input character. So instead, we
1048 * always examine the sequence byte-by-byte.
1049 *
1050 * We also should not consume too few bytes, otherwise someone could inject
1051 * things. For example, an input could be deliberately designed to
1052 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1053 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1054 * which is actually part of of the overflowing sequence), that could look
1055 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1056 * sequence and process the rest, inappropriately.
1057 *
1058 * Some possible input sequences are malformed in more than one way. This
1059 * function goes to lengths to try to find all of them. This is necessary
1060 * for correctness, as the inputs may allow one malformation but not
1061 * another, and if we abandon searching for others after finding the
1062 * allowed one, we could allow in something that shouldn't have been.
1063 */
eb83ed87 1064
b5b9af04 1065 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1066 possible_problems |= UTF8_GOT_EMPTY;
1067 curlen = 0;
1068 uv = 0; /* XXX It could be argued that this should be
1069 UNICODE_REPLACEMENT? */
1070 goto ready_to_handle_errors;
0c443dc2
JH
1071 }
1072
eb83ed87
KW
1073 expectlen = UTF8SKIP(s);
1074
1075 /* A well-formed UTF-8 character, as the vast majority of calls to this
1076 * function will be for, has this expected length. For efficiency, set
1077 * things up here to return it. It will be overriden only in those rare
1078 * cases where a malformation is found */
1079 if (retlen) {
1080 *retlen = expectlen;
1081 }
1082
1083 /* An invariant is trivially well-formed */
1d72bdf6 1084 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1085 return uv;
a0ed51b3 1086 }
67e989fb 1087
eb83ed87 1088 /* A continuation character can't start a valid sequence */
b5b9af04 1089 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1090 possible_problems |= UTF8_GOT_CONTINUATION;
1091 curlen = 1;
1092 uv = UNICODE_REPLACEMENT;
1093 goto ready_to_handle_errors;
ba210ebe 1094 }
9041c2e3 1095
dcd27b3c
KW
1096 /* Here is not a continuation byte, nor an invariant. The only thing left
1097 * is a start byte (possibly for an overlong) */
1098
534752c1
KW
1099 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1100 * that indicate the number of bytes in the character's whole UTF-8
1101 * sequence, leaving just the bits that are part of the value. */
1102 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1103
eb83ed87
KW
1104 /* Now, loop through the remaining bytes in the character's sequence,
1105 * accumulating each into the working value as we go. Be sure to not look
1106 * past the end of the input string */
b3057643
KW
1107 send = adjusted_send = (U8*) s0 + ((expectlen <= curlen)
1108 ? expectlen
1109 : curlen);
eb83ed87 1110 for (s = s0 + 1; s < send; s++) {
b5b9af04 1111 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1112 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1113 continue;
1114 }
1115
1116 /* Here, found a non-continuation before processing all expected bytes.
1117 * This byte indicates the beginning of a new character, so quit, even
1118 * if allowing this malformation. */
1119 curlen = s - s0; /* Save how many bytes we actually got */
1120 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1121 goto finish_short;
eb83ed87
KW
1122 } /* End of loop through the character's bytes */
1123
1124 /* Save how many bytes were actually in the character */
1125 curlen = s - s0;
1126
2b5e7bc2
KW
1127 /* Did we get all the continuation bytes that were expected? Note that we
1128 * know this result even without executing the loop above. But we had to
1129 * do the loop to see if there are unexpected non-continuations. */
1130 if (UNLIKELY(curlen < expectlen)) {
1131 possible_problems |= UTF8_GOT_SHORT;
2f8f112e 1132
2b5e7bc2
KW
1133 finish_short:
1134 uv_so_far = uv;
1135 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
1136 }
1137
2b5e7bc2
KW
1138 /* Note that there are two types of too-short malformation. One is when
1139 * there is actual wrong data before the normal termination of the
1140 * sequence. The other is that the sequence wasn't complete before the end
1141 * of the data we are allowed to look at, based on the input 'curlen'.
1142 * This means that we were passed data for a partial character, but it is
1143 * valid as far as we saw. The other is definitely invalid. This
1144 * distinction could be important to a caller, so the two types are kept
1145 * separate. */
1146
1147 /* Check for overflow */
1148 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1149 possible_problems |= UTF8_GOT_OVERFLOW;
1150 uv = UNICODE_REPLACEMENT;
eb83ed87 1151 }
eb83ed87 1152
2b5e7bc2
KW
1153 /* Check for overlong. If no problems so far, 'uv' is the correct code
1154 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1155 * we must look at the UTF-8 byte sequence itself to see if it is for an
1156 * overlong */
1157 if ( ( LIKELY(! possible_problems)
1158 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1159 || ( UNLIKELY( possible_problems)
1160 && ( UNLIKELY(! UTF8_IS_START(*s0))
1161 || ( curlen > 1
1162 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1163 send - s0))))))
2f8f112e 1164 {
2b5e7bc2
KW
1165 possible_problems |= UTF8_GOT_LONG;
1166
1167 /* A convenience macro that matches either of the too-short conditions.
1168 * */
1169# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1170
1171 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1172 UV min_uv = uv_so_far;
1173 STRLEN i;
1174
1175 /* Here, the input is both overlong and is missing some trailing
1176 * bytes. There is no single code point it could be for, but there
1177 * may be enough information present to determine if what we have
1178 * so far is for an unallowed code point, such as for a surrogate.
1179 * The code below has the intelligence to determine this, but just
1180 * for non-overlong UTF-8 sequences. What we do here is calculate
1181 * the smallest code point the input could represent if there were
1182 * no too short malformation. Then we compute and save the UTF-8
1183 * for that, which is what the code below looks at instead of the
1184 * raw input. It turns out that the smallest such code point is
1185 * all we need. */
1186 for (i = curlen; i < expectlen; i++) {
1187 min_uv = UTF8_ACCUMULATE(min_uv,
1188 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1189 }
1190
1191 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1192 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1193 to free it ourselves if
1194 warnings are made fatal */
1195 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1196 }
eb83ed87
KW
1197 }
1198
2b5e7bc2
KW
1199 /* Now check that the input isn't for a problematic code point not allowed
1200 * by the input parameters. */
1201 /* isn't problematic if < this */
1202 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1203 || ( UNLIKELY(possible_problems)
1204 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1205 && ((flags & ( UTF8_DISALLOW_NONCHAR
1206 |UTF8_DISALLOW_SURROGATE
1207 |UTF8_DISALLOW_SUPER
1208 |UTF8_DISALLOW_ABOVE_31_BIT
1209 |UTF8_WARN_NONCHAR
1210 |UTF8_WARN_SURROGATE
1211 |UTF8_WARN_SUPER
1212 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1213 /* In case of a malformation, 'uv' is not valid, and has
1214 * been changed to something in the Unicode range.
1215 * Currently we don't output a deprecation message if there
1216 * is already a malformation, so we don't have to special
1217 * case the test immediately below */
760c7c2f
KW
1218 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1219 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1220 {
2b5e7bc2
KW
1221 /* If there were no malformations, or the only malformation is an
1222 * overlong, 'uv' is valid */
1223 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1224 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1225 possible_problems |= UTF8_GOT_SURROGATE;
1226 }
1227 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1228 possible_problems |= UTF8_GOT_SUPER;
1229 }
1230 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1231 possible_problems |= UTF8_GOT_NONCHAR;
1232 }
1233 }
1234 else { /* Otherwise, need to look at the source UTF-8, possibly
1235 adjusted to be non-overlong */
1236
1237 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1238 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1239 {
2b5e7bc2
KW
1240 possible_problems |= UTF8_GOT_SUPER;
1241 }
1242 else if (curlen > 1) {
1243 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1244 NATIVE_UTF8_TO_I8(*adjusted_s0),
1245 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1246 {
2b5e7bc2 1247 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1248 }
2b5e7bc2
KW
1249 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1250 NATIVE_UTF8_TO_I8(*adjusted_s0),
1251 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1252 {
1253 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1254 }
1255 }
c0236afe 1256
2b5e7bc2
KW
1257 /* We need a complete well-formed UTF-8 character to discern
1258 * non-characters, so can't look for them here */
1259 }
1260 }
949cf498 1261
2b5e7bc2
KW
1262 ready_to_handle_errors:
1263
1264 /* At this point:
1265 * curlen contains the number of bytes in the sequence that
1266 * this call should advance the input by.
1267 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1268 * is set in it for each potential problem found.
1269 * uv contains the code point the input sequence
1270 * represents; or if there is a problem that prevents
1271 * a well-defined value from being computed, it is
1272 * some subsitute value, typically the REPLACEMENT
1273 * CHARACTER.
1274 * s0 points to the first byte of the character
1275 * send points to just after where that (potentially
1276 * partial) character ends
1277 * adjusted_s0 normally is the same as s0, but in case of an
1278 * overlong for which the UTF-8 matters below, it is
1279 * the first byte of the shortest form representation
1280 * of the input.
1281 * adjusted_send normally is the same as 'send', but if adjusted_s0
1282 * is set to something other than s0, this points one
1283 * beyond its end
1284 */
eb83ed87 1285
2b5e7bc2
KW
1286 if (UNLIKELY(possible_problems)) {
1287 bool disallowed = FALSE;
1288 const U32 orig_problems = possible_problems;
1289
1290 while (possible_problems) { /* Handle each possible problem */
1291 UV pack_warn = 0;
1292 char * message = NULL;
1293
1294 /* Each 'if' clause handles one problem. They are ordered so that
1295 * the first ones' messages will be displayed before the later
1296 * ones; this is kinda in decreasing severity order */
1297 if (possible_problems & UTF8_GOT_OVERFLOW) {
1298
1299 /* Overflow means also got a super and above 31 bits, but we
1300 * handle all three cases here */
1301 possible_problems
1302 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1303 *errors |= UTF8_GOT_OVERFLOW;
1304
1305 /* But the API says we flag all errors found */
1306 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1307 *errors |= UTF8_GOT_SUPER;
1308 }
1309 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1310 *errors |= UTF8_GOT_ABOVE_31_BIT;
1311 }
2b5e7bc2
KW
1312
1313 disallowed = TRUE;
1314
1315 /* The warnings code explicitly says it doesn't handle the case
1316 * of packWARN2 and two categories which have parent-child
1317 * relationship. Even if it works now to raise the warning if
1318 * either is enabled, it wouldn't necessarily do so in the
1319 * future. We output (only) the most dire warning*/
1320 if (! (flags & UTF8_CHECK_ONLY)) {
1321 if (ckWARN_d(WARN_UTF8)) {
1322 pack_warn = packWARN(WARN_UTF8);
1323 }
1324 else if (ckWARN_d(WARN_NON_UNICODE)) {
1325 pack_warn = packWARN(WARN_NON_UNICODE);
1326 }
1327 if (pack_warn) {
1328 message = Perl_form(aTHX_ "%s: %s (overflows)",
1329 malformed_text,
1330 _byte_dump_string(s0, send - s0));
1331 }
1332 }
1333 }
1334 else if (possible_problems & UTF8_GOT_EMPTY) {
1335 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1336 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1337
1338 if (! (flags & UTF8_ALLOW_EMPTY)) {
1339 disallowed = TRUE;
1340 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1341 pack_warn = packWARN(WARN_UTF8);
1342 message = Perl_form(aTHX_ "%s (empty string)",
1343 malformed_text);
1344 }
1345 }
1346 }
1347 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1348 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1349 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1350
1351 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1352 disallowed = TRUE;
1353 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1354 pack_warn = packWARN(WARN_UTF8);
1355 message = Perl_form(aTHX_
1356 "%s: %s (unexpected continuation byte 0x%02x,"
1357 " with no preceding start byte)",
1358 malformed_text,
1359 _byte_dump_string(s0, 1), *s0);
1360 }
1361 }
1362 }
1363 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1364 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
f9380377 1365 *errors |= UTF8_GOT_NON_CONTINUATION;
2b5e7bc2
KW
1366
1367 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1368 disallowed = TRUE;
1369 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1370 pack_warn = packWARN(WARN_UTF8);
1371 message = Perl_form(aTHX_ "%s",
1372 unexpected_non_continuation_text(s0,
1373 send - s0,
1374 s - s0,
1375 (int) expectlen));
1376 }
1377 }
1378 }
1379 else if (possible_problems & UTF8_GOT_SHORT) {
1380 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1381 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1382
1383 if (! (flags & UTF8_ALLOW_SHORT)) {
1384 disallowed = TRUE;
1385 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1386 pack_warn = packWARN(WARN_UTF8);
1387 message = Perl_form(aTHX_
9a6c9c81 1388 "%s: %s (too short; %d byte%s available, need %d)",
2b5e7bc2
KW
1389 malformed_text,
1390 _byte_dump_string(s0, send - s0),
1391 (int)curlen,
1392 curlen == 1 ? "" : "s",
1393 (int)expectlen);
1394 }
1395 }
ba210ebe 1396
2b5e7bc2
KW
1397 }
1398 else if (possible_problems & UTF8_GOT_LONG) {
1399 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1400 *errors |= UTF8_GOT_LONG;
2b5e7bc2
KW
1401
1402 if (! (flags & UTF8_ALLOW_LONG)) {
1403 disallowed = TRUE;
1404
1405 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1406 pack_warn = packWARN(WARN_UTF8);
1407
1408 /* These error types cause 'uv' to be something that
1409 * isn't what was intended, so can't use it in the
1410 * message. The other error types either can't
1411 * generate an overlong, or else the 'uv' is valid */
1412 if (orig_problems &
1413 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1414 {
1415 message = Perl_form(aTHX_
1416 "%s: %s (any UTF-8 sequence that starts"
1417 " with \"%s\" is overlong which can and"
1418 " should be represented with a"
1419 " different, shorter sequence)",
1420 malformed_text,
1421 _byte_dump_string(s0, send - s0),
1422 _byte_dump_string(s0, curlen));
1423 }
1424 else {
1425 U8 tmpbuf[UTF8_MAXBYTES+1];
1426 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1427 uv, 0);
1428 message = Perl_form(aTHX_
1429 "%s: %s (overlong; instead use %s to represent"
147e3846 1430 " U+%0*" UVXf ")",
2b5e7bc2
KW
1431 malformed_text,
1432 _byte_dump_string(s0, send - s0),
1433 _byte_dump_string(tmpbuf, e - tmpbuf),
1434 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1435 small code points */
1436 uv);
1437 }
1438 }
1439 }
1440 }
1441 else if (possible_problems & UTF8_GOT_SURROGATE) {
1442 possible_problems &= ~UTF8_GOT_SURROGATE;
1443
f9380377
KW
1444 if (flags & UTF8_WARN_SURROGATE) {
1445 *errors |= UTF8_GOT_SURROGATE;
1446
1447 if ( ! (flags & UTF8_CHECK_ONLY)
1448 && ckWARN_d(WARN_SURROGATE))
1449 {
2b5e7bc2
KW
1450 pack_warn = packWARN(WARN_SURROGATE);
1451
1452 /* These are the only errors that can occur with a
1453 * surrogate when the 'uv' isn't valid */
1454 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1455 message = Perl_form(aTHX_
1456 "UTF-16 surrogate (any UTF-8 sequence that"
1457 " starts with \"%s\" is for a surrogate)",
1458 _byte_dump_string(s0, curlen));
1459 }
1460 else {
1461 message = Perl_form(aTHX_
147e3846 1462 "UTF-16 surrogate U+%04" UVXf, uv);
2b5e7bc2 1463 }
f9380377 1464 }
2b5e7bc2 1465 }
ba210ebe 1466
2b5e7bc2
KW
1467 if (flags & UTF8_DISALLOW_SURROGATE) {
1468 disallowed = TRUE;
f9380377 1469 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1470 }
1471 }
1472 else if (possible_problems & UTF8_GOT_SUPER) {
1473 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1474
f9380377
KW
1475 if (flags & UTF8_WARN_SUPER) {
1476 *errors |= UTF8_GOT_SUPER;
1477
1478 if ( ! (flags & UTF8_CHECK_ONLY)
1479 && ckWARN_d(WARN_NON_UNICODE))
1480 {
2b5e7bc2
KW
1481 pack_warn = packWARN(WARN_NON_UNICODE);
1482
1483 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1484 message = Perl_form(aTHX_
1485 "Any UTF-8 sequence that starts with"
1486 " \"%s\" is for a non-Unicode code point,"
1487 " may not be portable",
1488 _byte_dump_string(s0, curlen));
1489 }
1490 else {
1491 message = Perl_form(aTHX_
147e3846 1492 "Code point 0x%04" UVXf " is not"
2b5e7bc2
KW
1493 " Unicode, may not be portable",
1494 uv);
1495 }
f9380377 1496 }
2b5e7bc2 1497 }
ba210ebe 1498
2b5e7bc2
KW
1499 /* The maximum code point ever specified by a standard was
1500 * 2**31 - 1. Anything larger than that is a Perl extension
1501 * that very well may not be understood by other applications
1502 * (including earlier perl versions on EBCDIC platforms). We
1503 * test for these after the regular SUPER ones, and before
1504 * possibly bailing out, so that the slightly more dire warning
1505 * will override the regular one. */
1506 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1507 |UTF8_WARN_SUPER
1508 |UTF8_DISALLOW_ABOVE_31_BIT))
1509 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1510 && UNLIKELY(is_utf8_cp_above_31_bits(
1511 adjusted_s0,
1512 adjusted_send)))
1513 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1514 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1515 {
1516 if ( ! (flags & UTF8_CHECK_ONLY)
1517 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1518 && ckWARN_d(WARN_UTF8))
1519 {
1520 pack_warn = packWARN(WARN_UTF8);
1521
1522 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1523 message = Perl_form(aTHX_
1524 "Any UTF-8 sequence that starts with"
1525 " \"%s\" is for a non-Unicode code"
1526 " point, and is not portable",
1527 _byte_dump_string(s0, curlen));
1528 }
1529 else {
1530 message = Perl_form(aTHX_
147e3846 1531 "Code point 0x%" UVXf " is not Unicode,"
2b5e7bc2
KW
1532 " and not portable",
1533 uv);
1534 }
1535 }
1536
f9380377
KW
1537 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1538 *errors |= UTF8_GOT_ABOVE_31_BIT;
1539
1540 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1541 disallowed = TRUE;
1542 }
2b5e7bc2
KW
1543 }
1544 }
eb83ed87 1545
2b5e7bc2 1546 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1547 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1548 disallowed = TRUE;
1549 }
eb83ed87 1550
2b5e7bc2
KW
1551 /* The deprecated warning overrides any non-deprecated one. If
1552 * there are other problems, a deprecation message is not
1553 * really helpful, so don't bother to raise it in that case.
1554 * This also keeps the code from having to handle the case
1555 * where 'uv' is not valid. */
1556 if ( ! (orig_problems
1557 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1558 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1559 && ckWARN_d(WARN_DEPRECATED))
1560 {
1561 message = Perl_form(aTHX_ cp_above_legal_max,
1562 uv, MAX_NON_DEPRECATED_CP);
1563 pack_warn = packWARN(WARN_DEPRECATED);
1564 }
1565 }
1566 else if (possible_problems & UTF8_GOT_NONCHAR) {
1567 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1568
f9380377
KW
1569 if (flags & UTF8_WARN_NONCHAR) {
1570 *errors |= UTF8_GOT_NONCHAR;
1571
1572 if ( ! (flags & UTF8_CHECK_ONLY)
1573 && ckWARN_d(WARN_NONCHAR))
1574 {
2b5e7bc2
KW
1575 /* The code above should have guaranteed that we don't
1576 * get here with errors other than overlong */
1577 assert (! (orig_problems
1578 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1579
1580 pack_warn = packWARN(WARN_NONCHAR);
1581 message = Perl_form(aTHX_ "Unicode non-character"
147e3846 1582 " U+%04" UVXf " is not recommended"
2b5e7bc2 1583 " for open interchange", uv);
f9380377 1584 }
2b5e7bc2 1585 }
5b311467 1586
2b5e7bc2
KW
1587 if (flags & UTF8_DISALLOW_NONCHAR) {
1588 disallowed = TRUE;
f9380377 1589 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1590 }
1591 } /* End of looking through the possible flags */
1592
1593 /* Display the message (if any) for the problem being handled in
1594 * this iteration of the loop */
1595 if (message) {
1596 if (PL_op)
1597 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1598 OP_DESC(PL_op));
1599 else
1600 Perl_warner(aTHX_ pack_warn, "%s", message);
1601 }
1602 } /* End of 'while (possible_problems) {' */
a0dbb045 1603
2b5e7bc2
KW
1604 /* Since there was a possible problem, the returned length may need to
1605 * be changed from the one stored at the beginning of this function.
1606 * Instead of trying to figure out if that's needed, just do it. */
1607 if (retlen) {
1608 *retlen = curlen;
1609 }
a0dbb045 1610
2b5e7bc2
KW
1611 if (disallowed) {
1612 if (flags & UTF8_CHECK_ONLY && retlen) {
1613 *retlen = ((STRLEN) -1);
1614 }
1615 return 0;
1616 }
eb83ed87 1617 }
ba210ebe 1618
2b5e7bc2 1619 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1620}
1621
8e84507e 1622/*
ec5f19d0
KW
1623=for apidoc utf8_to_uvchr_buf
1624
1625Returns the native code point of the first character in the string C<s> which
1626is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1627C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1628
524080c4
KW
1629If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1630enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1631C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1632(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1633C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1634the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1635See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1636returned.
ec5f19d0 1637
760c7c2f
KW
1638Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1639unless those are turned off.
1640
ec5f19d0 1641=cut
52be2536
KW
1642
1643Also implemented as a macro in utf8.h
1644
ec5f19d0
KW
1645*/
1646
1647
1648UV
1649Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1650{
ec5f19d0
KW
1651 assert(s < send);
1652
1653 return utf8n_to_uvchr(s, send - s, retlen,
1654 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1655}
1656
52be2536
KW
1657/* This is marked as deprecated
1658 *
ec5f19d0
KW
1659=for apidoc utf8_to_uvuni_buf
1660
de69f3af
KW
1661Only in very rare circumstances should code need to be dealing in Unicode
1662(as opposed to native) code points. In those few cases, use
1663C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1664
1665Returns the Unicode (not-native) code point of the first character in the
1666string C<s> which
ec5f19d0
KW
1667is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1668C<retlen> will be set to the length, in bytes, of that character.
1669
524080c4
KW
1670If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1671enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1672NULL) to -1. If those warnings are off, the computed value if well-defined (or
1673the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1674is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1675next possible position in C<s> that could begin a non-malformed character.
de69f3af 1676See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1677
760c7c2f
KW
1678Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1679unless those are turned off.
1680
ec5f19d0
KW
1681=cut
1682*/
1683
1684UV
1685Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1686{
1687 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1688
1689 assert(send > s);
1690
5962d97e
KW
1691 /* Call the low level routine, asking for checks */
1692 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1693}
1694
b76347f2 1695/*
87cea99e 1696=for apidoc utf8_length
b76347f2
JH
1697
1698Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1699Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1700up past C<e>, croaks.
b76347f2
JH
1701
1702=cut
1703*/
1704
1705STRLEN
35a4481c 1706Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1707{
1708 STRLEN len = 0;
1709
7918f24d
NC
1710 PERL_ARGS_ASSERT_UTF8_LENGTH;
1711
8850bf83
JH
1712 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1713 * the bitops (especially ~) can create illegal UTF-8.
1714 * In other words: in Perl UTF-8 is not just for Unicode. */
1715
a3b680e6
AL
1716 if (e < s)
1717 goto warn_and_return;
b76347f2 1718 while (s < e) {
4cbf4130 1719 s += UTF8SKIP(s);
8e91ec7f
AV
1720 len++;
1721 }
1722
1723 if (e != s) {
1724 len--;
1725 warn_and_return:
9b387841
NC
1726 if (PL_op)
1727 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1728 "%s in %s", unees, OP_DESC(PL_op));
1729 else
61a12c31 1730 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1731 }
1732
1733 return len;
1734}
1735
b06226ff 1736/*
fed3ba5d
NC
1737=for apidoc bytes_cmp_utf8
1738
a1433954 1739Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1740sequence of characters (stored as UTF-8)
1741in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1742equal, -1 or -2 if the first string is less than the second string, +1 or +2
1743if the first string is greater than the second string.
1744
1745-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1746longer string. -2 or +2 is returned if
1747there was a difference between characters
fed3ba5d
NC
1748within the strings.
1749
1750=cut
1751*/
1752
1753int
1754Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1755{
1756 const U8 *const bend = b + blen;
1757 const U8 *const uend = u + ulen;
1758
1759 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1760
1761 while (b < bend && u < uend) {
1762 U8 c = *u++;
1763 if (!UTF8_IS_INVARIANT(c)) {
1764 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1765 if (u < uend) {
1766 U8 c1 = *u++;
1767 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1768 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1769 } else {
2b5e7bc2 1770 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1771 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1772 "%s %s%s",
7cf8d05d 1773 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1774 PL_op ? " in " : "",
1775 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1776 return -2;
1777 }
1778 } else {
1779 if (PL_op)
1780 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1781 "%s in %s", unees, OP_DESC(PL_op));
1782 else
61a12c31 1783 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1784 return -2; /* Really want to return undef :-) */
1785 }
1786 } else {
1787 return -2;
1788 }
1789 }
1790 if (*b != c) {
1791 return *b < c ? -2 : +2;
1792 }
1793 ++b;
1794 }
1795
1796 if (b == bend && u == uend)
1797 return 0;
1798
1799 return b < bend ? +1 : -1;
1800}
1801
1802/*
87cea99e 1803=for apidoc utf8_to_bytes
6940069f 1804
2bbc8d55 1805Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1806Unlike L</bytes_to_utf8>, this over-writes the original string, and
1807updates C<len> to contain the new length.
67e989fb 1808Returns zero on failure, setting C<len> to -1.
6940069f 1809
a1433954 1810If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1811
6940069f
GS
1812=cut
1813*/
1814
1815U8 *
37607a96 1816Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1817{
d4c19fe8
AL
1818 U8 * const save = s;
1819 U8 * const send = s + *len;
6940069f 1820 U8 *d;
246fae53 1821
7918f24d 1822 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1823 PERL_UNUSED_CONTEXT;
7918f24d 1824
1e54db1a 1825 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1826 while (s < send) {
d59937ca
KW
1827 if (! UTF8_IS_INVARIANT(*s)) {
1828 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1829 *len = ((STRLEN) -1);
1830 return 0;
1831 }
1832 s++;
dcad2880 1833 }
d59937ca 1834 s++;
246fae53 1835 }
dcad2880
JH
1836
1837 d = s = save;
6940069f 1838 while (s < send) {
80e0b38f
KW
1839 U8 c = *s++;
1840 if (! UTF8_IS_INVARIANT(c)) {
1841 /* Then it is two-byte encoded */
a62b247b 1842 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1843 s++;
1844 }
1845 *d++ = c;
6940069f
GS
1846 }
1847 *d = '\0';
246fae53 1848 *len = d - save;
6940069f
GS
1849 return save;
1850}
1851
1852/*
87cea99e 1853=for apidoc bytes_from_utf8
f9a63242 1854
2bbc8d55 1855Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1856Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1857the newly-created string, and updates C<len> to contain the new
1858length. Returns the original string if no conversion occurs, C<len>
72d33970 1859is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 18600 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1861in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1862
37607a96
PK
1863=cut
1864*/
f9a63242
JH
1865
1866U8 *
e1ec3a88 1867Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1868{
f9a63242 1869 U8 *d;
e1ec3a88
AL
1870 const U8 *start = s;
1871 const U8 *send;
f9a63242
JH
1872 I32 count = 0;
1873
7918f24d 1874 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1875 PERL_UNUSED_CONTEXT;
f9a63242 1876 if (!*is_utf8)
73d840c0 1877 return (U8 *)start;
f9a63242 1878
1e54db1a 1879 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1880 for (send = s + *len; s < send;) {
d59937ca
KW
1881 if (! UTF8_IS_INVARIANT(*s)) {
1882 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1883 return (U8 *)start;
d59937ca
KW
1884 }
1885 count++;
1886 s++;
db42d148 1887 }
d59937ca 1888 s++;
f9a63242
JH
1889 }
1890
35da51f7 1891 *is_utf8 = FALSE;
f9a63242 1892
212542aa 1893 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1894 s = start; start = d;
f9a63242
JH
1895 while (s < send) {
1896 U8 c = *s++;
1a91c45d 1897 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1898 /* Then it is two-byte encoded */
a62b247b 1899 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1900 s++;
c4d5f83a
NIS
1901 }
1902 *d++ = c;
f9a63242
JH
1903 }
1904 *d = '\0';
1905 *len = d - start;
73d840c0 1906 return (U8 *)start;
f9a63242
JH
1907}
1908
1909/*
87cea99e 1910=for apidoc bytes_to_utf8
6940069f 1911
ff97e5cf
KW
1912Converts a string C<s> of length C<len> bytes from the native encoding into
1913UTF-8.
6662521e 1914Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1915reflect the new length in bytes.
6940069f 1916
75200dff 1917A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1918
1919If you want to convert to UTF-8 from encodings other than
1920the native (Latin1 or EBCDIC),
a1433954 1921see L</sv_recode_to_utf8>().
c9ada85f 1922
497711e7 1923=cut
6940069f
GS
1924*/
1925
c682ebef
FC
1926/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1927 likewise need duplication. */
1928
6940069f 1929U8*
35a4481c 1930Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1931{
35a4481c 1932 const U8 * const send = s + (*len);
6940069f
GS
1933 U8 *d;
1934 U8 *dst;
7918f24d
NC
1935
1936 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1937 PERL_UNUSED_CONTEXT;
6940069f 1938
212542aa 1939 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1940 dst = d;
1941
1942 while (s < send) {
55d09dc8
KW
1943 append_utf8_from_native_byte(*s, &d);
1944 s++;
6940069f
GS
1945 }
1946 *d = '\0';
6662521e 1947 *len = d-dst;
6940069f
GS
1948 return dst;
1949}
1950
a0ed51b3 1951/*
dea0fc0b 1952 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1953 *
1954 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1955 * We optimize for native, for obvious reasons. */
1956
1957U8*
dea0fc0b 1958Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1959{
dea0fc0b
JH
1960 U8* pend;
1961 U8* dstart = d;
1962
7918f24d
NC
1963 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1964
dea0fc0b 1965 if (bytelen & 1)
147e3846 1966 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
dea0fc0b
JH
1967
1968 pend = p + bytelen;
1969
a0ed51b3 1970 while (p < pend) {
dea0fc0b
JH
1971 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1972 p += 2;
2d1545e5 1973 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1974 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1975 continue;
1976 }
56d37426
KW
1977 if (uv <= MAX_UTF8_TWO_BYTE) {
1978 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1979 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1980 continue;
1981 }
46956fad
KW
1982#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1983#define LAST_HIGH_SURROGATE 0xDBFF
1984#define FIRST_LOW_SURROGATE 0xDC00
1985#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1986
1987 /* This assumes that most uses will be in the first Unicode plane, not
1988 * needing surrogates */
1989 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1990 && uv <= UNICODE_SURROGATE_LAST))
1991 {
1992 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1993 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1994 }
1995 else {
01ea242b 1996 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1997 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1998 || UNLIKELY(low > LAST_LOW_SURROGATE))
1999 {
01ea242b 2000 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2001 }
2002 p += 2;
46956fad
KW
2003 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2004 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2005 }
a0ed51b3 2006 }
56d37426
KW
2007#ifdef EBCDIC
2008 d = uvoffuni_to_utf8_flags(d, uv, 0);
2009#else
a0ed51b3 2010 if (uv < 0x10000) {
eb160463
GS
2011 *d++ = (U8)(( uv >> 12) | 0xe0);
2012 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2013 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2014 continue;
2015 }
2016 else {
eb160463
GS
2017 *d++ = (U8)(( uv >> 18) | 0xf0);
2018 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2019 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2020 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2021 continue;
2022 }
56d37426 2023#endif
a0ed51b3 2024 }
dea0fc0b 2025 *newlen = d - dstart;
a0ed51b3
LW
2026 return d;
2027}
2028
2029/* Note: this one is slightly destructive of the source. */
2030
2031U8*
dea0fc0b 2032Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2033{
2034 U8* s = (U8*)p;
d4c19fe8 2035 U8* const send = s + bytelen;
7918f24d
NC
2036
2037 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2038
e0ea5e2d 2039 if (bytelen & 1)
147e3846 2040 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2041 (UV)bytelen);
2042
a0ed51b3 2043 while (s < send) {
d4c19fe8 2044 const U8 tmp = s[0];
a0ed51b3
LW
2045 s[0] = s[1];
2046 s[1] = tmp;
2047 s += 2;
2048 }
dea0fc0b 2049 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2050}
2051
922e8cb4
KW
2052bool
2053Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2054{
2055 U8 tmpbuf[UTF8_MAXBYTES+1];
2056 uvchr_to_utf8(tmpbuf, c);
2057 return _is_utf8_FOO(classnum, tmpbuf);
2058}
2059
f9ae8fb6
JD
2060/* Internal function so we can deprecate the external one, and call
2061 this one from other deprecated functions in this file */
2062
f2645549
KW
2063bool
2064Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2065{
f2645549 2066 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2067
2068 if (*p == '_')
2069 return TRUE;
f25ce844 2070 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2071}
2072
5092f92a 2073bool
eba68aa0
KW
2074Perl__is_uni_perl_idcont(pTHX_ UV c)
2075{
2076 U8 tmpbuf[UTF8_MAXBYTES+1];
2077 uvchr_to_utf8(tmpbuf, c);
2078 return _is_utf8_perl_idcont(tmpbuf);
2079}
2080
2081bool
f91dcd13
KW
2082Perl__is_uni_perl_idstart(pTHX_ UV c)
2083{
2084 U8 tmpbuf[UTF8_MAXBYTES+1];
2085 uvchr_to_utf8(tmpbuf, c);
2086 return _is_utf8_perl_idstart(tmpbuf);
2087}
2088
3a4c58c9
KW
2089UV
2090Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2091{
2092 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2093 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2094 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2095 * either "SS" or "Ss". Which one to use is passed into the routine in
2096 * 'S_or_s' to avoid a test */
2097
2098 UV converted = toUPPER_LATIN1_MOD(c);
2099
2100 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2101
2102 assert(S_or_s == 'S' || S_or_s == 's');
2103
6f2d5cbc 2104 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2105 characters in this range */
3a4c58c9
KW
2106 *p = (U8) converted;
2107 *lenp = 1;
2108 return converted;
2109 }
2110
2111 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2112 * which it maps to one of them, so as to only have to have one check for
2113 * it in the main case */
2114 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2115 switch (c) {
2116 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2117 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2118 break;
2119 case MICRO_SIGN:
2120 converted = GREEK_CAPITAL_LETTER_MU;
2121 break;
79e064b9
KW
2122#if UNICODE_MAJOR_VERSION > 2 \
2123 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2124 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2125 case LATIN_SMALL_LETTER_SHARP_S:
2126 *(p)++ = 'S';
2127 *p = S_or_s;
2128 *lenp = 2;
2129 return 'S';
79e064b9 2130#endif
3a4c58c9
KW
2131 default:
2132 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2133 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2134 }
2135 }
2136
2137 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2138 *p = UTF8_TWO_BYTE_LO(converted);
2139 *lenp = 2;
2140
2141 return converted;
2142}
2143
50bda2c3
KW
2144/* Call the function to convert a UTF-8 encoded character to the specified case.
2145 * Note that there may be more than one character in the result.
2146 * INP is a pointer to the first byte of the input character
2147 * OUTP will be set to the first byte of the string of changed characters. It
2148 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2149 * LENP will be set to the length in bytes of the string of changed characters
2150 *
2151 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2152#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2153#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2154#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2155
b9992569
KW
2156/* This additionally has the input parameter 'specials', which if non-zero will
2157 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2158 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2159#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2160
84afefe6
JH
2161UV
2162Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2163{
a1433954
KW
2164 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2165 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2166 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2167 * the changed version may be longer than the original character.
2168 *
2169 * The ordinal of the first character of the changed version is returned
2170 * (but note, as explained above, that there may be more.) */
2171
7918f24d
NC
2172 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2173
3a4c58c9
KW
2174 if (c < 256) {
2175 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2176 }
2177
0ebc6274 2178 uvchr_to_utf8(p, c);
b9992569 2179 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2180}
2181
84afefe6
JH
2182UV
2183Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2184{
7918f24d
NC
2185 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2186
3a4c58c9
KW
2187 if (c < 256) {
2188 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2189 }
2190
0ebc6274 2191 uvchr_to_utf8(p, c);
b9992569 2192 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2193}
2194
afc16117 2195STATIC U8
81611534 2196S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
2197{
2198 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2199 * those, converting the result to UTF-8. Since the result is always just
a1433954 2200 * one character, we allow <p> to be NULL */
afc16117
KW
2201
2202 U8 converted = toLOWER_LATIN1(c);
2203
2204 if (p != NULL) {
6f2d5cbc 2205 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2206 *p = converted;
2207 *lenp = 1;
2208 }
2209 else {
430c9760
KW
2210 /* Result is known to always be < 256, so can use the EIGHT_BIT
2211 * macros */
2212 *p = UTF8_EIGHT_BIT_HI(converted);
2213 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2214 *lenp = 2;
2215 }
2216 }
2217 return converted;
2218}
2219
84afefe6
JH
2220UV
2221Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2222{
7918f24d
NC
2223 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2224
afc16117
KW
2225 if (c < 256) {
2226 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
2227 }
2228
afc16117 2229 uvchr_to_utf8(p, c);
b9992569 2230 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2231}
2232
84afefe6 2233UV
51910141 2234Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2235{
51910141 2236 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2237 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2238 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2239 *
2240 * Not to be used for locale folds
51910141 2241 */
f673fad4 2242
a1dde8de
KW
2243 UV converted;
2244
2245 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2246 PERL_UNUSED_CONTEXT;
a1dde8de 2247
1ca267a5
KW
2248 assert (! (flags & FOLD_FLAGS_LOCALE));
2249
659a7c2d 2250 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2251 converted = GREEK_SMALL_LETTER_MU;
2252 }
9b63e895
KW
2253#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2254 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2255 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2256 else if ( (flags & FOLD_FLAGS_FULL)
2257 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2258 {
1ca267a5
KW
2259 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2260 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2261 * under those circumstances. */
2262 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2263 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2264 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2265 p, *lenp, U8);
2266 return LATIN_SMALL_LETTER_LONG_S;
2267 }
2268 else {
4f489194
KW
2269 *(p)++ = 's';
2270 *p = 's';
2271 *lenp = 2;
2272 return 's';
1ca267a5 2273 }
a1dde8de 2274 }
9b63e895 2275#endif
a1dde8de
KW
2276 else { /* In this range the fold of all other characters is their lower
2277 case */
2278 converted = toLOWER_LATIN1(c);
2279 }
2280
6f2d5cbc 2281 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2282 *p = (U8) converted;
2283 *lenp = 1;
2284 }
2285 else {
2286 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2287 *p = UTF8_TWO_BYTE_LO(converted);
2288 *lenp = 2;
2289 }
2290
2291 return converted;
2292}
2293
2294UV
31f05a37 2295Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2296{
4b593389 2297
a0270393
KW
2298 /* Not currently externally documented, and subject to change
2299 * <flags> bits meanings:
2300 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2301 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2302 * locale are to be used.
a0270393
KW
2303 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2304 */
4b593389 2305
36bb2ab6 2306 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2307
780fcc9f
KW
2308 if (flags & FOLD_FLAGS_LOCALE) {
2309 /* Treat a UTF-8 locale as not being in locale at all */
2310 if (IN_UTF8_CTYPE_LOCALE) {
2311 flags &= ~FOLD_FLAGS_LOCALE;
2312 }
2313 else {
2314 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2315 goto needs_full_generality;
780fcc9f 2316 }
31f05a37
KW
2317 }
2318
a1dde8de 2319 if (c < 256) {
e7b7ac46 2320 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2321 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2322 }
2323
2f306ab9 2324 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2325 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2326 uvchr_to_utf8(p, c);
b9992569 2327 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2328 }
2329 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2330 the special flags. */
2331 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2332
2333 needs_full_generality:
a0270393 2334 uvchr_to_utf8(utf8_c, c);
445bf929 2335 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2336 }
84afefe6
JH
2337}
2338
26483009 2339PERL_STATIC_INLINE bool
5141f98e 2340S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2341 const char *const swashname, SV* const invlist)
bde6a22d 2342{
ea317ccb
KW
2343 /* returns a boolean giving whether or not the UTF8-encoded character that
2344 * starts at <p> is in the swash indicated by <swashname>. <swash>
2345 * contains a pointer to where the swash indicated by <swashname>
2346 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2347 * look at <*swash> and only generate a swash if it is not null. <invlist>
2348 * is NULL or an inversion list that defines the swash. If not null, it
2349 * saves time during initialization of the swash.
ea317ccb
KW
2350 *
2351 * Note that it is assumed that the buffer length of <p> is enough to
2352 * contain all the bytes that comprise the character. Thus, <*p> should
2353 * have been checked before this call for mal-formedness enough to assure
2354 * that. */
2355
7918f24d
NC
2356 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2357
492a624f 2358 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2359 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2360 * as far as there being enough bytes available in it to accommodate the
2361 * character without reading beyond the end, and pass that number on to the
2362 * validating routine */
6302f837 2363 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2364 if (ckWARN_d(WARN_UTF8)) {
2365 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2366 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2367 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2368 what the malformation is */
2369 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2370 }
2371 }
2372 return FALSE;
2373 }
87367d5f
KW
2374 if (!*swash) {
2375 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2376 *swash = _core_swash_init("utf8",
2377
2378 /* Only use the name if there is no inversion
2379 * list; otherwise will go out to disk */
2380 (invlist) ? "" : swashname,
2381
2382 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2383 }
28123549 2384
bde6a22d
NC
2385 return swash_fetch(*swash, p, TRUE) != 0;
2386}
2387
2388bool
922e8cb4
KW
2389Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2390{
922e8cb4
KW
2391 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2392
2393 assert(classnum < _FIRST_NON_SWASH_CC);
2394
f25ce844
KW
2395 return is_utf8_common(p,
2396 &PL_utf8_swash_ptrs[classnum],
2397 swash_property_names[classnum],
2398 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2399}
2400
2401bool
f2645549 2402Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2403{
f2645549 2404 SV* invlist = NULL;
7918f24d 2405
f2645549 2406 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2407
f2645549
KW
2408 if (! PL_utf8_perl_idstart) {
2409 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2410 }
60071a22 2411 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2412}
2413
2414bool
f2645549 2415Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2416{
f2645549 2417 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2418
2419 if (*p == '_')
2420 return TRUE;
f25ce844 2421 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2422}
2423
2424bool
eba68aa0
KW
2425Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2426{
b24b43f7 2427 SV* invlist = NULL;
eba68aa0
KW
2428
2429 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2430
b24b43f7
KW
2431 if (! PL_utf8_perl_idcont) {
2432 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2433 }
60071a22 2434 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2435}
2436
eba68aa0 2437bool
f2645549 2438Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2439{
f2645549 2440 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2441
f25ce844 2442 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2443}
2444
2445bool
f2645549 2446Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2447{
f2645549 2448 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2449
f25ce844 2450 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2451}
2452
2453bool
7dbf68d2
KW
2454Perl__is_utf8_mark(pTHX_ const U8 *p)
2455{
7dbf68d2
KW
2456 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2457
f25ce844 2458 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2459}
2460
6b5c0936 2461/*
87cea99e 2462=for apidoc to_utf8_case
6b5c0936 2463
9da1e7cb
KW
2464Instead use the appropriate one of L</toUPPER_utf8>,
2465L</toTITLE_utf8>,
2466L</toLOWER_utf8>,
2467or L</toFOLD_utf8>.
2468
6fae5207 2469C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2470the character that is being converted. This routine assumes that the character
2471at C<p> is well-formed.
6b5c0936 2472
6fae5207
KW
2473C<ustrp> is a pointer to the character buffer to put the
2474conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2475of the result.
2476
6fae5207 2477C<swashp> is a pointer to the swash to use.
6b5c0936 2478
a1433954 2479Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2480and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2481but not always, a multicharacter mapping), is tried first.
6b5c0936 2482
4a8240a3
KW
2483C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2484any special mappings; C<""> means to use the special mappings. Values other
2485than these two are treated as the name of the hash containing the special
2486mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2487
796b6530
KW
2488C<normal> is a string like C<"ToLower"> which means the swash
2489C<%utf8::ToLower>.
0134edef 2490
760c7c2f
KW
2491Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2492unless those are turned off.
2493
0134edef 2494=cut */
6b5c0936 2495
2104c8d9 2496UV
9a957fbc
AL
2497Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2498 SV **swashp, const char *normal, const char *special)
a0ed51b3 2499{
b9992569
KW
2500 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2501
2502 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2503}
2504
2505 /* change namve uv1 to 'from' */
6a4a25f4 2506STATIC UV
b9992569
KW
2507S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2508 SV **swashp, const char *normal, const char *special)
2509{
0134edef 2510 STRLEN len = 0;
7918f24d 2511
b9992569 2512 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2513
36eaa811
KW
2514 /* For code points that don't change case, we already know that the output
2515 * of this function is the unchanged input, so we can skip doing look-ups
2516 * for them. Unfortunately the case-changing code points are scattered
2517 * around. But there are some long consecutive ranges where there are no
2518 * case changing code points. By adding tests, we can eliminate the lookup
2519 * for all the ones in such ranges. This is currently done here only for
2520 * just a few cases where the scripts are in common use in modern commerce
2521 * (and scripts adjacent to those which can be included without additional
2522 * tests). */
2523
2524 if (uv1 >= 0x0590) {
2525 /* This keeps from needing further processing the code points most
2526 * likely to be used in the following non-cased scripts: Hebrew,
2527 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2528 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2529 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2530 if (uv1 < 0x10A0) {
2531 goto cases_to_self;
2532 }
2533
2534 /* The following largish code point ranges also don't have case
2535 * changes, but khw didn't think they warranted extra tests to speed
2536 * them up (which would slightly slow down everything else above them):
2537 * 1100..139F Hangul Jamo, Ethiopic
2538 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2539 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2540 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2541 * Combining Diacritical Marks Extended, Balinese,
2542 * Sundanese, Batak, Lepcha, Ol Chiki
2543 * 2000..206F General Punctuation
2544 */
2545
2546 if (uv1 >= 0x2D30) {
2547
2548 /* This keeps the from needing further processing the code points
2549 * most likely to be used in the following non-cased major scripts:
2550 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2551 *
2552 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2553 * event that Unicode eventually allocates the unused block as of
2554 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2555 * that the test suite will start having failures to alert you
2556 * should that happen) */
2557 if (uv1 < 0xA640) {
2558 goto cases_to_self;
2559 }
2560
2561 if (uv1 >= 0xAC00) {
2562 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2563 if (ckWARN_d(WARN_SURROGATE)) {
2564 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2565 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 2566 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
2567 }
2568 goto cases_to_self;
2569 }
36eaa811
KW
2570
2571 /* AC00..FAFF Catches Hangul syllables and private use, plus
2572 * some others */
2573 if (uv1 < 0xFB00) {
2574 goto cases_to_self;
2575
2576 }
2577
5af9bc97
KW
2578 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2579 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2580 && ckWARN_d(WARN_DEPRECATED))
2581 {
2582 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2583 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2584 }
2585 if (ckWARN_d(WARN_NON_UNICODE)) {
2586 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2587 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 2588 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
2589 }
2590 goto cases_to_self;
2591 }
3bfc1e70
KW
2592#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2593 if (UNLIKELY(uv1
2594 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2595 {
2596
2597 /* As of this writing, this means we avoid swash creation
2598 * for anything beyond low Plane 1 */
2599 goto cases_to_self;
2600 }
2601#endif
36eaa811
KW
2602 }
2603 }
9ae3ac1a 2604
36eaa811
KW
2605 /* Note that non-characters are perfectly legal, so no warning should
2606 * be given. There are so few of them, that it isn't worth the extra
2607 * tests to avoid swash creation */
9ae3ac1a
KW
2608 }
2609
0134edef 2610 if (!*swashp) /* load on-demand */
5ab9d2ef 2611 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2612
a6f87d8c 2613 if (special) {
0134edef 2614 /* It might be "special" (sometimes, but not always,
2a37f04d 2615 * a multicharacter mapping) */
4a8240a3 2616 HV *hv = NULL;
b08cf34e
JH
2617 SV **svp;
2618
4a8240a3
KW
2619 /* If passed in the specials name, use that; otherwise use any
2620 * given in the swash */
2621 if (*special != '\0') {
2622 hv = get_hv(special, 0);
2623 }
2624 else {
2625 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2626 if (svp) {
2627 hv = MUTABLE_HV(SvRV(*svp));
2628 }
2629 }
2630
176fe009 2631 if (hv
5f560d8a 2632 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2633 && (*svp))
2634 {
cfd0369c 2635 const char *s;
47654450 2636
cfd0369c 2637 s = SvPV_const(*svp, len);
47654450 2638 if (len == 1)
f4cd282c 2639 /* EIGHTBIT */
c80e42f3 2640 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2641 else {
d2dcd0fb 2642 Copy(s, ustrp, len, U8);
29e98929 2643 }
983ffd37 2644 }
0134edef
JH
2645 }
2646
2647 if (!len && *swashp) {
4a4088c4 2648 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2649
0134edef
JH
2650 if (uv2) {
2651 /* It was "normal" (a single character mapping). */
f4cd282c 2652 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2653 }
2654 }
1feea2c7 2655
cbe07460
KW
2656 if (len) {
2657 if (lenp) {
2658 *lenp = len;
2659 }
2660 return valid_utf8_to_uvchr(ustrp, 0);
2661 }
2662
2663 /* Here, there was no mapping defined, which means that the code point maps
2664 * to itself. Return the inputs */
e24dfe9c 2665 cases_to_self:
bfdf22ec 2666 len = UTF8SKIP(p);
ca9fab46
KW
2667 if (p != ustrp) { /* Don't copy onto itself */
2668 Copy(p, ustrp, len, U8);
2669 }
0134edef 2670
2a37f04d
JH
2671 if (lenp)
2672 *lenp = len;
2673
f4cd282c 2674 return uv1;
cbe07460 2675
a0ed51b3
LW
2676}
2677
051a06d4 2678STATIC UV
357aadde 2679S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2680{
4a4088c4 2681 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2682 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2683 * result contains a character that crosses the 255/256 boundary, disallow
2684 * the change, and return the original code point. See L<perlfunc/lc> for
2685 * why;
051a06d4 2686 *
a1433954
KW
2687 * p points to the original string whose case was changed; assumed
2688 * by this routine to be well-formed
051a06d4
KW
2689 * result the code point of the first character in the changed-case string
2690 * ustrp points to the changed-case string (<result> represents its first char)
2691 * lenp points to the length of <ustrp> */
2692
2693 UV original; /* To store the first code point of <p> */
2694
2695 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2696
a4f12ed7 2697 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2698
2699 /* We know immediately if the first character in the string crosses the
2700 * boundary, so can skip */
2701 if (result > 255) {
2702
2703 /* Look at every character in the result; if any cross the
2704 * boundary, the whole thing is disallowed */
2705 U8* s = ustrp + UTF8SKIP(ustrp);
2706 U8* e = ustrp + *lenp;
2707 while (s < e) {
a4f12ed7 2708 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2709 goto bad_crossing;
2710 }
2711 s += UTF8SKIP(s);
2712 }
2713
613abc6d
KW
2714 /* Here, no characters crossed, result is ok as-is, but we warn. */
2715 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2716 return result;
2717 }
2718
7b52d656 2719 bad_crossing:
051a06d4
KW
2720
2721 /* Failed, have to return the original */
4b88fb76 2722 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2723
2724 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2725 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
147e3846
KW
2726 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
2727 "resolved to \"\\x{%" UVXf "}\".",
357aadde 2728 OP_DESC(PL_op),
ab0b796c
KW
2729 original,
2730 original);
051a06d4
KW
2731 Copy(p, ustrp, *lenp, char);
2732 return original;
2733}
2734
d3e79532 2735/*
87cea99e 2736=for apidoc to_utf8_upper
d3e79532 2737
1f607577 2738Instead use L</toUPPER_utf8>.
a1433954 2739
d3e79532
JH
2740=cut */
2741
051a06d4 2742/* Not currently externally documented, and subject to change:
31f05a37
KW
2743 * <flags> is set iff iff the rules from the current underlying locale are to
2744 * be used. */
051a06d4 2745
2104c8d9 2746UV
31f05a37 2747Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2748{
051a06d4
KW
2749 UV result;
2750
2751 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2752
780fcc9f
KW
2753 if (flags) {
2754 /* Treat a UTF-8 locale as not being in locale at all */
2755 if (IN_UTF8_CTYPE_LOCALE) {
2756 flags = FALSE;
2757 }
2758 else {
2759 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2760 }
31f05a37
KW
2761 }
2762
3a4c58c9 2763 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2764 if (flags) {
2765 result = toUPPER_LC(*p);
2766 }
2767 else {
81c6c7ce 2768 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2769 }
3a4c58c9
KW
2770 }
2771 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2772 if (flags) {
a62b247b 2773 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2774 result = toUPPER_LC(c);
051a06d4
KW
2775 }
2776 else {
a62b247b 2777 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2778 ustrp, lenp, 'S');
051a06d4
KW
2779 }
2780 }
4a4088c4 2781 else { /* UTF-8, ord above 255 */
b9992569 2782 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2783
2784 if (flags) {
357aadde 2785 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2786 }
2787 return result;
2788 }
2789
4a4088c4 2790 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2791 if (UTF8_IS_INVARIANT(result)) {
2792 *ustrp = (U8) result;
2793 *lenp = 1;
2794 }
2795 else {
62cb07ea
KW
2796 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2797 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2798 *lenp = 2;
3a4c58c9 2799 }
baa60164 2800
051a06d4 2801 return result;
983ffd37 2802}
a0ed51b3 2803
d3e79532 2804/*
87cea99e 2805=for apidoc to_utf8_title
d3e79532 2806
1f607577 2807Instead use L</toTITLE_utf8>.
a1433954 2808
d3e79532
JH
2809=cut */
2810
051a06d4 2811/* Not currently externally documented, and subject to change:
31f05a37
KW
2812 * <flags> is set iff the rules from the current underlying locale are to be
2813 * used. Since titlecase is not defined in POSIX, for other than a
2814 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2815 */
051a06d4 2816
983ffd37 2817UV
31f05a37 2818Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2819{
051a06d4
KW
2820 UV result;
2821
2822 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2823
780fcc9f
KW
2824 if (flags) {
2825 /* Treat a UTF-8 locale as not being in locale at all */
2826 if (IN_UTF8_CTYPE_LOCALE) {
2827 flags = FALSE;
2828 }
2829 else {
2830 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2831 }
31f05a37
KW
2832 }
2833
3a4c58c9 2834 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2835 if (flags) {
2836 result = toUPPER_LC(*p);
2837 }
2838 else {
81c6c7ce 2839 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2840 }
3a4c58c9
KW
2841 }
2842 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2843 if (flags) {
a62b247b 2844 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2845 result = toUPPER_LC(c);
051a06d4
KW
2846 }
2847 else {
a62b247b 2848 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2849 ustrp, lenp, 's');
051a06d4
KW
2850 }
2851 }
4a4088c4 2852 else { /* UTF-8, ord above 255 */
b9992569 2853 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2854
2855 if (flags) {
357aadde 2856 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2857 }
2858 return result;
2859 }
2860
4a4088c4 2861 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2862 if (UTF8_IS_INVARIANT(result)) {
2863 *ustrp = (U8) result;
2864 *lenp = 1;
2865 }
2866 else {
62cb07ea
KW
2867 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2868 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2869 *lenp = 2;
3a4c58c9
KW
2870 }
2871
051a06d4 2872 return result;
a0ed51b3
LW
2873}
2874
d3e79532 2875/*
87cea99e 2876=for apidoc to_utf8_lower
d3e79532 2877
1f607577 2878Instead use L</toLOWER_utf8>.
a1433954 2879
d3e79532
JH
2880=cut */
2881
051a06d4 2882/* Not currently externally documented, and subject to change:
31f05a37
KW
2883 * <flags> is set iff iff the rules from the current underlying locale are to
2884 * be used.
2885 */
051a06d4 2886
2104c8d9 2887UV
31f05a37 2888Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2889{
051a06d4
KW
2890 UV result;
2891
051a06d4 2892 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2893
780fcc9f
KW
2894 if (flags) {
2895 /* Treat a UTF-8 locale as not being in locale at all */
2896 if (IN_UTF8_CTYPE_LOCALE) {
2897 flags = FALSE;
2898 }
2899 else {
2900 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2901 }
31f05a37
KW
2902 }
2903
968c5e6a 2904 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2905 if (flags) {
2906 result = toLOWER_LC(*p);
2907 }
2908 else {
81c6c7ce 2909 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2910 }
968c5e6a
KW
2911 }
2912 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2913 if (flags) {
a62b247b 2914 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2915 result = toLOWER_LC(c);
051a06d4
KW
2916 }
2917 else {
a62b247b 2918 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2919 ustrp, lenp);
051a06d4 2920 }
968c5e6a 2921 }
4a4088c4 2922 else { /* UTF-8, ord above 255 */
b9992569 2923 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2924
2925 if (flags) {
357aadde 2926 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2927 }
968c5e6a 2928
051a06d4
KW
2929 return result;
2930 }
2931
4a4088c4 2932 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2933 if (UTF8_IS_INVARIANT(result)) {
2934 *ustrp = (U8) result;
2935 *lenp = 1;
2936 }
2937 else {
62cb07ea
KW
2938 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2939 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2940 *lenp = 2;
2941 }
2942
051a06d4 2943 return result;
b4e400f9
JH
2944}
2945
d3e79532 2946/*
87cea99e 2947=for apidoc to_utf8_fold
d3e79532 2948
1f607577 2949Instead use L</toFOLD_utf8>.
a1433954 2950
d3e79532
JH
2951=cut */
2952
051a06d4
KW
2953/* Not currently externally documented, and subject to change,
2954 * in <flags>
31f05a37
KW
2955 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2956 * locale are to be used.
051a06d4
KW
2957 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2958 * otherwise simple folds
a0270393
KW
2959 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2960 * prohibited
445bf929 2961 */
36bb2ab6 2962
b4e400f9 2963UV
445bf929 2964Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2965{
051a06d4
KW
2966 UV result;
2967
36bb2ab6 2968 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2969
a0270393
KW
2970 /* These are mutually exclusive */
2971 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2972
50ba90ff
KW
2973 assert(p != ustrp); /* Otherwise overwrites */
2974
780fcc9f
KW
2975 if (flags & FOLD_FLAGS_LOCALE) {
2976 /* Treat a UTF-8 locale as not being in locale at all */
2977 if (IN_UTF8_CTYPE_LOCALE) {
2978 flags &= ~FOLD_FLAGS_LOCALE;
2979 }
2980 else {
2981 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2982 }
31f05a37
KW
2983 }
2984
a1dde8de 2985 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2986 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2987 result = toFOLD_LC(*p);
051a06d4
KW
2988 }
2989 else {
81c6c7ce 2990 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2991 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2992 }
a1dde8de
KW
2993 }
2994 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2995 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2996 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2997 result = toFOLD_LC(c);
051a06d4
KW
2998 }
2999 else {
a62b247b 3000 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 3001 ustrp, lenp,
1ca267a5 3002 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 3003 }
a1dde8de 3004 }
4a4088c4 3005 else { /* UTF-8, ord above 255 */
b9992569 3006 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3007
1ca267a5
KW
3008 if (flags & FOLD_FLAGS_LOCALE) {
3009
76f2ffcd 3010# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3011 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3012
3013# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3014# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3015
3016 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3017
538e84ed
KW
3018 /* Special case these two characters, as what normally gets
3019 * returned under locale doesn't work */
76f2ffcd
KW
3020 if (UTF8SKIP(p) == cap_sharp_s_len
3021 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3022 {
ab0b796c
KW
3023 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3024 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3025 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3026 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3027 goto return_long_s;
3028 }
0766489e
KW
3029 else
3030#endif
3031 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3032 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3033 {
ab0b796c
KW
3034 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3035 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3036 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3037 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3038 goto return_ligature_st;
3039 }
74894415
KW
3040
3041#if UNICODE_MAJOR_VERSION == 3 \
3042 && UNICODE_DOT_VERSION == 0 \
3043 && UNICODE_DOT_DOT_VERSION == 1
3044# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3045
3046 /* And special case this on this Unicode version only, for the same
3047 * reaons the other two are special cased. They would cross the
3048 * 255/256 boundary which is forbidden under /l, and so the code
3049 * wouldn't catch that they are equivalent (which they are only in
3050 * this release) */
3051 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3052 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3053 {
3054 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3055 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3056 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3057 "resolved to \"\\x{0131}\".");
3058 goto return_dotless_i;
3059 }
3060#endif
3061
357aadde 3062 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3063 }
a0270393
KW
3064 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3065 return result;
3066 }
3067 else {
4a4088c4 3068 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3069 * character above the ASCII range, and the result should not
3070 * contain an ASCII character. */
a0270393
KW
3071
3072 UV original; /* To store the first code point of <p> */
3073
3074 /* Look at every character in the result; if any cross the
3075 * boundary, the whole thing is disallowed */
3076 U8* s = ustrp;
3077 U8* e = ustrp + *lenp;
3078 while (s < e) {
3079 if (isASCII(*s)) {
3080 /* Crossed, have to return the original */
3081 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3082
9fc2026f 3083 /* But in these instances, there is an alternative we can
1ca267a5 3084 * return that is valid */
0766489e
KW
3085 if (original == LATIN_SMALL_LETTER_SHARP_S
3086#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3087 || original == LATIN_CAPITAL_LETTER_SHARP_S
3088#endif
3089 ) {
1ca267a5
KW
3090 goto return_long_s;
3091 }
9fc2026f
KW
3092 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3093 goto return_ligature_st;
3094 }
74894415
KW
3095#if UNICODE_MAJOR_VERSION == 3 \
3096 && UNICODE_DOT_VERSION == 0 \
3097 && UNICODE_DOT_DOT_VERSION == 1
3098
3099 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3100 goto return_dotless_i;
3101 }
3102#endif
a0270393
KW
3103 Copy(p, ustrp, *lenp, char);
3104 return original;
3105 }
3106 s += UTF8SKIP(s);
3107 }
051a06d4 3108
a0270393
KW
3109 /* Here, no characters crossed, result is ok as-is */
3110 return result;
3111 }
051a06d4
KW
3112 }
3113
4a4088c4 3114 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3115 if (UTF8_IS_INVARIANT(result)) {
3116 *ustrp = (U8) result;
3117 *lenp = 1;
3118 }
3119 else {
62cb07ea
KW
3120 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3121 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3122 *lenp = 2;
3123 }
3124
051a06d4 3125 return result;
1ca267a5
KW
3126
3127 return_long_s:
3128 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3129 * folds to a string of two of these characters. By returning this
3130 * instead, then, e.g.,
3131 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3132 * works. */
3133
3134 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3135 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3136 ustrp, *lenp, U8);
3137 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3138
3139 return_ligature_st:
3140 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3141 * have the other one fold to it */
3142
3143 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3144 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3145 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3146
3147#if UNICODE_MAJOR_VERSION == 3 \
3148 && UNICODE_DOT_VERSION == 0 \
3149 && UNICODE_DOT_DOT_VERSION == 1
3150
3151 return_dotless_i:
3152 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3153 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3154 return LATIN_SMALL_LETTER_DOTLESS_I;
3155
3156#endif
3157
a0ed51b3
LW
3158}
3159
711a919c 3160/* Note:
f90a9a02 3161 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
3162 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3163 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3164 */
c4a5db0c 3165
a0ed51b3 3166SV*
7fc63493 3167Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3168{
c4a5db0c
KW
3169 PERL_ARGS_ASSERT_SWASH_INIT;
3170
3171 /* Returns a copy of a swash initiated by the called function. This is the
3172 * public interface, and returning a copy prevents others from doing
3173 * mischief on the original */
3174
5d3d13d1 3175 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3176}
3177
3178SV*
5d3d13d1 3179Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3180{
2c1f00b9
YO
3181
3182 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3183 * use the following define */
3184
3185#define CORE_SWASH_INIT_RETURN(x) \
3186 PL_curpm= old_PL_curpm; \
3187 return x
3188
c4a5db0c 3189 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3190 * by calling utf8_heavy.pl in the general case. The returned value may be
3191 * the swash's inversion list instead if the input parameters allow it.
3192 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3193 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3194 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3195 *
3196 * This interface should only be used by functions that won't destroy or
3197 * adversely change the swash, as doing so affects all other uses of the
3198 * swash in the program; the general public should use 'Perl_swash_init'
3199 * instead.
3200 *
3201 * pkg is the name of the package that <name> should be in.
3202 * name is the name of the swash to find. Typically it is a Unicode
3203 * property name, including user-defined ones
3204 * listsv is a string to initialize the swash with. It must be of the form
3205 * documented as the subroutine return value in
3206 * L<perlunicode/User-Defined Character Properties>
3207 * minbits is the number of bits required to represent each data element.
3208 * It is '1' for binary properties.
3209 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3210 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3211 * flags_p if non-NULL is the address of various input and output flag bits
3212 * to the routine, as follows: ('I' means is input to the routine;
3213 * 'O' means output from the routine. Only flags marked O are
3214 * meaningful on return.)
3215 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3216 * came from a user-defined property. (I O)
5d3d13d1
KW
3217 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3218 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3219 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3220 * return of an inversion list instead of a swash hash if this routine
3221 * thinks that would result in faster execution of swash_fetch() later
3222 * on. (I)
9a53f6cf
KW
3223 *
3224 * Thus there are three possible inputs to find the swash: <name>,
3225 * <listsv>, and <invlist>. At least one must be specified. The result
3226 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3227 * actions can intersect, etc. what <name> gives. To avoid going out to
3228 * disk at all, <invlist> should specify completely what the swash should
3229 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3230 *
3231 * <invlist> is only valid for binary properties */
c4a5db0c 3232
2c1f00b9
YO
3233 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3234
c4a5db0c 3235 SV* retval = &PL_sv_undef;
83199d38 3236 HV* swash_hv = NULL;
87367d5f
KW
3237 const int invlist_swash_boundary =
3238 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3239 ? 512 /* Based on some benchmarking, but not extensive, see commit
3240 message */
3241 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3242
3243 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3244 assert(! invlist || minbits == 1);
3245
2c1f00b9
YO
3246 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3247 that triggered the swash init and the swash init perl logic itself.
3248 See perl #122747 */
3249
9a53f6cf
KW
3250 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3251 * so */
3252 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3253 dSP;
3254 const size_t pkg_len = strlen(pkg);
3255 const size_t name_len = strlen(name);
3256 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3257 SV* errsv_save;
3258 GV *method;
3259
3260 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3261
3262 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3263 ENTER;
69794297 3264 SAVEHINTS();
2782061f 3265 save_re_context();
650f067c
JL
3266 /* We might get here via a subroutine signature which uses a utf8
3267 * parameter name, at which point PL_subname will have been set
3268 * but not yet used. */
3269 save_item(PL_subname);
69794297
KW
3270 if (PL_parser && PL_parser->error_count)
3271 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3272 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3273 if (!method) { /* demand load UTF-8 */
69794297 3274 ENTER;
db2c6cb3
FC
3275 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3276 GvSV(PL_errgv) = NULL;
1a419e6b 3277#ifndef NO_TAINT_SUPPORT
69794297
KW
3278 /* It is assumed that callers of this routine are not passing in
3279 * any user derived data. */
2782061f
DM
3280 /* Need to do this after save_re_context() as it will set
3281 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3282 * in Perl_magic_get). Even line to create errsv_save can turn on
3283 * PL_tainted. */
284167a5
SM
3284 SAVEBOOL(TAINT_get);
3285 TAINT_NOT;
3286#endif
69794297
KW
3287 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3288 NULL);
eed484f9 3289 {
db2c6cb3
FC
3290 /* Not ERRSV, as there is no need to vivify a scalar we are
3291 about to discard. */
3292 SV * const errsv = GvSV(PL_errgv);
3293 if (!SvTRUE(errsv)) {
3294 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3295 SvREFCNT_dec(errsv);
3296 }
eed484f9 3297 }
69794297
KW
3298 LEAVE;
3299 }
3300 SPAGAIN;
3301 PUSHMARK(SP);
3302 EXTEND(SP,5);
3303 mPUSHp(pkg, pkg_len);
3304 mPUSHp(name, name_len);
3305 PUSHs(listsv);
3306 mPUSHi(minbits);
3307 mPUSHi(none);
3308 PUTBACK;
db2c6cb3
FC
3309 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3310 GvSV(PL_errgv) = NULL;
69794297
KW
3311 /* If we already have a pointer to the method, no need to use
3312 * call_method() to repeat the lookup. */
c41800a8
KW
3313 if (method
3314 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3315 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3316 {
3317 retval = *PL_stack_sp--;
3318 SvREFCNT_inc(retval);
3319 }
eed484f9 3320 {
db2c6cb3
FC
3321 /* Not ERRSV. See above. */
3322 SV * const errsv = GvSV(PL_errgv);
3323 if (!SvTRUE(errsv)) {
3324 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3325 SvREFCNT_dec(errsv);
3326 }
eed484f9 3327 }
ce3b816e 3328 LEAVE;
69794297
KW
3329 POPSTACK;
3330 if (IN_PERL_COMPILETIME) {
3331 CopHINTS_set(PL_curcop, PL_hints);
3332 }
3333 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3334 if (SvPOK(retval)) {
69794297
KW
3335
3336 /* If caller wants to handle missing properties, let them */
5d3d13d1 3337 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3338 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3339 }
3340 Perl_croak(aTHX_
147e3846 3341 "Can't find Unicode property definition \"%" SVf "\"",
69794297 3342 SVfARG(retval));
a25b5927 3343 NOT_REACHED; /* NOTREACHED */
d95e4a00 3344 }
69794297 3345 }
9a53f6cf 3346 } /* End of calling the module to find the swash */
36eb48b4 3347
83199d38
KW
3348 /* If this operation fetched a swash, and we will need it later, get it */
3349 if (retval != &PL_sv_undef
3350 && (minbits == 1 || (flags_p
3351 && ! (*flags_p
3352 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3353 {
3354 swash_hv = MUTABLE_HV(SvRV(retval));
3355
3356 /* If we don't already know that there is a user-defined component to
3357 * this swash, and the user has indicated they wish to know if there is
3358 * one (by passing <flags_p>), find out */
3359 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3360 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3361 if (user_defined && SvUV(*user_defined)) {
3362 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3363 }
3364 }
3365 }
3366
36eb48b4
KW
3367 /* Make sure there is an inversion list for binary properties */
3368 if (minbits == 1) {
3369 SV** swash_invlistsvp = NULL;
3370 SV* swash_invlist = NULL;
9a53f6cf 3371 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3372 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3373 an unclaimed reference count */
36eb48b4 3374
9a53f6cf 3375 /* If this operation fetched a swash, get its already existing
83199d38 3376 * inversion list, or create one for it */
36eb48b4 3377
83199d38 3378 if (swash_hv) {
5c9f4bd2 3379 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3380 if (swash_invlistsvp) {
3381 swash_invlist = *swash_invlistsvp;
3382 invlist_in_swash_is_valid = TRUE;
3383 }
3384 else {
36eb48b4 3385 swash_invlist = _swash_to_invlist(retval);
02c85471 3386 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3387 }
3388 }
3389
3390 /* If an inversion list was passed in, have to include it */
3391 if (invlist) {
3392
3393 /* Any fetched swash will by now have an inversion list in it;
3394 * otherwise <swash_invlist> will be NULL, indicating that we
3395 * didn't fetch a swash */
3396 if (swash_invlist) {
3397
3398 /* Add the passed-in inversion list, which invalidates the one
3399 * already stored in the swash */
3400 invlist_in_swash_is_valid = FALSE;
eee4c920 3401 SvREADONLY_off(swash_invlist); /* Turned on again below */
9a53f6cf
KW
3402 _invlist_union(invlist, swash_invlist, &swash_invlist);
3403 }
3404 else {
3405
87367d5f
KW
3406 /* Here, there is no swash already. Set up a minimal one, if
3407 * we are going to return a swash */
3408 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3409 swash_hv = newHV();
4aca0fe6 3410 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3411 }
9a53f6cf
KW
3412 swash_invlist = invlist;
3413 }
9a53f6cf
KW
3414 }
3415
3416 /* Here, we have computed the union of all the passed-in data. It may
3417 * be that there was an inversion list in the swash which didn't get
538e84ed 3418 * touched; otherwise save the computed one */
87367d5f
KW
3419 if (! invlist_in_swash_is_valid
3420 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3421 {
5c9f4bd2 3422 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3423 {
3424 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3425 }
cc34d8c5
FC
3426 /* We just stole a reference count. */
3427 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3428 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3429 }
87367d5f 3430
ee3222e3 3431 /* The result is immutable. Forbid attempts to change it. */
dbfdbd26
KW
3432 SvREADONLY_on(swash_invlist);
3433
c41800a8 3434 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3435 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3436 SvREFCNT_dec(retval);
02c85471
FC
3437 if (!swash_invlist_unclaimed)
3438 SvREFCNT_inc_simple_void_NN(swash_invlist);
3439 retval = newRV_noinc(swash_invlist);
87367d5f 3440 }
36eb48b4
KW
3441 }
3442
2c1f00b9
YO
3443 CORE_SWASH_INIT_RETURN(retval);
3444#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3445}
3446
035d37be
JH
3447
3448/* This API is wrong for special case conversions since we may need to
3449 * return several Unicode characters for a single Unicode character
3450 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3451 * the lower-level routine, and it is similarly broken for returning
38684baa 3452 * multiple values. --jhi
b9992569 3453 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3454/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
3455
3456/* Note:
3457 * Returns the value of property/mapping C<swash> for the first character
3458 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3459 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3460 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3461 *
3462 * A "swash" is a hash which contains initially the keys/values set up by
3463 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3464 * property for all possible code points. Things are stored in a compact form
3465 * (see utf8_heavy.pl) so that calculation is required to find the actual
3466 * property value for a given code point. As code points are looked up, new
3467 * key/value pairs are added to the hash, so that the calculation doesn't have
3468 * to ever be re-done. Further, each calculation is done, not just for the
3469 * desired one, but for a whole block of code points adjacent to that one.
3470 * For binary properties on ASCII machines, the block is usually for 64 code
3471 * points, starting with a code point evenly divisible by 64. Thus if the
3472 * property value for code point 257 is requested, the code goes out and
3473 * calculates the property values for all 64 code points between 256 and 319,
3474 * and stores these as a single 64-bit long bit vector, called a "swatch",
3475 * under the key for code point 256. The key is the UTF-8 encoding for code
3476 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3477 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3478 * for code point 258 is then requested, this code realizes that it would be
3479 * stored under the key for 256, and would find that value and extract the
3480 * relevant bit, offset from 256.
3481 *
3482 * Non-binary properties are stored in as many bits as necessary to represent
3483 * their values (32 currently, though the code is more general than that), not
fc273927 3484 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3485 * vector that encompasses the property values for all code points whose UTF-8
3486 * representations are represented by the key. That is, for all code points
3487 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3488 * bytes of that.
680c470c 3489 */
a0ed51b3 3490UV
680c470c 3491Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3492{
ef8f7699 3493 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3494 U32 klen;
3495 U32 off;
9b56a019 3496 STRLEN slen = 0;
7d85a32c 3497 STRLEN needents;
cfd0369c 3498 const U8 *tmps = NULL;
979f2922 3499 SV *swatch;
08fb1ac5 3500 const U8 c = *ptr;
3568d838 3501
7918f24d
NC
3502 PERL_ARGS_ASSERT_SWASH_FETCH;
3503
87367d5f
KW
3504 /* If it really isn't a hash, it isn't really swash; must be an inversion
3505 * list */
3506 if (SvTYPE(hv) != SVt_PVHV) {
3507 return _invlist_contains_cp((SV*)hv,
3508 (do_utf8)
3509 ? valid_utf8_to_uvchr(ptr, NULL)
3510 : c);
3511 }
3512
08fb1ac5
KW
3513 /* We store the values in a "swatch" which is a vec() value in a swash
3514 * hash. Code points 0-255 are a single vec() stored with key length
3515 * (klen) 0. All other code points have a UTF-8 representation
3516 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3517 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3518 * length for them is the length of the encoded char - 1. ptr[klen] is the
3519 * final byte in the sequence representing the character */
3520 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3521 klen = 0;
3522 needents = 256;
3523 off = c;
3568d838 3524 }
08fb1ac5
KW
3525 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3526 klen = 0;
3527 needents = 256;
a62b247b 3528 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
ST
3529 }
3530 else {
08fb1ac5
KW
3531 klen = UTF8SKIP(ptr) - 1;
3532
3533 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3534 * the vec is the final byte in the sequence. (In EBCDIC this is
3535 * converted to I8 to get consecutive values.) To help you visualize
3536 * all this:
3537 * Straight 1047 After final byte
3538 * UTF-8 UTF-EBCDIC I8 transform
3539 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3540 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3541 * ...
3542 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3543 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3544 * ...
3545 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3546 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3547 * ...
3548 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3549 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3550 * ...
3551 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3552 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3553 *
3554 * (There are no discontinuities in the elided (...) entries.)
3555 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3556 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3557 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3558 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3559 * index into the vec() swatch (after subtracting 0x80, which we
3560 * actually do with an '&').
3561 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3562 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3563 * dicontinuities which go away by transforming it into I8, and we
3564 * effectively subtract 0xA0 to get the index. */
979f2922 3565 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3566 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3567 }
7d85a32c 3568
a0ed51b3 3569 /*
4a4088c4 3570 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3571 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3572 * it's nothing to sniff at.) Pity we usually come through at least
3573 * two function calls to get here...
3574 *
3575 * NB: this code assumes that swatches are never modified, once generated!
3576 */
3577
3568d838 3578 if (hv == PL_last_swash_hv &&
a0ed51b3 3579 klen == PL_last_swash_klen &&
27da23d5 3580 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3581 {
3582 tmps = PL_last_swash_tmps;
3583 slen = PL_last_swash_slen;
3584 }
3585 else {
3586 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3587 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3588
b0e3252e 3589 /* If not cached, generate it via swatch_get */
979f2922 3590 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3591 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3592 {
3593 if (klen) {
3594 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3595 swatch = swatch_get(swash,
3596 code_point & ~((UV)needents - 1),
3597 needents);
3598 }
3599 else { /* For the first 256 code points, the swatch has a key of
3600 length 0 */
3601 swatch = swatch_get(swash, 0, needents);
3602 }
979f2922 3603
923e4eb5 3604 if (IN_PERL_COMPILETIME)
623e6609 3605 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3606
979f2922 3607 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3608
979f2922
ST
3609 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3610 || (slen << 3) < needents)
5637ef5b 3611 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
147e3846 3612 "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
5637ef5b 3613 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3614 }
3615
3616 PL_last_swash_hv = hv;
16d8f38a 3617 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3618 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3619 /* FIXME change interpvar.h? */
3620 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3621 PL_last_swash_slen = slen;
3622 if (klen)
3623 Copy(ptr, PL_last_swash_key, klen, U8);
3624 }
3625
9faf8d75 3626 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3627 case 1:
e7aca353 3628 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3629 case 8:
e7aca353 3630 return ((UV) tmps[off]);
a0ed51b3
LW
3631 case 16:
3632 off <<= 1;
e7aca353
JH
3633 return
3634 ((UV) tmps[off ] << 8) +
3635 ((UV) tmps[off + 1]);
a0ed51b3
LW
3636 case 32:
3637 off <<= 2;
e7aca353
JH
3638 return
3639 ((UV) tmps[off ] << 24) +
3640 ((UV) tmps[off + 1] << 16) +
3641 ((UV) tmps[off + 2] << 8) +
3642 ((UV) tmps[off + 3]);
a0ed51b3 3643 }
5637ef5b 3644 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
147e3846 3645 "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
670f1322 3646 NORETURN_FUNCTION_END;
a0ed51b3 3647}
2b9d42f0 3648
319009ee
KW
3649/* Read a single line of the main body of the swash input text. These are of
3650 * the form:
3651 * 0053 0056 0073
3652 * where each number is hex. The first two numbers form the minimum and
3653 * maximum of a range, and the third is the value associated with the range.
3654 * Not all swashes should have a third number
3655 *
3656 * On input: l points to the beginning of the line to be examined; it points
3657 * to somewhere in the string of the whole input text, and is
3658 * terminated by a \n or the null string terminator.
3659 * lend points to the null terminator of that string
3660 * wants_value is non-zero if the swash expects a third number
3661 * typestr is the name of the swash's mapping, like 'ToLower'
3662 * On output: *min, *max, and *val are set to the values read from the line.
3663 * returns a pointer just beyond the line examined. If there was no
3664 * valid min number on the line, returns lend+1
3665 */
3666
3667STATIC U8*
3668S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3669 const bool wants_value, const U8* const typestr)
3670{
3671 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3672 STRLEN numlen; /* Length of the number */
02470786
KW
3673 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3674 | PERL_SCAN_DISALLOW_PREFIX
3675 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3676
3677 /* nl points to the next \n in the scan */
3678 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3679
95543e92
KW
3680 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3681
319009ee
KW
3682 /* Get the first number on the line: the range minimum */
3683 numlen = lend - l;
3684 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3685 *max = *min; /* So can never return without setting max */
319009ee
KW
3686 if (numlen) /* If found a hex number, position past it */
3687 l += numlen;
3688 else if (nl) { /* Else, go handle next line, if any */
3689 return nl + 1; /* 1 is length of "\n" */
3690 }
3691 else { /* Else, no next line */
3692 return lend + 1; /* to LIST's end at which \n is not found */
3693 }
3694
3695 /* The max range value follows, separated by a BLANK */
3696 if (isBLANK(*l)) {
3697 ++l;
02470786
KW
3698 flags = PERL_SCAN_SILENT_ILLDIGIT
3699 | PERL_SCAN_DISALLOW_PREFIX
3700 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3701 numlen = lend - l;
3702 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3703 if (numlen)