This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8.t: Correct to uppercase in print
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
2d212e86 40 "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
75 "UTF-16 surrogate U+%04"UVXf, uv); \
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
86 "Unicode non-character U+%04"UVXf" is not " \
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
167 ? "Code point 0x%"UVXf" is not Unicode, and not portable"
168 : "Code point 0x%"UVXf" is not Unicode, may not be portable",
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
b343c774
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
433 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
434 * platform, that is if it represents a code point larger than the highest
435 * representable code point. (For ASCII platforms, we could use memcmp()
436 * because we don't have to convert each byte to I8, but it's very rare
437 * input indeed that would approach overflow, so the loop below will likely
438 * only get executed once.
439 *
440 * 'e' must not be beyond a full character. If it is less than a full
441 * character, the function returns FALSE if there is any input beyond 'e'
442 * that could result in a non-overflowing code point */
443
444 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
445 assert(s + UTF8SKIP(s) >= e);
446
447 for (x = s; x < e; x++, y++) {
448
449 /* If this byte is larger than the corresponding highest UTF-8 byte, it
450 * overflows */
451 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
452 return TRUE;
453 }
454
455 /* If not the same as this byte, it must be smaller, doesn't overflow */
456 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
457 return FALSE;
458 }
459 }
460
461 /* Got to the end and all bytes are the same. If the input is a whole
462 * character, it doesn't overflow. And if it is a partial character,
463 * there's not enough information to tell, so assume doesn't overflow */
464 return FALSE;
465}
466
467PERL_STATIC_INLINE bool
468S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
469{
470 /* Overlongs can occur whenever the number of continuation bytes
471 * changes. That means whenever the number of leading 1 bits in a start
472 * byte increases from the next lower start byte. That happens for start
473 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
474 * illegal start bytes have already been excluded, so don't need to be
475 * tested here;
476 * ASCII platforms: C0, C1
477 * EBCDIC platforms C0, C1, C2, C3, C4, E0
478 *
479 * At least a second byte is required to determine if other sequences will
480 * be an overlong. */
481
482 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
483 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
484
485 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
486 assert(len > 1 && UTF8_IS_START(*s));
487
488 /* Each platform has overlongs after the start bytes given above (expressed
489 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
490 * the logic is the same, except the E0 overlong has already been excluded
491 * on EBCDIC platforms. The values below were found by manually
492 * inspecting the UTF-8 patterns. See the tables in utf8.h and
493 * utfebcdic.h. */
494
495# ifdef EBCDIC
496# define F0_ABOVE_OVERLONG 0xB0
497# define F8_ABOVE_OVERLONG 0xA8
498# define FC_ABOVE_OVERLONG 0xA4
499# define FE_ABOVE_OVERLONG 0xA2
500# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
501 /* I8(0xfe) is FF */
502# else
503
504 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
505 return TRUE;
506 }
507
508# define F0_ABOVE_OVERLONG 0x90
509# define F8_ABOVE_OVERLONG 0x88
510# define FC_ABOVE_OVERLONG 0x84
511# define FE_ABOVE_OVERLONG 0x82
512# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
513# endif
514
515
516 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
517 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
518 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
519 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
520 {
521 return TRUE;
522 }
523
524# if defined(UV_IS_QUAD) || defined(EBCDIC)
525
526 /* Check for the FF overlong. This happens only if all these bytes match;
527 * what comes after them doesn't matter. See tables in utf8.h,
528 * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
529 * instead.) */
530
531 if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
532 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
533 sizeof(FF_OVERLONG_PREFIX) - 1)))
534 {
535 return TRUE;
536 }
537
210ad863
KW
538#else
539
540 PERL_UNUSED_ARG(len);
541
12a4bed3
KW
542#endif
543
544 return FALSE;
545}
546
547#undef F0_ABOVE_OVERLONG
548#undef F8_ABOVE_OVERLONG
549#undef FC_ABOVE_OVERLONG
550#undef FE_ABOVE_OVERLONG
551#undef FF_OVERLONG_PREFIX
552
35f8c9bd 553STRLEN
edc2c47a 554Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 555{
2b479609 556 STRLEN len;
12a4bed3 557 const U8 *x;
35f8c9bd 558
2b479609
KW
559 /* A helper function that should not be called directly.
560 *
561 * This function returns non-zero if the string beginning at 's' and
562 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
563 * code point; otherwise it returns 0. The examination stops after the
564 * first code point in 's' is validated, not looking at the rest of the
565 * input. If 'e' is such that there are not enough bytes to represent a
566 * complete code point, this function will return non-zero anyway, if the
567 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
568 * excluded by 'flags'.
569 *
570 * A non-zero return gives the number of bytes required to represent the
571 * code point. Be aware that if the input is for a partial character, the
572 * return will be larger than 'e - s'.
573 *
574 * This function assumes that the code point represented is UTF-8 variant.
575 * The caller should have excluded this possibility before calling this
576 * function.
577 *
578 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
579 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
580 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
581 * disallowed by the flags. If the input is only for a partial character,
582 * the function will return non-zero if there is any sequence of
583 * well-formed UTF-8 that, when appended to the input sequence, could
584 * result in an allowed code point; otherwise it returns 0. Non characters
585 * cannot be determined based on partial character input. But many of the
586 * other excluded types can be determined with just the first one or two
587 * bytes.
588 *
589 */
590
591 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
592
593 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
594 |UTF8_DISALLOW_ABOVE_31_BIT)));
595 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 596
2b479609 597 /* A variant char must begin with a start byte */
35f8c9bd
KW
598 if (UNLIKELY(! UTF8_IS_START(*s))) {
599 return 0;
600 }
601
edc2c47a
KW
602 /* Examine a maximum of a single whole code point */
603 if (e - s > UTF8SKIP(s)) {
604 e = s + UTF8SKIP(s);
605 }
606
2b479609
KW
607 len = e - s;
608
609 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
610 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 611
2b479609
KW
612 /* The code below is derived from this table. Keep in mind that legal
613 * continuation bytes range between \x80..\xBF for UTF-8, and
614 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
615 * Hence, we don't have to test the upper edge because if any of those
616 * are encountered, the sequence is malformed, and will fail elsewhere
617 * in this function.
618 * UTF-8 UTF-EBCDIC I8
619 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
620 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
621 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
622 *
623 */
624
625#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
626# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 627# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 628
19794540
KW
629# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
630 /* B6 and B7 */ \
631 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
632#else
633# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
634# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
635# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
636#endif
637
638 if ( (flags & UTF8_DISALLOW_SUPER)
639 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
640 return 0; /* Above Unicode */
641 }
642
643 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
644 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
645 {
646 return 0; /* Above 31 bits */
647 }
648
649 if (len > 1) {
650 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
651
652 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 653 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
654 {
655 return 0; /* Above Unicode */
656 }
657
658 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 659 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
660 {
661 return 0; /* Surrogate */
662 }
663
664 if ( (flags & UTF8_DISALLOW_NONCHAR)
665 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
666 {
667 return 0; /* Noncharacter code point */
668 }
669 }
670 }
671
672 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
673 for (x = s + 1; x < e; x++) {
674 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
675 return 0;
676 }
677 }
678
af13dd8a 679 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
680 * overlong. */
681 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
682 return 0;
af13dd8a
KW
683 }
684
12a4bed3
KW
685 /* And finally, that the code point represented fits in a word on this
686 * platform */
687 if (does_utf8_overflow(s, e)) {
688 return 0;
35f8c9bd
KW
689 }
690
2b479609 691 return UTF8SKIP(s);
35f8c9bd
KW
692}
693
7cf8d05d
KW
694STATIC char *
695S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
696{
697 /* Returns a mortalized C string that is a displayable copy of the 'len'
698 * bytes starting at 's', each in a \xXY format. */
699
700 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
701 trailing NUL */
702 const U8 * const e = s + len;
703 char * output;
704 char * d;
705
706 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
707
708 Newx(output, output_len, char);
709 SAVEFREEPV(output);
710
711 d = output;
712 for (; s < e; s++) {
713 const unsigned high_nibble = (*s & 0xF0) >> 4;
714 const unsigned low_nibble = (*s & 0x0F);
715
716 *d++ = '\\';
717 *d++ = 'x';
718
719 if (high_nibble < 10) {
720 *d++ = high_nibble + '0';
721 }
722 else {
723 *d++ = high_nibble - 10 + 'a';
724 }
725
726 if (low_nibble < 10) {
727 *d++ = low_nibble + '0';
728 }
729 else {
730 *d++ = low_nibble - 10 + 'a';
731 }
732 }
733
734 *d = '\0';
735 return output;
736}
737
806547a7 738PERL_STATIC_INLINE char *
7cf8d05d
KW
739S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
740
741 /* How many bytes to print */
3cc6a05e 742 STRLEN print_len,
7cf8d05d
KW
743
744 /* Which one is the non-continuation */
745 const STRLEN non_cont_byte_pos,
746
747 /* How many bytes should there be? */
748 const STRLEN expect_len)
806547a7
KW
749{
750 /* Return the malformation warning text for an unexpected continuation
751 * byte. */
752
7cf8d05d 753 const char * const where = (non_cont_byte_pos == 1)
806547a7 754 ? "immediately"
7cf8d05d
KW
755 : Perl_form(aTHX_ "%d bytes",
756 (int) non_cont_byte_pos);
3cc6a05e 757 unsigned int i;
806547a7
KW
758
759 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
760
7cf8d05d
KW
761 /* We don't need to pass this parameter, but since it has already been
762 * calculated, it's likely faster to pass it; verify under DEBUGGING */
763 assert(expect_len == UTF8SKIP(s));
764
3cc6a05e
KW
765 /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
766 * length that is larger than is actually available in the buffer. If we
767 * print all the bytes based on that length, we will read past the buffer
768 * end. Often, the strings are NUL terminated, so to lower the chances of
769 * this happening, print the malformed bytes only up through any NUL. */
770 for (i = 1; i < print_len; i++) {
771 if (*(s + i) == '\0') {
772 print_len = i + 1; /* +1 gets the NUL printed */
773 break;
774 }
775 }
776
7cf8d05d
KW
777 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
778 " %s after start byte 0x%02x; need %d bytes, got %d)",
779 malformed_text,
780 _byte_dump_string(s, print_len),
781 *(s + non_cont_byte_pos),
782 where,
783 *s,
784 (int) expect_len,
785 (int) non_cont_byte_pos);
806547a7
KW
786}
787
35f8c9bd
KW
788/*
789
de69f3af 790=for apidoc utf8n_to_uvchr
378516de
KW
791
792THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 793Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 794
9041c2e3 795Bottom level UTF-8 decode routine.
de69f3af 796Returns the native code point value of the first character in the string C<s>,
746afd53
KW
797which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
798C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
799the length, in bytes, of that character.
949cf498
KW
800
801The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
802well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
803causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
804is the next possible position in C<s> that could begin a non-malformed
805character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
806is raised. Some UTF-8 input sequences may contain multiple malformations.
807This function tries to find every possible one in each call, so multiple
808warnings can be raised for each sequence.
949cf498
KW
809
810Various ALLOW flags can be set in C<flags> to allow (and not warn on)
811individual types of malformations, such as the sequence being overlong (that
812is, when there is a shorter sequence that can express the same code point;
813overlong sequences are expressly forbidden in the UTF-8 standard due to
814potential security issues). Another malformation example is the first byte of
815a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
816flags. For allowed 0 length strings, this function returns 0; for allowed
817overlong sequences, the computed code point is returned; for all other allowed
818malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
819determinable reasonable value.
949cf498 820
796b6530 821The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
822flags) malformation is found. If this flag is set, the routine assumes that
823the caller will raise a warning, and this function will silently just set
d088425d
KW
824C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
825
75200dff 826Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 827character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
828in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
829be set to 1. To disambiguate, upon a zero return, see if the first byte of
830C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 831error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
832
833Certain code points are considered problematic. These are Unicode surrogates,
746afd53 834Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 835By default these are considered regular code points, but certain situations
ecc1615f
KW
836warrant special handling for them, which can be specified using the C<flags>
837parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
838three classes are treated as malformations and handled as such. The flags
839C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
840C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
841disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
842restricts the allowed inputs to the strict UTF-8 traditionally defined by
843Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
844definition given by
845L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
846The difference between traditional strictness and C9 strictness is that the
847latter does not forbid non-character code points. (They are still discouraged,
848however.) For more discussion see L<perlunicode/Noncharacter code points>.
849
850The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
851C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
852C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
853raised for their respective categories, but otherwise the code points are
854considered valid (not malformations). To get a category to both be treated as
855a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 856(But note that warnings are not raised if lexically disabled nor if
796b6530 857C<UTF8_CHECK_ONLY> is also specified.)
949cf498 858
760c7c2f
KW
859It is now deprecated to have very high code points (above C<IV_MAX> on the
860platforms) and this function will raise a deprecation warning for these (unless
d5944cab 861such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 862in a 32-bit word.
ab8e6d41
KW
863
864Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
865so using them is more problematic than other above-Unicode code points. Perl
866invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
867likely that non-Perl languages will not be able to read files that contain
aff2be59 868these; nor would Perl understand files
ab8e6d41
KW
869written by something that uses a different extension. For these reasons, there
870is a separate set of flags that can warn and/or disallow these extremely high
871code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
872C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
873are entirely independent from the deprecation warning for code points above
874C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
875code point that needs more than 31 bits to represent. When that happens,
876effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
87732-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
878above-Unicode code points, including these, as malformations; and
879C<UTF8_WARN_SUPER> warns on these.)
880
881On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
882extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
883than on ASCII. Prior to that, code points 2**31 and higher were simply
884unrepresentable, and a different, incompatible method was used to represent
885code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
886and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
887platforms, warning and disallowing 2**31 and higher.
949cf498
KW
888
889All other code points corresponding to Unicode characters, including private
890use and those yet to be assigned, are never considered malformed and never
891warn.
67e989fb 892
37607a96 893=cut
f9380377
KW
894
895Also implemented as a macro in utf8.h
896*/
897
898UV
899Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
900 STRLEN curlen,
901 STRLEN *retlen,
902 const U32 flags)
903{
904 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
905
906 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
907}
908
909/*
910
911=for apidoc utf8n_to_uvchr_error
912
913THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
914Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
915
916This function is for code that needs to know what the precise malformation(s)
917are when an error is found.
918
919It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
920all the others, C<errors>. If this parameter is 0, this function behaves
921identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
922to a C<U32> variable, which this function sets to indicate any errors found.
923Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
924C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
925of these bits will be set if a malformation is found, even if the input
926C<flags> parameter indicates that the given malformation is allowed; the
927exceptions are noted:
928
929=over 4
930
931=item C<UTF8_GOT_ABOVE_31_BIT>
932
933The code point represented by the input UTF-8 sequence occupies more than 31
934bits.
935This bit is set only if the input C<flags> parameter contains either the
936C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
937
938=item C<UTF8_GOT_CONTINUATION>
939
940The input sequence was malformed in that the first byte was a a UTF-8
941continuation byte.
942
943=item C<UTF8_GOT_EMPTY>
944
945The input C<curlen> parameter was 0.
946
947=item C<UTF8_GOT_LONG>
948
949The input sequence was malformed in that there is some other sequence that
950evaluates to the same code point, but that sequence is shorter than this one.
951
952=item C<UTF8_GOT_NONCHAR>
953
954The code point represented by the input UTF-8 sequence is for a Unicode
955non-character code point.
956This bit is set only if the input C<flags> parameter contains either the
957C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
958
959=item C<UTF8_GOT_NON_CONTINUATION>
960
961The input sequence was malformed in that a non-continuation type byte was found
962in a position where only a continuation type one should be.
963
964=item C<UTF8_GOT_OVERFLOW>
965
966The input sequence was malformed in that it is for a code point that is not
967representable in the number of bits available in a UV on the current platform.
968
969=item C<UTF8_GOT_SHORT>
970
971The input sequence was malformed in that C<curlen> is smaller than required for
972a complete sequence. In other words, the input is for a partial character
973sequence.
974
975=item C<UTF8_GOT_SUPER>
976
977The input sequence was malformed in that it is for a non-Unicode code point;
978that is, one above the legal Unicode maximum.
979This bit is set only if the input C<flags> parameter contains either the
980C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
981
982=item C<UTF8_GOT_SURROGATE>
983
984The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
985code point.
986This bit is set only if the input C<flags> parameter contains either the
987C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
988
989=back
990
991=cut
37607a96 992*/
67e989fb 993
a0ed51b3 994UV
f9380377
KW
995Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
996 STRLEN curlen,
997 STRLEN *retlen,
998 const U32 flags,
999 U32 * errors)
a0ed51b3 1000{
d4c19fe8 1001 const U8 * const s0 = s;
2b5e7bc2
KW
1002 U8 * send = NULL; /* (initialized to silence compilers' wrong
1003 warning) */
1004 U32 possible_problems = 0; /* A bit is set here for each potential problem
1005 found as we go along */
eb83ed87 1006 UV uv = *s;
2b5e7bc2
KW
1007 STRLEN expectlen = 0; /* How long should this sequence be?
1008 (initialized to silence compilers' wrong
1009 warning) */
f9380377
KW
1010 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1011 this gets set and discarded */
a0dbb045 1012
2b5e7bc2
KW
1013 /* The below are used only if there is both an overlong malformation and a
1014 * too short one. Otherwise the first two are set to 's0' and 'send', and
1015 * the third not used at all */
1016 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1017 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1018 warning) */
2b5e7bc2 1019 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1020
f9380377
KW
1021 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1022
1023 if (errors) {
1024 *errors = 0;
1025 }
1026 else {
1027 errors = &discard_errors;
1028 }
a0dbb045 1029
eb83ed87
KW
1030 /* The order of malformation tests here is important. We should consume as
1031 * few bytes as possible in order to not skip any valid character. This is
1032 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1033 * http://unicode.org/reports/tr36 for more discussion as to why. For
1034 * example, once we've done a UTF8SKIP, we can tell the expected number of
1035 * bytes, and could fail right off the bat if the input parameters indicate
1036 * that there are too few available. But it could be that just that first
1037 * byte is garbled, and the intended character occupies fewer bytes. If we
1038 * blindly assumed that the first byte is correct, and skipped based on
1039 * that number, we could skip over a valid input character. So instead, we
1040 * always examine the sequence byte-by-byte.
1041 *
1042 * We also should not consume too few bytes, otherwise someone could inject
1043 * things. For example, an input could be deliberately designed to
1044 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1045 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1046 * which is actually part of of the overflowing sequence), that could look
1047 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1048 * sequence and process the rest, inappropriately.
1049 *
1050 * Some possible input sequences are malformed in more than one way. This
1051 * function goes to lengths to try to find all of them. This is necessary
1052 * for correctness, as the inputs may allow one malformation but not
1053 * another, and if we abandon searching for others after finding the
1054 * allowed one, we could allow in something that shouldn't have been.
1055 */
eb83ed87 1056
b5b9af04 1057 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1058 possible_problems |= UTF8_GOT_EMPTY;
1059 curlen = 0;
1060 uv = 0; /* XXX It could be argued that this should be
1061 UNICODE_REPLACEMENT? */
1062 goto ready_to_handle_errors;
0c443dc2
JH
1063 }
1064
eb83ed87
KW
1065 expectlen = UTF8SKIP(s);
1066
1067 /* A well-formed UTF-8 character, as the vast majority of calls to this
1068 * function will be for, has this expected length. For efficiency, set
1069 * things up here to return it. It will be overriden only in those rare
1070 * cases where a malformation is found */
1071 if (retlen) {
1072 *retlen = expectlen;
1073 }
1074
1075 /* An invariant is trivially well-formed */
1d72bdf6 1076 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1077 return uv;
a0ed51b3 1078 }
67e989fb 1079
eb83ed87 1080 /* A continuation character can't start a valid sequence */
b5b9af04 1081 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1082 possible_problems |= UTF8_GOT_CONTINUATION;
1083 curlen = 1;
1084 uv = UNICODE_REPLACEMENT;
1085 goto ready_to_handle_errors;
ba210ebe 1086 }
9041c2e3 1087
dcd27b3c
KW
1088 /* Here is not a continuation byte, nor an invariant. The only thing left
1089 * is a start byte (possibly for an overlong) */
1090
534752c1
KW
1091 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1092 * that indicate the number of bytes in the character's whole UTF-8
1093 * sequence, leaving just the bits that are part of the value. */
1094 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1095
eb83ed87
KW
1096 /* Now, loop through the remaining bytes in the character's sequence,
1097 * accumulating each into the working value as we go. Be sure to not look
1098 * past the end of the input string */
b3057643
KW
1099 send = adjusted_send = (U8*) s0 + ((expectlen <= curlen)
1100 ? expectlen
1101 : curlen);
eb83ed87 1102 for (s = s0 + 1; s < send; s++) {
b5b9af04 1103 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1104 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1105 continue;
1106 }
1107
1108 /* Here, found a non-continuation before processing all expected bytes.
1109 * This byte indicates the beginning of a new character, so quit, even
1110 * if allowing this malformation. */
1111 curlen = s - s0; /* Save how many bytes we actually got */
1112 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1113 goto finish_short;
eb83ed87
KW
1114 } /* End of loop through the character's bytes */
1115
1116 /* Save how many bytes were actually in the character */
1117 curlen = s - s0;
1118
2b5e7bc2
KW
1119 /* Did we get all the continuation bytes that were expected? Note that we
1120 * know this result even without executing the loop above. But we had to
1121 * do the loop to see if there are unexpected non-continuations. */
1122 if (UNLIKELY(curlen < expectlen)) {
1123 possible_problems |= UTF8_GOT_SHORT;
2f8f112e 1124
2b5e7bc2
KW
1125 finish_short:
1126 uv_so_far = uv;
1127 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
1128 }
1129
2b5e7bc2
KW
1130 /* Note that there are two types of too-short malformation. One is when
1131 * there is actual wrong data before the normal termination of the
1132 * sequence. The other is that the sequence wasn't complete before the end
1133 * of the data we are allowed to look at, based on the input 'curlen'.
1134 * This means that we were passed data for a partial character, but it is
1135 * valid as far as we saw. The other is definitely invalid. This
1136 * distinction could be important to a caller, so the two types are kept
1137 * separate. */
1138
1139 /* Check for overflow */
1140 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1141 possible_problems |= UTF8_GOT_OVERFLOW;
1142 uv = UNICODE_REPLACEMENT;
eb83ed87 1143 }
eb83ed87 1144
2b5e7bc2
KW
1145 /* Check for overlong. If no problems so far, 'uv' is the correct code
1146 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1147 * we must look at the UTF-8 byte sequence itself to see if it is for an
1148 * overlong */
1149 if ( ( LIKELY(! possible_problems)
1150 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1151 || ( UNLIKELY( possible_problems)
1152 && ( UNLIKELY(! UTF8_IS_START(*s0))
1153 || ( curlen > 1
1154 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1155 send - s0))))))
2f8f112e 1156 {
2b5e7bc2
KW
1157 possible_problems |= UTF8_GOT_LONG;
1158
1159 /* A convenience macro that matches either of the too-short conditions.
1160 * */
1161# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1162
1163 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1164 UV min_uv = uv_so_far;
1165 STRLEN i;
1166
1167 /* Here, the input is both overlong and is missing some trailing
1168 * bytes. There is no single code point it could be for, but there
1169 * may be enough information present to determine if what we have
1170 * so far is for an unallowed code point, such as for a surrogate.
1171 * The code below has the intelligence to determine this, but just
1172 * for non-overlong UTF-8 sequences. What we do here is calculate
1173 * the smallest code point the input could represent if there were
1174 * no too short malformation. Then we compute and save the UTF-8
1175 * for that, which is what the code below looks at instead of the
1176 * raw input. It turns out that the smallest such code point is
1177 * all we need. */
1178 for (i = curlen; i < expectlen; i++) {
1179 min_uv = UTF8_ACCUMULATE(min_uv,
1180 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1181 }
1182
1183 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1184 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1185 to free it ourselves if
1186 warnings are made fatal */
1187 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1188 }
eb83ed87
KW
1189 }
1190
2b5e7bc2
KW
1191 /* Now check that the input isn't for a problematic code point not allowed
1192 * by the input parameters. */
1193 /* isn't problematic if < this */
1194 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1195 || ( UNLIKELY(possible_problems)
1196 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1197 && ((flags & ( UTF8_DISALLOW_NONCHAR
1198 |UTF8_DISALLOW_SURROGATE
1199 |UTF8_DISALLOW_SUPER
1200 |UTF8_DISALLOW_ABOVE_31_BIT
1201 |UTF8_WARN_NONCHAR
1202 |UTF8_WARN_SURROGATE
1203 |UTF8_WARN_SUPER
1204 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1205 /* In case of a malformation, 'uv' is not valid, and has
1206 * been changed to something in the Unicode range.
1207 * Currently we don't output a deprecation message if there
1208 * is already a malformation, so we don't have to special
1209 * case the test immediately below */
760c7c2f
KW
1210 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1211 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1212 {
2b5e7bc2
KW
1213 /* If there were no malformations, or the only malformation is an
1214 * overlong, 'uv' is valid */
1215 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1216 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1217 possible_problems |= UTF8_GOT_SURROGATE;
1218 }
1219 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1220 possible_problems |= UTF8_GOT_SUPER;
1221 }
1222 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1223 possible_problems |= UTF8_GOT_NONCHAR;
1224 }
1225 }
1226 else { /* Otherwise, need to look at the source UTF-8, possibly
1227 adjusted to be non-overlong */
1228
1229 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1230 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1231 {
2b5e7bc2
KW
1232 possible_problems |= UTF8_GOT_SUPER;
1233 }
1234 else if (curlen > 1) {
1235 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1236 NATIVE_UTF8_TO_I8(*adjusted_s0),
1237 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1238 {
2b5e7bc2 1239 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1240 }
2b5e7bc2
KW
1241 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1242 NATIVE_UTF8_TO_I8(*adjusted_s0),
1243 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1244 {
1245 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1246 }
1247 }
c0236afe 1248
2b5e7bc2
KW
1249 /* We need a complete well-formed UTF-8 character to discern
1250 * non-characters, so can't look for them here */
1251 }
1252 }
949cf498 1253
2b5e7bc2
KW
1254 ready_to_handle_errors:
1255
1256 /* At this point:
1257 * curlen contains the number of bytes in the sequence that
1258 * this call should advance the input by.
1259 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1260 * is set in it for each potential problem found.
1261 * uv contains the code point the input sequence
1262 * represents; or if there is a problem that prevents
1263 * a well-defined value from being computed, it is
1264 * some subsitute value, typically the REPLACEMENT
1265 * CHARACTER.
1266 * s0 points to the first byte of the character
1267 * send points to just after where that (potentially
1268 * partial) character ends
1269 * adjusted_s0 normally is the same as s0, but in case of an
1270 * overlong for which the UTF-8 matters below, it is
1271 * the first byte of the shortest form representation
1272 * of the input.
1273 * adjusted_send normally is the same as 'send', but if adjusted_s0
1274 * is set to something other than s0, this points one
1275 * beyond its end
1276 */
eb83ed87 1277
2b5e7bc2
KW
1278 if (UNLIKELY(possible_problems)) {
1279 bool disallowed = FALSE;
1280 const U32 orig_problems = possible_problems;
1281
1282 while (possible_problems) { /* Handle each possible problem */
1283 UV pack_warn = 0;
1284 char * message = NULL;
1285
1286 /* Each 'if' clause handles one problem. They are ordered so that
1287 * the first ones' messages will be displayed before the later
1288 * ones; this is kinda in decreasing severity order */
1289 if (possible_problems & UTF8_GOT_OVERFLOW) {
1290
1291 /* Overflow means also got a super and above 31 bits, but we
1292 * handle all three cases here */
1293 possible_problems
1294 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1295 *errors |= UTF8_GOT_OVERFLOW;
1296
1297 /* But the API says we flag all errors found */
1298 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1299 *errors |= UTF8_GOT_SUPER;
1300 }
1301 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1302 *errors |= UTF8_GOT_ABOVE_31_BIT;
1303 }
2b5e7bc2
KW
1304
1305 disallowed = TRUE;
1306
1307 /* The warnings code explicitly says it doesn't handle the case
1308 * of packWARN2 and two categories which have parent-child
1309 * relationship. Even if it works now to raise the warning if
1310 * either is enabled, it wouldn't necessarily do so in the
1311 * future. We output (only) the most dire warning*/
1312 if (! (flags & UTF8_CHECK_ONLY)) {
1313 if (ckWARN_d(WARN_UTF8)) {
1314 pack_warn = packWARN(WARN_UTF8);
1315 }
1316 else if (ckWARN_d(WARN_NON_UNICODE)) {
1317 pack_warn = packWARN(WARN_NON_UNICODE);
1318 }
1319 if (pack_warn) {
1320 message = Perl_form(aTHX_ "%s: %s (overflows)",
1321 malformed_text,
1322 _byte_dump_string(s0, send - s0));
1323 }
1324 }
1325 }
1326 else if (possible_problems & UTF8_GOT_EMPTY) {
1327 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1328 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1329
1330 if (! (flags & UTF8_ALLOW_EMPTY)) {
1331 disallowed = TRUE;
1332 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1333 pack_warn = packWARN(WARN_UTF8);
1334 message = Perl_form(aTHX_ "%s (empty string)",
1335 malformed_text);
1336 }
1337 }
1338 }
1339 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1340 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1341 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1342
1343 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1344 disallowed = TRUE;
1345 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1346 pack_warn = packWARN(WARN_UTF8);
1347 message = Perl_form(aTHX_
1348 "%s: %s (unexpected continuation byte 0x%02x,"
1349 " with no preceding start byte)",
1350 malformed_text,
1351 _byte_dump_string(s0, 1), *s0);
1352 }
1353 }
1354 }
1355 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1356 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
f9380377 1357 *errors |= UTF8_GOT_NON_CONTINUATION;
2b5e7bc2
KW
1358
1359 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1360 disallowed = TRUE;
1361 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1362 pack_warn = packWARN(WARN_UTF8);
1363 message = Perl_form(aTHX_ "%s",
1364 unexpected_non_continuation_text(s0,
1365 send - s0,
1366 s - s0,
1367 (int) expectlen));
1368 }
1369 }
1370 }
1371 else if (possible_problems & UTF8_GOT_SHORT) {
1372 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1373 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1374
1375 if (! (flags & UTF8_ALLOW_SHORT)) {
1376 disallowed = TRUE;
1377 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1378 pack_warn = packWARN(WARN_UTF8);
1379 message = Perl_form(aTHX_
1380 "%s: %s (too short; got %d byte%s, need %d)",
1381 malformed_text,
1382 _byte_dump_string(s0, send - s0),
1383 (int)curlen,
1384 curlen == 1 ? "" : "s",
1385 (int)expectlen);
1386 }
1387 }
ba210ebe 1388
2b5e7bc2
KW
1389 }
1390 else if (possible_problems & UTF8_GOT_LONG) {
1391 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1392 *errors |= UTF8_GOT_LONG;
2b5e7bc2
KW
1393
1394 if (! (flags & UTF8_ALLOW_LONG)) {
1395 disallowed = TRUE;
1396
1397 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1398 pack_warn = packWARN(WARN_UTF8);
1399
1400 /* These error types cause 'uv' to be something that
1401 * isn't what was intended, so can't use it in the
1402 * message. The other error types either can't
1403 * generate an overlong, or else the 'uv' is valid */
1404 if (orig_problems &
1405 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1406 {
1407 message = Perl_form(aTHX_
1408 "%s: %s (any UTF-8 sequence that starts"
1409 " with \"%s\" is overlong which can and"
1410 " should be represented with a"
1411 " different, shorter sequence)",
1412 malformed_text,
1413 _byte_dump_string(s0, send - s0),
1414 _byte_dump_string(s0, curlen));
1415 }
1416 else {
1417 U8 tmpbuf[UTF8_MAXBYTES+1];
1418 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1419 uv, 0);
1420 message = Perl_form(aTHX_
1421 "%s: %s (overlong; instead use %s to represent"
1422 " U+%0*"UVXf")",
1423 malformed_text,
1424 _byte_dump_string(s0, send - s0),
1425 _byte_dump_string(tmpbuf, e - tmpbuf),
1426 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1427 small code points */
1428 uv);
1429 }
1430 }
1431 }
1432 }
1433 else if (possible_problems & UTF8_GOT_SURROGATE) {
1434 possible_problems &= ~UTF8_GOT_SURROGATE;
1435
f9380377
KW
1436 if (flags & UTF8_WARN_SURROGATE) {
1437 *errors |= UTF8_GOT_SURROGATE;
1438
1439 if ( ! (flags & UTF8_CHECK_ONLY)
1440 && ckWARN_d(WARN_SURROGATE))
1441 {
2b5e7bc2
KW
1442 pack_warn = packWARN(WARN_SURROGATE);
1443
1444 /* These are the only errors that can occur with a
1445 * surrogate when the 'uv' isn't valid */
1446 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1447 message = Perl_form(aTHX_
1448 "UTF-16 surrogate (any UTF-8 sequence that"
1449 " starts with \"%s\" is for a surrogate)",
1450 _byte_dump_string(s0, curlen));
1451 }
1452 else {
1453 message = Perl_form(aTHX_
1454 "UTF-16 surrogate U+%04"UVXf"", uv);
1455 }
f9380377 1456 }
2b5e7bc2 1457 }
ba210ebe 1458
2b5e7bc2
KW
1459 if (flags & UTF8_DISALLOW_SURROGATE) {
1460 disallowed = TRUE;
f9380377 1461 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1462 }
1463 }
1464 else if (possible_problems & UTF8_GOT_SUPER) {
1465 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1466
f9380377
KW
1467 if (flags & UTF8_WARN_SUPER) {
1468 *errors |= UTF8_GOT_SUPER;
1469
1470 if ( ! (flags & UTF8_CHECK_ONLY)
1471 && ckWARN_d(WARN_NON_UNICODE))
1472 {
2b5e7bc2
KW
1473 pack_warn = packWARN(WARN_NON_UNICODE);
1474
1475 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1476 message = Perl_form(aTHX_
1477 "Any UTF-8 sequence that starts with"
1478 " \"%s\" is for a non-Unicode code point,"
1479 " may not be portable",
1480 _byte_dump_string(s0, curlen));
1481 }
1482 else {
1483 message = Perl_form(aTHX_
1484 "Code point 0x%04"UVXf" is not"
1485 " Unicode, may not be portable",
1486 uv);
1487 }
f9380377 1488 }
2b5e7bc2 1489 }
ba210ebe 1490
2b5e7bc2
KW
1491 /* The maximum code point ever specified by a standard was
1492 * 2**31 - 1. Anything larger than that is a Perl extension
1493 * that very well may not be understood by other applications
1494 * (including earlier perl versions on EBCDIC platforms). We
1495 * test for these after the regular SUPER ones, and before
1496 * possibly bailing out, so that the slightly more dire warning
1497 * will override the regular one. */
1498 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1499 |UTF8_WARN_SUPER
1500 |UTF8_DISALLOW_ABOVE_31_BIT))
1501 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1502 && UNLIKELY(is_utf8_cp_above_31_bits(
1503 adjusted_s0,
1504 adjusted_send)))
1505 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1506 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1507 {
1508 if ( ! (flags & UTF8_CHECK_ONLY)
1509 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1510 && ckWARN_d(WARN_UTF8))
1511 {
1512 pack_warn = packWARN(WARN_UTF8);
1513
1514 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1515 message = Perl_form(aTHX_
1516 "Any UTF-8 sequence that starts with"
1517 " \"%s\" is for a non-Unicode code"
1518 " point, and is not portable",
1519 _byte_dump_string(s0, curlen));
1520 }
1521 else {
1522 message = Perl_form(aTHX_
1523 "Code point 0x%"UVXf" is not Unicode,"
1524 " and not portable",
1525 uv);
1526 }
1527 }
1528
f9380377
KW
1529 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1530 *errors |= UTF8_GOT_ABOVE_31_BIT;
1531
1532 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1533 disallowed = TRUE;
1534 }
2b5e7bc2
KW
1535 }
1536 }
eb83ed87 1537
2b5e7bc2 1538 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1539 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1540 disallowed = TRUE;
1541 }
eb83ed87 1542
2b5e7bc2
KW
1543 /* The deprecated warning overrides any non-deprecated one. If
1544 * there are other problems, a deprecation message is not
1545 * really helpful, so don't bother to raise it in that case.
1546 * This also keeps the code from having to handle the case
1547 * where 'uv' is not valid. */
1548 if ( ! (orig_problems
1549 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1550 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1551 && ckWARN_d(WARN_DEPRECATED))
1552 {
1553 message = Perl_form(aTHX_ cp_above_legal_max,
1554 uv, MAX_NON_DEPRECATED_CP);
1555 pack_warn = packWARN(WARN_DEPRECATED);
1556 }
1557 }
1558 else if (possible_problems & UTF8_GOT_NONCHAR) {
1559 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1560
f9380377
KW
1561 if (flags & UTF8_WARN_NONCHAR) {
1562 *errors |= UTF8_GOT_NONCHAR;
1563
1564 if ( ! (flags & UTF8_CHECK_ONLY)
1565 && ckWARN_d(WARN_NONCHAR))
1566 {
2b5e7bc2
KW
1567 /* The code above should have guaranteed that we don't
1568 * get here with errors other than overlong */
1569 assert (! (orig_problems
1570 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1571
1572 pack_warn = packWARN(WARN_NONCHAR);
1573 message = Perl_form(aTHX_ "Unicode non-character"
1574 " U+%04"UVXf" is not recommended"
1575 " for open interchange", uv);
f9380377 1576 }
2b5e7bc2 1577 }
5b311467 1578
2b5e7bc2
KW
1579 if (flags & UTF8_DISALLOW_NONCHAR) {
1580 disallowed = TRUE;
f9380377 1581 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1582 }
1583 } /* End of looking through the possible flags */
1584
1585 /* Display the message (if any) for the problem being handled in
1586 * this iteration of the loop */
1587 if (message) {
1588 if (PL_op)
1589 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1590 OP_DESC(PL_op));
1591 else
1592 Perl_warner(aTHX_ pack_warn, "%s", message);
1593 }
1594 } /* End of 'while (possible_problems) {' */
a0dbb045 1595
2b5e7bc2
KW
1596 /* Since there was a possible problem, the returned length may need to
1597 * be changed from the one stored at the beginning of this function.
1598 * Instead of trying to figure out if that's needed, just do it. */
1599 if (retlen) {
1600 *retlen = curlen;
1601 }
a0dbb045 1602
2b5e7bc2
KW
1603 if (disallowed) {
1604 if (flags & UTF8_CHECK_ONLY && retlen) {
1605 *retlen = ((STRLEN) -1);
1606 }
1607 return 0;
1608 }
eb83ed87 1609 }
ba210ebe 1610
2b5e7bc2 1611 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1612}
1613
8e84507e 1614/*
ec5f19d0
KW
1615=for apidoc utf8_to_uvchr_buf
1616
1617Returns the native code point of the first character in the string C<s> which
1618is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1619C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1620
524080c4
KW
1621If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1622enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1623C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1624(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1625C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1626the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1627See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1628returned.
ec5f19d0 1629
760c7c2f
KW
1630Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1631unless those are turned off.
1632
ec5f19d0 1633=cut
52be2536
KW
1634
1635Also implemented as a macro in utf8.h
1636
ec5f19d0
KW
1637*/
1638
1639
1640UV
1641Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1642{
ec5f19d0
KW
1643 assert(s < send);
1644
1645 return utf8n_to_uvchr(s, send - s, retlen,
1646 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1647}
1648
52be2536
KW
1649/* This is marked as deprecated
1650 *
ec5f19d0
KW
1651=for apidoc utf8_to_uvuni_buf
1652
de69f3af
KW
1653Only in very rare circumstances should code need to be dealing in Unicode
1654(as opposed to native) code points. In those few cases, use
1655C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1656
1657Returns the Unicode (not-native) code point of the first character in the
1658string C<s> which
ec5f19d0
KW
1659is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1660C<retlen> will be set to the length, in bytes, of that character.
1661
524080c4
KW
1662If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1663enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1664NULL) to -1. If those warnings are off, the computed value if well-defined (or
1665the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1666is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1667next possible position in C<s> that could begin a non-malformed character.
de69f3af 1668See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1669
760c7c2f
KW
1670Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1671unless those are turned off.
1672
ec5f19d0
KW
1673=cut
1674*/
1675
1676UV
1677Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1678{
1679 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1680
1681 assert(send > s);
1682
5962d97e
KW
1683 /* Call the low level routine, asking for checks */
1684 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1685}
1686
b76347f2 1687/*
87cea99e 1688=for apidoc utf8_length
b76347f2
JH
1689
1690Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1691Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1692up past C<e>, croaks.
b76347f2
JH
1693
1694=cut
1695*/
1696
1697STRLEN
35a4481c 1698Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1699{
1700 STRLEN len = 0;
1701
7918f24d
NC
1702 PERL_ARGS_ASSERT_UTF8_LENGTH;
1703
8850bf83
JH
1704 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1705 * the bitops (especially ~) can create illegal UTF-8.
1706 * In other words: in Perl UTF-8 is not just for Unicode. */
1707
a3b680e6
AL
1708 if (e < s)
1709 goto warn_and_return;
b76347f2 1710 while (s < e) {
4cbf4130 1711 s += UTF8SKIP(s);
8e91ec7f
AV
1712 len++;
1713 }
1714
1715 if (e != s) {
1716 len--;
1717 warn_and_return:
9b387841
NC
1718 if (PL_op)
1719 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1720 "%s in %s", unees, OP_DESC(PL_op));
1721 else
61a12c31 1722 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1723 }
1724
1725 return len;
1726}
1727
b06226ff 1728/*
fed3ba5d
NC
1729=for apidoc bytes_cmp_utf8
1730
a1433954 1731Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1732sequence of characters (stored as UTF-8)
1733in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1734equal, -1 or -2 if the first string is less than the second string, +1 or +2
1735if the first string is greater than the second string.
1736
1737-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1738longer string. -2 or +2 is returned if
1739there was a difference between characters
fed3ba5d
NC
1740within the strings.
1741
1742=cut
1743*/
1744
1745int
1746Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1747{
1748 const U8 *const bend = b + blen;
1749 const U8 *const uend = u + ulen;
1750
1751 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1752
1753 while (b < bend && u < uend) {
1754 U8 c = *u++;
1755 if (!UTF8_IS_INVARIANT(c)) {
1756 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1757 if (u < uend) {
1758 U8 c1 = *u++;
1759 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1760 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1761 } else {
2b5e7bc2 1762 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1763 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1764 "%s %s%s",
7cf8d05d 1765 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1766 PL_op ? " in " : "",
1767 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1768 return -2;
1769 }
1770 } else {
1771 if (PL_op)
1772 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1773 "%s in %s", unees, OP_DESC(PL_op));
1774 else
61a12c31 1775 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1776 return -2; /* Really want to return undef :-) */
1777 }
1778 } else {
1779 return -2;
1780 }
1781 }
1782 if (*b != c) {
1783 return *b < c ? -2 : +2;
1784 }
1785 ++b;
1786 }
1787
1788 if (b == bend && u == uend)
1789 return 0;
1790
1791 return b < bend ? +1 : -1;
1792}
1793
1794/*
87cea99e 1795=for apidoc utf8_to_bytes
6940069f 1796
2bbc8d55 1797Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1798Unlike L</bytes_to_utf8>, this over-writes the original string, and
1799updates C<len> to contain the new length.
67e989fb 1800Returns zero on failure, setting C<len> to -1.
6940069f 1801
a1433954 1802If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1803
6940069f
GS
1804=cut
1805*/
1806
1807U8 *
37607a96 1808Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1809{
d4c19fe8
AL
1810 U8 * const save = s;
1811 U8 * const send = s + *len;
6940069f 1812 U8 *d;
246fae53 1813
7918f24d 1814 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1815 PERL_UNUSED_CONTEXT;
7918f24d 1816
1e54db1a 1817 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1818 while (s < send) {
d59937ca
KW
1819 if (! UTF8_IS_INVARIANT(*s)) {
1820 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1821 *len = ((STRLEN) -1);
1822 return 0;
1823 }
1824 s++;
dcad2880 1825 }
d59937ca 1826 s++;
246fae53 1827 }
dcad2880
JH
1828
1829 d = s = save;
6940069f 1830 while (s < send) {
80e0b38f
KW
1831 U8 c = *s++;
1832 if (! UTF8_IS_INVARIANT(c)) {
1833 /* Then it is two-byte encoded */
a62b247b 1834 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1835 s++;
1836 }
1837 *d++ = c;
6940069f
GS
1838 }
1839 *d = '\0';
246fae53 1840 *len = d - save;
6940069f
GS
1841 return save;
1842}
1843
1844/*
87cea99e 1845=for apidoc bytes_from_utf8
f9a63242 1846
2bbc8d55 1847Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1848Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1849the newly-created string, and updates C<len> to contain the new
1850length. Returns the original string if no conversion occurs, C<len>
72d33970 1851is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 18520 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1853in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1854
37607a96
PK
1855=cut
1856*/
f9a63242
JH
1857
1858U8 *
e1ec3a88 1859Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1860{
f9a63242 1861 U8 *d;
e1ec3a88
AL
1862 const U8 *start = s;
1863 const U8 *send;
f9a63242
JH
1864 I32 count = 0;
1865
7918f24d 1866 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1867 PERL_UNUSED_CONTEXT;
f9a63242 1868 if (!*is_utf8)
73d840c0 1869 return (U8 *)start;
f9a63242 1870
1e54db1a 1871 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1872 for (send = s + *len; s < send;) {
d59937ca
KW
1873 if (! UTF8_IS_INVARIANT(*s)) {
1874 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1875 return (U8 *)start;
d59937ca
KW
1876 }
1877 count++;
1878 s++;
db42d148 1879 }
d59937ca 1880 s++;
f9a63242
JH
1881 }
1882
35da51f7 1883 *is_utf8 = FALSE;
f9a63242 1884
212542aa 1885 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1886 s = start; start = d;
f9a63242
JH
1887 while (s < send) {
1888 U8 c = *s++;
1a91c45d 1889 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1890 /* Then it is two-byte encoded */
a62b247b 1891 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1892 s++;
c4d5f83a
NIS
1893 }
1894 *d++ = c;
f9a63242
JH
1895 }
1896 *d = '\0';
1897 *len = d - start;
73d840c0 1898 return (U8 *)start;
f9a63242
JH
1899}
1900
1901/*
87cea99e 1902=for apidoc bytes_to_utf8
6940069f 1903
ff97e5cf
KW
1904Converts a string C<s> of length C<len> bytes from the native encoding into
1905UTF-8.
6662521e 1906Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1907reflect the new length in bytes.
6940069f 1908
75200dff 1909A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1910
1911If you want to convert to UTF-8 from encodings other than
1912the native (Latin1 or EBCDIC),
a1433954 1913see L</sv_recode_to_utf8>().
c9ada85f 1914
497711e7 1915=cut
6940069f
GS
1916*/
1917
c682ebef
FC
1918/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1919 likewise need duplication. */
1920
6940069f 1921U8*
35a4481c 1922Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1923{
35a4481c 1924 const U8 * const send = s + (*len);
6940069f
GS
1925 U8 *d;
1926 U8 *dst;
7918f24d
NC
1927
1928 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1929 PERL_UNUSED_CONTEXT;
6940069f 1930
212542aa 1931 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1932 dst = d;
1933
1934 while (s < send) {
55d09dc8
KW
1935 append_utf8_from_native_byte(*s, &d);
1936 s++;
6940069f
GS
1937 }
1938 *d = '\0';
6662521e 1939 *len = d-dst;
6940069f
GS
1940 return dst;
1941}
1942
a0ed51b3 1943/*
dea0fc0b 1944 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1945 *
1946 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1947 * We optimize for native, for obvious reasons. */
1948
1949U8*
dea0fc0b 1950Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1951{
dea0fc0b
JH
1952 U8* pend;
1953 U8* dstart = d;
1954
7918f24d
NC
1955 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1956
dea0fc0b 1957 if (bytelen & 1)
f5992bc4 1958 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1959
1960 pend = p + bytelen;
1961
a0ed51b3 1962 while (p < pend) {
dea0fc0b
JH
1963 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1964 p += 2;
2d1545e5 1965 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1966 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1967 continue;
1968 }
56d37426
KW
1969 if (uv <= MAX_UTF8_TWO_BYTE) {
1970 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1971 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1972 continue;
1973 }
46956fad
KW
1974#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1975#define LAST_HIGH_SURROGATE 0xDBFF
1976#define FIRST_LOW_SURROGATE 0xDC00
1977#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1978
1979 /* This assumes that most uses will be in the first Unicode plane, not
1980 * needing surrogates */
1981 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1982 && uv <= UNICODE_SURROGATE_LAST))
1983 {
1984 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1985 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1986 }
1987 else {
01ea242b 1988 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1989 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1990 || UNLIKELY(low > LAST_LOW_SURROGATE))
1991 {
01ea242b 1992 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
1993 }
1994 p += 2;
46956fad
KW
1995 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1996 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1997 }
a0ed51b3 1998 }
56d37426
KW
1999#ifdef EBCDIC
2000 d = uvoffuni_to_utf8_flags(d, uv, 0);
2001#else
a0ed51b3 2002 if (uv < 0x10000) {
eb160463
GS
2003 *d++ = (U8)(( uv >> 12) | 0xe0);
2004 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2005 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2006 continue;
2007 }
2008 else {
eb160463
GS
2009 *d++ = (U8)(( uv >> 18) | 0xf0);
2010 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2011 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2012 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2013 continue;
2014 }
56d37426 2015#endif
a0ed51b3 2016 }
dea0fc0b 2017 *newlen = d - dstart;
a0ed51b3
LW
2018 return d;
2019}
2020
2021/* Note: this one is slightly destructive of the source. */
2022
2023U8*
dea0fc0b 2024Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2025{
2026 U8* s = (U8*)p;
d4c19fe8 2027 U8* const send = s + bytelen;
7918f24d
NC
2028
2029 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2030
e0ea5e2d
NC
2031 if (bytelen & 1)
2032 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
2033 (UV)bytelen);
2034
a0ed51b3 2035 while (s < send) {
d4c19fe8 2036 const U8 tmp = s[0];
a0ed51b3
LW
2037 s[0] = s[1];
2038 s[1] = tmp;
2039 s += 2;
2040 }
dea0fc0b 2041 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2042}
2043
922e8cb4
KW
2044bool
2045Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2046{
2047 U8 tmpbuf[UTF8_MAXBYTES+1];
2048 uvchr_to_utf8(tmpbuf, c);
2049 return _is_utf8_FOO(classnum, tmpbuf);
2050}
2051
f9ae8fb6
JD
2052/* Internal function so we can deprecate the external one, and call
2053 this one from other deprecated functions in this file */
2054
f2645549
KW
2055bool
2056Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2057{
f2645549 2058 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2059
2060 if (*p == '_')
2061 return TRUE;
f25ce844 2062 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2063}
2064
5092f92a 2065bool
eba68aa0
KW
2066Perl__is_uni_perl_idcont(pTHX_ UV c)
2067{
2068 U8 tmpbuf[UTF8_MAXBYTES+1];
2069 uvchr_to_utf8(tmpbuf, c);
2070 return _is_utf8_perl_idcont(tmpbuf);
2071}
2072
2073bool
f91dcd13
KW
2074Perl__is_uni_perl_idstart(pTHX_ UV c)
2075{
2076 U8 tmpbuf[UTF8_MAXBYTES+1];
2077 uvchr_to_utf8(tmpbuf, c);
2078 return _is_utf8_perl_idstart(tmpbuf);
2079}
2080
3a4c58c9
KW
2081UV
2082Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2083{
2084 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2085 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2086 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2087 * either "SS" or "Ss". Which one to use is passed into the routine in
2088 * 'S_or_s' to avoid a test */
2089
2090 UV converted = toUPPER_LATIN1_MOD(c);
2091
2092 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2093
2094 assert(S_or_s == 'S' || S_or_s == 's');
2095
6f2d5cbc 2096 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2097 characters in this range */
3a4c58c9
KW
2098 *p = (U8) converted;
2099 *lenp = 1;
2100 return converted;
2101 }
2102
2103 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2104 * which it maps to one of them, so as to only have to have one check for
2105 * it in the main case */
2106 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2107 switch (c) {
2108 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2109 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2110 break;
2111 case MICRO_SIGN:
2112 converted = GREEK_CAPITAL_LETTER_MU;
2113 break;
79e064b9
KW
2114#if UNICODE_MAJOR_VERSION > 2 \
2115 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2116 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2117 case LATIN_SMALL_LETTER_SHARP_S:
2118 *(p)++ = 'S';
2119 *p = S_or_s;
2120 *lenp = 2;
2121 return 'S';
79e064b9 2122#endif
3a4c58c9
KW
2123 default:
2124 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2125 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2126 }
2127 }
2128
2129 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2130 *p = UTF8_TWO_BYTE_LO(converted);
2131 *lenp = 2;
2132
2133 return converted;
2134}
2135
50bda2c3
KW
2136/* Call the function to convert a UTF-8 encoded character to the specified case.
2137 * Note that there may be more than one character in the result.
2138 * INP is a pointer to the first byte of the input character
2139 * OUTP will be set to the first byte of the string of changed characters. It
2140 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2141 * LENP will be set to the length in bytes of the string of changed characters
2142 *
2143 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2144#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2145#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2146#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2147
b9992569
KW
2148/* This additionally has the input parameter 'specials', which if non-zero will
2149 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2150 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2151#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2152
84afefe6
JH
2153UV
2154Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2155{
a1433954
KW
2156 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2157 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2158 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2159 * the changed version may be longer than the original character.
2160 *
2161 * The ordinal of the first character of the changed version is returned
2162 * (but note, as explained above, that there may be more.) */
2163
7918f24d
NC
2164 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2165
3a4c58c9
KW
2166 if (c < 256) {
2167 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2168 }
2169
0ebc6274 2170 uvchr_to_utf8(p, c);
b9992569 2171 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2172}
2173
84afefe6
JH
2174UV
2175Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2176{
7918f24d
NC
2177 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2178
3a4c58c9
KW
2179 if (c < 256) {
2180 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2181 }
2182
0ebc6274 2183 uvchr_to_utf8(p, c);
b9992569 2184 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2185}
2186
afc16117 2187STATIC U8
81611534 2188S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
2189{
2190 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2191 * those, converting the result to UTF-8. Since the result is always just
a1433954 2192 * one character, we allow <p> to be NULL */
afc16117
KW
2193
2194 U8 converted = toLOWER_LATIN1(c);
2195
2196 if (p != NULL) {
6f2d5cbc 2197 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2198 *p = converted;
2199 *lenp = 1;
2200 }
2201 else {
430c9760
KW
2202 /* Result is known to always be < 256, so can use the EIGHT_BIT
2203 * macros */
2204 *p = UTF8_EIGHT_BIT_HI(converted);
2205 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2206 *lenp = 2;
2207 }
2208 }
2209 return converted;
2210}
2211
84afefe6
JH
2212UV
2213Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2214{
7918f24d
NC
2215 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2216
afc16117
KW
2217 if (c < 256) {
2218 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
2219 }
2220
afc16117 2221 uvchr_to_utf8(p, c);
b9992569 2222 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2223}
2224
84afefe6 2225UV
51910141 2226Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2227{
51910141 2228 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2229 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2230 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2231 *
2232 * Not to be used for locale folds
51910141 2233 */
f673fad4 2234
a1dde8de
KW
2235 UV converted;
2236
2237 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2238 PERL_UNUSED_CONTEXT;
a1dde8de 2239
1ca267a5
KW
2240 assert (! (flags & FOLD_FLAGS_LOCALE));
2241
659a7c2d 2242 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2243 converted = GREEK_SMALL_LETTER_MU;
2244 }
9b63e895
KW
2245#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2246 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2247 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2248 else if ( (flags & FOLD_FLAGS_FULL)
2249 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2250 {
1ca267a5
KW
2251 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2252 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2253 * under those circumstances. */
2254 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2255 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2256 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2257 p, *lenp, U8);
2258 return LATIN_SMALL_LETTER_LONG_S;
2259 }
2260 else {
4f489194
KW
2261 *(p)++ = 's';
2262 *p = 's';
2263 *lenp = 2;
2264 return 's';
1ca267a5 2265 }
a1dde8de 2266 }
9b63e895 2267#endif
a1dde8de
KW
2268 else { /* In this range the fold of all other characters is their lower
2269 case */
2270 converted = toLOWER_LATIN1(c);
2271 }
2272
6f2d5cbc 2273 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2274 *p = (U8) converted;
2275 *lenp = 1;
2276 }
2277 else {
2278 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2279 *p = UTF8_TWO_BYTE_LO(converted);
2280 *lenp = 2;
2281 }
2282
2283 return converted;
2284}
2285
2286UV
31f05a37 2287Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2288{
4b593389 2289
a0270393
KW
2290 /* Not currently externally documented, and subject to change
2291 * <flags> bits meanings:
2292 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2293 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2294 * locale are to be used.
a0270393
KW
2295 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2296 */
4b593389 2297
36bb2ab6 2298 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2299
780fcc9f
KW
2300 if (flags & FOLD_FLAGS_LOCALE) {
2301 /* Treat a UTF-8 locale as not being in locale at all */
2302 if (IN_UTF8_CTYPE_LOCALE) {
2303 flags &= ~FOLD_FLAGS_LOCALE;
2304 }
2305 else {
2306 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2307 goto needs_full_generality;
780fcc9f 2308 }
31f05a37
KW
2309 }
2310
a1dde8de 2311 if (c < 256) {
e7b7ac46 2312 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2313 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2314 }
2315
2f306ab9 2316 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2317 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2318 uvchr_to_utf8(p, c);
b9992569 2319 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2320 }
2321 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2322 the special flags. */
2323 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2324
2325 needs_full_generality:
a0270393 2326 uvchr_to_utf8(utf8_c, c);
445bf929 2327 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2328 }
84afefe6
JH
2329}
2330
26483009 2331PERL_STATIC_INLINE bool
5141f98e 2332S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2333 const char *const swashname, SV* const invlist)
bde6a22d 2334{
ea317ccb
KW
2335 /* returns a boolean giving whether or not the UTF8-encoded character that
2336 * starts at <p> is in the swash indicated by <swashname>. <swash>
2337 * contains a pointer to where the swash indicated by <swashname>
2338 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2339 * look at <*swash> and only generate a swash if it is not null. <invlist>
2340 * is NULL or an inversion list that defines the swash. If not null, it
2341 * saves time during initialization of the swash.
ea317ccb
KW
2342 *
2343 * Note that it is assumed that the buffer length of <p> is enough to
2344 * contain all the bytes that comprise the character. Thus, <*p> should
2345 * have been checked before this call for mal-formedness enough to assure
2346 * that. */
2347
7918f24d
NC
2348 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2349
492a624f 2350 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2351 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2352 * as far as there being enough bytes available in it to accommodate the
2353 * character without reading beyond the end, and pass that number on to the
2354 * validating routine */
6302f837 2355 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2356 if (ckWARN_d(WARN_UTF8)) {
2357 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2358 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2359 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2360 what the malformation is */
2361 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2362 }
2363 }
2364 return FALSE;
2365 }
87367d5f
KW
2366 if (!*swash) {
2367 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2368 *swash = _core_swash_init("utf8",
2369
2370 /* Only use the name if there is no inversion
2371 * list; otherwise will go out to disk */
2372 (invlist) ? "" : swashname,
2373
2374 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2375 }
28123549 2376
bde6a22d
NC
2377 return swash_fetch(*swash, p, TRUE) != 0;
2378}
2379
2380bool
922e8cb4
KW
2381Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2382{
922e8cb4
KW
2383 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2384
2385 assert(classnum < _FIRST_NON_SWASH_CC);
2386
f25ce844
KW
2387 return is_utf8_common(p,
2388 &PL_utf8_swash_ptrs[classnum],
2389 swash_property_names[classnum],
2390 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2391}
2392
2393bool
f2645549 2394Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2395{
f2645549 2396 SV* invlist = NULL;
7918f24d 2397
f2645549 2398 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2399
f2645549
KW
2400 if (! PL_utf8_perl_idstart) {
2401 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2402 }
60071a22 2403 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2404}
2405
2406bool
f2645549 2407Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2408{
f2645549 2409 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2410
2411 if (*p == '_')
2412 return TRUE;
f25ce844 2413 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2414}
2415
2416bool
eba68aa0
KW
2417Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2418{
b24b43f7 2419 SV* invlist = NULL;
eba68aa0
KW
2420
2421 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2422
b24b43f7
KW
2423 if (! PL_utf8_perl_idcont) {
2424 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2425 }
60071a22 2426 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2427}
2428
eba68aa0 2429bool
f2645549 2430Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2431{
f2645549 2432 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2433
f25ce844 2434 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2435}
2436
2437bool
f2645549 2438Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2439{
f2645549 2440 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2441
f25ce844 2442 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2443}
2444
2445bool
7dbf68d2
KW
2446Perl__is_utf8_mark(pTHX_ const U8 *p)
2447{
7dbf68d2
KW
2448 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2449
f25ce844 2450 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2451}
2452
6b5c0936 2453/*
87cea99e 2454=for apidoc to_utf8_case
6b5c0936 2455
9da1e7cb
KW
2456Instead use the appropriate one of L</toUPPER_utf8>,
2457L</toTITLE_utf8>,
2458L</toLOWER_utf8>,
2459or L</toFOLD_utf8>.
2460
6fae5207 2461C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2462the character that is being converted. This routine assumes that the character
2463at C<p> is well-formed.
6b5c0936 2464
6fae5207
KW
2465C<ustrp> is a pointer to the character buffer to put the
2466conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2467of the result.
2468
6fae5207 2469C<swashp> is a pointer to the swash to use.
6b5c0936 2470
a1433954 2471Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2472and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2473but not always, a multicharacter mapping), is tried first.
6b5c0936 2474
4a8240a3
KW
2475C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2476any special mappings; C<""> means to use the special mappings. Values other
2477than these two are treated as the name of the hash containing the special
2478mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2479
796b6530
KW
2480C<normal> is a string like C<"ToLower"> which means the swash
2481C<%utf8::ToLower>.
0134edef 2482
760c7c2f
KW
2483Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2484unless those are turned off.
2485
0134edef 2486=cut */
6b5c0936 2487
2104c8d9 2488UV
9a957fbc
AL
2489Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2490 SV **swashp, const char *normal, const char *special)
a0ed51b3 2491{
b9992569
KW
2492 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2493
2494 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2495}
2496
2497 /* change namve uv1 to 'from' */
6a4a25f4 2498STATIC UV
b9992569
KW
2499S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2500 SV **swashp, const char *normal, const char *special)
2501{
0134edef 2502 STRLEN len = 0;
7918f24d 2503
b9992569 2504 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2505
36eaa811
KW
2506 /* For code points that don't change case, we already know that the output
2507 * of this function is the unchanged input, so we can skip doing look-ups
2508 * for them. Unfortunately the case-changing code points are scattered
2509 * around. But there are some long consecutive ranges where there are no
2510 * case changing code points. By adding tests, we can eliminate the lookup
2511 * for all the ones in such ranges. This is currently done here only for
2512 * just a few cases where the scripts are in common use in modern commerce
2513 * (and scripts adjacent to those which can be included without additional
2514 * tests). */
2515
2516 if (uv1 >= 0x0590) {
2517 /* This keeps from needing further processing the code points most
2518 * likely to be used in the following non-cased scripts: Hebrew,
2519 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2520 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2521 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2522 if (uv1 < 0x10A0) {
2523 goto cases_to_self;
2524 }
2525
2526 /* The following largish code point ranges also don't have case
2527 * changes, but khw didn't think they warranted extra tests to speed
2528 * them up (which would slightly slow down everything else above them):
2529 * 1100..139F Hangul Jamo, Ethiopic
2530 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2531 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2532 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2533 * Combining Diacritical Marks Extended, Balinese,
2534 * Sundanese, Batak, Lepcha, Ol Chiki
2535 * 2000..206F General Punctuation
2536 */
2537
2538 if (uv1 >= 0x2D30) {
2539
2540 /* This keeps the from needing further processing the code points
2541 * most likely to be used in the following non-cased major scripts:
2542 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2543 *
2544 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2545 * event that Unicode eventually allocates the unused block as of
2546 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2547 * that the test suite will start having failures to alert you
2548 * should that happen) */
2549 if (uv1 < 0xA640) {
2550 goto cases_to_self;
2551 }
2552
2553 if (uv1 >= 0xAC00) {
2554 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2555 if (ckWARN_d(WARN_SURROGATE)) {
2556 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2557 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2558 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2559 }
2560 goto cases_to_self;
2561 }
36eaa811
KW
2562
2563 /* AC00..FAFF Catches Hangul syllables and private use, plus
2564 * some others */
2565 if (uv1 < 0xFB00) {
2566 goto cases_to_self;
2567
2568 }
2569
5af9bc97
KW
2570 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2571 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2572 && ckWARN_d(WARN_DEPRECATED))
2573 {
2574 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2575 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2576 }
2577 if (ckWARN_d(WARN_NON_UNICODE)) {
2578 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2579 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2580 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2581 }
2582 goto cases_to_self;
2583 }
3bfc1e70
KW
2584#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2585 if (UNLIKELY(uv1
2586 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2587 {
2588
2589 /* As of this writing, this means we avoid swash creation
2590 * for anything beyond low Plane 1 */
2591 goto cases_to_self;
2592 }
2593#endif
36eaa811
KW
2594 }
2595 }
9ae3ac1a 2596
36eaa811
KW
2597 /* Note that non-characters are perfectly legal, so no warning should
2598 * be given. There are so few of them, that it isn't worth the extra
2599 * tests to avoid swash creation */
9ae3ac1a
KW
2600 }
2601
0134edef 2602 if (!*swashp) /* load on-demand */
5ab9d2ef 2603 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2604
a6f87d8c 2605 if (special) {
0134edef 2606 /* It might be "special" (sometimes, but not always,
2a37f04d 2607 * a multicharacter mapping) */
4a8240a3 2608 HV *hv = NULL;
b08cf34e
JH
2609 SV **svp;
2610
4a8240a3
KW
2611 /* If passed in the specials name, use that; otherwise use any
2612 * given in the swash */
2613 if (*special != '\0') {
2614 hv = get_hv(special, 0);
2615 }
2616 else {
2617 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2618 if (svp) {
2619 hv = MUTABLE_HV(SvRV(*svp));
2620 }
2621 }
2622
176fe009 2623 if (hv
5f560d8a 2624 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2625 && (*svp))
2626 {
cfd0369c 2627 const char *s;
47654450 2628
cfd0369c 2629 s = SvPV_const(*svp, len);
47654450 2630 if (len == 1)
f4cd282c 2631 /* EIGHTBIT */
c80e42f3 2632 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2633 else {
d2dcd0fb 2634 Copy(s, ustrp, len, U8);
29e98929 2635 }
983ffd37 2636 }
0134edef
JH
2637 }
2638
2639 if (!len && *swashp) {
4a4088c4 2640 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2641
0134edef
JH
2642 if (uv2) {
2643 /* It was "normal" (a single character mapping). */
f4cd282c 2644 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2645 }
2646 }
1feea2c7 2647
cbe07460
KW
2648 if (len) {
2649 if (lenp) {
2650 *lenp = len;
2651 }
2652 return valid_utf8_to_uvchr(ustrp, 0);
2653 }
2654
2655 /* Here, there was no mapping defined, which means that the code point maps
2656 * to itself. Return the inputs */
e24dfe9c 2657 cases_to_self:
bfdf22ec 2658 len = UTF8SKIP(p);
ca9fab46
KW
2659 if (p != ustrp) { /* Don't copy onto itself */
2660 Copy(p, ustrp, len, U8);
2661 }
0134edef 2662
2a37f04d
JH
2663 if (lenp)
2664 *lenp = len;
2665
f4cd282c 2666 return uv1;
cbe07460 2667
a0ed51b3
LW
2668}
2669
051a06d4 2670STATIC UV
357aadde 2671S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2672{
4a4088c4 2673 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2674 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2675 * result contains a character that crosses the 255/256 boundary, disallow
2676 * the change, and return the original code point. See L<perlfunc/lc> for
2677 * why;
051a06d4 2678 *
a1433954
KW
2679 * p points to the original string whose case was changed; assumed
2680 * by this routine to be well-formed
051a06d4
KW
2681 * result the code point of the first character in the changed-case string
2682 * ustrp points to the changed-case string (<result> represents its first char)
2683 * lenp points to the length of <ustrp> */
2684
2685 UV original; /* To store the first code point of <p> */
2686
2687 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2688
a4f12ed7 2689 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2690
2691 /* We know immediately if the first character in the string crosses the
2692 * boundary, so can skip */
2693 if (result > 255) {
2694
2695 /* Look at every character in the result; if any cross the
2696 * boundary, the whole thing is disallowed */
2697 U8* s = ustrp + UTF8SKIP(ustrp);
2698 U8* e = ustrp + *lenp;
2699 while (s < e) {
a4f12ed7 2700 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2701 goto bad_crossing;
2702 }
2703 s += UTF8SKIP(s);
2704 }
2705
613abc6d
KW
2706 /* Here, no characters crossed, result is ok as-is, but we warn. */
2707 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2708 return result;
2709 }
2710
7b52d656 2711 bad_crossing:
051a06d4
KW
2712
2713 /* Failed, have to return the original */
4b88fb76 2714 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2715
2716 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2717 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2718 "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
2719 "resolved to \"\\x{%"UVXf"}\".",
357aadde 2720 OP_DESC(PL_op),
ab0b796c
KW
2721 original,
2722 original);
051a06d4
KW
2723 Copy(p, ustrp, *lenp, char);
2724 return original;
2725}
2726
d3e79532 2727/*
87cea99e 2728=for apidoc to_utf8_upper
d3e79532 2729
1f607577 2730Instead use L</toUPPER_utf8>.
a1433954 2731
d3e79532
JH
2732=cut */
2733
051a06d4 2734/* Not currently externally documented, and subject to change:
31f05a37
KW
2735 * <flags> is set iff iff the rules from the current underlying locale are to
2736 * be used. */
051a06d4 2737
2104c8d9 2738UV
31f05a37 2739Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2740{
051a06d4
KW
2741 UV result;
2742
2743 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2744
780fcc9f
KW
2745 if (flags) {
2746 /* Treat a UTF-8 locale as not being in locale at all */
2747 if (IN_UTF8_CTYPE_LOCALE) {
2748 flags = FALSE;
2749 }
2750 else {
2751 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2752 }
31f05a37
KW
2753 }
2754
3a4c58c9 2755 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2756 if (flags) {
2757 result = toUPPER_LC(*p);
2758 }
2759 else {
81c6c7ce 2760 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2761 }
3a4c58c9
KW
2762 }
2763 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2764 if (flags) {
a62b247b 2765 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2766 result = toUPPER_LC(c);
051a06d4
KW
2767 }
2768 else {
a62b247b 2769 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2770 ustrp, lenp, 'S');
051a06d4
KW
2771 }
2772 }
4a4088c4 2773 else { /* UTF-8, ord above 255 */
b9992569 2774 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2775
2776 if (flags) {
357aadde 2777 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2778 }
2779 return result;
2780 }
2781
4a4088c4 2782 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2783 if (UTF8_IS_INVARIANT(result)) {
2784 *ustrp = (U8) result;
2785 *lenp = 1;
2786 }
2787 else {
62cb07ea
KW
2788 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2789 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2790 *lenp = 2;
3a4c58c9 2791 }
baa60164 2792
051a06d4 2793 return result;
983ffd37 2794}
a0ed51b3 2795
d3e79532 2796/*
87cea99e 2797=for apidoc to_utf8_title
d3e79532 2798
1f607577 2799Instead use L</toTITLE_utf8>.
a1433954 2800
d3e79532
JH
2801=cut */
2802
051a06d4 2803/* Not currently externally documented, and subject to change:
31f05a37
KW
2804 * <flags> is set iff the rules from the current underlying locale are to be
2805 * used. Since titlecase is not defined in POSIX, for other than a
2806 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2807 */
051a06d4 2808
983ffd37 2809UV
31f05a37 2810Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2811{
051a06d4
KW
2812 UV result;
2813
2814 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2815
780fcc9f
KW
2816 if (flags) {
2817 /* Treat a UTF-8 locale as not being in locale at all */
2818 if (IN_UTF8_CTYPE_LOCALE) {
2819 flags = FALSE;
2820 }
2821 else {
2822 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2823 }
31f05a37
KW
2824 }
2825
3a4c58c9 2826 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2827 if (flags) {
2828 result = toUPPER_LC(*p);
2829 }
2830 else {
81c6c7ce 2831 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2832 }
3a4c58c9
KW
2833 }
2834 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2835 if (flags) {
a62b247b 2836 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2837 result = toUPPER_LC(c);
051a06d4
KW
2838 }
2839 else {
a62b247b 2840 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2841 ustrp, lenp, 's');
051a06d4
KW
2842 }
2843 }
4a4088c4 2844 else { /* UTF-8, ord above 255 */
b9992569 2845 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2846
2847 if (flags) {
357aadde 2848 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2849 }
2850 return result;
2851 }
2852
4a4088c4 2853 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2854 if (UTF8_IS_INVARIANT(result)) {
2855 *ustrp = (U8) result;
2856 *lenp = 1;
2857 }
2858 else {
62cb07ea
KW
2859 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2860 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2861 *lenp = 2;
3a4c58c9
KW
2862 }
2863
051a06d4 2864 return result;
a0ed51b3
LW
2865}
2866
d3e79532 2867/*
87cea99e 2868=for apidoc to_utf8_lower
d3e79532 2869
1f607577 2870Instead use L</toLOWER_utf8>.
a1433954 2871
d3e79532
JH
2872=cut */
2873
051a06d4 2874/* Not currently externally documented, and subject to change:
31f05a37
KW
2875 * <flags> is set iff iff the rules from the current underlying locale are to
2876 * be used.
2877 */
051a06d4 2878
2104c8d9 2879UV
31f05a37 2880Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2881{
051a06d4
KW
2882 UV result;
2883
051a06d4 2884 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2885
780fcc9f
KW
2886 if (flags) {
2887 /* Treat a UTF-8 locale as not being in locale at all */
2888 if (IN_UTF8_CTYPE_LOCALE) {
2889 flags = FALSE;
2890 }
2891 else {
2892 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2893 }
31f05a37
KW
2894 }
2895
968c5e6a 2896 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2897 if (flags) {
2898 result = toLOWER_LC(*p);
2899 }
2900 else {
81c6c7ce 2901 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2902 }
968c5e6a
KW
2903 }
2904 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2905 if (flags) {
a62b247b 2906 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2907 result = toLOWER_LC(c);
051a06d4
KW
2908 }
2909 else {
a62b247b 2910 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2911 ustrp, lenp);
051a06d4 2912 }
968c5e6a 2913 }
4a4088c4 2914 else { /* UTF-8, ord above 255 */
b9992569 2915 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2916
2917 if (flags) {
357aadde 2918 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2919 }
968c5e6a 2920
051a06d4
KW
2921 return result;
2922 }
2923
4a4088c4 2924 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2925 if (UTF8_IS_INVARIANT(result)) {
2926 *ustrp = (U8) result;
2927 *lenp = 1;
2928 }
2929 else {
62cb07ea
KW
2930 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2931 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2932 *lenp = 2;
2933 }
2934
051a06d4 2935 return result;
b4e400f9
JH
2936}
2937
d3e79532 2938/*
87cea99e 2939=for apidoc to_utf8_fold
d3e79532 2940
1f607577 2941Instead use L</toFOLD_utf8>.
a1433954 2942
d3e79532
JH
2943=cut */
2944
051a06d4
KW
2945/* Not currently externally documented, and subject to change,
2946 * in <flags>
31f05a37
KW
2947 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2948 * locale are to be used.
051a06d4
KW
2949 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2950 * otherwise simple folds
a0270393
KW
2951 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2952 * prohibited
445bf929 2953 */
36bb2ab6 2954
b4e400f9 2955UV
445bf929 2956Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2957{
051a06d4
KW
2958 UV result;
2959
36bb2ab6 2960 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2961
a0270393
KW
2962 /* These are mutually exclusive */
2963 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2964
50ba90ff
KW
2965 assert(p != ustrp); /* Otherwise overwrites */
2966
780fcc9f
KW
2967 if (flags & FOLD_FLAGS_LOCALE) {
2968 /* Treat a UTF-8 locale as not being in locale at all */
2969 if (IN_UTF8_CTYPE_LOCALE) {
2970 flags &= ~FOLD_FLAGS_LOCALE;
2971 }
2972 else {
2973 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2974 }
31f05a37
KW
2975 }
2976
a1dde8de 2977 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2978 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2979 result = toFOLD_LC(*p);
051a06d4
KW
2980 }
2981 else {
81c6c7ce 2982 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2983 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2984 }
a1dde8de
KW
2985 }
2986 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2987 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2988 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2989 result = toFOLD_LC(c);
051a06d4
KW
2990 }
2991 else {
a62b247b 2992 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2993 ustrp, lenp,
1ca267a5 2994 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2995 }
a1dde8de 2996 }
4a4088c4 2997 else { /* UTF-8, ord above 255 */
b9992569 2998 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2999
1ca267a5
KW
3000 if (flags & FOLD_FLAGS_LOCALE) {
3001
76f2ffcd 3002# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3003 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3004
3005# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3006# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3007
3008 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3009
538e84ed
KW
3010 /* Special case these two characters, as what normally gets
3011 * returned under locale doesn't work */
76f2ffcd
KW
3012 if (UTF8SKIP(p) == cap_sharp_s_len
3013 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3014 {
ab0b796c
KW
3015 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3016 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3017 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3018 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3019 goto return_long_s;
3020 }
0766489e
KW
3021 else
3022#endif
3023 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3024 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3025 {
ab0b796c
KW
3026 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3027 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3028 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3029 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3030 goto return_ligature_st;
3031 }
74894415
KW
3032
3033#if UNICODE_MAJOR_VERSION == 3 \
3034 && UNICODE_DOT_VERSION == 0 \
3035 && UNICODE_DOT_DOT_VERSION == 1
3036# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3037
3038 /* And special case this on this Unicode version only, for the same
3039 * reaons the other two are special cased. They would cross the
3040 * 255/256 boundary which is forbidden under /l, and so the code
3041 * wouldn't catch that they are equivalent (which they are only in
3042 * this release) */
3043 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3044 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3045 {
3046 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3047 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3048 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3049 "resolved to \"\\x{0131}\".");
3050 goto return_dotless_i;
3051 }
3052#endif
3053
357aadde 3054 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3055 }
a0270393
KW
3056 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3057 return result;
3058 }
3059 else {
4a4088c4 3060 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3061 * character above the ASCII range, and the result should not
3062 * contain an ASCII character. */
a0270393
KW
3063
3064 UV original; /* To store the first code point of <p> */
3065
3066 /* Look at every character in the result; if any cross the
3067 * boundary, the whole thing is disallowed */
3068 U8* s = ustrp;
3069 U8* e = ustrp + *lenp;
3070 while (s < e) {
3071 if (isASCII(*s)) {
3072 /* Crossed, have to return the original */
3073 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3074
9fc2026f 3075 /* But in these instances, there is an alternative we can
1ca267a5 3076 * return that is valid */
0766489e
KW
3077 if (original == LATIN_SMALL_LETTER_SHARP_S
3078#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3079 || original == LATIN_CAPITAL_LETTER_SHARP_S
3080#endif
3081 ) {
1ca267a5
KW
3082 goto return_long_s;
3083 }
9fc2026f
KW
3084 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3085 goto return_ligature_st;
3086 }
74894415
KW
3087#if UNICODE_MAJOR_VERSION == 3 \
3088 && UNICODE_DOT_VERSION == 0 \
3089 && UNICODE_DOT_DOT_VERSION == 1
3090
3091 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3092 goto return_dotless_i;
3093 }
3094#endif
a0270393
KW
3095 Copy(p, ustrp, *lenp, char);
3096 return original;
3097 }
3098 s += UTF8SKIP(s);
3099 }
051a06d4 3100
a0270393
KW
3101 /* Here, no characters crossed, result is ok as-is */
3102 return result;
3103 }
051a06d4
KW
3104 }
3105
4a4088c4 3106 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3107 if (UTF8_IS_INVARIANT(result)) {
3108 *ustrp = (U8) result;
3109 *lenp = 1;
3110 }
3111 else {
62cb07ea
KW
3112 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3113 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3114 *lenp = 2;
3115 }
3116
051a06d4 3117 return result;
1ca267a5
KW
3118
3119 return_long_s:
3120 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3121 * folds to a string of two of these characters. By returning this
3122 * instead, then, e.g.,
3123 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3124 * works. */
3125
3126 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3127 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3128 ustrp, *lenp, U8);
3129 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3130
3131 return_ligature_st:
3132 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3133 * have the other one fold to it */
3134
3135 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3136 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3137 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3138
3139#if UNICODE_MAJOR_VERSION == 3 \
3140 && UNICODE_DOT_VERSION == 0 \
3141 && UNICODE_DOT_DOT_VERSION == 1
3142
3143 return_dotless_i:
3144 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3145 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3146 return LATIN_SMALL_LETTER_DOTLESS_I;
3147
3148#endif
3149
a0ed51b3
LW
3150}
3151
711a919c 3152/* Note:
f90a9a02 3153 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
3154 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3155 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3156 */
c4a5db0c 3157
a0ed51b3 3158SV*
7fc63493 3159Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3160{
c4a5db0c
KW
3161 PERL_ARGS_ASSERT_SWASH_INIT;
3162
3163 /* Returns a copy of a swash initiated by the called function. This is the
3164 * public interface, and returning a copy prevents others from doing
3165 * mischief on the original */
3166
5d3d13d1 3167 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3168}
3169
3170SV*
5d3d13d1 3171Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3172{
2c1f00b9
YO
3173
3174 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3175 * use the following define */
3176
3177#define CORE_SWASH_INIT_RETURN(x) \
3178 PL_curpm= old_PL_curpm; \
3179 return x
3180
c4a5db0c 3181 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3182 * by calling utf8_heavy.pl in the general case. The returned value may be
3183 * the swash's inversion list instead if the input parameters allow it.
3184 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3185 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3186 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3187 *
3188 * This interface should only be used by functions that won't destroy or
3189 * adversely change the swash, as doing so affects all other uses of the
3190 * swash in the program; the general public should use 'Perl_swash_init'
3191 * instead.
3192 *
3193 * pkg is the name of the package that <name> should be in.
3194 * name is the name of the swash to find. Typically it is a Unicode
3195 * property name, including user-defined ones
3196 * listsv is a string to initialize the swash with. It must be of the form
3197 * documented as the subroutine return value in
3198 * L<perlunicode/User-Defined Character Properties>
3199 * minbits is the number of bits required to represent each data element.
3200 * It is '1' for binary properties.
3201 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3202 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3203 * flags_p if non-NULL is the address of various input and output flag bits
3204 * to the routine, as follows: ('I' means is input to the routine;
3205 * 'O' means output from the routine. Only flags marked O are
3206 * meaningful on return.)
3207 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3208 * came from a user-defined property. (I O)
5d3d13d1
KW
3209 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3210 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3211 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3212 * return of an inversion list instead of a swash hash if this routine
3213 * thinks that would result in faster execution of swash_fetch() later
3214 * on. (I)
9a53f6cf
KW
3215 *
3216 * Thus there are three possible inputs to find the swash: <name>,
3217 * <listsv>, and <invlist>. At least one must be specified. The result
3218 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3219 * actions can intersect, etc. what <name> gives. To avoid going out to
3220 * disk at all, <invlist> should specify completely what the swash should
3221 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3222 *
3223 * <invlist> is only valid for binary properties */
c4a5db0c 3224
2c1f00b9
YO
3225 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3226
c4a5db0c 3227 SV* retval = &PL_sv_undef;
83199d38 3228 HV* swash_hv = NULL;
87367d5f
KW
3229 const int invlist_swash_boundary =
3230 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3231 ? 512 /* Based on some benchmarking, but not extensive, see commit
3232 message */
3233 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3234
3235 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3236 assert(! invlist || minbits == 1);
3237
2c1f00b9
YO
3238 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3239 that triggered the swash init and the swash init perl logic itself.
3240 See perl #122747 */
3241
9a53f6cf
KW
3242 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3243 * so */
3244 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3245 dSP;
3246 const size_t pkg_len = strlen(pkg);
3247 const size_t name_len = strlen(name);
3248 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3249 SV* errsv_save;
3250 GV *method;
3251
3252 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3253
3254 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3255 ENTER;
69794297 3256 SAVEHINTS();
2782061f 3257 save_re_context();
650f067c
JL
3258 /* We might get here via a subroutine signature which uses a utf8
3259 * parameter name, at which point PL_subname will have been set
3260 * but not yet used. */
3261 save_item(PL_subname);
69794297
KW
3262 if (PL_parser && PL_parser->error_count)
3263 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3264 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3265 if (!method) { /* demand load UTF-8 */
69794297 3266 ENTER;
db2c6cb3
FC
3267 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3268 GvSV(PL_errgv) = NULL;
1a419e6b 3269#ifndef NO_TAINT_SUPPORT
69794297
KW
3270 /* It is assumed that callers of this routine are not passing in
3271 * any user derived data. */
2782061f
DM
3272 /* Need to do this after save_re_context() as it will set
3273 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3274 * in Perl_magic_get). Even line to create errsv_save can turn on
3275 * PL_tainted. */
284167a5
SM
3276 SAVEBOOL(TAINT_get);
3277 TAINT_NOT;
3278#endif
69794297
KW
3279 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3280 NULL);
eed484f9 3281 {
db2c6cb3
FC
3282 /* Not ERRSV, as there is no need to vivify a scalar we are
3283 about to discard. */
3284 SV * const errsv = GvSV(PL_errgv);
3285 if (!SvTRUE(errsv)) {
3286 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3287 SvREFCNT_dec(errsv);
3288 }
eed484f9 3289 }
69794297
KW
3290 LEAVE;
3291 }
3292 SPAGAIN;
3293 PUSHMARK(SP);
3294 EXTEND(SP,5);
3295 mPUSHp(pkg, pkg_len);
3296 mPUSHp(name, name_len);
3297 PUSHs(listsv);
3298 mPUSHi(minbits);
3299 mPUSHi(none);
3300 PUTBACK;
db2c6cb3
FC
3301 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3302 GvSV(PL_errgv) = NULL;
69794297
KW
3303 /* If we already have a pointer to the method, no need to use
3304 * call_method() to repeat the lookup. */
c41800a8
KW
3305 if (method
3306 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3307 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3308 {
3309 retval = *PL_stack_sp--;
3310 SvREFCNT_inc(retval);
3311 }
eed484f9 3312 {
db2c6cb3
FC
3313 /* Not ERRSV. See above. */
3314 SV * const errsv = GvSV(PL_errgv);
3315 if (!SvTRUE(errsv)) {
3316 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3317 SvREFCNT_dec(errsv);
3318 }
eed484f9 3319 }
ce3b816e 3320 LEAVE;
69794297
KW
3321 POPSTACK;
3322 if (IN_PERL_COMPILETIME) {
3323 CopHINTS_set(PL_curcop, PL_hints);
3324 }
3325 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3326 if (SvPOK(retval)) {
69794297
KW
3327
3328 /* If caller wants to handle missing properties, let them */
5d3d13d1 3329 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3330 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3331 }
3332 Perl_croak(aTHX_
3333 "Can't find Unicode property definition \"%"SVf"\"",
3334 SVfARG(retval));
a25b5927 3335 NOT_REACHED; /* NOTREACHED */
d95e4a00 3336 }
69794297 3337 }
9a53f6cf 3338 } /* End of calling the module to find the swash */
36eb48b4 3339
83199d38
KW
3340 /* If this operation fetched a swash, and we will need it later, get it */
3341 if (retval != &PL_sv_undef
3342 && (minbits == 1 || (flags_p
3343 && ! (*flags_p
3344 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3345 {
3346 swash_hv = MUTABLE_HV(SvRV(retval));
3347
3348 /* If we don't already know that there is a user-defined component to
3349 * this swash, and the user has indicated they wish to know if there is
3350 * one (by passing <flags_p>), find out */
3351 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3352 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3353 if (user_defined && SvUV(*user_defined)) {
3354 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3355 }
3356 }
3357 }
3358
36eb48b4
KW
3359 /* Make sure there is an inversion list for binary properties */
3360 if (minbits == 1) {
3361 SV** swash_invlistsvp = NULL;
3362 SV* swash_invlist = NULL;
9a53f6cf 3363 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3364 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3365 an unclaimed reference count */
36eb48b4 3366
9a53f6cf 3367 /* If this operation fetched a swash, get its already existing
83199d38 3368 * inversion list, or create one for it */
36eb48b4 3369
83199d38 3370 if (swash_hv) {
5c9f4bd2 3371 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3372 if (swash_invlistsvp) {
3373 swash_invlist = *swash_invlistsvp;
3374 invlist_in_swash_is_valid = TRUE;
3375 }
3376 else {
36eb48b4 3377 swash_invlist = _swash_to_invlist(retval);
02c85471 3378 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3379 }
3380 }
3381
3382 /* If an inversion list was passed in, have to include it */
3383 if (invlist) {
3384
3385 /* Any fetched swash will by now have an inversion list in it;
3386 * otherwise <swash_invlist> will be NULL, indicating that we
3387 * didn't fetch a swash */
3388 if (swash_invlist) {
3389
3390 /* Add the passed-in inversion list, which invalidates the one
3391 * already stored in the swash */
3392 invlist_in_swash_is_valid = FALSE;
3393 _invlist_union(invlist, swash_invlist, &swash_invlist);
3394 }
3395 else {
3396
87367d5f
KW
3397 /* Here, there is no swash already. Set up a minimal one, if
3398 * we are going to return a swash */
3399 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3400 swash_hv = newHV();
4aca0fe6 3401 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3402 }
9a53f6cf
KW
3403 swash_invlist = invlist;
3404 }
9a53f6cf
KW
3405 }
3406
3407 /* Here, we have computed the union of all the passed-in data. It may
3408 * be that there was an inversion list in the swash which didn't get
538e84ed 3409 * touched; otherwise save the computed one */
87367d5f
KW
3410 if (! invlist_in_swash_is_valid
3411 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3412 {
5c9f4bd2 3413 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3414 {
3415 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3416 }
cc34d8c5
FC
3417 /* We just stole a reference count. */
3418 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3419 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3420 }
87367d5f 3421
dbfdbd26
KW
3422 SvREADONLY_on(swash_invlist);
3423
c41800a8 3424 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3425 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3426 SvREFCNT_dec(retval);
02c85471
FC
3427 if (!swash_invlist_unclaimed)
3428 SvREFCNT_inc_simple_void_NN(swash_invlist);
3429 retval = newRV_noinc(swash_invlist);
87367d5f 3430 }
36eb48b4
KW
3431 }
3432
2c1f00b9
YO
3433 CORE_SWASH_INIT_RETURN(retval);
3434#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3435}
3436
035d37be
JH
3437
3438/* This API is wrong for special case conversions since we may need to
3439 * return several Unicode characters for a single Unicode character
3440 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3441 * the lower-level routine, and it is similarly broken for returning
38684baa 3442 * multiple values. --jhi
b9992569 3443 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3444/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
3445
3446/* Note:
3447 * Returns the value of property/mapping C<swash> for the first character
3448 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3449 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3450 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3451 *
3452 * A "swash" is a hash which contains initially the keys/values set up by
3453 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3454 * property for all possible code points. Things are stored in a compact form
3455 * (see utf8_heavy.pl) so that calculation is required to find the actual
3456 * property value for a given code point. As code points are looked up, new
3457 * key/value pairs are added to the hash, so that the calculation doesn't have
3458 * to ever be re-done. Further, each calculation is done, not just for the
3459 * desired one, but for a whole block of code points adjacent to that one.
3460 * For binary properties on ASCII machines, the block is usually for 64 code
3461 * points, starting with a code point evenly divisible by 64. Thus if the
3462 * property value for code point 257 is requested, the code goes out and
3463 * calculates the property values for all 64 code points between 256 and 319,
3464 * and stores these as a single 64-bit long bit vector, called a "swatch",
3465 * under the key for code point 256. The key is the UTF-8 encoding for code
3466 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3467 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3468 * for code point 258 is then requested, this code realizes that it would be
3469 * stored under the key for 256, and would find that value and extract the
3470 * relevant bit, offset from 256.
3471 *
3472 * Non-binary properties are stored in as many bits as necessary to represent
3473 * their values (32 currently, though the code is more general than that), not
fc273927 3474 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3475 * vector that encompasses the property values for all code points whose UTF-8
3476 * representations are represented by the key. That is, for all code points
3477 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3478 * bytes of that.
680c470c 3479 */
a0ed51b3 3480UV
680c470c 3481Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3482{
ef8f7699 3483 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3484 U32 klen;
3485 U32 off;
9b56a019 3486 STRLEN slen = 0;
7d85a32c 3487 STRLEN needents;
cfd0369c 3488 const U8 *tmps = NULL;
979f2922 3489 SV *swatch;
08fb1ac5 3490 const U8 c = *ptr;
3568d838 3491
7918f24d
NC
3492 PERL_ARGS_ASSERT_SWASH_FETCH;
3493
87367d5f
KW
3494 /* If it really isn't a hash, it isn't really swash; must be an inversion
3495 * list */
3496 if (SvTYPE(hv) != SVt_PVHV) {
3497 return _invlist_contains_cp((SV*)hv,
3498 (do_utf8)
3499 ? valid_utf8_to_uvchr(ptr, NULL)
3500 : c);
3501 }
3502
08fb1ac5
KW
3503 /* We store the values in a "swatch" which is a vec() value in a swash
3504 * hash. Code points 0-255 are a single vec() stored with key length
3505 * (klen) 0. All other code points have a UTF-8 representation
3506 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3507 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3508 * length for them is the length of the encoded char - 1. ptr[klen] is the
3509 * final byte in the sequence representing the character */
3510 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3511 klen = 0;
3512 needents = 256;
3513 off = c;
3568d838 3514 }
08fb1ac5
KW
3515 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3516 klen = 0;
3517 needents = 256;
a62b247b 3518 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
ST
3519 }
3520 else {
08fb1ac5
KW
3521 klen = UTF8SKIP(ptr) - 1;
3522
3523 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3524 * the vec is the final byte in the sequence. (In EBCDIC this is
3525 * converted to I8 to get consecutive values.) To help you visualize
3526 * all this:
3527 * Straight 1047 After final byte
3528 * UTF-8 UTF-EBCDIC I8 transform
3529 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3530 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3531 * ...
3532 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3533 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3534 * ...
3535 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3536 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3537 * ...
3538 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3539 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3540 * ...
3541 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3542 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3543 *
3544 * (There are no discontinuities in the elided (...) entries.)
3545 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3546 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3547 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3548 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3549 * index into the vec() swatch (after subtracting 0x80, which we
3550 * actually do with an '&').
3551 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3552 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3553 * dicontinuities which go away by transforming it into I8, and we
3554 * effectively subtract 0xA0 to get the index. */
979f2922 3555 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3556 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3557 }
7d85a32c 3558
a0ed51b3 3559 /*
4a4088c4 3560 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3561 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3562 * it's nothing to sniff at.) Pity we usually come through at least
3563 * two function calls to get here...
3564 *
3565 * NB: this code assumes that swatches are never modified, once generated!
3566 */
3567
3568d838 3568 if (hv == PL_last_swash_hv &&
a0ed51b3 3569 klen == PL_last_swash_klen &&
27da23d5 3570 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3571 {
3572 tmps = PL_last_swash_tmps;
3573 slen = PL_last_swash_slen;
3574 }
3575 else {
3576 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3577 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3578
b0e3252e 3579 /* If not cached, generate it via swatch_get */
979f2922 3580 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3581 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3582 {
3583 if (klen) {
3584 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3585 swatch = swatch_get(swash,
3586 code_point & ~((UV)needents - 1),
3587 needents);
3588 }
3589 else { /* For the first 256 code points, the swatch has a key of
3590 length 0 */
3591 swatch = swatch_get(swash, 0, needents);
3592 }
979f2922 3593
923e4eb5 3594 if (IN_PERL_COMPILETIME)
623e6609 3595 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3596
979f2922 3597 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3598
979f2922
ST
3599 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3600 || (slen << 3) < needents)
5637ef5b
NC
3601 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3602 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3603 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3604 }
3605
3606 PL_last_swash_hv = hv;
16d8f38a 3607 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3608 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3609 /* FIXME change interpvar.h? */
3610 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3611 PL_last_swash_slen = slen;
3612 if (klen)
3613 Copy(ptr, PL_last_swash_key, klen, U8);
3614 }
3615
9faf8d75 3616 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3617 case 1:
e7aca353 3618 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3619 case 8:
e7aca353 3620 return ((UV) tmps[off]);
a0ed51b3
LW
3621 case 16:
3622 off <<= 1;
e7aca353
JH
3623 return
3624 ((UV) tmps[off ] << 8) +
3625 ((UV) tmps[off + 1]);
a0ed51b3
LW
3626 case 32:
3627 off <<= 2;
e7aca353
JH
3628 return
3629 ((UV) tmps[off ] << 24) +
3630 ((UV) tmps[off + 1] << 16) +
3631 ((UV) tmps[off + 2] << 8) +
3632 ((UV) tmps[off + 3]);
a0ed51b3 3633 }
5637ef5b
NC
3634 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3635 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3636 NORETURN_FUNCTION_END;
a0ed51b3 3637}
2b9d42f0 3638
319009ee
KW
3639/* Read a single line of the main body of the swash input text. These are of
3640 * the form:
3641 * 0053 0056 0073
3642 * where each number is hex. The first two numbers form the minimum and
3643 * maximum of a range, and the third is the value associated with the range.
3644 * Not all swashes should have a third number
3645 *
3646 * On input: l points to the beginning of the line to be examined; it points
3647 * to somewhere in the string of the whole input text, and is
3648 * terminated by a \n or the null string terminator.
3649 * lend points to the null terminator of that string
3650 * wants_value is non-zero if the swash expects a third number
3651 * typestr is the name of the swash's mapping, like 'ToLower'
3652 * On output: *min, *max, and *val are set to the values read from the line.
3653 * returns a pointer just beyond the line examined. If there was no
3654 * valid min number on the line, returns lend+1
3655 */
3656
3657STATIC U8*
3658S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3659 const bool wants_value, const U8* const typestr)
3660{
3661 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3662 STRLEN numlen; /* Length of the number */
02470786
KW
3663 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3664 | PERL_SCAN_DISALLOW_PREFIX
3665 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3666
3667 /* nl points to the next \n in the scan */
3668 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3669
95543e92
KW
3670 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3671
319009ee
KW
3672 /* Get the first number on the line: the range minimum */
3673 numlen = lend - l;
3674 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3675 *max = *min; /* So can never return without setting max */
319009ee
KW
3676 if (numlen) /* If found a hex number, position past it */
3677 l += numlen;
3678 else if (nl) { /* Else, go handle next line, if any */
3679 return nl + 1; /* 1 is length of "\n" */
3680 }
3681 else { /* Else, no next line */
3682 return lend + 1; /* to LIST's end at which \n is not found */
3683 }
3684
3685 /* The max range value follows, separated by a BLANK */
3686 if (isBLANK(*l)) {
3687 ++l;
02470786
KW
3688 flags = PERL_SCAN_SILENT_ILLDIGIT
3689 | PERL_SCAN_DISALLOW_PREFIX
3690 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3691 numlen = lend - l;
3692 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3693 if (numlen)
3694 l += numlen;
3695 else /* If no value here, it is a single element range */
3696 *max = *min;
3697
3698 /* Non-binary tables have a third entry: what the first element of the
24303724 3699 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3700 if (wants_value) {
3701 if (isBLANK(*l)) {
3702 ++l;
f2a7d0fc
KW
3703 flags = PERL_SCAN_SILENT_ILLDIGIT
3704 | PERL_SCAN_DISALLOW_PREFIX
3705 | PERL_SCAN_SILENT_NON_PORTABLE;
3706 numlen = lend - l;
3707 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3708 if (numlen)
3709 l += numlen;
3710 else
3711 *val = 0;
319009ee
KW
3712 }
3713 else {
3714 *val = 0;
3715 if (typeto) {
dcbac5bb 3716 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3717 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3718 typestr, l);
3719 }
3720 }
3721 }
3722 else
3723 *val = 0; /* bits == 1, then any val should be ignored */
3724 }
3725 else { /* Nothing following range min, should be single element with no
3726 mapping expected */
319009ee
KW
3727 if (wants_value) {
3728 *val = 0;
3729 if (typeto) {
dcbac5bb 3730 /* diag_listed_a