This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlapi: Fix clause that should have been removed earlier
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
2d212e86 40 "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
75 "UTF-16 surrogate U+%04"UVXf, uv); \
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
86 "Unicode non-character U+%04"UVXf" is not " \
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
167 ? "Code point 0x%"UVXf" is not Unicode, and not portable"
168 : "Code point 0x%"UVXf" is not Unicode, may not be portable",
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
b343c774
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
433 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
434 * platform, that is if it represents a code point larger than the highest
435 * representable code point. (For ASCII platforms, we could use memcmp()
436 * because we don't have to convert each byte to I8, but it's very rare
437 * input indeed that would approach overflow, so the loop below will likely
438 * only get executed once.
439 *
440 * 'e' must not be beyond a full character. If it is less than a full
441 * character, the function returns FALSE if there is any input beyond 'e'
442 * that could result in a non-overflowing code point */
443
444 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
445 assert(s + UTF8SKIP(s) >= e);
446
447 for (x = s; x < e; x++, y++) {
448
449 /* If this byte is larger than the corresponding highest UTF-8 byte, it
450 * overflows */
451 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
452 return TRUE;
453 }
454
455 /* If not the same as this byte, it must be smaller, doesn't overflow */
456 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
457 return FALSE;
458 }
459 }
460
461 /* Got to the end and all bytes are the same. If the input is a whole
462 * character, it doesn't overflow. And if it is a partial character,
463 * there's not enough information to tell, so assume doesn't overflow */
464 return FALSE;
465}
466
467PERL_STATIC_INLINE bool
468S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
469{
470 /* Overlongs can occur whenever the number of continuation bytes
471 * changes. That means whenever the number of leading 1 bits in a start
472 * byte increases from the next lower start byte. That happens for start
473 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
474 * illegal start bytes have already been excluded, so don't need to be
475 * tested here;
476 * ASCII platforms: C0, C1
477 * EBCDIC platforms C0, C1, C2, C3, C4, E0
478 *
479 * At least a second byte is required to determine if other sequences will
480 * be an overlong. */
481
482 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
483 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
484
485 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
486 assert(len > 1 && UTF8_IS_START(*s));
487
488 /* Each platform has overlongs after the start bytes given above (expressed
489 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
490 * the logic is the same, except the E0 overlong has already been excluded
491 * on EBCDIC platforms. The values below were found by manually
492 * inspecting the UTF-8 patterns. See the tables in utf8.h and
493 * utfebcdic.h. */
494
495# ifdef EBCDIC
496# define F0_ABOVE_OVERLONG 0xB0
497# define F8_ABOVE_OVERLONG 0xA8
498# define FC_ABOVE_OVERLONG 0xA4
499# define FE_ABOVE_OVERLONG 0xA2
500# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
501 /* I8(0xfe) is FF */
502# else
503
504 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
505 return TRUE;
506 }
507
508# define F0_ABOVE_OVERLONG 0x90
509# define F8_ABOVE_OVERLONG 0x88
510# define FC_ABOVE_OVERLONG 0x84
511# define FE_ABOVE_OVERLONG 0x82
512# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
513# endif
514
515
516 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
517 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
518 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
519 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
520 {
521 return TRUE;
522 }
523
524# if defined(UV_IS_QUAD) || defined(EBCDIC)
525
526 /* Check for the FF overlong. This happens only if all these bytes match;
527 * what comes after them doesn't matter. See tables in utf8.h,
528 * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
529 * instead.) */
530
531 if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
532 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
533 sizeof(FF_OVERLONG_PREFIX) - 1)))
534 {
535 return TRUE;
536 }
537
538#endif
539
540 return FALSE;
541}
542
543#undef F0_ABOVE_OVERLONG
544#undef F8_ABOVE_OVERLONG
545#undef FC_ABOVE_OVERLONG
546#undef FE_ABOVE_OVERLONG
547#undef FF_OVERLONG_PREFIX
548
35f8c9bd 549STRLEN
edc2c47a 550Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 551{
2b479609 552 STRLEN len;
12a4bed3 553 const U8 *x;
35f8c9bd 554
2b479609
KW
555 /* A helper function that should not be called directly.
556 *
557 * This function returns non-zero if the string beginning at 's' and
558 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
559 * code point; otherwise it returns 0. The examination stops after the
560 * first code point in 's' is validated, not looking at the rest of the
561 * input. If 'e' is such that there are not enough bytes to represent a
562 * complete code point, this function will return non-zero anyway, if the
563 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
564 * excluded by 'flags'.
565 *
566 * A non-zero return gives the number of bytes required to represent the
567 * code point. Be aware that if the input is for a partial character, the
568 * return will be larger than 'e - s'.
569 *
570 * This function assumes that the code point represented is UTF-8 variant.
571 * The caller should have excluded this possibility before calling this
572 * function.
573 *
574 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
575 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
576 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
577 * disallowed by the flags. If the input is only for a partial character,
578 * the function will return non-zero if there is any sequence of
579 * well-formed UTF-8 that, when appended to the input sequence, could
580 * result in an allowed code point; otherwise it returns 0. Non characters
581 * cannot be determined based on partial character input. But many of the
582 * other excluded types can be determined with just the first one or two
583 * bytes.
584 *
585 */
586
587 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
588
589 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
590 |UTF8_DISALLOW_ABOVE_31_BIT)));
591 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 592
2b479609 593 /* A variant char must begin with a start byte */
35f8c9bd
KW
594 if (UNLIKELY(! UTF8_IS_START(*s))) {
595 return 0;
596 }
597
edc2c47a
KW
598 /* Examine a maximum of a single whole code point */
599 if (e - s > UTF8SKIP(s)) {
600 e = s + UTF8SKIP(s);
601 }
602
2b479609
KW
603 len = e - s;
604
605 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
606 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 607
2b479609
KW
608 /* The code below is derived from this table. Keep in mind that legal
609 * continuation bytes range between \x80..\xBF for UTF-8, and
610 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
611 * Hence, we don't have to test the upper edge because if any of those
612 * are encountered, the sequence is malformed, and will fail elsewhere
613 * in this function.
614 * UTF-8 UTF-EBCDIC I8
615 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
616 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
617 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
618 *
619 */
620
621#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
622# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 623# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 624
19794540
KW
625# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
626 /* B6 and B7 */ \
627 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
628#else
629# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
630# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
631# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
632#endif
633
634 if ( (flags & UTF8_DISALLOW_SUPER)
635 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
636 return 0; /* Above Unicode */
637 }
638
639 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
640 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
641 {
642 return 0; /* Above 31 bits */
643 }
644
645 if (len > 1) {
646 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
647
648 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 649 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
650 {
651 return 0; /* Above Unicode */
652 }
653
654 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 655 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
656 {
657 return 0; /* Surrogate */
658 }
659
660 if ( (flags & UTF8_DISALLOW_NONCHAR)
661 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
662 {
663 return 0; /* Noncharacter code point */
664 }
665 }
666 }
667
668 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
669 for (x = s + 1; x < e; x++) {
670 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
671 return 0;
672 }
673 }
674
af13dd8a 675 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
676 * overlong. */
677 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
678 return 0;
af13dd8a
KW
679 }
680
12a4bed3
KW
681 /* And finally, that the code point represented fits in a word on this
682 * platform */
683 if (does_utf8_overflow(s, e)) {
684 return 0;
35f8c9bd
KW
685 }
686
2b479609 687 return UTF8SKIP(s);
35f8c9bd
KW
688}
689
7cf8d05d
KW
690STATIC char *
691S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
692{
693 /* Returns a mortalized C string that is a displayable copy of the 'len'
694 * bytes starting at 's', each in a \xXY format. */
695
696 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
697 trailing NUL */
698 const U8 * const e = s + len;
699 char * output;
700 char * d;
701
702 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
703
704 Newx(output, output_len, char);
705 SAVEFREEPV(output);
706
707 d = output;
708 for (; s < e; s++) {
709 const unsigned high_nibble = (*s & 0xF0) >> 4;
710 const unsigned low_nibble = (*s & 0x0F);
711
712 *d++ = '\\';
713 *d++ = 'x';
714
715 if (high_nibble < 10) {
716 *d++ = high_nibble + '0';
717 }
718 else {
719 *d++ = high_nibble - 10 + 'a';
720 }
721
722 if (low_nibble < 10) {
723 *d++ = low_nibble + '0';
724 }
725 else {
726 *d++ = low_nibble - 10 + 'a';
727 }
728 }
729
730 *d = '\0';
731 return output;
732}
733
806547a7 734PERL_STATIC_INLINE char *
7cf8d05d
KW
735S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
736
737 /* How many bytes to print */
738 const STRLEN print_len,
739
740 /* Which one is the non-continuation */
741 const STRLEN non_cont_byte_pos,
742
743 /* How many bytes should there be? */
744 const STRLEN expect_len)
806547a7
KW
745{
746 /* Return the malformation warning text for an unexpected continuation
747 * byte. */
748
7cf8d05d 749 const char * const where = (non_cont_byte_pos == 1)
806547a7 750 ? "immediately"
7cf8d05d
KW
751 : Perl_form(aTHX_ "%d bytes",
752 (int) non_cont_byte_pos);
806547a7
KW
753
754 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
755
7cf8d05d
KW
756 /* We don't need to pass this parameter, but since it has already been
757 * calculated, it's likely faster to pass it; verify under DEBUGGING */
758 assert(expect_len == UTF8SKIP(s));
759
760 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
761 " %s after start byte 0x%02x; need %d bytes, got %d)",
762 malformed_text,
763 _byte_dump_string(s, print_len),
764 *(s + non_cont_byte_pos),
765 where,
766 *s,
767 (int) expect_len,
768 (int) non_cont_byte_pos);
806547a7
KW
769}
770
35f8c9bd
KW
771/*
772
de69f3af 773=for apidoc utf8n_to_uvchr
378516de
KW
774
775THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 776Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 777
9041c2e3 778Bottom level UTF-8 decode routine.
de69f3af 779Returns the native code point value of the first character in the string C<s>,
746afd53
KW
780which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
781C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
782the length, in bytes, of that character.
949cf498
KW
783
784The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
785well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
786causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
787is the next possible position in C<s> that could begin a non-malformed
788character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
789is raised. Some UTF-8 input sequences may contain multiple malformations.
790This function tries to find every possible one in each call, so multiple
791warnings can be raised for each sequence.
949cf498
KW
792
793Various ALLOW flags can be set in C<flags> to allow (and not warn on)
794individual types of malformations, such as the sequence being overlong (that
795is, when there is a shorter sequence that can express the same code point;
796overlong sequences are expressly forbidden in the UTF-8 standard due to
797potential security issues). Another malformation example is the first byte of
798a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
799flags. For allowed 0 length strings, this function returns 0; for allowed
800overlong sequences, the computed code point is returned; for all other allowed
801malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
802determinable reasonable value.
949cf498 803
796b6530 804The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
805flags) malformation is found. If this flag is set, the routine assumes that
806the caller will raise a warning, and this function will silently just set
d088425d
KW
807C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
808
75200dff 809Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 810character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
811in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
812be set to 1. To disambiguate, upon a zero return, see if the first byte of
813C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 814error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
815
816Certain code points are considered problematic. These are Unicode surrogates,
746afd53 817Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 818By default these are considered regular code points, but certain situations
ecc1615f
KW
819warrant special handling for them, which can be specified using the C<flags>
820parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
821three classes are treated as malformations and handled as such. The flags
822C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
823C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
824disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
825restricts the allowed inputs to the strict UTF-8 traditionally defined by
826Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
827definition given by
828L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
829The difference between traditional strictness and C9 strictness is that the
830latter does not forbid non-character code points. (They are still discouraged,
831however.) For more discussion see L<perlunicode/Noncharacter code points>.
832
833The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
834C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
835C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
836raised for their respective categories, but otherwise the code points are
837considered valid (not malformations). To get a category to both be treated as
838a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 839(But note that warnings are not raised if lexically disabled nor if
796b6530 840C<UTF8_CHECK_ONLY> is also specified.)
949cf498 841
760c7c2f
KW
842It is now deprecated to have very high code points (above C<IV_MAX> on the
843platforms) and this function will raise a deprecation warning for these (unless
844such warnings are turned off). This value, is typically 0x7FFF_FFFF (2**31 -1)
845in a 32-bit word.
ab8e6d41
KW
846
847Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
848so using them is more problematic than other above-Unicode code points. Perl
849invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
850likely that non-Perl languages will not be able to read files that contain
aff2be59 851these; nor would Perl understand files
ab8e6d41
KW
852written by something that uses a different extension. For these reasons, there
853is a separate set of flags that can warn and/or disallow these extremely high
854code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
855C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
856are entirely independent from the deprecation warning for code points above
857C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
858code point that needs more than 31 bits to represent. When that happens,
859effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
86032-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
861above-Unicode code points, including these, as malformations; and
862C<UTF8_WARN_SUPER> warns on these.)
863
864On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
865extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
866than on ASCII. Prior to that, code points 2**31 and higher were simply
867unrepresentable, and a different, incompatible method was used to represent
868code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
869and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
870platforms, warning and disallowing 2**31 and higher.
949cf498
KW
871
872All other code points corresponding to Unicode characters, including private
873use and those yet to be assigned, are never considered malformed and never
874warn.
67e989fb 875
37607a96 876=cut
f9380377
KW
877
878Also implemented as a macro in utf8.h
879*/
880
881UV
882Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
883 STRLEN curlen,
884 STRLEN *retlen,
885 const U32 flags)
886{
887 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
888
889 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
890}
891
892/*
893
894=for apidoc utf8n_to_uvchr_error
895
896THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
897Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
898
899This function is for code that needs to know what the precise malformation(s)
900are when an error is found.
901
902It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
903all the others, C<errors>. If this parameter is 0, this function behaves
904identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
905to a C<U32> variable, which this function sets to indicate any errors found.
906Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
907C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
908of these bits will be set if a malformation is found, even if the input
909C<flags> parameter indicates that the given malformation is allowed; the
910exceptions are noted:
911
912=over 4
913
914=item C<UTF8_GOT_ABOVE_31_BIT>
915
916The code point represented by the input UTF-8 sequence occupies more than 31
917bits.
918This bit is set only if the input C<flags> parameter contains either the
919C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
920
921=item C<UTF8_GOT_CONTINUATION>
922
923The input sequence was malformed in that the first byte was a a UTF-8
924continuation byte.
925
926=item C<UTF8_GOT_EMPTY>
927
928The input C<curlen> parameter was 0.
929
930=item C<UTF8_GOT_LONG>
931
932The input sequence was malformed in that there is some other sequence that
933evaluates to the same code point, but that sequence is shorter than this one.
934
935=item C<UTF8_GOT_NONCHAR>
936
937The code point represented by the input UTF-8 sequence is for a Unicode
938non-character code point.
939This bit is set only if the input C<flags> parameter contains either the
940C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
941
942=item C<UTF8_GOT_NON_CONTINUATION>
943
944The input sequence was malformed in that a non-continuation type byte was found
945in a position where only a continuation type one should be.
946
947=item C<UTF8_GOT_OVERFLOW>
948
949The input sequence was malformed in that it is for a code point that is not
950representable in the number of bits available in a UV on the current platform.
951
952=item C<UTF8_GOT_SHORT>
953
954The input sequence was malformed in that C<curlen> is smaller than required for
955a complete sequence. In other words, the input is for a partial character
956sequence.
957
958=item C<UTF8_GOT_SUPER>
959
960The input sequence was malformed in that it is for a non-Unicode code point;
961that is, one above the legal Unicode maximum.
962This bit is set only if the input C<flags> parameter contains either the
963C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
964
965=item C<UTF8_GOT_SURROGATE>
966
967The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
968code point.
969This bit is set only if the input C<flags> parameter contains either the
970C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
971
972=back
973
974=cut
37607a96 975*/
67e989fb 976
a0ed51b3 977UV
f9380377
KW
978Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
979 STRLEN curlen,
980 STRLEN *retlen,
981 const U32 flags,
982 U32 * errors)
a0ed51b3 983{
d4c19fe8 984 const U8 * const s0 = s;
2b5e7bc2
KW
985 U8 * send = NULL; /* (initialized to silence compilers' wrong
986 warning) */
987 U32 possible_problems = 0; /* A bit is set here for each potential problem
988 found as we go along */
eb83ed87 989 UV uv = *s;
2b5e7bc2
KW
990 STRLEN expectlen = 0; /* How long should this sequence be?
991 (initialized to silence compilers' wrong
992 warning) */
f9380377
KW
993 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
994 this gets set and discarded */
a0dbb045 995
2b5e7bc2
KW
996 /* The below are used only if there is both an overlong malformation and a
997 * too short one. Otherwise the first two are set to 's0' and 'send', and
998 * the third not used at all */
999 U8 * adjusted_s0 = (U8 *) s0;
1000 U8 * adjusted_send;
1001 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1002
f9380377
KW
1003 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1004
1005 if (errors) {
1006 *errors = 0;
1007 }
1008 else {
1009 errors = &discard_errors;
1010 }
a0dbb045 1011
eb83ed87
KW
1012 /* The order of malformation tests here is important. We should consume as
1013 * few bytes as possible in order to not skip any valid character. This is
1014 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1015 * http://unicode.org/reports/tr36 for more discussion as to why. For
1016 * example, once we've done a UTF8SKIP, we can tell the expected number of
1017 * bytes, and could fail right off the bat if the input parameters indicate
1018 * that there are too few available. But it could be that just that first
1019 * byte is garbled, and the intended character occupies fewer bytes. If we
1020 * blindly assumed that the first byte is correct, and skipped based on
1021 * that number, we could skip over a valid input character. So instead, we
1022 * always examine the sequence byte-by-byte.
1023 *
1024 * We also should not consume too few bytes, otherwise someone could inject
1025 * things. For example, an input could be deliberately designed to
1026 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1027 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1028 * which is actually part of of the overflowing sequence), that could look
1029 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1030 * sequence and process the rest, inappropriately.
1031 *
1032 * Some possible input sequences are malformed in more than one way. This
1033 * function goes to lengths to try to find all of them. This is necessary
1034 * for correctness, as the inputs may allow one malformation but not
1035 * another, and if we abandon searching for others after finding the
1036 * allowed one, we could allow in something that shouldn't have been.
1037 */
eb83ed87 1038
b5b9af04 1039 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1040 possible_problems |= UTF8_GOT_EMPTY;
1041 curlen = 0;
1042 uv = 0; /* XXX It could be argued that this should be
1043 UNICODE_REPLACEMENT? */
1044 goto ready_to_handle_errors;
0c443dc2
JH
1045 }
1046
eb83ed87
KW
1047 expectlen = UTF8SKIP(s);
1048
1049 /* A well-formed UTF-8 character, as the vast majority of calls to this
1050 * function will be for, has this expected length. For efficiency, set
1051 * things up here to return it. It will be overriden only in those rare
1052 * cases where a malformation is found */
1053 if (retlen) {
1054 *retlen = expectlen;
1055 }
1056
1057 /* An invariant is trivially well-formed */
1d72bdf6 1058 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1059 return uv;
a0ed51b3 1060 }
67e989fb 1061
eb83ed87 1062 /* A continuation character can't start a valid sequence */
b5b9af04 1063 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1064 possible_problems |= UTF8_GOT_CONTINUATION;
1065 curlen = 1;
1066 uv = UNICODE_REPLACEMENT;
1067 goto ready_to_handle_errors;
ba210ebe 1068 }
9041c2e3 1069
dcd27b3c
KW
1070 /* Here is not a continuation byte, nor an invariant. The only thing left
1071 * is a start byte (possibly for an overlong) */
1072
534752c1
KW
1073 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1074 * that indicate the number of bytes in the character's whole UTF-8
1075 * sequence, leaving just the bits that are part of the value. */
1076 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1077
eb83ed87
KW
1078 /* Now, loop through the remaining bytes in the character's sequence,
1079 * accumulating each into the working value as we go. Be sure to not look
1080 * past the end of the input string */
2b5e7bc2
KW
1081 send = adjusted_send
1082 = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
eb83ed87 1083 for (s = s0 + 1; s < send; s++) {
b5b9af04 1084 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1085 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1086 continue;
1087 }
1088
1089 /* Here, found a non-continuation before processing all expected bytes.
1090 * This byte indicates the beginning of a new character, so quit, even
1091 * if allowing this malformation. */
1092 curlen = s - s0; /* Save how many bytes we actually got */
1093 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1094 goto finish_short;
eb83ed87
KW
1095 } /* End of loop through the character's bytes */
1096
1097 /* Save how many bytes were actually in the character */
1098 curlen = s - s0;
1099
2b5e7bc2
KW
1100 /* Did we get all the continuation bytes that were expected? Note that we
1101 * know this result even without executing the loop above. But we had to
1102 * do the loop to see if there are unexpected non-continuations. */
1103 if (UNLIKELY(curlen < expectlen)) {
1104 possible_problems |= UTF8_GOT_SHORT;
2f8f112e 1105
2b5e7bc2
KW
1106 finish_short:
1107 uv_so_far = uv;
1108 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
1109 }
1110
2b5e7bc2
KW
1111 /* Note that there are two types of too-short malformation. One is when
1112 * there is actual wrong data before the normal termination of the
1113 * sequence. The other is that the sequence wasn't complete before the end
1114 * of the data we are allowed to look at, based on the input 'curlen'.
1115 * This means that we were passed data for a partial character, but it is
1116 * valid as far as we saw. The other is definitely invalid. This
1117 * distinction could be important to a caller, so the two types are kept
1118 * separate. */
1119
1120 /* Check for overflow */
1121 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1122 possible_problems |= UTF8_GOT_OVERFLOW;
1123 uv = UNICODE_REPLACEMENT;
eb83ed87 1124 }
eb83ed87 1125
2b5e7bc2
KW
1126 /* Check for overlong. If no problems so far, 'uv' is the correct code
1127 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1128 * we must look at the UTF-8 byte sequence itself to see if it is for an
1129 * overlong */
1130 if ( ( LIKELY(! possible_problems)
1131 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1132 || ( UNLIKELY( possible_problems)
1133 && ( UNLIKELY(! UTF8_IS_START(*s0))
1134 || ( curlen > 1
1135 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1136 send - s0))))))
2f8f112e 1137 {
2b5e7bc2
KW
1138 possible_problems |= UTF8_GOT_LONG;
1139
1140 /* A convenience macro that matches either of the too-short conditions.
1141 * */
1142# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1143
1144 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1145 UV min_uv = uv_so_far;
1146 STRLEN i;
1147
1148 /* Here, the input is both overlong and is missing some trailing
1149 * bytes. There is no single code point it could be for, but there
1150 * may be enough information present to determine if what we have
1151 * so far is for an unallowed code point, such as for a surrogate.
1152 * The code below has the intelligence to determine this, but just
1153 * for non-overlong UTF-8 sequences. What we do here is calculate
1154 * the smallest code point the input could represent if there were
1155 * no too short malformation. Then we compute and save the UTF-8
1156 * for that, which is what the code below looks at instead of the
1157 * raw input. It turns out that the smallest such code point is
1158 * all we need. */
1159 for (i = curlen; i < expectlen; i++) {
1160 min_uv = UTF8_ACCUMULATE(min_uv,
1161 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1162 }
1163
1164 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1165 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1166 to free it ourselves if
1167 warnings are made fatal */
1168 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1169 }
eb83ed87
KW
1170 }
1171
2b5e7bc2
KW
1172 /* Now check that the input isn't for a problematic code point not allowed
1173 * by the input parameters. */
1174 /* isn't problematic if < this */
1175 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1176 || ( UNLIKELY(possible_problems)
1177 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1178 && ((flags & ( UTF8_DISALLOW_NONCHAR
1179 |UTF8_DISALLOW_SURROGATE
1180 |UTF8_DISALLOW_SUPER
1181 |UTF8_DISALLOW_ABOVE_31_BIT
1182 |UTF8_WARN_NONCHAR
1183 |UTF8_WARN_SURROGATE
1184 |UTF8_WARN_SUPER
1185 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1186 /* In case of a malformation, 'uv' is not valid, and has
1187 * been changed to something in the Unicode range.
1188 * Currently we don't output a deprecation message if there
1189 * is already a malformation, so we don't have to special
1190 * case the test immediately below */
760c7c2f
KW
1191 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1192 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1193 {
2b5e7bc2
KW
1194 /* If there were no malformations, or the only malformation is an
1195 * overlong, 'uv' is valid */
1196 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1197 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1198 possible_problems |= UTF8_GOT_SURROGATE;
1199 }
1200 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1201 possible_problems |= UTF8_GOT_SUPER;
1202 }
1203 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1204 possible_problems |= UTF8_GOT_NONCHAR;
1205 }
1206 }
1207 else { /* Otherwise, need to look at the source UTF-8, possibly
1208 adjusted to be non-overlong */
1209
1210 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1211 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1212 {
2b5e7bc2
KW
1213 possible_problems |= UTF8_GOT_SUPER;
1214 }
1215 else if (curlen > 1) {
1216 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1217 NATIVE_UTF8_TO_I8(*adjusted_s0),
1218 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1219 {
2b5e7bc2 1220 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1221 }
2b5e7bc2
KW
1222 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1223 NATIVE_UTF8_TO_I8(*adjusted_s0),
1224 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1225 {
1226 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1227 }
1228 }
c0236afe 1229
2b5e7bc2
KW
1230 /* We need a complete well-formed UTF-8 character to discern
1231 * non-characters, so can't look for them here */
1232 }
1233 }
949cf498 1234
2b5e7bc2
KW
1235 ready_to_handle_errors:
1236
1237 /* At this point:
1238 * curlen contains the number of bytes in the sequence that
1239 * this call should advance the input by.
1240 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1241 * is set in it for each potential problem found.
1242 * uv contains the code point the input sequence
1243 * represents; or if there is a problem that prevents
1244 * a well-defined value from being computed, it is
1245 * some subsitute value, typically the REPLACEMENT
1246 * CHARACTER.
1247 * s0 points to the first byte of the character
1248 * send points to just after where that (potentially
1249 * partial) character ends
1250 * adjusted_s0 normally is the same as s0, but in case of an
1251 * overlong for which the UTF-8 matters below, it is
1252 * the first byte of the shortest form representation
1253 * of the input.
1254 * adjusted_send normally is the same as 'send', but if adjusted_s0
1255 * is set to something other than s0, this points one
1256 * beyond its end
1257 */
eb83ed87 1258
2b5e7bc2
KW
1259 if (UNLIKELY(possible_problems)) {
1260 bool disallowed = FALSE;
1261 const U32 orig_problems = possible_problems;
1262
1263 while (possible_problems) { /* Handle each possible problem */
1264 UV pack_warn = 0;
1265 char * message = NULL;
1266
1267 /* Each 'if' clause handles one problem. They are ordered so that
1268 * the first ones' messages will be displayed before the later
1269 * ones; this is kinda in decreasing severity order */
1270 if (possible_problems & UTF8_GOT_OVERFLOW) {
1271
1272 /* Overflow means also got a super and above 31 bits, but we
1273 * handle all three cases here */
1274 possible_problems
1275 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1276 *errors |= UTF8_GOT_OVERFLOW;
1277
1278 /* But the API says we flag all errors found */
1279 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1280 *errors |= UTF8_GOT_SUPER;
1281 }
1282 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1283 *errors |= UTF8_GOT_ABOVE_31_BIT;
1284 }
2b5e7bc2
KW
1285
1286 disallowed = TRUE;
1287
1288 /* The warnings code explicitly says it doesn't handle the case
1289 * of packWARN2 and two categories which have parent-child
1290 * relationship. Even if it works now to raise the warning if
1291 * either is enabled, it wouldn't necessarily do so in the
1292 * future. We output (only) the most dire warning*/
1293 if (! (flags & UTF8_CHECK_ONLY)) {
1294 if (ckWARN_d(WARN_UTF8)) {
1295 pack_warn = packWARN(WARN_UTF8);
1296 }
1297 else if (ckWARN_d(WARN_NON_UNICODE)) {
1298 pack_warn = packWARN(WARN_NON_UNICODE);
1299 }
1300 if (pack_warn) {
1301 message = Perl_form(aTHX_ "%s: %s (overflows)",
1302 malformed_text,
1303 _byte_dump_string(s0, send - s0));
1304 }
1305 }
1306 }
1307 else if (possible_problems & UTF8_GOT_EMPTY) {
1308 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1309 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1310
1311 if (! (flags & UTF8_ALLOW_EMPTY)) {
1312 disallowed = TRUE;
1313 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1314 pack_warn = packWARN(WARN_UTF8);
1315 message = Perl_form(aTHX_ "%s (empty string)",
1316 malformed_text);
1317 }
1318 }
1319 }
1320 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1321 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1322 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1323
1324 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1325 disallowed = TRUE;
1326 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1327 pack_warn = packWARN(WARN_UTF8);
1328 message = Perl_form(aTHX_
1329 "%s: %s (unexpected continuation byte 0x%02x,"
1330 " with no preceding start byte)",
1331 malformed_text,
1332 _byte_dump_string(s0, 1), *s0);
1333 }
1334 }
1335 }
1336 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1337 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
f9380377 1338 *errors |= UTF8_GOT_NON_CONTINUATION;
2b5e7bc2
KW
1339
1340 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1341 disallowed = TRUE;
1342 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1343 pack_warn = packWARN(WARN_UTF8);
1344 message = Perl_form(aTHX_ "%s",
1345 unexpected_non_continuation_text(s0,
1346 send - s0,
1347 s - s0,
1348 (int) expectlen));
1349 }
1350 }
1351 }
1352 else if (possible_problems & UTF8_GOT_SHORT) {
1353 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1354 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1355
1356 if (! (flags & UTF8_ALLOW_SHORT)) {
1357 disallowed = TRUE;
1358 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1359 pack_warn = packWARN(WARN_UTF8);
1360 message = Perl_form(aTHX_
1361 "%s: %s (too short; got %d byte%s, need %d)",
1362 malformed_text,
1363 _byte_dump_string(s0, send - s0),
1364 (int)curlen,
1365 curlen == 1 ? "" : "s",
1366 (int)expectlen);
1367 }
1368 }
ba210ebe 1369
2b5e7bc2
KW
1370 }
1371 else if (possible_problems & UTF8_GOT_LONG) {
1372 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1373 *errors |= UTF8_GOT_LONG;
2b5e7bc2
KW
1374
1375 if (! (flags & UTF8_ALLOW_LONG)) {
1376 disallowed = TRUE;
1377
1378 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1379 pack_warn = packWARN(WARN_UTF8);
1380
1381 /* These error types cause 'uv' to be something that
1382 * isn't what was intended, so can't use it in the
1383 * message. The other error types either can't
1384 * generate an overlong, or else the 'uv' is valid */
1385 if (orig_problems &
1386 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1387 {
1388 message = Perl_form(aTHX_
1389 "%s: %s (any UTF-8 sequence that starts"
1390 " with \"%s\" is overlong which can and"
1391 " should be represented with a"
1392 " different, shorter sequence)",
1393 malformed_text,
1394 _byte_dump_string(s0, send - s0),
1395 _byte_dump_string(s0, curlen));
1396 }
1397 else {
1398 U8 tmpbuf[UTF8_MAXBYTES+1];
1399 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1400 uv, 0);
1401 message = Perl_form(aTHX_
1402 "%s: %s (overlong; instead use %s to represent"
1403 " U+%0*"UVXf")",
1404 malformed_text,
1405 _byte_dump_string(s0, send - s0),
1406 _byte_dump_string(tmpbuf, e - tmpbuf),
1407 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1408 small code points */
1409 uv);
1410 }
1411 }
1412 }
1413 }
1414 else if (possible_problems & UTF8_GOT_SURROGATE) {
1415 possible_problems &= ~UTF8_GOT_SURROGATE;
1416
f9380377
KW
1417 if (flags & UTF8_WARN_SURROGATE) {
1418 *errors |= UTF8_GOT_SURROGATE;
1419
1420 if ( ! (flags & UTF8_CHECK_ONLY)
1421 && ckWARN_d(WARN_SURROGATE))
1422 {
2b5e7bc2
KW
1423 pack_warn = packWARN(WARN_SURROGATE);
1424
1425 /* These are the only errors that can occur with a
1426 * surrogate when the 'uv' isn't valid */
1427 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1428 message = Perl_form(aTHX_
1429 "UTF-16 surrogate (any UTF-8 sequence that"
1430 " starts with \"%s\" is for a surrogate)",
1431 _byte_dump_string(s0, curlen));
1432 }
1433 else {
1434 message = Perl_form(aTHX_
1435 "UTF-16 surrogate U+%04"UVXf"", uv);
1436 }
f9380377 1437 }
2b5e7bc2 1438 }
ba210ebe 1439
2b5e7bc2
KW
1440 if (flags & UTF8_DISALLOW_SURROGATE) {
1441 disallowed = TRUE;
f9380377 1442 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1443 }
1444 }
1445 else if (possible_problems & UTF8_GOT_SUPER) {
1446 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1447
f9380377
KW
1448 if (flags & UTF8_WARN_SUPER) {
1449 *errors |= UTF8_GOT_SUPER;
1450
1451 if ( ! (flags & UTF8_CHECK_ONLY)
1452 && ckWARN_d(WARN_NON_UNICODE))
1453 {
2b5e7bc2
KW
1454 pack_warn = packWARN(WARN_NON_UNICODE);
1455
1456 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1457 message = Perl_form(aTHX_
1458 "Any UTF-8 sequence that starts with"
1459 " \"%s\" is for a non-Unicode code point,"
1460 " may not be portable",
1461 _byte_dump_string(s0, curlen));
1462 }
1463 else {
1464 message = Perl_form(aTHX_
1465 "Code point 0x%04"UVXf" is not"
1466 " Unicode, may not be portable",
1467 uv);
1468 }
f9380377 1469 }
2b5e7bc2 1470 }
ba210ebe 1471
2b5e7bc2
KW
1472 /* The maximum code point ever specified by a standard was
1473 * 2**31 - 1. Anything larger than that is a Perl extension
1474 * that very well may not be understood by other applications
1475 * (including earlier perl versions on EBCDIC platforms). We
1476 * test for these after the regular SUPER ones, and before
1477 * possibly bailing out, so that the slightly more dire warning
1478 * will override the regular one. */
1479 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1480 |UTF8_WARN_SUPER
1481 |UTF8_DISALLOW_ABOVE_31_BIT))
1482 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1483 && UNLIKELY(is_utf8_cp_above_31_bits(
1484 adjusted_s0,
1485 adjusted_send)))
1486 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1487 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1488 {
1489 if ( ! (flags & UTF8_CHECK_ONLY)
1490 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1491 && ckWARN_d(WARN_UTF8))
1492 {
1493 pack_warn = packWARN(WARN_UTF8);
1494
1495 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1496 message = Perl_form(aTHX_
1497 "Any UTF-8 sequence that starts with"
1498 " \"%s\" is for a non-Unicode code"
1499 " point, and is not portable",
1500 _byte_dump_string(s0, curlen));
1501 }
1502 else {
1503 message = Perl_form(aTHX_
1504 "Code point 0x%"UVXf" is not Unicode,"
1505 " and not portable",
1506 uv);
1507 }
1508 }
1509
f9380377
KW
1510 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1511 *errors |= UTF8_GOT_ABOVE_31_BIT;
1512
1513 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1514 disallowed = TRUE;
1515 }
2b5e7bc2
KW
1516 }
1517 }
eb83ed87 1518
2b5e7bc2 1519 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1520 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1521 disallowed = TRUE;
1522 }
eb83ed87 1523
2b5e7bc2
KW
1524 /* The deprecated warning overrides any non-deprecated one. If
1525 * there are other problems, a deprecation message is not
1526 * really helpful, so don't bother to raise it in that case.
1527 * This also keeps the code from having to handle the case
1528 * where 'uv' is not valid. */
1529 if ( ! (orig_problems
1530 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1531 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1532 && ckWARN_d(WARN_DEPRECATED))
1533 {
1534 message = Perl_form(aTHX_ cp_above_legal_max,
1535 uv, MAX_NON_DEPRECATED_CP);
1536 pack_warn = packWARN(WARN_DEPRECATED);
1537 }
1538 }
1539 else if (possible_problems & UTF8_GOT_NONCHAR) {
1540 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1541
f9380377
KW
1542 if (flags & UTF8_WARN_NONCHAR) {
1543 *errors |= UTF8_GOT_NONCHAR;
1544
1545 if ( ! (flags & UTF8_CHECK_ONLY)
1546 && ckWARN_d(WARN_NONCHAR))
1547 {
2b5e7bc2
KW
1548 /* The code above should have guaranteed that we don't
1549 * get here with errors other than overlong */
1550 assert (! (orig_problems
1551 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1552
1553 pack_warn = packWARN(WARN_NONCHAR);
1554 message = Perl_form(aTHX_ "Unicode non-character"
1555 " U+%04"UVXf" is not recommended"
1556 " for open interchange", uv);
f9380377 1557 }
2b5e7bc2 1558 }
5b311467 1559
2b5e7bc2
KW
1560 if (flags & UTF8_DISALLOW_NONCHAR) {
1561 disallowed = TRUE;
f9380377 1562 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1563 }
1564 } /* End of looking through the possible flags */
1565
1566 /* Display the message (if any) for the problem being handled in
1567 * this iteration of the loop */
1568 if (message) {
1569 if (PL_op)
1570 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1571 OP_DESC(PL_op));
1572 else
1573 Perl_warner(aTHX_ pack_warn, "%s", message);
1574 }
1575 } /* End of 'while (possible_problems) {' */
a0dbb045 1576
2b5e7bc2
KW
1577 /* Since there was a possible problem, the returned length may need to
1578 * be changed from the one stored at the beginning of this function.
1579 * Instead of trying to figure out if that's needed, just do it. */
1580 if (retlen) {
1581 *retlen = curlen;
1582 }
a0dbb045 1583
2b5e7bc2
KW
1584 if (disallowed) {
1585 if (flags & UTF8_CHECK_ONLY && retlen) {
1586 *retlen = ((STRLEN) -1);
1587 }
1588 return 0;
1589 }
eb83ed87 1590 }
ba210ebe 1591
2b5e7bc2 1592 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1593}
1594
8e84507e 1595/*
ec5f19d0
KW
1596=for apidoc utf8_to_uvchr_buf
1597
1598Returns the native code point of the first character in the string C<s> which
1599is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1600C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1601
524080c4
KW
1602If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1603enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1604C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1605(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1606C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1607the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1608See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1609returned.
ec5f19d0 1610
760c7c2f
KW
1611Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1612unless those are turned off.
1613
ec5f19d0 1614=cut
52be2536
KW
1615
1616Also implemented as a macro in utf8.h
1617
ec5f19d0
KW
1618*/
1619
1620
1621UV
1622Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1623{
ec5f19d0
KW
1624 assert(s < send);
1625
1626 return utf8n_to_uvchr(s, send - s, retlen,
1627 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1628}
1629
52be2536
KW
1630/* This is marked as deprecated
1631 *
ec5f19d0
KW
1632=for apidoc utf8_to_uvuni_buf
1633
de69f3af
KW
1634Only in very rare circumstances should code need to be dealing in Unicode
1635(as opposed to native) code points. In those few cases, use
1636C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1637
1638Returns the Unicode (not-native) code point of the first character in the
1639string C<s> which
ec5f19d0
KW
1640is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1641C<retlen> will be set to the length, in bytes, of that character.
1642
524080c4
KW
1643If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1644enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1645NULL) to -1. If those warnings are off, the computed value if well-defined (or
1646the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1647is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1648next possible position in C<s> that could begin a non-malformed character.
de69f3af 1649See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1650
760c7c2f
KW
1651Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1652unless those are turned off.
1653
ec5f19d0
KW
1654=cut
1655*/
1656
1657UV
1658Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1659{
1660 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1661
1662 assert(send > s);
1663
5962d97e
KW
1664 /* Call the low level routine, asking for checks */
1665 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1666}
1667
b76347f2 1668/*
87cea99e 1669=for apidoc utf8_length
b76347f2
JH
1670
1671Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1672Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1673up past C<e>, croaks.
b76347f2
JH
1674
1675=cut
1676*/
1677
1678STRLEN
35a4481c 1679Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1680{
1681 STRLEN len = 0;
1682
7918f24d
NC
1683 PERL_ARGS_ASSERT_UTF8_LENGTH;
1684
8850bf83
JH
1685 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1686 * the bitops (especially ~) can create illegal UTF-8.
1687 * In other words: in Perl UTF-8 is not just for Unicode. */
1688
a3b680e6
AL
1689 if (e < s)
1690 goto warn_and_return;
b76347f2 1691 while (s < e) {
4cbf4130 1692 s += UTF8SKIP(s);
8e91ec7f
AV
1693 len++;
1694 }
1695
1696 if (e != s) {
1697 len--;
1698 warn_and_return:
9b387841
NC
1699 if (PL_op)
1700 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1701 "%s in %s", unees, OP_DESC(PL_op));
1702 else
61a12c31 1703 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1704 }
1705
1706 return len;
1707}
1708
b06226ff 1709/*
fed3ba5d
NC
1710=for apidoc bytes_cmp_utf8
1711
a1433954 1712Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1713sequence of characters (stored as UTF-8)
1714in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1715equal, -1 or -2 if the first string is less than the second string, +1 or +2
1716if the first string is greater than the second string.
1717
1718-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1719longer string. -2 or +2 is returned if
1720there was a difference between characters
fed3ba5d
NC
1721within the strings.
1722
1723=cut
1724*/
1725
1726int
1727Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1728{
1729 const U8 *const bend = b + blen;
1730 const U8 *const uend = u + ulen;
1731
1732 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1733
1734 while (b < bend && u < uend) {
1735 U8 c = *u++;
1736 if (!UTF8_IS_INVARIANT(c)) {
1737 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1738 if (u < uend) {
1739 U8 c1 = *u++;
1740 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1741 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1742 } else {
2b5e7bc2 1743 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1744 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1745 "%s %s%s",
7cf8d05d 1746 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1747 PL_op ? " in " : "",
1748 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1749 return -2;
1750 }
1751 } else {
1752 if (PL_op)
1753 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1754 "%s in %s", unees, OP_DESC(PL_op));
1755 else
61a12c31 1756 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1757 return -2; /* Really want to return undef :-) */
1758 }
1759 } else {
1760 return -2;
1761 }
1762 }
1763 if (*b != c) {
1764 return *b < c ? -2 : +2;
1765 }
1766 ++b;
1767 }
1768
1769 if (b == bend && u == uend)
1770 return 0;
1771
1772 return b < bend ? +1 : -1;
1773}
1774
1775/*
87cea99e 1776=for apidoc utf8_to_bytes
6940069f 1777
2bbc8d55 1778Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1779Unlike L</bytes_to_utf8>, this over-writes the original string, and
1780updates C<len> to contain the new length.
67e989fb 1781Returns zero on failure, setting C<len> to -1.
6940069f 1782
a1433954 1783If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1784
6940069f
GS
1785=cut
1786*/
1787
1788U8 *
37607a96 1789Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1790{
d4c19fe8
AL
1791 U8 * const save = s;
1792 U8 * const send = s + *len;
6940069f 1793 U8 *d;
246fae53 1794
7918f24d 1795 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1796 PERL_UNUSED_CONTEXT;
7918f24d 1797
1e54db1a 1798 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1799 while (s < send) {
d59937ca
KW
1800 if (! UTF8_IS_INVARIANT(*s)) {
1801 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1802 *len = ((STRLEN) -1);
1803 return 0;
1804 }
1805 s++;
dcad2880 1806 }
d59937ca 1807 s++;
246fae53 1808 }
dcad2880
JH
1809
1810 d = s = save;
6940069f 1811 while (s < send) {
80e0b38f
KW
1812 U8 c = *s++;
1813 if (! UTF8_IS_INVARIANT(c)) {
1814 /* Then it is two-byte encoded */
a62b247b 1815 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1816 s++;
1817 }
1818 *d++ = c;
6940069f
GS
1819 }
1820 *d = '\0';
246fae53 1821 *len = d - save;
6940069f
GS
1822 return save;
1823}
1824
1825/*
87cea99e 1826=for apidoc bytes_from_utf8
f9a63242 1827
2bbc8d55 1828Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1829Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1830the newly-created string, and updates C<len> to contain the new
1831length. Returns the original string if no conversion occurs, C<len>
72d33970 1832is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 18330 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1834in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1835
37607a96
PK
1836=cut
1837*/
f9a63242
JH
1838
1839U8 *
e1ec3a88 1840Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1841{
f9a63242 1842 U8 *d;
e1ec3a88
AL
1843 const U8 *start = s;
1844 const U8 *send;
f9a63242
JH
1845 I32 count = 0;
1846
7918f24d 1847 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1848 PERL_UNUSED_CONTEXT;
f9a63242 1849 if (!*is_utf8)
73d840c0 1850 return (U8 *)start;
f9a63242 1851
1e54db1a 1852 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1853 for (send = s + *len; s < send;) {
d59937ca
KW
1854 if (! UTF8_IS_INVARIANT(*s)) {
1855 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1856 return (U8 *)start;
d59937ca
KW
1857 }
1858 count++;
1859 s++;
db42d148 1860 }
d59937ca 1861 s++;
f9a63242
JH
1862 }
1863
35da51f7 1864 *is_utf8 = FALSE;
f9a63242 1865
212542aa 1866 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1867 s = start; start = d;
f9a63242
JH
1868 while (s < send) {
1869 U8 c = *s++;
1a91c45d 1870 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1871 /* Then it is two-byte encoded */
a62b247b 1872 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1873 s++;
c4d5f83a
NIS
1874 }
1875 *d++ = c;
f9a63242
JH
1876 }
1877 *d = '\0';
1878 *len = d - start;
73d840c0 1879 return (U8 *)start;
f9a63242
JH
1880}
1881
1882/*
87cea99e 1883=for apidoc bytes_to_utf8
6940069f 1884
ff97e5cf
KW
1885Converts a string C<s> of length C<len> bytes from the native encoding into
1886UTF-8.
6662521e 1887Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1888reflect the new length in bytes.
6940069f 1889
75200dff 1890A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1891
1892If you want to convert to UTF-8 from encodings other than
1893the native (Latin1 or EBCDIC),
a1433954 1894see L</sv_recode_to_utf8>().
c9ada85f 1895
497711e7 1896=cut
6940069f
GS
1897*/
1898
c682ebef
FC
1899/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1900 likewise need duplication. */
1901
6940069f 1902U8*
35a4481c 1903Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1904{
35a4481c 1905 const U8 * const send = s + (*len);
6940069f
GS
1906 U8 *d;
1907 U8 *dst;
7918f24d
NC
1908
1909 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1910 PERL_UNUSED_CONTEXT;
6940069f 1911
212542aa 1912 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1913 dst = d;
1914
1915 while (s < send) {
55d09dc8
KW
1916 append_utf8_from_native_byte(*s, &d);
1917 s++;
6940069f
GS
1918 }
1919 *d = '\0';
6662521e 1920 *len = d-dst;
6940069f
GS
1921 return dst;
1922}
1923
a0ed51b3 1924/*
dea0fc0b 1925 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1926 *
1927 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1928 * We optimize for native, for obvious reasons. */
1929
1930U8*
dea0fc0b 1931Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1932{
dea0fc0b
JH
1933 U8* pend;
1934 U8* dstart = d;
1935
7918f24d
NC
1936 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1937
dea0fc0b 1938 if (bytelen & 1)
f5992bc4 1939 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1940
1941 pend = p + bytelen;
1942
a0ed51b3 1943 while (p < pend) {
dea0fc0b
JH
1944 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1945 p += 2;
2d1545e5 1946 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1947 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1948 continue;
1949 }
56d37426
KW
1950 if (uv <= MAX_UTF8_TWO_BYTE) {
1951 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1952 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1953 continue;
1954 }
46956fad
KW
1955#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1956#define LAST_HIGH_SURROGATE 0xDBFF
1957#define FIRST_LOW_SURROGATE 0xDC00
1958#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1959
1960 /* This assumes that most uses will be in the first Unicode plane, not
1961 * needing surrogates */
1962 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1963 && uv <= UNICODE_SURROGATE_LAST))
1964 {
1965 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1966 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1967 }
1968 else {
01ea242b 1969 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1970 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1971 || UNLIKELY(low > LAST_LOW_SURROGATE))
1972 {
01ea242b 1973 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
1974 }
1975 p += 2;
46956fad
KW
1976 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1977 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1978 }
a0ed51b3 1979 }
56d37426
KW
1980#ifdef EBCDIC
1981 d = uvoffuni_to_utf8_flags(d, uv, 0);
1982#else
a0ed51b3 1983 if (uv < 0x10000) {
eb160463
GS
1984 *d++ = (U8)(( uv >> 12) | 0xe0);
1985 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1986 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1987 continue;
1988 }
1989 else {
eb160463
GS
1990 *d++ = (U8)(( uv >> 18) | 0xf0);
1991 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1992 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1993 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1994 continue;
1995 }
56d37426 1996#endif
a0ed51b3 1997 }
dea0fc0b 1998 *newlen = d - dstart;
a0ed51b3
LW
1999 return d;
2000}
2001
2002/* Note: this one is slightly destructive of the source. */
2003
2004U8*
dea0fc0b 2005Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2006{
2007 U8* s = (U8*)p;
d4c19fe8 2008 U8* const send = s + bytelen;
7918f24d
NC
2009
2010 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2011
e0ea5e2d
NC
2012 if (bytelen & 1)
2013 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
2014 (UV)bytelen);
2015
a0ed51b3 2016 while (s < send) {
d4c19fe8 2017 const U8 tmp = s[0];
a0ed51b3
LW
2018 s[0] = s[1];
2019 s[1] = tmp;
2020 s += 2;
2021 }
dea0fc0b 2022 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2023}
2024
922e8cb4
KW
2025bool
2026Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2027{
2028 U8 tmpbuf[UTF8_MAXBYTES+1];
2029 uvchr_to_utf8(tmpbuf, c);
2030 return _is_utf8_FOO(classnum, tmpbuf);
2031}
2032
f9ae8fb6
JD
2033/* Internal function so we can deprecate the external one, and call
2034 this one from other deprecated functions in this file */
2035
f2645549
KW
2036bool
2037Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2038{
f2645549 2039 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2040
2041 if (*p == '_')
2042 return TRUE;
f25ce844 2043 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2044}
2045
5092f92a 2046bool
eba68aa0
KW
2047Perl__is_uni_perl_idcont(pTHX_ UV c)
2048{
2049 U8 tmpbuf[UTF8_MAXBYTES+1];
2050 uvchr_to_utf8(tmpbuf, c);
2051 return _is_utf8_perl_idcont(tmpbuf);
2052}
2053
2054bool
f91dcd13
KW
2055Perl__is_uni_perl_idstart(pTHX_ UV c)
2056{
2057 U8 tmpbuf[UTF8_MAXBYTES+1];
2058 uvchr_to_utf8(tmpbuf, c);
2059 return _is_utf8_perl_idstart(tmpbuf);
2060}
2061
3a4c58c9
KW
2062UV
2063Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2064{
2065 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2066 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2067 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2068 * either "SS" or "Ss". Which one to use is passed into the routine in
2069 * 'S_or_s' to avoid a test */
2070
2071 UV converted = toUPPER_LATIN1_MOD(c);
2072
2073 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2074
2075 assert(S_or_s == 'S' || S_or_s == 's');
2076
6f2d5cbc 2077 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2078 characters in this range */
3a4c58c9
KW
2079 *p = (U8) converted;
2080 *lenp = 1;
2081 return converted;
2082 }
2083
2084 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2085 * which it maps to one of them, so as to only have to have one check for
2086 * it in the main case */
2087 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2088 switch (c) {
2089 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2090 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2091 break;
2092 case MICRO_SIGN:
2093 converted = GREEK_CAPITAL_LETTER_MU;
2094 break;
79e064b9
KW
2095#if UNICODE_MAJOR_VERSION > 2 \
2096 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2097 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2098 case LATIN_SMALL_LETTER_SHARP_S:
2099 *(p)++ = 'S';
2100 *p = S_or_s;
2101 *lenp = 2;
2102 return 'S';
79e064b9 2103#endif
3a4c58c9
KW
2104 default:
2105 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2106 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2107 }
2108 }
2109
2110 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2111 *p = UTF8_TWO_BYTE_LO(converted);
2112 *lenp = 2;
2113
2114 return converted;
2115}
2116
50bda2c3
KW
2117/* Call the function to convert a UTF-8 encoded character to the specified case.
2118 * Note that there may be more than one character in the result.
2119 * INP is a pointer to the first byte of the input character
2120 * OUTP will be set to the first byte of the string of changed characters. It
2121 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2122 * LENP will be set to the length in bytes of the string of changed characters
2123 *
2124 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2125#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2126#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2127#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2128
b9992569
KW
2129/* This additionally has the input parameter 'specials', which if non-zero will
2130 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2131 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2132#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2133
84afefe6
JH
2134UV
2135Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2136{
a1433954
KW
2137 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2138 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2139 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2140 * the changed version may be longer than the original character.
2141 *
2142 * The ordinal of the first character of the changed version is returned
2143 * (but note, as explained above, that there may be more.) */
2144
7918f24d
NC
2145 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2146
3a4c58c9
KW
2147 if (c < 256) {
2148 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2149 }
2150
0ebc6274 2151 uvchr_to_utf8(p, c);
b9992569 2152 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2153}
2154
84afefe6
JH
2155UV
2156Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2157{
7918f24d
NC
2158 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2159
3a4c58c9
KW
2160 if (c < 256) {
2161 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2162 }
2163
0ebc6274 2164 uvchr_to_utf8(p, c);
b9992569 2165 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2166}
2167
afc16117 2168STATIC U8
81611534 2169S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
2170{
2171 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2172 * those, converting the result to UTF-8. Since the result is always just
a1433954 2173 * one character, we allow <p> to be NULL */
afc16117
KW
2174
2175 U8 converted = toLOWER_LATIN1(c);
2176
2177 if (p != NULL) {
6f2d5cbc 2178 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2179 *p = converted;
2180 *lenp = 1;
2181 }
2182 else {
430c9760
KW
2183 /* Result is known to always be < 256, so can use the EIGHT_BIT
2184 * macros */
2185 *p = UTF8_EIGHT_BIT_HI(converted);
2186 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2187 *lenp = 2;
2188 }
2189 }
2190 return converted;
2191}
2192
84afefe6
JH
2193UV
2194Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2195{
7918f24d
NC
2196 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2197
afc16117
KW
2198 if (c < 256) {
2199 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
2200 }
2201
afc16117 2202 uvchr_to_utf8(p, c);
b9992569 2203 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2204}
2205
84afefe6 2206UV
51910141 2207Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2208{
51910141 2209 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2210 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2211 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2212 *
2213 * Not to be used for locale folds
51910141 2214 */
f673fad4 2215
a1dde8de
KW
2216 UV converted;
2217
2218 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2219 PERL_UNUSED_CONTEXT;
a1dde8de 2220
1ca267a5
KW
2221 assert (! (flags & FOLD_FLAGS_LOCALE));
2222
659a7c2d 2223 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2224 converted = GREEK_SMALL_LETTER_MU;
2225 }
9b63e895
KW
2226#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2227 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2228 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2229 else if ( (flags & FOLD_FLAGS_FULL)
2230 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2231 {
1ca267a5
KW
2232 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2233 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2234 * under those circumstances. */
2235 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2236 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2237 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2238 p, *lenp, U8);
2239 return LATIN_SMALL_LETTER_LONG_S;
2240 }
2241 else {
4f489194
KW
2242 *(p)++ = 's';
2243 *p = 's';
2244 *lenp = 2;
2245 return 's';
1ca267a5 2246 }
a1dde8de 2247 }
9b63e895 2248#endif
a1dde8de
KW
2249 else { /* In this range the fold of all other characters is their lower
2250 case */
2251 converted = toLOWER_LATIN1(c);
2252 }
2253
6f2d5cbc 2254 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2255 *p = (U8) converted;
2256 *lenp = 1;
2257 }
2258 else {
2259 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2260 *p = UTF8_TWO_BYTE_LO(converted);
2261 *lenp = 2;
2262 }
2263
2264 return converted;
2265}
2266
2267UV
31f05a37 2268Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2269{
4b593389 2270
a0270393
KW
2271 /* Not currently externally documented, and subject to change
2272 * <flags> bits meanings:
2273 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2274 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2275 * locale are to be used.
a0270393
KW
2276 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2277 */
4b593389 2278
36bb2ab6 2279 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2280
780fcc9f
KW
2281 if (flags & FOLD_FLAGS_LOCALE) {
2282 /* Treat a UTF-8 locale as not being in locale at all */
2283 if (IN_UTF8_CTYPE_LOCALE) {
2284 flags &= ~FOLD_FLAGS_LOCALE;
2285 }
2286 else {
2287 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2288 goto needs_full_generality;
780fcc9f 2289 }
31f05a37
KW
2290 }
2291
a1dde8de 2292 if (c < 256) {
e7b7ac46 2293 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2294 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2295 }
2296
2f306ab9 2297 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2298 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2299 uvchr_to_utf8(p, c);
b9992569 2300 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2301 }
2302 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2303 the special flags. */
2304 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2305
2306 needs_full_generality:
a0270393 2307 uvchr_to_utf8(utf8_c, c);
445bf929 2308 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2309 }
84afefe6
JH
2310}
2311
26483009 2312PERL_STATIC_INLINE bool
5141f98e 2313S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2314 const char *const swashname, SV* const invlist)
bde6a22d 2315{
ea317ccb
KW
2316 /* returns a boolean giving whether or not the UTF8-encoded character that
2317 * starts at <p> is in the swash indicated by <swashname>. <swash>
2318 * contains a pointer to where the swash indicated by <swashname>
2319 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2320 * look at <*swash> and only generate a swash if it is not null. <invlist>
2321 * is NULL or an inversion list that defines the swash. If not null, it
2322 * saves time during initialization of the swash.
ea317ccb
KW
2323 *
2324 * Note that it is assumed that the buffer length of <p> is enough to
2325 * contain all the bytes that comprise the character. Thus, <*p> should
2326 * have been checked before this call for mal-formedness enough to assure
2327 * that. */
2328
7918f24d
NC
2329 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2330
492a624f 2331 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2332 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2333 * as far as there being enough bytes available in it to accommodate the
2334 * character without reading beyond the end, and pass that number on to the
2335 * validating routine */
6302f837 2336 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2337 if (ckWARN_d(WARN_UTF8)) {
2338 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2339 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2340 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2341 what the malformation is */
2342 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2343 }
2344 }
2345 return FALSE;
2346 }
87367d5f
KW
2347 if (!*swash) {
2348 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2349 *swash = _core_swash_init("utf8",
2350
2351 /* Only use the name if there is no inversion
2352 * list; otherwise will go out to disk */
2353 (invlist) ? "" : swashname,
2354
2355 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2356 }
28123549 2357
bde6a22d
NC
2358 return swash_fetch(*swash, p, TRUE) != 0;
2359}
2360
2361bool
922e8cb4
KW
2362Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2363{
922e8cb4
KW
2364 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2365
2366 assert(classnum < _FIRST_NON_SWASH_CC);
2367
f25ce844
KW
2368 return is_utf8_common(p,
2369 &PL_utf8_swash_ptrs[classnum],
2370 swash_property_names[classnum],
2371 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2372}
2373
2374bool
f2645549 2375Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2376{
f2645549 2377 SV* invlist = NULL;
7918f24d 2378
f2645549 2379 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2380
f2645549
KW
2381 if (! PL_utf8_perl_idstart) {
2382 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2383 }
60071a22 2384 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2385}
2386
2387bool
f2645549 2388Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2389{
f2645549 2390 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2391
2392 if (*p == '_')
2393 return TRUE;
f25ce844 2394 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2395}
2396
2397bool
eba68aa0
KW
2398Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2399{
b24b43f7 2400 SV* invlist = NULL;
eba68aa0
KW
2401
2402 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2403
b24b43f7
KW
2404 if (! PL_utf8_perl_idcont) {
2405 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2406 }
60071a22 2407 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2408}
2409
eba68aa0 2410bool
f2645549 2411Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2412{
f2645549 2413 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2414
f25ce844 2415 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2416}
2417
2418bool
f2645549 2419Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2420{
f2645549 2421 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2422
f25ce844 2423 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2424}
2425
2426bool
7dbf68d2
KW
2427Perl__is_utf8_mark(pTHX_ const U8 *p)
2428{
7dbf68d2
KW
2429 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2430
f25ce844 2431 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2432}
2433
6b5c0936 2434/*
87cea99e 2435=for apidoc to_utf8_case
6b5c0936 2436
9da1e7cb
KW
2437Instead use the appropriate one of L</toUPPER_utf8>,
2438L</toTITLE_utf8>,
2439L</toLOWER_utf8>,
2440or L</toFOLD_utf8>.
2441
6fae5207 2442C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2443the character that is being converted. This routine assumes that the character
2444at C<p> is well-formed.
6b5c0936 2445
6fae5207
KW
2446C<ustrp> is a pointer to the character buffer to put the
2447conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2448of the result.
2449
6fae5207 2450C<swashp> is a pointer to the swash to use.
6b5c0936 2451
a1433954 2452Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2453and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2454but not always, a multicharacter mapping), is tried first.
6b5c0936 2455
4a8240a3
KW
2456C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2457any special mappings; C<""> means to use the special mappings. Values other
2458than these two are treated as the name of the hash containing the special
2459mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2460
796b6530
KW
2461C<normal> is a string like C<"ToLower"> which means the swash
2462C<%utf8::ToLower>.
0134edef 2463
760c7c2f
KW
2464Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2465unless those are turned off.
2466
0134edef 2467=cut */
6b5c0936 2468
2104c8d9 2469UV
9a957fbc
AL
2470Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2471 SV **swashp, const char *normal, const char *special)
a0ed51b3 2472{
b9992569
KW
2473 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2474
2475 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2476}
2477
2478 /* change namve uv1 to 'from' */
6a4a25f4 2479STATIC UV
b9992569
KW
2480S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2481 SV **swashp, const char *normal, const char *special)
2482{
0134edef 2483 STRLEN len = 0;
7918f24d 2484
b9992569 2485 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2486
36eaa811
KW
2487 /* For code points that don't change case, we already know that the output
2488 * of this function is the unchanged input, so we can skip doing look-ups
2489 * for them. Unfortunately the case-changing code points are scattered
2490 * around. But there are some long consecutive ranges where there are no
2491 * case changing code points. By adding tests, we can eliminate the lookup
2492 * for all the ones in such ranges. This is currently done here only for
2493 * just a few cases where the scripts are in common use in modern commerce
2494 * (and scripts adjacent to those which can be included without additional
2495 * tests). */
2496
2497 if (uv1 >= 0x0590) {
2498 /* This keeps from needing further processing the code points most
2499 * likely to be used in the following non-cased scripts: Hebrew,
2500 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2501 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2502 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2503 if (uv1 < 0x10A0) {
2504 goto cases_to_self;
2505 }
2506
2507 /* The following largish code point ranges also don't have case
2508 * changes, but khw didn't think they warranted extra tests to speed
2509 * them up (which would slightly slow down everything else above them):
2510 * 1100..139F Hangul Jamo, Ethiopic
2511 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2512 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2513 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2514 * Combining Diacritical Marks Extended, Balinese,
2515 * Sundanese, Batak, Lepcha, Ol Chiki
2516 * 2000..206F General Punctuation
2517 */
2518
2519 if (uv1 >= 0x2D30) {
2520
2521 /* This keeps the from needing further processing the code points
2522 * most likely to be used in the following non-cased major scripts:
2523 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2524 *
2525 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2526 * event that Unicode eventually allocates the unused block as of
2527 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2528 * that the test suite will start having failures to alert you
2529 * should that happen) */
2530 if (uv1 < 0xA640) {
2531 goto cases_to_self;
2532 }
2533
2534 if (uv1 >= 0xAC00) {
2535 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2536 if (ckWARN_d(WARN_SURROGATE)) {
2537 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2538 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2539 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2540 }
2541 goto cases_to_self;
2542 }
36eaa811
KW
2543
2544 /* AC00..FAFF Catches Hangul syllables and private use, plus
2545 * some others */
2546 if (uv1 < 0xFB00) {
2547 goto cases_to_self;
2548
2549 }
2550
5af9bc97
KW
2551 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2552 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2553 && ckWARN_d(WARN_DEPRECATED))
2554 {
2555 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2556 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2557 }
2558 if (ckWARN_d(WARN_NON_UNICODE)) {
2559 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2560 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2561 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2562 }
2563 goto cases_to_self;
2564 }
3bfc1e70
KW
2565#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2566 if (UNLIKELY(uv1
2567 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2568 {
2569
2570 /* As of this writing, this means we avoid swash creation
2571 * for anything beyond low Plane 1 */
2572 goto cases_to_self;
2573 }
2574#endif
36eaa811
KW
2575 }
2576 }
9ae3ac1a 2577
36eaa811
KW
2578 /* Note that non-characters are perfectly legal, so no warning should
2579 * be given. There are so few of them, that it isn't worth the extra
2580 * tests to avoid swash creation */
9ae3ac1a
KW
2581 }
2582
0134edef 2583 if (!*swashp) /* load on-demand */
5ab9d2ef 2584 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2585
a6f87d8c 2586 if (special) {
0134edef 2587 /* It might be "special" (sometimes, but not always,
2a37f04d 2588 * a multicharacter mapping) */
4a8240a3 2589 HV *hv = NULL;
b08cf34e
JH
2590 SV **svp;
2591
4a8240a3
KW
2592 /* If passed in the specials name, use that; otherwise use any
2593 * given in the swash */
2594 if (*special != '\0') {
2595 hv = get_hv(special, 0);
2596 }
2597 else {
2598 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2599 if (svp) {
2600 hv = MUTABLE_HV(SvRV(*svp));
2601 }
2602 }
2603
176fe009 2604 if (hv
5f560d8a 2605 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2606 && (*svp))
2607 {
cfd0369c 2608 const char *s;
47654450 2609
cfd0369c 2610 s = SvPV_const(*svp, len);
47654450 2611 if (len == 1)
f4cd282c 2612 /* EIGHTBIT */
c80e42f3 2613 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2614 else {
d2dcd0fb 2615 Copy(s, ustrp, len, U8);
29e98929 2616 }
983ffd37 2617 }
0134edef
JH
2618 }
2619
2620 if (!len && *swashp) {
4a4088c4 2621 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2622
0134edef
JH
2623 if (uv2) {
2624 /* It was "normal" (a single character mapping). */
f4cd282c 2625 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2626 }
2627 }
1feea2c7 2628
cbe07460
KW
2629 if (len) {
2630 if (lenp) {
2631 *lenp = len;
2632 }
2633 return valid_utf8_to_uvchr(ustrp, 0);
2634 }
2635
2636 /* Here, there was no mapping defined, which means that the code point maps
2637 * to itself. Return the inputs */
e24dfe9c 2638 cases_to_self:
bfdf22ec 2639 len = UTF8SKIP(p);
ca9fab46
KW
2640 if (p != ustrp) { /* Don't copy onto itself */
2641 Copy(p, ustrp, len, U8);
2642 }
0134edef 2643
2a37f04d
JH
2644 if (lenp)
2645 *lenp = len;
2646
f4cd282c 2647 return uv1;
cbe07460 2648
a0ed51b3
LW
2649}
2650
051a06d4 2651STATIC UV
357aadde 2652S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2653{
4a4088c4 2654 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2655 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2656 * result contains a character that crosses the 255/256 boundary, disallow
2657 * the change, and return the original code point. See L<perlfunc/lc> for
2658 * why;
051a06d4 2659 *
a1433954
KW
2660 * p points to the original string whose case was changed; assumed
2661 * by this routine to be well-formed
051a06d4
KW
2662 * result the code point of the first character in the changed-case string
2663 * ustrp points to the changed-case string (<result> represents its first char)
2664 * lenp points to the length of <ustrp> */
2665
2666 UV original; /* To store the first code point of <p> */
2667
2668 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2669
a4f12ed7 2670 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2671
2672 /* We know immediately if the first character in the string crosses the
2673 * boundary, so can skip */
2674 if (result > 255) {
2675
2676 /* Look at every character in the result; if any cross the
2677 * boundary, the whole thing is disallowed */
2678 U8* s = ustrp + UTF8SKIP(ustrp);
2679 U8* e = ustrp + *lenp;
2680 while (s < e) {
a4f12ed7 2681 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2682 goto bad_crossing;
2683 }
2684 s += UTF8SKIP(s);
2685 }
2686
613abc6d
KW
2687 /* Here, no characters crossed, result is ok as-is, but we warn. */
2688 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2689 return result;
2690 }
2691
7b52d656 2692 bad_crossing:
051a06d4
KW
2693
2694 /* Failed, have to return the original */
4b88fb76 2695 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2696
2697 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2698 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2699 "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
2700 "resolved to \"\\x{%"UVXf"}\".",
357aadde 2701 OP_DESC(PL_op),
ab0b796c
KW
2702 original,
2703 original);
051a06d4
KW
2704 Copy(p, ustrp, *lenp, char);
2705 return original;
2706}
2707
d3e79532 2708/*
87cea99e 2709=for apidoc to_utf8_upper
d3e79532 2710
1f607577 2711Instead use L</toUPPER_utf8>.
a1433954 2712
d3e79532
JH
2713=cut */
2714
051a06d4 2715/* Not currently externally documented, and subject to change:
31f05a37
KW
2716 * <flags> is set iff iff the rules from the current underlying locale are to
2717 * be used. */
051a06d4 2718
2104c8d9 2719UV
31f05a37 2720Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2721{
051a06d4
KW
2722 UV result;
2723
2724 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2725
780fcc9f
KW
2726 if (flags) {
2727 /* Treat a UTF-8 locale as not being in locale at all */
2728 if (IN_UTF8_CTYPE_LOCALE) {
2729 flags = FALSE;
2730 }
2731 else {
2732 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2733 }
31f05a37
KW
2734 }
2735
3a4c58c9 2736 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2737 if (flags) {
2738 result = toUPPER_LC(*p);
2739 }
2740 else {
81c6c7ce 2741 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2742 }
3a4c58c9
KW
2743 }
2744 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2745 if (flags) {
a62b247b 2746 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2747 result = toUPPER_LC(c);
051a06d4
KW
2748 }
2749 else {
a62b247b 2750 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2751 ustrp, lenp, 'S');
051a06d4
KW
2752 }
2753 }
4a4088c4 2754 else { /* UTF-8, ord above 255 */
b9992569 2755 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2756
2757 if (flags) {
357aadde 2758 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2759 }
2760 return result;
2761 }
2762
4a4088c4 2763 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2764 if (UTF8_IS_INVARIANT(result)) {
2765 *ustrp = (U8) result;
2766 *lenp = 1;
2767 }
2768 else {
62cb07ea
KW
2769 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2770 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2771 *lenp = 2;
3a4c58c9 2772 }
baa60164 2773
051a06d4 2774 return result;
983ffd37 2775}
a0ed51b3 2776
d3e79532 2777/*
87cea99e 2778=for apidoc to_utf8_title
d3e79532 2779
1f607577 2780Instead use L</toTITLE_utf8>.
a1433954 2781
d3e79532
JH
2782=cut */
2783
051a06d4 2784/* Not currently externally documented, and subject to change:
31f05a37
KW
2785 * <flags> is set iff the rules from the current underlying locale are to be
2786 * used. Since titlecase is not defined in POSIX, for other than a
2787 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2788 */
051a06d4 2789
983ffd37 2790UV
31f05a37 2791Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2792{
051a06d4
KW
2793 UV result;
2794
2795 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2796
780fcc9f
KW
2797 if (flags) {
2798 /* Treat a UTF-8 locale as not being in locale at all */
2799 if (IN_UTF8_CTYPE_LOCALE) {
2800 flags = FALSE;
2801 }
2802 else {
2803 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2804 }
31f05a37
KW
2805 }
2806
3a4c58c9 2807 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2808 if (flags) {
2809 result = toUPPER_LC(*p);
2810 }
2811 else {
81c6c7ce 2812 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2813 }
3a4c58c9
KW
2814 }
2815 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2816 if (flags) {
a62b247b 2817 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2818 result = toUPPER_LC(c);
051a06d4
KW
2819 }
2820 else {
a62b247b 2821 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2822 ustrp, lenp, 's');
051a06d4
KW
2823 }
2824 }
4a4088c4 2825 else { /* UTF-8, ord above 255 */
b9992569 2826 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2827
2828 if (flags) {
357aadde 2829 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2830 }
2831 return result;
2832 }
2833
4a4088c4 2834 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2835 if (UTF8_IS_INVARIANT(result)) {
2836 *ustrp = (U8) result;
2837 *lenp = 1;
2838 }
2839 else {
62cb07ea
KW
2840 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2841 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2842 *lenp = 2;
3a4c58c9
KW
2843 }
2844
051a06d4 2845 return result;
a0ed51b3
LW
2846}
2847
d3e79532 2848/*
87cea99e 2849=for apidoc to_utf8_lower
d3e79532 2850
1f607577 2851Instead use L</toLOWER_utf8>.
a1433954 2852
d3e79532
JH
2853=cut */
2854
051a06d4 2855/* Not currently externally documented, and subject to change:
31f05a37
KW
2856 * <flags> is set iff iff the rules from the current underlying locale are to
2857 * be used.
2858 */
051a06d4 2859
2104c8d9 2860UV
31f05a37 2861Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2862{
051a06d4
KW
2863 UV result;
2864
051a06d4 2865 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2866
780fcc9f
KW
2867 if (flags) {
2868 /* Treat a UTF-8 locale as not being in locale at all */
2869 if (IN_UTF8_CTYPE_LOCALE) {
2870 flags = FALSE;
2871 }
2872 else {
2873 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2874 }
31f05a37
KW
2875 }
2876
968c5e6a 2877 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2878 if (flags) {
2879 result = toLOWER_LC(*p);
2880 }
2881 else {
81c6c7ce 2882 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2883 }
968c5e6a
KW
2884 }
2885 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2886 if (flags) {
a62b247b 2887 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2888 result = toLOWER_LC(c);
051a06d4
KW
2889 }
2890 else {
a62b247b 2891 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2892 ustrp, lenp);
051a06d4 2893 }
968c5e6a 2894 }
4a4088c4 2895 else { /* UTF-8, ord above 255 */
b9992569 2896 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2897
2898 if (flags) {
357aadde 2899 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2900 }
968c5e6a 2901
051a06d4
KW
2902 return result;
2903 }
2904
4a4088c4 2905 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2906 if (UTF8_IS_INVARIANT(result)) {
2907 *ustrp = (U8) result;
2908 *lenp = 1;
2909 }
2910 else {
62cb07ea
KW
2911 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2912 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2913 *lenp = 2;
2914 }
2915
051a06d4 2916 return result;
b4e400f9
JH
2917}
2918
d3e79532 2919/*
87cea99e 2920=for apidoc to_utf8_fold
d3e79532 2921
1f607577 2922Instead use L</toFOLD_utf8>.
a1433954 2923
d3e79532
JH
2924=cut */
2925
051a06d4
KW
2926/* Not currently externally documented, and subject to change,
2927 * in <flags>
31f05a37
KW
2928 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2929 * locale are to be used.
051a06d4
KW
2930 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2931 * otherwise simple folds
a0270393
KW
2932 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2933 * prohibited
445bf929 2934 */
36bb2ab6 2935
b4e400f9 2936UV
445bf929 2937Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2938{
051a06d4
KW
2939 UV result;
2940
36bb2ab6 2941 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2942
a0270393
KW
2943 /* These are mutually exclusive */
2944 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2945
50ba90ff
KW
2946 assert(p != ustrp); /* Otherwise overwrites */
2947
780fcc9f
KW
2948 if (flags & FOLD_FLAGS_LOCALE) {
2949 /* Treat a UTF-8 locale as not being in locale at all */
2950 if (IN_UTF8_CTYPE_LOCALE) {
2951 flags &= ~FOLD_FLAGS_LOCALE;
2952 }
2953 else {
2954 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2955 }
31f05a37
KW
2956 }
2957
a1dde8de 2958 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2959 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2960 result = toFOLD_LC(*p);
051a06d4
KW
2961 }
2962 else {
81c6c7ce 2963 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2964 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2965 }
a1dde8de
KW
2966 }
2967 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2968 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2969 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2970 result = toFOLD_LC(c);
051a06d4
KW
2971 }
2972 else {
a62b247b 2973 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2974 ustrp, lenp,
1ca267a5 2975 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2976 }
a1dde8de 2977 }
4a4088c4 2978 else { /* UTF-8, ord above 255 */
b9992569 2979 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2980
1ca267a5
KW
2981 if (flags & FOLD_FLAGS_LOCALE) {
2982
76f2ffcd 2983# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
2984 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
2985
2986# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2987# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
2988
2989 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 2990
538e84ed
KW
2991 /* Special case these two characters, as what normally gets
2992 * returned under locale doesn't work */
76f2ffcd
KW
2993 if (UTF8SKIP(p) == cap_sharp_s_len
2994 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 2995 {
ab0b796c
KW
2996 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2997 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2998 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2999 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3000 goto return_long_s;
3001 }
0766489e
KW
3002 else
3003#endif
3004 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3005 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3006 {
ab0b796c
KW
3007 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3008 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3009 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3010 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3011 goto return_ligature_st;
3012 }
74894415
KW
3013
3014#if UNICODE_MAJOR_VERSION == 3 \
3015 && UNICODE_DOT_VERSION == 0 \
3016 && UNICODE_DOT_DOT_VERSION == 1
3017# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3018
3019 /* And special case this on this Unicode version only, for the same
3020 * reaons the other two are special cased. They would cross the
3021 * 255/256 boundary which is forbidden under /l, and so the code
3022 * wouldn't catch that they are equivalent (which they are only in
3023 * this release) */
3024 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3025 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3026 {
3027 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3028 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3029 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3030 "resolved to \"\\x{0131}\".");
3031 goto return_dotless_i;
3032 }
3033#endif
3034
357aadde 3035 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3036 }
a0270393
KW
3037 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3038 return result;
3039 }
3040 else {
4a4088c4 3041 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3042 * character above the ASCII range, and the result should not
3043 * contain an ASCII character. */
a0270393
KW
3044
3045 UV original; /* To store the first code point of <p> */
3046
3047 /* Look at every character in the result; if any cross the
3048 * boundary, the whole thing is disallowed */
3049 U8* s = ustrp;
3050 U8* e = ustrp + *lenp;
3051 while (s < e) {
3052 if (isASCII(*s)) {
3053 /* Crossed, have to return the original */
3054 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3055
9fc2026f 3056 /* But in these instances, there is an alternative we can
1ca267a5 3057 * return that is valid */
0766489e
KW
3058 if (original == LATIN_SMALL_LETTER_SHARP_S
3059#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3060 || original == LATIN_CAPITAL_LETTER_SHARP_S
3061#endif
3062 ) {
1ca267a5
KW
3063 goto return_long_s;
3064 }
9fc2026f
KW
3065 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3066 goto return_ligature_st;
3067 }
74894415
KW
3068#if UNICODE_MAJOR_VERSION == 3 \
3069 && UNICODE_DOT_VERSION == 0 \
3070 && UNICODE_DOT_DOT_VERSION == 1
3071
3072 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3073 goto return_dotless_i;
3074 }
3075#endif
a0270393
KW
3076 Copy(p, ustrp, *lenp, char);
3077 return original;
3078 }
3079 s += UTF8SKIP(s);
3080 }
051a06d4 3081
a0270393
KW
3082 /* Here, no characters crossed, result is ok as-is */
3083 return result;
3084 }
051a06d4
KW
3085 }
3086
4a4088c4 3087 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3088 if (UTF8_IS_INVARIANT(result)) {
3089 *ustrp = (U8) result;
3090 *lenp = 1;
3091 }
3092 else {
62cb07ea
KW
3093 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3094 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3095 *lenp = 2;
3096 }
3097
051a06d4 3098 return result;
1ca267a5
KW
3099
3100 return_long_s:
3101 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3102 * folds to a string of two of these characters. By returning this
3103 * instead, then, e.g.,
3104 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3105 * works. */
3106
3107 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3108 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3109 ustrp, *lenp, U8);
3110 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3111
3112 return_ligature_st:
3113 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3114 * have the other one fold to it */
3115
3116 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3117 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3118 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3119
3120#if UNICODE_MAJOR_VERSION == 3 \
3121 && UNICODE_DOT_VERSION == 0 \
3122 && UNICODE_DOT_DOT_VERSION == 1
3123
3124 return_dotless_i:
3125 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3126 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3127 return LATIN_SMALL_LETTER_DOTLESS_I;
3128
3129#endif
3130
a0ed51b3
LW
3131}
3132
711a919c 3133/* Note:
f90a9a02 3134 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
3135 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3136 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3137 */
c4a5db0c 3138
a0ed51b3 3139SV*
7fc63493 3140Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3141{
c4a5db0c
KW
3142 PERL_ARGS_ASSERT_SWASH_INIT;
3143
3144 /* Returns a copy of a swash initiated by the called function. This is the
3145 * public interface, and returning a copy prevents others from doing
3146 * mischief on the original */
3147
5d3d13d1 3148 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3149}
3150
3151SV*
5d3d13d1 3152Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3153{
2c1f00b9
YO
3154
3155 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3156 * use the following define */
3157
3158#define CORE_SWASH_INIT_RETURN(x) \
3159 PL_curpm= old_PL_curpm; \
3160 return x
3161
c4a5db0c 3162 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3163 * by calling utf8_heavy.pl in the general case. The returned value may be
3164 * the swash's inversion list instead if the input parameters allow it.
3165 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3166 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3167 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3168 *
3169 * This interface should only be used by functions that won't destroy or
3170 * adversely change the swash, as doing so affects all other uses of the
3171 * swash in the program; the general public should use 'Perl_swash_init'
3172 * instead.
3173 *
3174 * pkg is the name of the package that <name> should be in.
3175 * name is the name of the swash to find. Typically it is a Unicode
3176 * property name, including user-defined ones
3177 * listsv is a string to initialize the swash with. It must be of the form
3178 * documented as the subroutine return value in
3179 * L<perlunicode/User-Defined Character Properties>
3180 * minbits is the number of bits required to represent each data element.
3181 * It is '1' for binary properties.
3182 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3183 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3184 * flags_p if non-NULL is the address of various input and output flag bits
3185 * to the routine, as follows: ('I' means is input to the routine;
3186 * 'O' means output from the routine. Only flags marked O are
3187 * meaningful on return.)
3188 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3189 * came from a user-defined property. (I O)
5d3d13d1
KW
3190 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3191 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3192 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3193 * return of an inversion list instead of a swash hash if this routine
3194 * thinks that would result in faster execution of swash_fetch() later
3195 * on. (I)
9a53f6cf
KW
3196 *
3197 * Thus there are three possible inputs to find the swash: <name>,
3198 * <listsv>, and <invlist>. At least one must be specified. The result
3199 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3200 * actions can intersect, etc. what <name> gives. To avoid going out to
3201 * disk at all, <invlist> should specify completely what the swash should
3202 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3203 *
3204 * <invlist> is only valid for binary properties */
c4a5db0c 3205
2c1f00b9
YO
3206 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3207
c4a5db0c 3208 SV* retval = &PL_sv_undef;
83199d38 3209 HV* swash_hv = NULL;
87367d5f
KW
3210 const int invlist_swash_boundary =
3211 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3212 ? 512 /* Based on some benchmarking, but not extensive, see commit
3213 message */
3214 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3215
3216 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3217 assert(! invlist || minbits == 1);
3218
2c1f00b9
YO
3219 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3220 that triggered the swash init and the swash init perl logic itself.
3221 See perl #122747 */
3222
9a53f6cf
KW
3223 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3224 * so */
3225 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3226 dSP;
3227 const size_t pkg_len = strlen(pkg);
3228 const size_t name_len = strlen(name);
3229 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3230 SV* errsv_save;
3231 GV *method;
3232
3233 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3234
3235 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3236 ENTER;
69794297 3237 SAVEHINTS();
2782061f 3238 save_re_context();
650f067c
JL
3239 /* We might get here via a subroutine signature which uses a utf8
3240 * parameter name, at which point PL_subname will have been set
3241 * but not yet used. */
3242 save_item(PL_subname);
69794297
KW
3243 if (PL_parser && PL_parser->error_count)
3244 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3245 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3246 if (!method) { /* demand load UTF-8 */
69794297 3247 ENTER;
db2c6cb3
FC
3248 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3249 GvSV(PL_errgv) = NULL;
1a419e6b 3250#ifndef NO_TAINT_SUPPORT
69794297
KW
3251 /* It is assumed that callers of this routine are not passing in
3252 * any user derived data. */
2782061f
DM
3253 /* Need to do this after save_re_context() as it will set
3254 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3255 * in Perl_magic_get). Even line to create errsv_save can turn on
3256 * PL_tainted. */
284167a5
S
3257 SAVEBOOL(TAINT_get);
3258 TAINT_NOT;
3259#endif
69794297
KW
3260 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3261 NULL);
eed484f9 3262 {
db2c6cb3
FC
3263 /* Not ERRSV, as there is no need to vivify a scalar we are
3264 about to discard. */
3265 SV * const errsv = GvSV(PL_errgv);
3266 if (!SvTRUE(errsv)) {
3267 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3268 SvREFCNT_dec(errsv);
3269 }
eed484f9 3270 }
69794297
KW
3271 LEAVE;
3272 }
3273 SPAGAIN;
3274 PUSHMARK(SP);
3275 EXTEND(SP,5);
3276 mPUSHp(pkg, pkg_len);
3277 mPUSHp(name, name_len);
3278 PUSHs(listsv);
3279 mPUSHi(minbits);
3280 mPUSHi(none);
3281 PUTBACK;
db2c6cb3
FC
3282 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3283 GvSV(PL_errgv) = NULL;
69794297
KW
3284 /* If we already have a pointer to the method, no need to use
3285 * call_method() to repeat the lookup. */
c41800a8
KW
3286 if (method
3287 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3288 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3289 {
3290 retval = *PL_stack_sp--;
3291 SvREFCNT_inc(retval);
3292 }
eed484f9 3293 {
db2c6cb3
FC
3294 /* Not ERRSV. See above. */
3295 SV * const errsv = GvSV(PL_errgv);
3296 if (!SvTRUE(errsv)) {
3297 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3298 SvREFCNT_dec(errsv);
3299 }
eed484f9 3300 }
ce3b816e 3301 LEAVE;
69794297
KW
3302 POPSTACK;
3303 if (IN_PERL_COMPILETIME) {
3304 CopHINTS_set(PL_curcop, PL_hints);
3305 }
3306 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3307 if (SvPOK(retval)) {
69794297
KW
3308
3309 /* If caller wants to handle missing properties, let them */
5d3d13d1 3310 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3311 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3312 }
3313 Perl_croak(aTHX_
3314 "Can't find Unicode property definition \"%"SVf"\"",
3315 SVfARG(retval));
a25b5927 3316 NOT_REACHED; /* NOTREACHED */
d95e4a00 3317 }
69794297 3318 }
9a53f6cf 3319 } /* End of calling the module to find the swash */
36eb48b4 3320
83199d38
KW
3321 /* If this operation fetched a swash, and we will need it later, get it */
3322 if (retval != &PL_sv_undef
3323 && (minbits == 1 || (flags_p
3324 && ! (*flags_p
3325 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3326 {
3327 swash_hv = MUTABLE_HV(SvRV(retval));
3328
3329 /* If we don't already know that there is a user-defined component to
3330 * this swash, and the user has indicated they wish to know if there is
3331 * one (by passing <flags_p>), find out */
3332 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3333 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3334 if (user_defined && SvUV(*user_defined)) {
3335 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3336 }
3337 }
3338 }
3339
36eb48b4
KW
3340 /* Make sure there is an inversion list for binary properties */
3341 if (minbits == 1) {
3342 SV** swash_invlistsvp = NULL;
3343 SV* swash_invlist = NULL;
9a53f6cf 3344 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3345 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3346 an unclaimed reference count */
36eb48b4 3347
9a53f6cf 3348 /* If this operation fetched a swash, get its already existing
83199d38 3349 * inversion list, or create one for it */
36eb48b4 3350
83199d38 3351 if (swash_hv) {
5c9f4bd2 3352 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3353 if (swash_invlistsvp) {
3354 swash_invlist = *swash_invlistsvp;
3355 invlist_in_swash_is_valid = TRUE;
3356 }
3357 else {
36eb48b4 3358 swash_invlist = _swash_to_invlist(retval);
02c85471 3359 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3360 }
3361 }
3362
3363 /* If an inversion list was passed in, have to include it */
3364 if (invlist) {
3365
3366 /* Any fetched swash will by now have an inversion list in it;
3367 * otherwise <swash_invlist> will be NULL, indicating that we
3368 * didn't fetch a swash */
3369 if (swash_invlist) {
3370
3371 /* Add the passed-in inversion list, which invalidates the one
3372 * already stored in the swash */
3373 invlist_in_swash_is_valid = FALSE;
3374 _invlist_union(invlist, swash_invlist, &swash_invlist);
3375 }
3376 else {
3377
87367d5f
KW
3378 /* Here, there is no swash already. Set up a minimal one, if
3379 * we are going to return a swash */
3380 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3381 swash_hv = newHV();
4aca0fe6 3382 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3383 }
9a53f6cf
KW
3384 swash_invlist = invlist;
3385 }
9a53f6cf
KW
3386 }
3387
3388 /* Here, we have computed the union of all the passed-in data. It may
3389 * be that there was an inversion list in the swash which didn't get
538e84ed 3390 * touched; otherwise save the computed one */
87367d5f
KW
3391 if (! invlist_in_swash_is_valid
3392 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3393 {
5c9f4bd2 3394 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3395 {
3396 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3397 }
cc34d8c5
FC
3398 /* We just stole a reference count. */
3399 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3400 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3401 }
87367d5f 3402
dbfdbd26
KW
3403 SvREADONLY_on(swash_invlist);
3404
c41800a8 3405 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3406 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3407 SvREFCNT_dec(retval);
02c85471
FC
3408 if (!swash_invlist_unclaimed)
3409 SvREFCNT_inc_simple_void_NN(swash_invlist);
3410 retval = newRV_noinc(swash_invlist);
87367d5f 3411 }
36eb48b4
KW
3412 }
3413
2c1f00b9
YO
3414 CORE_SWASH_INIT_RETURN(retval);
3415#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3416}
3417
035d37be
JH
3418
3419/* This API is wrong for special case conversions since we may need to
3420 * return several Unicode characters for a single Unicode character
3421 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3422 * the lower-level routine, and it is similarly broken for returning
38684baa 3423 * multiple values. --jhi
b9992569 3424 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3425/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
TS
3426
3427/* Note:
3428 * Returns the value of property/mapping C<swash> for the first character
3429 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3430 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3431 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3432 *
3433 * A "swash" is a hash which contains initially the keys/values set up by
3434 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3435 * property for all possible code points. Things are stored in a compact form
3436 * (see utf8_heavy.pl) so that calculation is required to find the actual
3437 * property value for a given code point. As code points are looked up, new
3438 * key/value pairs are added to the hash, so that the calculation doesn't have
3439 * to ever be re-done. Further, each calculation is done, not just for the
3440 * desired one, but for a whole block of code points adjacent to that one.
3441 * For binary properties on ASCII machines, the block is usually for 64 code
3442 * points, starting with a code point evenly divisible by 64. Thus if the
3443 * property value for code point 257 is requested, the code goes out and
3444 * calculates the property values for all 64 code points between 256 and 319,
3445 * and stores these as a single 64-bit long bit vector, called a "swatch",
3446 * under the key for code point 256. The key is the UTF-8 encoding for code
3447 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3448 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3449 * for code point 258 is then requested, this code realizes that it would be
3450 * stored under the key for 256, and would find that value and extract the
3451 * relevant bit, offset from 256.
3452 *
3453 * Non-binary properties are stored in as many bits as necessary to represent
3454 * their values (32 currently, though the code is more general than that), not
fc273927 3455 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3456 * vector that encompasses the property values for all code points whose UTF-8
3457 * representations are represented by the key. That is, for all code points
3458 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3459 * bytes of that.
680c470c 3460 */
a0ed51b3 3461UV
680c470c 3462Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3463{
ef8f7699 3464 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3465 U32 klen;
3466 U32 off;
9b56a019 3467 STRLEN slen = 0;
7d85a32c 3468 STRLEN needents;
cfd0369c 3469 const U8 *tmps = NULL;
979f2922 3470 SV *swatch;
08fb1ac5 3471 const U8 c = *ptr;
3568d838 3472
7918f24d
NC
3473 PERL_ARGS_ASSERT_SWASH_FETCH;
3474
87367d5f
KW
3475 /* If it really isn't a hash, it isn't really swash; must be an inversion
3476 * list */
3477 if (SvTYPE(hv) != SVt_PVHV) {
3478 return _invlist_contains_cp((SV*)hv,
3479 (do_utf8)
3480 ? valid_utf8_to_uvchr(ptr, NULL)
3481 : c);
3482 }
3483
08fb1ac5
KW
3484 /* We store the values in a "swatch" which is a vec() value in a swash
3485 * hash. Code points 0-255 are a single vec() stored with key length
3486 * (klen) 0. All other code points have a UTF-8 representation
3487 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3488 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3489 * length for them is the length of the encoded char - 1. ptr[klen] is the
3490 * final byte in the sequence representing the character */
3491 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3492 klen = 0;
3493 needents = 256;
3494 off = c;
3568d838 3495 }
08fb1ac5
KW
3496 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3497 klen = 0;
3498 needents = 256;
a62b247b 3499 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
TS
3500 }
3501 else {
08fb1ac5
KW
3502 klen = UTF8SKIP(ptr) - 1;
3503
3504 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3505 * the vec is the final byte in the sequence. (In EBCDIC this is
3506 * converted to I8 to get consecutive values.) To help you visualize
3507 * all this:
3508 * Straight 1047 After final byte
3509 * UTF-8 UTF-EBCDIC I8 transform
3510 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3511 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3512 * ...
3513 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3514 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3515 * ...
3516 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3517 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3518 * ...
3519 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3520 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3521 * ...
3522 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3523 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3524 *
3525 * (There are no discontinuities in the elided (...) entries.)
3526 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3527 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3528 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3529 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3530 * index into the vec() swatch (after subtracting 0x80, which we
3531 * actually do with an '&').
3532 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3533 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3534 * dicontinuities which go away by transforming it into I8, and we
3535 * effectively subtract 0xA0 to get the index. */
979f2922 3536 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3537 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3538 }
7d85a32c 3539
a0ed51b3 3540 /*
4a4088c4 3541 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3542 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3543 * it's nothing to sniff at.) Pity we usually come through at least
3544 * two function calls to get here...
3545 *
3546 * NB: this code assumes that swatches are never modified, once generated!
3547 */
3548
3568d838 3549 if (hv == PL_last_swash_hv &&
a0ed51b3 3550 klen == PL_last_swash_klen &&
27da23d5 3551 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3552 {
3553 tmps = PL_last_swash_tmps;
3554 slen = PL_last_swash_slen;
3555 }
3556 else {
3557 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3558 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3559
b0e3252e 3560 /* If not cached, generate it via swatch_get */
979f2922 3561 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3562 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3563 {
3564 if (klen) {
3565 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3566 swatch = swatch_get(swash,
3567 code_point & ~((UV)needents - 1),
3568 needents);
3569 }
3570 else { /* For the first 256 code points, the swatch has a key of
3571 length 0 */
3572 swatch = swatch_get(swash, 0, needents);
3573 }
979f2922 3574
923e4eb5 3575 if (IN_PERL_COMPILETIME)
623e6609 3576 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3577
979f2922 3578 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3579
979f2922
TS
3580 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3581 || (slen << 3) < needents)
5637ef5b
NC
3582 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3583 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3584 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3585 }
3586
3587 PL_last_swash_hv = hv;
16d8f38a 3588 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3589 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3590 /* FIXME change interpvar.h? */
3591 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3592 PL_last_swash_slen = slen;
3593 if (klen)
3594 Copy(ptr, PL_last_swash_key, klen, U8);
3595 }
3596
9faf8d75 3597 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3598 case 1:
e7aca353 3599 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3600 case 8:
e7aca353 3601 return ((UV) tmps[off]);
a0ed51b3
LW
3602 case 16:
3603 off <<= 1;
e7aca353
JH
3604 return
3605 ((UV) tmps[off ] << 8) +
3606 ((UV) tmps[off + 1]);
a0ed51b3
LW
3607 case 32:
3608 off <<= 2;
e7aca353
JH
3609 return
3610 ((UV) tmps[off ] << 24) +
3611 ((UV) tmps[off + 1] << 16) +
3612 ((UV) tmps[off + 2] << 8) +
3613 ((UV) tmps[off + 3]);
a0ed51b3 3614 }
5637ef5b
NC
3615 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3616 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3617 NORETURN_FUNCTION_END;
a0ed51b3 3618}
2b9d42f0 3619
319009ee
KW
3620/* Read a single line of the main body of the swash input text. These are of
3621 * the form:
3622 * 0053 0056 0073
3623 * where each number is hex. The first two numbers form the minimum and
3624 * maximum of a range, and the third is the value associated with the range.
3625 * Not all swashes should have a third number
3626 *
3627 * On input: l points to the beginning of the line to be examined; it points
3628 * to somewhere in the string of the whole input text, and is
3629 * terminated by a \n or the null string terminator.
3630 * lend points to the null terminator of that string
3631 * wants_value is non-zero if the swash expects a third number
3632 * typestr is the name of the swash's mapping, like 'ToLower'
3633 * On output: *min, *max, and *val are set to the values read from the line.
3634 * returns a pointer just beyond the line examined. If there was no
3635 * valid min number on the line, returns lend+1
3636 */
3637
3638STATIC U8*
3639S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3640 const bool wants_value, const U8* const typestr)
3641{
3642 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3643 STRLEN numlen; /* Length of the number */
02470786
KW
3644 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3645 | PERL_SCAN_DISALLOW_PREFIX
3646 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3647
3648 /* nl points to the next \n in the scan */
3649 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3650
95543e92
KW
3651 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3652
319009ee
KW
3653 /* Get the first number on the line: the range minimum */
3654 numlen = lend - l;
3655 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3656 *max = *min; /* So can never return without setting max */
319009ee
KW
3657 if (numlen) /* If found a hex number, position past it */
3658 l += numlen;
3659 else if (nl) { /* Else, go handle next line, if any */
3660 return nl + 1; /* 1 is length of "\n" */
3661 }
3662 else { /* Else, no next line */
3663 return lend + 1; /* to LIST's end at which \n is not found */
3664 }
3665
3666 /* The max range value follows, separated by a BLANK */
3667 if (isBLANK(*l)) {
3668 ++l;
02470786
KW
3669 flags = PERL_SCAN_SILENT_ILLDIGIT
3670 | PERL_SCAN_DISALLOW_PREFIX
3671 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3672 numlen = lend - l;
3673 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3674 if (numlen)
3675 l += numlen;
3676 else /* If no value here, it is a single element range */
3677 *max = *min;
3678
3679 /* Non-binary tables have a third entry: what the first element of the
24303724 3680 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3681 if (wants_value) {
3682 if (isBLANK(*l)) {
3683 ++l;
f2a7d0fc
KW
3684 flags = PERL_SCAN_SILENT_ILLDIGIT
3685 | PERL_SCAN_DISALLOW_PREFIX
3686 | PERL_SCAN_SILENT_NON_PORTABLE;
3687 numlen = lend - l;
3688 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3689 if (numlen)
3690 l += numlen;
3691 else
3692 *val = 0;
319009ee
KW
3693 }
3694 else {
3695 *val = 0;
3696 if (typeto) {
dcbac5bb 3697 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3698 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3699 typestr, l);
3700 }
3701 }
3702 }
3703 else
3704 *val = 0; /* bits == 1, then any val should be ignored */
3705 }
3706 else { /* Nothing following range min, should be single element with no
3707 mapping expected */
319009ee
KW
3708 if (wants_value) {
3709 *val = 0;
3710 if (typeto) {
dcbac5bb 3711 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3712 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3713 }
3714 }
3715 else
3716 *val = 0; /* bits == 1, then val should be ignored */
3717 }
3718
3719 /* Position to next line if any, or EOF */
3720 if (nl)
3721 l = nl + 1;
3722 else
3723 l = lend;
3724
3725 return l;
3726}
3727
979f2922
TS
3728/* Note:
3729 * Returns a swatch (a bit vector string) for a code point sequence
3730 * that starts from the value C<start> and comprises the number C<span>.
3731 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3732 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3733 */
3734STATIC SV*
b0e3252e 3735S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
TS
3736{