This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlapi: Add explanation for why certain macros don't exist.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
147e3846 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
147e3846 75 "UTF-16 surrogate U+%04" UVXf, uv); \
8ee1cdcb
KW
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
147e3846 86 "Unicode non-character U+%04" UVXf " is not " \
8ee1cdcb
KW
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
147e3846
KW
167 ? "Code point 0x%" UVXf " is not Unicode, and not portable"
168 : "Code point 0x%" UVXf " is not Unicode, may not be portable",
a5bf80e0
KW
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
37086697
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
c551bb01
KW
433#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
434
435 const STRLEN len = e - s;
436
437#endif
438
12a4bed3
KW
439 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
440 * platform, that is if it represents a code point larger than the highest
441 * representable code point. (For ASCII platforms, we could use memcmp()
442 * because we don't have to convert each byte to I8, but it's very rare
443 * input indeed that would approach overflow, so the loop below will likely
444 * only get executed once.
445 *
446 * 'e' must not be beyond a full character. If it is less than a full
447 * character, the function returns FALSE if there is any input beyond 'e'
448 * that could result in a non-overflowing code point */
449
450 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
b0b342d4
KW
451 assert(s <= e && s + UTF8SKIP(s) >= e);
452
453#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
454
455 /* On 32 bit ASCII machines, many overlongs that start with FF don't
456 * overflow */
457
c551bb01 458 if (isFF_OVERLONG(s, len)) {
b0b342d4
KW
459 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
460 return memGE(s, max_32_bit_overlong,
c551bb01 461 MIN(len, sizeof(max_32_bit_overlong) - 1));
b0b342d4
KW
462 }
463
464#endif
12a4bed3
KW
465
466 for (x = s; x < e; x++, y++) {
467
468 /* If this byte is larger than the corresponding highest UTF-8 byte, it
469 * overflows */
470 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
471 return TRUE;
472 }
473
474 /* If not the same as this byte, it must be smaller, doesn't overflow */
475 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
476 return FALSE;
477 }
478 }
479
480 /* Got to the end and all bytes are the same. If the input is a whole
481 * character, it doesn't overflow. And if it is a partial character,
482 * there's not enough information to tell, so assume doesn't overflow */
483 return FALSE;
484}
485
486PERL_STATIC_INLINE bool
487S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
488{
489 /* Overlongs can occur whenever the number of continuation bytes
490 * changes. That means whenever the number of leading 1 bits in a start
491 * byte increases from the next lower start byte. That happens for start
492 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
493 * illegal start bytes have already been excluded, so don't need to be
494 * tested here;
495 * ASCII platforms: C0, C1
496 * EBCDIC platforms C0, C1, C2, C3, C4, E0
497 *
498 * At least a second byte is required to determine if other sequences will
499 * be an overlong. */
500
501 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
502 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
503
504 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
505 assert(len > 1 && UTF8_IS_START(*s));
506
507 /* Each platform has overlongs after the start bytes given above (expressed
508 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
509 * the logic is the same, except the E0 overlong has already been excluded
510 * on EBCDIC platforms. The values below were found by manually
511 * inspecting the UTF-8 patterns. See the tables in utf8.h and
512 * utfebcdic.h. */
513
514# ifdef EBCDIC
515# define F0_ABOVE_OVERLONG 0xB0
516# define F8_ABOVE_OVERLONG 0xA8
517# define FC_ABOVE_OVERLONG 0xA4
518# define FE_ABOVE_OVERLONG 0xA2
519# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
520 /* I8(0xfe) is FF */
521# else
522
523 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
524 return TRUE;
525 }
526
527# define F0_ABOVE_OVERLONG 0x90
528# define F8_ABOVE_OVERLONG 0x88
529# define FC_ABOVE_OVERLONG 0x84
530# define FE_ABOVE_OVERLONG 0x82
531# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
532# endif
533
534
535 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
536 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
537 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
538 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
539 {
540 return TRUE;
541 }
542
b0b342d4
KW
543 /* Check for the FF overlong */
544 return isFF_OVERLONG(s, len);
545}
546
547PERL_STATIC_INLINE bool
548S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
549{
550 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3
KW
551
552 /* Check for the FF overlong. This happens only if all these bytes match;
553 * what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 554 * utfebcdic.h. */
12a4bed3 555
b0b342d4
KW
556 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
557 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
558 sizeof(FF_OVERLONG_PREFIX) - 1));
12a4bed3
KW
559}
560
561#undef F0_ABOVE_OVERLONG
562#undef F8_ABOVE_OVERLONG
563#undef FC_ABOVE_OVERLONG
564#undef FE_ABOVE_OVERLONG
565#undef FF_OVERLONG_PREFIX
566
35f8c9bd 567STRLEN
edc2c47a 568Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 569{
2b479609 570 STRLEN len;
12a4bed3 571 const U8 *x;
35f8c9bd 572
2b479609
KW
573 /* A helper function that should not be called directly.
574 *
575 * This function returns non-zero if the string beginning at 's' and
576 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
577 * code point; otherwise it returns 0. The examination stops after the
578 * first code point in 's' is validated, not looking at the rest of the
579 * input. If 'e' is such that there are not enough bytes to represent a
580 * complete code point, this function will return non-zero anyway, if the
581 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
582 * excluded by 'flags'.
583 *
584 * A non-zero return gives the number of bytes required to represent the
585 * code point. Be aware that if the input is for a partial character, the
586 * return will be larger than 'e - s'.
587 *
588 * This function assumes that the code point represented is UTF-8 variant.
589 * The caller should have excluded this possibility before calling this
590 * function.
591 *
592 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
593 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
594 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
595 * disallowed by the flags. If the input is only for a partial character,
596 * the function will return non-zero if there is any sequence of
597 * well-formed UTF-8 that, when appended to the input sequence, could
598 * result in an allowed code point; otherwise it returns 0. Non characters
599 * cannot be determined based on partial character input. But many of the
600 * other excluded types can be determined with just the first one or two
601 * bytes.
602 *
603 */
604
605 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
606
607 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
608 |UTF8_DISALLOW_ABOVE_31_BIT)));
609 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 610
2b479609 611 /* A variant char must begin with a start byte */
35f8c9bd
KW
612 if (UNLIKELY(! UTF8_IS_START(*s))) {
613 return 0;
614 }
615
edc2c47a
KW
616 /* Examine a maximum of a single whole code point */
617 if (e - s > UTF8SKIP(s)) {
618 e = s + UTF8SKIP(s);
619 }
620
2b479609
KW
621 len = e - s;
622
623 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
624 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 625
2b479609
KW
626 /* The code below is derived from this table. Keep in mind that legal
627 * continuation bytes range between \x80..\xBF for UTF-8, and
628 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
629 * Hence, we don't have to test the upper edge because if any of those
630 * are encountered, the sequence is malformed, and will fail elsewhere
631 * in this function.
632 * UTF-8 UTF-EBCDIC I8
633 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
634 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
635 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
636 *
637 */
638
639#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
640# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 641# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 642
19794540
KW
643# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
644 /* B6 and B7 */ \
645 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
646#else
647# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
648# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
649# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
650#endif
651
652 if ( (flags & UTF8_DISALLOW_SUPER)
653 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
654 return 0; /* Above Unicode */
655 }
656
657 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
658 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
659 {
660 return 0; /* Above 31 bits */
661 }
662
663 if (len > 1) {
664 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
665
666 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 667 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
668 {
669 return 0; /* Above Unicode */
670 }
671
672 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 673 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
674 {
675 return 0; /* Surrogate */
676 }
677
678 if ( (flags & UTF8_DISALLOW_NONCHAR)
679 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
680 {
681 return 0; /* Noncharacter code point */
682 }
683 }
684 }
685
686 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
687 for (x = s + 1; x < e; x++) {
688 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
689 return 0;
690 }
691 }
692
af13dd8a 693 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
694 * overlong. */
695 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
696 return 0;
af13dd8a
KW
697 }
698
12a4bed3
KW
699 /* And finally, that the code point represented fits in a word on this
700 * platform */
701 if (does_utf8_overflow(s, e)) {
702 return 0;
35f8c9bd
KW
703 }
704
2b479609 705 return UTF8SKIP(s);
35f8c9bd
KW
706}
707
7cf8d05d
KW
708STATIC char *
709S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
710{
711 /* Returns a mortalized C string that is a displayable copy of the 'len'
712 * bytes starting at 's', each in a \xXY format. */
713
714 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
715 trailing NUL */
716 const U8 * const e = s + len;
717 char * output;
718 char * d;
719
720 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
721
722 Newx(output, output_len, char);
723 SAVEFREEPV(output);
724
725 d = output;
726 for (; s < e; s++) {
727 const unsigned high_nibble = (*s & 0xF0) >> 4;
728 const unsigned low_nibble = (*s & 0x0F);
729
730 *d++ = '\\';
731 *d++ = 'x';
732
733 if (high_nibble < 10) {
734 *d++ = high_nibble + '0';
735 }
736 else {
737 *d++ = high_nibble - 10 + 'a';
738 }
739
740 if (low_nibble < 10) {
741 *d++ = low_nibble + '0';
742 }
743 else {
744 *d++ = low_nibble - 10 + 'a';
745 }
746 }
747
748 *d = '\0';
749 return output;
750}
751
806547a7 752PERL_STATIC_INLINE char *
7cf8d05d
KW
753S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
754
755 /* How many bytes to print */
3cc6a05e 756 STRLEN print_len,
7cf8d05d
KW
757
758 /* Which one is the non-continuation */
759 const STRLEN non_cont_byte_pos,
760
761 /* How many bytes should there be? */
762 const STRLEN expect_len)
806547a7
KW
763{
764 /* Return the malformation warning text for an unexpected continuation
765 * byte. */
766
7cf8d05d 767 const char * const where = (non_cont_byte_pos == 1)
806547a7 768 ? "immediately"
7cf8d05d
KW
769 : Perl_form(aTHX_ "%d bytes",
770 (int) non_cont_byte_pos);
3cc6a05e 771 unsigned int i;
806547a7
KW
772
773 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
774
7cf8d05d
KW
775 /* We don't need to pass this parameter, but since it has already been
776 * calculated, it's likely faster to pass it; verify under DEBUGGING */
777 assert(expect_len == UTF8SKIP(s));
778
3cc6a05e
KW
779 /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
780 * length that is larger than is actually available in the buffer. If we
781 * print all the bytes based on that length, we will read past the buffer
782 * end. Often, the strings are NUL terminated, so to lower the chances of
783 * this happening, print the malformed bytes only up through any NUL. */
784 for (i = 1; i < print_len; i++) {
785 if (*(s + i) == '\0') {
786 print_len = i + 1; /* +1 gets the NUL printed */
787 break;
788 }
789 }
790
7cf8d05d
KW
791 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
792 " %s after start byte 0x%02x; need %d bytes, got %d)",
793 malformed_text,
794 _byte_dump_string(s, print_len),
795 *(s + non_cont_byte_pos),
796 where,
797 *s,
798 (int) expect_len,
799 (int) non_cont_byte_pos);
806547a7
KW
800}
801
35f8c9bd
KW
802/*
803
de69f3af 804=for apidoc utf8n_to_uvchr
378516de
KW
805
806THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 807Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 808
9041c2e3 809Bottom level UTF-8 decode routine.
de69f3af 810Returns the native code point value of the first character in the string C<s>,
746afd53
KW
811which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
812C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
813the length, in bytes, of that character.
949cf498
KW
814
815The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
816well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
817causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
818is the next possible position in C<s> that could begin a non-malformed
819character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
820is raised. Some UTF-8 input sequences may contain multiple malformations.
821This function tries to find every possible one in each call, so multiple
822warnings can be raised for each sequence.
949cf498
KW
823
824Various ALLOW flags can be set in C<flags> to allow (and not warn on)
825individual types of malformations, such as the sequence being overlong (that
826is, when there is a shorter sequence that can express the same code point;
827overlong sequences are expressly forbidden in the UTF-8 standard due to
828potential security issues). Another malformation example is the first byte of
829a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
830flags. For allowed 0 length strings, this function returns 0; for allowed
831overlong sequences, the computed code point is returned; for all other allowed
832malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
833determinable reasonable value.
949cf498 834
796b6530 835The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
836flags) malformation is found. If this flag is set, the routine assumes that
837the caller will raise a warning, and this function will silently just set
d088425d
KW
838C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
839
75200dff 840Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 841character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
842in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
843be set to 1. To disambiguate, upon a zero return, see if the first byte of
844C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 845error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
846
847Certain code points are considered problematic. These are Unicode surrogates,
746afd53 848Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 849By default these are considered regular code points, but certain situations
ecc1615f
KW
850warrant special handling for them, which can be specified using the C<flags>
851parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
852three classes are treated as malformations and handled as such. The flags
853C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
854C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
855disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
856restricts the allowed inputs to the strict UTF-8 traditionally defined by
857Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
858definition given by
859L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
860The difference between traditional strictness and C9 strictness is that the
861latter does not forbid non-character code points. (They are still discouraged,
862however.) For more discussion see L<perlunicode/Noncharacter code points>.
863
864The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
865C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
866C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
867raised for their respective categories, but otherwise the code points are
868considered valid (not malformations). To get a category to both be treated as
869a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 870(But note that warnings are not raised if lexically disabled nor if
796b6530 871C<UTF8_CHECK_ONLY> is also specified.)
949cf498 872
760c7c2f
KW
873It is now deprecated to have very high code points (above C<IV_MAX> on the
874platforms) and this function will raise a deprecation warning for these (unless
d5944cab 875such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 876in a 32-bit word.
ab8e6d41
KW
877
878Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
879so using them is more problematic than other above-Unicode code points. Perl
880invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
881likely that non-Perl languages will not be able to read files that contain
aff2be59 882these; nor would Perl understand files
ab8e6d41
KW
883written by something that uses a different extension. For these reasons, there
884is a separate set of flags that can warn and/or disallow these extremely high
885code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
886C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
887are entirely independent from the deprecation warning for code points above
888C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
889code point that needs more than 31 bits to represent. When that happens,
890effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
89132-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
892above-Unicode code points, including these, as malformations; and
893C<UTF8_WARN_SUPER> warns on these.)
894
895On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
896extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
897than on ASCII. Prior to that, code points 2**31 and higher were simply
898unrepresentable, and a different, incompatible method was used to represent
899code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
900and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
901platforms, warning and disallowing 2**31 and higher.
949cf498
KW
902
903All other code points corresponding to Unicode characters, including private
904use and those yet to be assigned, are never considered malformed and never
905warn.
67e989fb 906
37607a96 907=cut
f9380377
KW
908
909Also implemented as a macro in utf8.h
910*/
911
912UV
913Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
914 STRLEN curlen,
915 STRLEN *retlen,
916 const U32 flags)
917{
918 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
919
920 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
921}
922
923/*
924
925=for apidoc utf8n_to_uvchr_error
926
927THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
928Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
929
930This function is for code that needs to know what the precise malformation(s)
931are when an error is found.
932
933It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
934all the others, C<errors>. If this parameter is 0, this function behaves
935identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
936to a C<U32> variable, which this function sets to indicate any errors found.
937Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
938C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
939of these bits will be set if a malformation is found, even if the input
7a65503b 940C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
941exceptions are noted:
942
943=over 4
944
945=item C<UTF8_GOT_ABOVE_31_BIT>
946
947The code point represented by the input UTF-8 sequence occupies more than 31
948bits.
949This bit is set only if the input C<flags> parameter contains either the
950C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
951
952=item C<UTF8_GOT_CONTINUATION>
953
954The input sequence was malformed in that the first byte was a a UTF-8
955continuation byte.
956
957=item C<UTF8_GOT_EMPTY>
958
959The input C<curlen> parameter was 0.
960
961=item C<UTF8_GOT_LONG>
962
963The input sequence was malformed in that there is some other sequence that
964evaluates to the same code point, but that sequence is shorter than this one.
965
966=item C<UTF8_GOT_NONCHAR>
967
968The code point represented by the input UTF-8 sequence is for a Unicode
969non-character code point.
970This bit is set only if the input C<flags> parameter contains either the
971C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
972
973=item C<UTF8_GOT_NON_CONTINUATION>
974
975The input sequence was malformed in that a non-continuation type byte was found
976in a position where only a continuation type one should be.
977
978=item C<UTF8_GOT_OVERFLOW>
979
980The input sequence was malformed in that it is for a code point that is not
981representable in the number of bits available in a UV on the current platform.
982
983=item C<UTF8_GOT_SHORT>
984
985The input sequence was malformed in that C<curlen> is smaller than required for
986a complete sequence. In other words, the input is for a partial character
987sequence.
988
989=item C<UTF8_GOT_SUPER>
990
991The input sequence was malformed in that it is for a non-Unicode code point;
992that is, one above the legal Unicode maximum.
993This bit is set only if the input C<flags> parameter contains either the
994C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
995
996=item C<UTF8_GOT_SURROGATE>
997
998The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
999code point.
1000This bit is set only if the input C<flags> parameter contains either the
1001C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1002
1003=back
1004
1005=cut
37607a96 1006*/
67e989fb 1007
a0ed51b3 1008UV
f9380377
KW
1009Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1010 STRLEN curlen,
1011 STRLEN *retlen,
1012 const U32 flags,
1013 U32 * errors)
a0ed51b3 1014{
d4c19fe8 1015 const U8 * const s0 = s;
2b5e7bc2
KW
1016 U8 * send = NULL; /* (initialized to silence compilers' wrong
1017 warning) */
1018 U32 possible_problems = 0; /* A bit is set here for each potential problem
1019 found as we go along */
eb83ed87 1020 UV uv = *s;
2b5e7bc2
KW
1021 STRLEN expectlen = 0; /* How long should this sequence be?
1022 (initialized to silence compilers' wrong
1023 warning) */
e308b348 1024 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1025 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1026 this gets set and discarded */
a0dbb045 1027
2b5e7bc2
KW
1028 /* The below are used only if there is both an overlong malformation and a
1029 * too short one. Otherwise the first two are set to 's0' and 'send', and
1030 * the third not used at all */
1031 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1032 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1033 warning) */
2b5e7bc2 1034 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1035
f9380377
KW
1036 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1037
1038 if (errors) {
1039 *errors = 0;
1040 }
1041 else {
1042 errors = &discard_errors;
1043 }
a0dbb045 1044
eb83ed87
KW
1045 /* The order of malformation tests here is important. We should consume as
1046 * few bytes as possible in order to not skip any valid character. This is
1047 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1048 * http://unicode.org/reports/tr36 for more discussion as to why. For
1049 * example, once we've done a UTF8SKIP, we can tell the expected number of
1050 * bytes, and could fail right off the bat if the input parameters indicate
1051 * that there are too few available. But it could be that just that first
1052 * byte is garbled, and the intended character occupies fewer bytes. If we
1053 * blindly assumed that the first byte is correct, and skipped based on
1054 * that number, we could skip over a valid input character. So instead, we
1055 * always examine the sequence byte-by-byte.
1056 *
1057 * We also should not consume too few bytes, otherwise someone could inject
1058 * things. For example, an input could be deliberately designed to
1059 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1060 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1061 * which is actually part of of the overflowing sequence), that could look
1062 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1063 * sequence and process the rest, inappropriately.
1064 *
1065 * Some possible input sequences are malformed in more than one way. This
1066 * function goes to lengths to try to find all of them. This is necessary
1067 * for correctness, as the inputs may allow one malformation but not
1068 * another, and if we abandon searching for others after finding the
1069 * allowed one, we could allow in something that shouldn't have been.
1070 */
eb83ed87 1071
b5b9af04 1072 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1073 possible_problems |= UTF8_GOT_EMPTY;
1074 curlen = 0;
1075 uv = 0; /* XXX It could be argued that this should be
1076 UNICODE_REPLACEMENT? */
1077 goto ready_to_handle_errors;
0c443dc2
JH
1078 }
1079
eb83ed87
KW
1080 expectlen = UTF8SKIP(s);
1081
1082 /* A well-formed UTF-8 character, as the vast majority of calls to this
1083 * function will be for, has this expected length. For efficiency, set
1084 * things up here to return it. It will be overriden only in those rare
1085 * cases where a malformation is found */
1086 if (retlen) {
1087 *retlen = expectlen;
1088 }
1089
1090 /* An invariant is trivially well-formed */
1d72bdf6 1091 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1092 return uv;
a0ed51b3 1093 }
67e989fb 1094
eb83ed87 1095 /* A continuation character can't start a valid sequence */
b5b9af04 1096 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1097 possible_problems |= UTF8_GOT_CONTINUATION;
1098 curlen = 1;
1099 uv = UNICODE_REPLACEMENT;
1100 goto ready_to_handle_errors;
ba210ebe 1101 }
9041c2e3 1102
dcd27b3c
KW
1103 /* Here is not a continuation byte, nor an invariant. The only thing left
1104 * is a start byte (possibly for an overlong) */
1105
534752c1
KW
1106 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1107 * that indicate the number of bytes in the character's whole UTF-8
1108 * sequence, leaving just the bits that are part of the value. */
1109 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1110
e308b348
KW
1111 /* Setup the loop end point, making sure to not look past the end of the
1112 * input string, and flag it as too short if the size isn't big enough. */
1113 send = (U8*) s0;
1114 if (UNLIKELY(curlen < expectlen)) {
1115 possible_problems |= UTF8_GOT_SHORT;
1116 avail_len = curlen;
1117 send += curlen;
1118 }
1119 else {
1120 send += expectlen;
1121 }
1122 adjusted_send = send;
1123
eb83ed87 1124 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1125 * accumulating each into the working value as we go. */
eb83ed87 1126 for (s = s0 + 1; s < send; s++) {
b5b9af04 1127 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1128 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1129 continue;
1130 }
1131
1132 /* Here, found a non-continuation before processing all expected bytes.
1133 * This byte indicates the beginning of a new character, so quit, even
1134 * if allowing this malformation. */
2b5e7bc2 1135 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1136 break;
eb83ed87
KW
1137 } /* End of loop through the character's bytes */
1138
1139 /* Save how many bytes were actually in the character */
1140 curlen = s - s0;
1141
e308b348
KW
1142 /* A convenience macro that matches either of the too-short conditions. */
1143# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
2f8f112e 1144
e308b348 1145 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
2b5e7bc2
KW
1146 uv_so_far = uv;
1147 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
1148 }
1149
2b5e7bc2
KW
1150 /* Note that there are two types of too-short malformation. One is when
1151 * there is actual wrong data before the normal termination of the
1152 * sequence. The other is that the sequence wasn't complete before the end
1153 * of the data we are allowed to look at, based on the input 'curlen'.
1154 * This means that we were passed data for a partial character, but it is
1155 * valid as far as we saw. The other is definitely invalid. This
1156 * distinction could be important to a caller, so the two types are kept
1157 * separate. */
1158
1159 /* Check for overflow */
1160 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1161 possible_problems |= UTF8_GOT_OVERFLOW;
1162 uv = UNICODE_REPLACEMENT;
eb83ed87 1163 }
eb83ed87 1164
2b5e7bc2
KW
1165 /* Check for overlong. If no problems so far, 'uv' is the correct code
1166 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1167 * we must look at the UTF-8 byte sequence itself to see if it is for an
1168 * overlong */
1169 if ( ( LIKELY(! possible_problems)
1170 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1171 || ( UNLIKELY( possible_problems)
1172 && ( UNLIKELY(! UTF8_IS_START(*s0))
1173 || ( curlen > 1
1174 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1175 send - s0))))))
2f8f112e 1176 {
2b5e7bc2
KW
1177 possible_problems |= UTF8_GOT_LONG;
1178
2b5e7bc2
KW
1179 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1180 UV min_uv = uv_so_far;
1181 STRLEN i;
1182
1183 /* Here, the input is both overlong and is missing some trailing
1184 * bytes. There is no single code point it could be for, but there
1185 * may be enough information present to determine if what we have
1186 * so far is for an unallowed code point, such as for a surrogate.
1187 * The code below has the intelligence to determine this, but just
1188 * for non-overlong UTF-8 sequences. What we do here is calculate
1189 * the smallest code point the input could represent if there were
1190 * no too short malformation. Then we compute and save the UTF-8
1191 * for that, which is what the code below looks at instead of the
1192 * raw input. It turns out that the smallest such code point is
1193 * all we need. */
1194 for (i = curlen; i < expectlen; i++) {
1195 min_uv = UTF8_ACCUMULATE(min_uv,
1196 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1197 }
1198
1199 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1200 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1201 to free it ourselves if
1202 warnings are made fatal */
1203 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1204 }
eb83ed87
KW
1205 }
1206
2b5e7bc2
KW
1207 /* Now check that the input isn't for a problematic code point not allowed
1208 * by the input parameters. */
1209 /* isn't problematic if < this */
1210 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1211 || ( UNLIKELY(possible_problems)
1212 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1213 && ((flags & ( UTF8_DISALLOW_NONCHAR
1214 |UTF8_DISALLOW_SURROGATE
1215 |UTF8_DISALLOW_SUPER
1216 |UTF8_DISALLOW_ABOVE_31_BIT
1217 |UTF8_WARN_NONCHAR
1218 |UTF8_WARN_SURROGATE
1219 |UTF8_WARN_SUPER
1220 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1221 /* In case of a malformation, 'uv' is not valid, and has
1222 * been changed to something in the Unicode range.
1223 * Currently we don't output a deprecation message if there
1224 * is already a malformation, so we don't have to special
1225 * case the test immediately below */
760c7c2f
KW
1226 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1227 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1228 {
2b5e7bc2
KW
1229 /* If there were no malformations, or the only malformation is an
1230 * overlong, 'uv' is valid */
1231 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1232 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1233 possible_problems |= UTF8_GOT_SURROGATE;
1234 }
1235 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1236 possible_problems |= UTF8_GOT_SUPER;
1237 }
1238 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1239 possible_problems |= UTF8_GOT_NONCHAR;
1240 }
1241 }
1242 else { /* Otherwise, need to look at the source UTF-8, possibly
1243 adjusted to be non-overlong */
1244
1245 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1246 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1247 {
2b5e7bc2
KW
1248 possible_problems |= UTF8_GOT_SUPER;
1249 }
1250 else if (curlen > 1) {
1251 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1252 NATIVE_UTF8_TO_I8(*adjusted_s0),
1253 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1254 {
2b5e7bc2 1255 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1256 }
2b5e7bc2
KW
1257 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1258 NATIVE_UTF8_TO_I8(*adjusted_s0),
1259 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1260 {
1261 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1262 }
1263 }
c0236afe 1264
2b5e7bc2
KW
1265 /* We need a complete well-formed UTF-8 character to discern
1266 * non-characters, so can't look for them here */
1267 }
1268 }
949cf498 1269
2b5e7bc2
KW
1270 ready_to_handle_errors:
1271
1272 /* At this point:
1273 * curlen contains the number of bytes in the sequence that
1274 * this call should advance the input by.
e308b348
KW
1275 * avail_len gives the available number of bytes passed in, but
1276 * only if this is less than the expected number of
1277 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1278 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1279 * is set in it for each potential problem found.
1280 * uv contains the code point the input sequence
1281 * represents; or if there is a problem that prevents
1282 * a well-defined value from being computed, it is
1283 * some subsitute value, typically the REPLACEMENT
1284 * CHARACTER.
1285 * s0 points to the first byte of the character
1286 * send points to just after where that (potentially
1287 * partial) character ends
1288 * adjusted_s0 normally is the same as s0, but in case of an
1289 * overlong for which the UTF-8 matters below, it is
1290 * the first byte of the shortest form representation
1291 * of the input.
1292 * adjusted_send normally is the same as 'send', but if adjusted_s0
1293 * is set to something other than s0, this points one
1294 * beyond its end
1295 */
eb83ed87 1296
2b5e7bc2
KW
1297 if (UNLIKELY(possible_problems)) {
1298 bool disallowed = FALSE;
1299 const U32 orig_problems = possible_problems;
1300
1301 while (possible_problems) { /* Handle each possible problem */
1302 UV pack_warn = 0;
1303 char * message = NULL;
1304
1305 /* Each 'if' clause handles one problem. They are ordered so that
1306 * the first ones' messages will be displayed before the later
1307 * ones; this is kinda in decreasing severity order */
1308 if (possible_problems & UTF8_GOT_OVERFLOW) {
1309
1310 /* Overflow means also got a super and above 31 bits, but we
1311 * handle all three cases here */
1312 possible_problems
1313 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1314 *errors |= UTF8_GOT_OVERFLOW;
1315
1316 /* But the API says we flag all errors found */
1317 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1318 *errors |= UTF8_GOT_SUPER;
1319 }
1320 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1321 *errors |= UTF8_GOT_ABOVE_31_BIT;
1322 }
2b5e7bc2
KW
1323
1324 disallowed = TRUE;
1325
1326 /* The warnings code explicitly says it doesn't handle the case
1327 * of packWARN2 and two categories which have parent-child
1328 * relationship. Even if it works now to raise the warning if
1329 * either is enabled, it wouldn't necessarily do so in the
1330 * future. We output (only) the most dire warning*/
1331 if (! (flags & UTF8_CHECK_ONLY)) {
1332 if (ckWARN_d(WARN_UTF8)) {
1333 pack_warn = packWARN(WARN_UTF8);
1334 }
1335 else if (ckWARN_d(WARN_NON_UNICODE)) {
1336 pack_warn = packWARN(WARN_NON_UNICODE);
1337 }
1338 if (pack_warn) {
1339 message = Perl_form(aTHX_ "%s: %s (overflows)",
1340 malformed_text,
1341 _byte_dump_string(s0, send - s0));
1342 }
1343 }
1344 }
1345 else if (possible_problems & UTF8_GOT_EMPTY) {
1346 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1347 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1348
1349 if (! (flags & UTF8_ALLOW_EMPTY)) {
1350 disallowed = TRUE;
1351 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1352 pack_warn = packWARN(WARN_UTF8);
1353 message = Perl_form(aTHX_ "%s (empty string)",
1354 malformed_text);
1355 }
1356 }
1357 }
1358 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1359 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1360 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1361
1362 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1363 disallowed = TRUE;
1364 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1365 pack_warn = packWARN(WARN_UTF8);
1366 message = Perl_form(aTHX_
1367 "%s: %s (unexpected continuation byte 0x%02x,"
1368 " with no preceding start byte)",
1369 malformed_text,
1370 _byte_dump_string(s0, 1), *s0);
1371 }
1372 }
1373 }
2b5e7bc2
KW
1374 else if (possible_problems & UTF8_GOT_SHORT) {
1375 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1376 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1377
1378 if (! (flags & UTF8_ALLOW_SHORT)) {
1379 disallowed = TRUE;
1380 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1381 pack_warn = packWARN(WARN_UTF8);
1382 message = Perl_form(aTHX_
9a6c9c81 1383 "%s: %s (too short; %d byte%s available, need %d)",
2b5e7bc2
KW
1384 malformed_text,
1385 _byte_dump_string(s0, send - s0),
e308b348
KW
1386 (int)avail_len,
1387 avail_len == 1 ? "" : "s",
2b5e7bc2
KW
1388 (int)expectlen);
1389 }
1390 }
ba210ebe 1391
2b5e7bc2 1392 }
e308b348
KW
1393 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1394 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1395 *errors |= UTF8_GOT_NON_CONTINUATION;
1396
1397 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1398 disallowed = TRUE;
1399 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1400 pack_warn = packWARN(WARN_UTF8);
1401 message = Perl_form(aTHX_ "%s",
1402 unexpected_non_continuation_text(s0,
1403 send - s0,
1404 s - s0,
1405 (int) expectlen));
1406 }
1407 }
1408 }
2b5e7bc2
KW
1409 else if (possible_problems & UTF8_GOT_LONG) {
1410 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1411 *errors |= UTF8_GOT_LONG;
2b5e7bc2
KW
1412
1413 if (! (flags & UTF8_ALLOW_LONG)) {
1414 disallowed = TRUE;
1415
1416 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1417 pack_warn = packWARN(WARN_UTF8);
1418
1419 /* These error types cause 'uv' to be something that
1420 * isn't what was intended, so can't use it in the
1421 * message. The other error types either can't
1422 * generate an overlong, or else the 'uv' is valid */
1423 if (orig_problems &
1424 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1425 {
1426 message = Perl_form(aTHX_
1427 "%s: %s (any UTF-8 sequence that starts"
1428 " with \"%s\" is overlong which can and"
1429 " should be represented with a"
1430 " different, shorter sequence)",
1431 malformed_text,
1432 _byte_dump_string(s0, send - s0),
1433 _byte_dump_string(s0, curlen));
1434 }
1435 else {
1436 U8 tmpbuf[UTF8_MAXBYTES+1];
1437 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1438 uv, 0);
1439 message = Perl_form(aTHX_
1440 "%s: %s (overlong; instead use %s to represent"
147e3846 1441 " U+%0*" UVXf ")",
2b5e7bc2
KW
1442 malformed_text,
1443 _byte_dump_string(s0, send - s0),
1444 _byte_dump_string(tmpbuf, e - tmpbuf),
1445 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1446 small code points */
1447 uv);
1448 }
1449 }
1450 }
1451 }
1452 else if (possible_problems & UTF8_GOT_SURROGATE) {
1453 possible_problems &= ~UTF8_GOT_SURROGATE;
1454
f9380377
KW
1455 if (flags & UTF8_WARN_SURROGATE) {
1456 *errors |= UTF8_GOT_SURROGATE;
1457
1458 if ( ! (flags & UTF8_CHECK_ONLY)
1459 && ckWARN_d(WARN_SURROGATE))
1460 {
2b5e7bc2
KW
1461 pack_warn = packWARN(WARN_SURROGATE);
1462
1463 /* These are the only errors that can occur with a
1464 * surrogate when the 'uv' isn't valid */
1465 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1466 message = Perl_form(aTHX_
1467 "UTF-16 surrogate (any UTF-8 sequence that"
1468 " starts with \"%s\" is for a surrogate)",
1469 _byte_dump_string(s0, curlen));
1470 }
1471 else {
1472 message = Perl_form(aTHX_
147e3846 1473 "UTF-16 surrogate U+%04" UVXf, uv);
2b5e7bc2 1474 }
f9380377 1475 }
2b5e7bc2 1476 }
ba210ebe 1477
2b5e7bc2
KW
1478 if (flags & UTF8_DISALLOW_SURROGATE) {
1479 disallowed = TRUE;
f9380377 1480 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1481 }
1482 }
1483 else if (possible_problems & UTF8_GOT_SUPER) {
1484 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1485
f9380377
KW
1486 if (flags & UTF8_WARN_SUPER) {
1487 *errors |= UTF8_GOT_SUPER;
1488
1489 if ( ! (flags & UTF8_CHECK_ONLY)
1490 && ckWARN_d(WARN_NON_UNICODE))
1491 {
2b5e7bc2
KW
1492 pack_warn = packWARN(WARN_NON_UNICODE);
1493
1494 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1495 message = Perl_form(aTHX_
1496 "Any UTF-8 sequence that starts with"
1497 " \"%s\" is for a non-Unicode code point,"
1498 " may not be portable",
1499 _byte_dump_string(s0, curlen));
1500 }
1501 else {
1502 message = Perl_form(aTHX_
147e3846 1503 "Code point 0x%04" UVXf " is not"
2b5e7bc2
KW
1504 " Unicode, may not be portable",
1505 uv);
1506 }
f9380377 1507 }
2b5e7bc2 1508 }
ba210ebe 1509
2b5e7bc2
KW
1510 /* The maximum code point ever specified by a standard was
1511 * 2**31 - 1. Anything larger than that is a Perl extension
1512 * that very well may not be understood by other applications
1513 * (including earlier perl versions on EBCDIC platforms). We
1514 * test for these after the regular SUPER ones, and before
1515 * possibly bailing out, so that the slightly more dire warning
1516 * will override the regular one. */
1517 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1518 |UTF8_WARN_SUPER
1519 |UTF8_DISALLOW_ABOVE_31_BIT))
1520 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1521 && UNLIKELY(is_utf8_cp_above_31_bits(
1522 adjusted_s0,
1523 adjusted_send)))
1524 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1525 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1526 {
1527 if ( ! (flags & UTF8_CHECK_ONLY)
1528 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1529 && ckWARN_d(WARN_UTF8))
1530 {
1531 pack_warn = packWARN(WARN_UTF8);
1532
1533 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1534 message = Perl_form(aTHX_
1535 "Any UTF-8 sequence that starts with"
1536 " \"%s\" is for a non-Unicode code"
1537 " point, and is not portable",
1538 _byte_dump_string(s0, curlen));
1539 }
1540 else {
1541 message = Perl_form(aTHX_
147e3846 1542 "Code point 0x%" UVXf " is not Unicode,"
2b5e7bc2
KW
1543 " and not portable",
1544 uv);
1545 }
1546 }
1547
f9380377
KW
1548 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1549 *errors |= UTF8_GOT_ABOVE_31_BIT;
1550
1551 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1552 disallowed = TRUE;
1553 }
2b5e7bc2
KW
1554 }
1555 }
eb83ed87 1556
2b5e7bc2 1557 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1558 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1559 disallowed = TRUE;
1560 }
eb83ed87 1561
2b5e7bc2
KW
1562 /* The deprecated warning overrides any non-deprecated one. If
1563 * there are other problems, a deprecation message is not
1564 * really helpful, so don't bother to raise it in that case.
1565 * This also keeps the code from having to handle the case
1566 * where 'uv' is not valid. */
1567 if ( ! (orig_problems
1568 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1569 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1570 && ckWARN_d(WARN_DEPRECATED))
1571 {
1572 message = Perl_form(aTHX_ cp_above_legal_max,
1573 uv, MAX_NON_DEPRECATED_CP);
1574 pack_warn = packWARN(WARN_DEPRECATED);
1575 }
1576 }
1577 else if (possible_problems & UTF8_GOT_NONCHAR) {
1578 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1579
f9380377
KW
1580 if (flags & UTF8_WARN_NONCHAR) {
1581 *errors |= UTF8_GOT_NONCHAR;
1582
1583 if ( ! (flags & UTF8_CHECK_ONLY)
1584 && ckWARN_d(WARN_NONCHAR))
1585 {
2b5e7bc2
KW
1586 /* The code above should have guaranteed that we don't
1587 * get here with errors other than overlong */
1588 assert (! (orig_problems
1589 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1590
1591 pack_warn = packWARN(WARN_NONCHAR);
1592 message = Perl_form(aTHX_ "Unicode non-character"
147e3846 1593 " U+%04" UVXf " is not recommended"
2b5e7bc2 1594 " for open interchange", uv);
f9380377 1595 }
2b5e7bc2 1596 }
5b311467 1597
2b5e7bc2
KW
1598 if (flags & UTF8_DISALLOW_NONCHAR) {
1599 disallowed = TRUE;
f9380377 1600 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1601 }
1602 } /* End of looking through the possible flags */
1603
1604 /* Display the message (if any) for the problem being handled in
1605 * this iteration of the loop */
1606 if (message) {
1607 if (PL_op)
1608 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1609 OP_DESC(PL_op));
1610 else
1611 Perl_warner(aTHX_ pack_warn, "%s", message);
1612 }
1613 } /* End of 'while (possible_problems) {' */
a0dbb045 1614
2b5e7bc2
KW
1615 /* Since there was a possible problem, the returned length may need to
1616 * be changed from the one stored at the beginning of this function.
1617 * Instead of trying to figure out if that's needed, just do it. */
1618 if (retlen) {
1619 *retlen = curlen;
1620 }
a0dbb045 1621
2b5e7bc2
KW
1622 if (disallowed) {
1623 if (flags & UTF8_CHECK_ONLY && retlen) {
1624 *retlen = ((STRLEN) -1);
1625 }
1626 return 0;
1627 }
eb83ed87 1628 }
ba210ebe 1629
2b5e7bc2 1630 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1631}
1632
8e84507e 1633/*
ec5f19d0
KW
1634=for apidoc utf8_to_uvchr_buf
1635
1636Returns the native code point of the first character in the string C<s> which
1637is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1638C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1639
524080c4
KW
1640If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1641enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1642C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1643(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1644C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1645the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1646See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1647returned.
ec5f19d0 1648
760c7c2f
KW
1649Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1650unless those are turned off.
1651
ec5f19d0 1652=cut
52be2536
KW
1653
1654Also implemented as a macro in utf8.h
1655
ec5f19d0
KW
1656*/
1657
1658
1659UV
1660Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1661{
ec5f19d0
KW
1662 assert(s < send);
1663
1664 return utf8n_to_uvchr(s, send - s, retlen,
1665 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1666}
1667
52be2536
KW
1668/* This is marked as deprecated
1669 *
ec5f19d0
KW
1670=for apidoc utf8_to_uvuni_buf
1671
de69f3af
KW
1672Only in very rare circumstances should code need to be dealing in Unicode
1673(as opposed to native) code points. In those few cases, use
1674C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1675
1676Returns the Unicode (not-native) code point of the first character in the
1677string C<s> which
ec5f19d0
KW
1678is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1679C<retlen> will be set to the length, in bytes, of that character.
1680
524080c4
KW
1681If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1682enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1683NULL) to -1. If those warnings are off, the computed value if well-defined (or
1684the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1685is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1686next possible position in C<s> that could begin a non-malformed character.
de69f3af 1687See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1688
760c7c2f
KW
1689Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1690unless those are turned off.
1691
ec5f19d0
KW
1692=cut
1693*/
1694
1695UV
1696Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1697{
1698 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1699
1700 assert(send > s);
1701
5962d97e
KW
1702 /* Call the low level routine, asking for checks */
1703 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1704}
1705
b76347f2 1706/*
87cea99e 1707=for apidoc utf8_length
b76347f2
JH
1708
1709Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1710Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1711up past C<e>, croaks.
b76347f2
JH
1712
1713=cut
1714*/
1715
1716STRLEN
35a4481c 1717Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1718{
1719 STRLEN len = 0;
1720
7918f24d
NC
1721 PERL_ARGS_ASSERT_UTF8_LENGTH;
1722
8850bf83
JH
1723 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1724 * the bitops (especially ~) can create illegal UTF-8.
1725 * In other words: in Perl UTF-8 is not just for Unicode. */
1726
a3b680e6
AL
1727 if (e < s)
1728 goto warn_and_return;
b76347f2 1729 while (s < e) {
4cbf4130 1730 s += UTF8SKIP(s);
8e91ec7f
AV
1731 len++;
1732 }
1733
1734 if (e != s) {
1735 len--;
1736 warn_and_return:
9b387841
NC
1737 if (PL_op)
1738 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1739 "%s in %s", unees, OP_DESC(PL_op));
1740 else
61a12c31 1741 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1742 }
1743
1744 return len;
1745}
1746
b06226ff 1747/*
fed3ba5d
NC
1748=for apidoc bytes_cmp_utf8
1749
a1433954 1750Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1751sequence of characters (stored as UTF-8)
1752in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1753equal, -1 or -2 if the first string is less than the second string, +1 or +2
1754if the first string is greater than the second string.
1755
1756-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1757longer string. -2 or +2 is returned if
1758there was a difference between characters
fed3ba5d
NC
1759within the strings.
1760
1761=cut
1762*/
1763
1764int
1765Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1766{
1767 const U8 *const bend = b + blen;
1768 const U8 *const uend = u + ulen;
1769
1770 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1771
1772 while (b < bend && u < uend) {
1773 U8 c = *u++;
1774 if (!UTF8_IS_INVARIANT(c)) {
1775 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1776 if (u < uend) {
1777 U8 c1 = *u++;
1778 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1779 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1780 } else {
2b5e7bc2 1781 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1782 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1783 "%s %s%s",
7cf8d05d 1784 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1785 PL_op ? " in " : "",
1786 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1787 return -2;
1788 }
1789 } else {
1790 if (PL_op)
1791 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1792 "%s in %s", unees, OP_DESC(PL_op));
1793 else
61a12c31 1794 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1795 return -2; /* Really want to return undef :-) */
1796 }
1797 } else {
1798 return -2;
1799 }
1800 }
1801 if (*b != c) {
1802 return *b < c ? -2 : +2;
1803 }
1804 ++b;
1805 }
1806
1807 if (b == bend && u == uend)
1808 return 0;
1809
1810 return b < bend ? +1 : -1;
1811}
1812
1813/*
87cea99e 1814=for apidoc utf8_to_bytes
6940069f 1815
2bbc8d55 1816Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1817Unlike L</bytes_to_utf8>, this over-writes the original string, and
1818updates C<len> to contain the new length.
67e989fb 1819Returns zero on failure, setting C<len> to -1.
6940069f 1820
a1433954 1821If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1822
6940069f
GS
1823=cut
1824*/
1825
1826U8 *
37607a96 1827Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1828{
d4c19fe8
AL
1829 U8 * const save = s;
1830 U8 * const send = s + *len;
6940069f 1831 U8 *d;
246fae53 1832
7918f24d 1833 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1834 PERL_UNUSED_CONTEXT;
7918f24d 1835
1e54db1a 1836 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1837 while (s < send) {
d59937ca
KW
1838 if (! UTF8_IS_INVARIANT(*s)) {
1839 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1840 *len = ((STRLEN) -1);
1841 return 0;
1842 }
1843 s++;
dcad2880 1844 }
d59937ca 1845 s++;
246fae53 1846 }
dcad2880
JH
1847
1848 d = s = save;
6940069f 1849 while (s < send) {
80e0b38f
KW
1850 U8 c = *s++;
1851 if (! UTF8_IS_INVARIANT(c)) {
1852 /* Then it is two-byte encoded */
a62b247b 1853 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1854 s++;
1855 }
1856 *d++ = c;
6940069f
GS
1857 }
1858 *d = '\0';
246fae53 1859 *len = d - save;
6940069f
GS
1860 return save;
1861}
1862
1863/*
87cea99e 1864=for apidoc bytes_from_utf8
f9a63242 1865
2bbc8d55 1866Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1867Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1868the newly-created string, and updates C<len> to contain the new
1869length. Returns the original string if no conversion occurs, C<len>
72d33970 1870is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 18710 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1872in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1873
37607a96
PK
1874=cut
1875*/
f9a63242
JH
1876
1877U8 *
e1ec3a88 1878Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1879{
f9a63242 1880 U8 *d;
e1ec3a88
AL
1881 const U8 *start = s;
1882 const U8 *send;
f9a63242
JH
1883 I32 count = 0;
1884
7918f24d 1885 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1886 PERL_UNUSED_CONTEXT;
f9a63242 1887 if (!*is_utf8)
73d840c0 1888 return (U8 *)start;
f9a63242 1889
1e54db1a 1890 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1891 for (send = s + *len; s < send;) {
d59937ca
KW
1892 if (! UTF8_IS_INVARIANT(*s)) {
1893 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1894 return (U8 *)start;
d59937ca
KW
1895 }
1896 count++;
1897 s++;
db42d148 1898 }
d59937ca 1899 s++;
f9a63242
JH
1900 }
1901
35da51f7 1902 *is_utf8 = FALSE;
f9a63242 1903
212542aa 1904 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1905 s = start; start = d;
f9a63242
JH
1906 while (s < send) {
1907 U8 c = *s++;
1a91c45d 1908 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1909 /* Then it is two-byte encoded */
a62b247b 1910 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1911 s++;
c4d5f83a
NIS
1912 }
1913 *d++ = c;
f9a63242
JH
1914 }
1915 *d = '\0';
1916 *len = d - start;
73d840c0 1917 return (U8 *)start;
f9a63242
JH
1918}
1919
1920/*
87cea99e 1921=for apidoc bytes_to_utf8
6940069f 1922
ff97e5cf
KW
1923Converts a string C<s> of length C<len> bytes from the native encoding into
1924UTF-8.
6662521e 1925Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1926reflect the new length in bytes.
6940069f 1927
75200dff 1928A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1929
1930If you want to convert to UTF-8 from encodings other than
1931the native (Latin1 or EBCDIC),
a1433954 1932see L</sv_recode_to_utf8>().
c9ada85f 1933
497711e7 1934=cut
6940069f
GS
1935*/
1936
c682ebef
FC
1937/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1938 likewise need duplication. */
1939
6940069f 1940U8*
35a4481c 1941Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1942{
35a4481c 1943 const U8 * const send = s + (*len);
6940069f
GS
1944 U8 *d;
1945 U8 *dst;
7918f24d
NC
1946
1947 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1948 PERL_UNUSED_CONTEXT;
6940069f 1949
212542aa 1950 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1951 dst = d;
1952
1953 while (s < send) {
55d09dc8
KW
1954 append_utf8_from_native_byte(*s, &d);
1955 s++;
6940069f
GS
1956 }
1957 *d = '\0';
6662521e 1958 *len = d-dst;
6940069f
GS
1959 return dst;
1960}
1961
a0ed51b3 1962/*
dea0fc0b 1963 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1964 *
1965 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1966 * We optimize for native, for obvious reasons. */
1967
1968U8*
dea0fc0b 1969Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1970{
dea0fc0b
JH
1971 U8* pend;
1972 U8* dstart = d;
1973
7918f24d
NC
1974 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1975
dea0fc0b 1976 if (bytelen & 1)
147e3846 1977 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
dea0fc0b
JH
1978
1979 pend = p + bytelen;
1980
a0ed51b3 1981 while (p < pend) {
dea0fc0b
JH
1982 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1983 p += 2;
2d1545e5 1984 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1985 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1986 continue;
1987 }
56d37426
KW
1988 if (uv <= MAX_UTF8_TWO_BYTE) {
1989 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1990 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1991 continue;
1992 }
46956fad
KW
1993#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1994#define LAST_HIGH_SURROGATE 0xDBFF
1995#define FIRST_LOW_SURROGATE 0xDC00
1996#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1997
1998 /* This assumes that most uses will be in the first Unicode plane, not
1999 * needing surrogates */
2000 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2001 && uv <= UNICODE_SURROGATE_LAST))
2002 {
2003 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2004 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2005 }
2006 else {
01ea242b 2007 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2008 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2009 || UNLIKELY(low > LAST_LOW_SURROGATE))
2010 {
01ea242b 2011 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2012 }
2013 p += 2;
46956fad
KW
2014 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2015 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2016 }
a0ed51b3 2017 }
56d37426
KW
2018#ifdef EBCDIC
2019 d = uvoffuni_to_utf8_flags(d, uv, 0);
2020#else
a0ed51b3 2021 if (uv < 0x10000) {
eb160463
GS
2022 *d++ = (U8)(( uv >> 12) | 0xe0);
2023 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2024 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2025 continue;
2026 }
2027 else {
eb160463
GS
2028 *d++ = (U8)(( uv >> 18) | 0xf0);
2029 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2030 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2031 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2032 continue;
2033 }
56d37426 2034#endif
a0ed51b3 2035 }
dea0fc0b 2036 *newlen = d - dstart;
a0ed51b3
LW
2037 return d;
2038}
2039
2040/* Note: this one is slightly destructive of the source. */
2041
2042U8*
dea0fc0b 2043Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2044{
2045 U8* s = (U8*)p;
d4c19fe8 2046 U8* const send = s + bytelen;
7918f24d
NC
2047
2048 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2049
e0ea5e2d 2050 if (bytelen & 1)
147e3846 2051 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2052 (UV)bytelen);
2053
a0ed51b3 2054 while (s < send) {
d4c19fe8 2055 const U8 tmp = s[0];
a0ed51b3
LW
2056 s[0] = s[1];
2057 s[1] = tmp;
2058 s += 2;
2059 }
dea0fc0b 2060 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2061}
2062
922e8cb4
KW
2063bool
2064Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2065{
2066 U8 tmpbuf[UTF8_MAXBYTES+1];
2067 uvchr_to_utf8(tmpbuf, c);
2068 return _is_utf8_FOO(classnum, tmpbuf);
2069}
2070
f9ae8fb6
JD
2071/* Internal function so we can deprecate the external one, and call
2072 this one from other deprecated functions in this file */
2073
f2645549
KW
2074bool
2075Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2076{
f2645549 2077 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2078
2079 if (*p == '_')
2080 return TRUE;
f25ce844 2081 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2082}
2083
5092f92a 2084bool
eba68aa0
KW
2085Perl__is_uni_perl_idcont(pTHX_ UV c)
2086{
2087 U8 tmpbuf[UTF8_MAXBYTES+1];
2088 uvchr_to_utf8(tmpbuf, c);
2089 return _is_utf8_perl_idcont(tmpbuf);
2090}
2091
2092bool
f91dcd13
KW
2093Perl__is_uni_perl_idstart(pTHX_ UV c)
2094{
2095 U8 tmpbuf[UTF8_MAXBYTES+1];
2096 uvchr_to_utf8(tmpbuf, c);
2097 return _is_utf8_perl_idstart(tmpbuf);
2098}
2099
3a4c58c9
KW
2100UV
2101Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2102{
2103 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2104 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2105 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2106 * either "SS" or "Ss". Which one to use is passed into the routine in
2107 * 'S_or_s' to avoid a test */
2108
2109 UV converted = toUPPER_LATIN1_MOD(c);
2110
2111 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2112
2113 assert(S_or_s == 'S' || S_or_s == 's');
2114
6f2d5cbc 2115 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2116 characters in this range */
3a4c58c9
KW
2117 *p = (U8) converted;
2118 *lenp = 1;
2119 return converted;
2120 }
2121
2122 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2123 * which it maps to one of them, so as to only have to have one check for
2124 * it in the main case */
2125 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2126 switch (c) {
2127 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2128 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2129 break;
2130 case MICRO_SIGN:
2131 converted = GREEK_CAPITAL_LETTER_MU;
2132 break;
79e064b9
KW
2133#if UNICODE_MAJOR_VERSION > 2 \
2134 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2135 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2136 case LATIN_SMALL_LETTER_SHARP_S:
2137 *(p)++ = 'S';
2138 *p = S_or_s;
2139 *lenp = 2;
2140 return 'S';
79e064b9 2141#endif
3a4c58c9
KW
2142 default:
2143 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2144 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2145 }
2146 }
2147
2148 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2149 *p = UTF8_TWO_BYTE_LO(converted);
2150 *lenp = 2;
2151
2152 return converted;
2153}
2154
50bda2c3
KW
2155/* Call the function to convert a UTF-8 encoded character to the specified case.
2156 * Note that there may be more than one character in the result.
2157 * INP is a pointer to the first byte of the input character
2158 * OUTP will be set to the first byte of the string of changed characters. It
2159 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2160 * LENP will be set to the length in bytes of the string of changed characters
2161 *
2162 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2163#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2164#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2165#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2166
b9992569
KW
2167/* This additionally has the input parameter 'specials', which if non-zero will
2168 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2169 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2170#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2171
84afefe6
JH
2172UV
2173Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2174{
a1433954
KW
2175 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2176 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2177 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2178 * the changed version may be longer than the original character.
2179 *
2180 * The ordinal of the first character of the changed version is returned
2181 * (but note, as explained above, that there may be more.) */
2182
7918f24d
NC
2183 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2184
3a4c58c9
KW
2185 if (c < 256) {
2186 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2187 }
2188
0ebc6274 2189 uvchr_to_utf8(p, c);
b9992569 2190 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2191}
2192
84afefe6
JH
2193UV
2194Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2195{
7918f24d
NC
2196 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2197
3a4c58c9
KW
2198 if (c < 256) {
2199 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2200 }
2201
0ebc6274 2202 uvchr_to_utf8(p, c);
b9992569 2203 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2204}
2205
afc16117 2206STATIC U8
81611534 2207S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
2208{
2209 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2210 * those, converting the result to UTF-8. Since the result is always just
a1433954 2211 * one character, we allow <p> to be NULL */
afc16117
KW
2212
2213 U8 converted = toLOWER_LATIN1(c);
2214
2215 if (p != NULL) {
6f2d5cbc 2216 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2217 *p = converted;
2218 *lenp = 1;
2219 }
2220 else {
430c9760
KW
2221 /* Result is known to always be < 256, so can use the EIGHT_BIT
2222 * macros */
2223 *p = UTF8_EIGHT_BIT_HI(converted);
2224 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2225 *lenp = 2;
2226 }
2227 }
2228 return converted;
2229}
2230
84afefe6
JH
2231UV
2232Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2233{
7918f24d
NC
2234 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2235
afc16117
KW
2236 if (c < 256) {
2237 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
2238 }
2239
afc16117 2240 uvchr_to_utf8(p, c);
b9992569 2241 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2242}
2243
84afefe6 2244UV
51910141 2245Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2246{
51910141 2247 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2248 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2249 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2250 *
2251 * Not to be used for locale folds
51910141 2252 */
f673fad4 2253
a1dde8de
KW
2254 UV converted;
2255
2256 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2257 PERL_UNUSED_CONTEXT;
a1dde8de 2258
1ca267a5
KW
2259 assert (! (flags & FOLD_FLAGS_LOCALE));
2260
659a7c2d 2261 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2262 converted = GREEK_SMALL_LETTER_MU;
2263 }
9b63e895
KW
2264#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2265 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2266 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2267 else if ( (flags & FOLD_FLAGS_FULL)
2268 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2269 {
1ca267a5
KW
2270 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2271 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2272 * under those circumstances. */
2273 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2274 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2275 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2276 p, *lenp, U8);
2277 return LATIN_SMALL_LETTER_LONG_S;
2278 }
2279 else {
4f489194
KW
2280 *(p)++ = 's';
2281 *p = 's';
2282 *lenp = 2;
2283 return 's';
1ca267a5 2284 }
a1dde8de 2285 }
9b63e895 2286#endif
a1dde8de
KW
2287 else { /* In this range the fold of all other characters is their lower
2288 case */
2289 converted = toLOWER_LATIN1(c);
2290 }
2291
6f2d5cbc 2292 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2293 *p = (U8) converted;
2294 *lenp = 1;
2295 }
2296 else {
2297 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2298 *p = UTF8_TWO_BYTE_LO(converted);
2299 *lenp = 2;
2300 }
2301
2302 return converted;
2303}
2304
2305UV
31f05a37 2306Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2307{
4b593389 2308
a0270393
KW
2309 /* Not currently externally documented, and subject to change
2310 * <flags> bits meanings:
2311 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2312 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2313 * locale are to be used.
a0270393
KW
2314 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2315 */
4b593389 2316
36bb2ab6 2317 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2318
780fcc9f
KW
2319 if (flags & FOLD_FLAGS_LOCALE) {
2320 /* Treat a UTF-8 locale as not being in locale at all */
2321 if (IN_UTF8_CTYPE_LOCALE) {
2322 flags &= ~FOLD_FLAGS_LOCALE;
2323 }
2324 else {
2325 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2326 goto needs_full_generality;
780fcc9f 2327 }
31f05a37
KW
2328 }
2329
a1dde8de 2330 if (c < 256) {
e7b7ac46 2331 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2332 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2333 }
2334
2f306ab9 2335 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2336 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2337 uvchr_to_utf8(p, c);
b9992569 2338 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2339 }
2340 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2341 the special flags. */
2342 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2343
2344 needs_full_generality:
a0270393 2345 uvchr_to_utf8(utf8_c, c);
445bf929 2346 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2347 }
84afefe6
JH
2348}
2349
26483009 2350PERL_STATIC_INLINE bool
5141f98e 2351S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2352 const char *const swashname, SV* const invlist)
bde6a22d 2353{
ea317ccb
KW
2354 /* returns a boolean giving whether or not the UTF8-encoded character that
2355 * starts at <p> is in the swash indicated by <swashname>. <swash>
2356 * contains a pointer to where the swash indicated by <swashname>
2357 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2358 * look at <*swash> and only generate a swash if it is not null. <invlist>
2359 * is NULL or an inversion list that defines the swash. If not null, it
2360 * saves time during initialization of the swash.
ea317ccb
KW
2361 *
2362 * Note that it is assumed that the buffer length of <p> is enough to
2363 * contain all the bytes that comprise the character. Thus, <*p> should
2364 * have been checked before this call for mal-formedness enough to assure
2365 * that. */
2366
7918f24d
NC
2367 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2368
492a624f 2369 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2370 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2371 * as far as there being enough bytes available in it to accommodate the
2372 * character without reading beyond the end, and pass that number on to the
2373 * validating routine */
6302f837 2374 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2375 if (ckWARN_d(WARN_UTF8)) {
2376 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2377 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2378 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2379 what the malformation is */
2380 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2381 }
2382 }
2383 return FALSE;
2384 }
87367d5f
KW
2385 if (!*swash) {
2386 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2387 *swash = _core_swash_init("utf8",
2388
2389 /* Only use the name if there is no inversion
2390 * list; otherwise will go out to disk */
2391 (invlist) ? "" : swashname,
2392
2393 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2394 }
28123549 2395
bde6a22d
NC
2396 return swash_fetch(*swash, p, TRUE) != 0;
2397}
2398
2399bool
922e8cb4
KW
2400Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2401{
922e8cb4
KW
2402 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2403
2404 assert(classnum < _FIRST_NON_SWASH_CC);
2405
f25ce844
KW
2406 return is_utf8_common(p,
2407 &PL_utf8_swash_ptrs[classnum],
2408 swash_property_names[classnum],
2409 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2410}
2411
2412bool
f2645549 2413Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2414{
f2645549 2415 SV* invlist = NULL;
7918f24d 2416
f2645549 2417 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2418
f2645549
KW
2419 if (! PL_utf8_perl_idstart) {
2420 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2421 }
60071a22 2422 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2423}
2424
2425bool
f2645549 2426Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2427{
f2645549 2428 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2429
2430 if (*p == '_')
2431 return TRUE;
f25ce844 2432 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2433}
2434
2435bool
eba68aa0
KW
2436Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2437{
b24b43f7 2438 SV* invlist = NULL;
eba68aa0
KW
2439
2440 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2441
b24b43f7
KW
2442 if (! PL_utf8_perl_idcont) {
2443 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2444 }
60071a22 2445 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2446}
2447
eba68aa0 2448bool
f2645549 2449Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2450{
f2645549 2451 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2452
f25ce844 2453 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2454}
2455
2456bool
f2645549 2457Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2458{
f2645549 2459 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2460
f25ce844 2461 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2462}
2463
2464bool
7dbf68d2
KW
2465Perl__is_utf8_mark(pTHX_ const U8 *p)
2466{
7dbf68d2
KW
2467 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2468
f25ce844 2469 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2470}
2471
6b5c0936 2472/*
87cea99e 2473=for apidoc to_utf8_case
6b5c0936 2474
9da1e7cb
KW
2475Instead use the appropriate one of L</toUPPER_utf8>,
2476L</toTITLE_utf8>,
2477L</toLOWER_utf8>,
2478or L</toFOLD_utf8>.
2479
6fae5207 2480C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2481the character that is being converted. This routine assumes that the character
2482at C<p> is well-formed.
6b5c0936 2483
6fae5207
KW
2484C<ustrp> is a pointer to the character buffer to put the
2485conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2486of the result.
2487
6fae5207 2488C<swashp> is a pointer to the swash to use.
6b5c0936 2489
a1433954 2490Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2491and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2492but not always, a multicharacter mapping), is tried first.
6b5c0936 2493
4a8240a3
KW
2494C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2495any special mappings; C<""> means to use the special mappings. Values other
2496than these two are treated as the name of the hash containing the special
2497mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2498
796b6530
KW
2499C<normal> is a string like C<"ToLower"> which means the swash
2500C<%utf8::ToLower>.
0134edef 2501
760c7c2f
KW
2502Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2503unless those are turned off.
2504
0134edef 2505=cut */
6b5c0936 2506
2104c8d9 2507UV
9a957fbc
AL
2508Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2509 SV **swashp, const char *normal, const char *special)
a0ed51b3 2510{
b9992569
KW
2511 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2512
2513 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2514}
2515
2516 /* change namve uv1 to 'from' */
6a4a25f4 2517STATIC UV
b9992569
KW
2518S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2519 SV **swashp, const char *normal, const char *special)
2520{
0134edef 2521 STRLEN len = 0;
7918f24d 2522
b9992569 2523 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2524
36eaa811
KW
2525 /* For code points that don't change case, we already know that the output
2526 * of this function is the unchanged input, so we can skip doing look-ups
2527 * for them. Unfortunately the case-changing code points are scattered
2528 * around. But there are some long consecutive ranges where there are no
2529 * case changing code points. By adding tests, we can eliminate the lookup
2530 * for all the ones in such ranges. This is currently done here only for
2531 * just a few cases where the scripts are in common use in modern commerce
2532 * (and scripts adjacent to those which can be included without additional
2533 * tests). */
2534
2535 if (uv1 >= 0x0590) {
2536 /* This keeps from needing further processing the code points most
2537 * likely to be used in the following non-cased scripts: Hebrew,
2538 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2539 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2540 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2541 if (uv1 < 0x10A0) {
2542 goto cases_to_self;
2543 }
2544
2545 /* The following largish code point ranges also don't have case
2546 * changes, but khw didn't think they warranted extra tests to speed
2547 * them up (which would slightly slow down everything else above them):
2548 * 1100..139F Hangul Jamo, Ethiopic
2549 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2550 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2551 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2552 * Combining Diacritical Marks Extended, Balinese,
2553 * Sundanese, Batak, Lepcha, Ol Chiki
2554 * 2000..206F General Punctuation
2555 */
2556
2557 if (uv1 >= 0x2D30) {
2558
2559 /* This keeps the from needing further processing the code points
2560 * most likely to be used in the following non-cased major scripts:
2561 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2562 *
2563 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2564 * event that Unicode eventually allocates the unused block as of
2565 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2566 * that the test suite will start having failures to alert you
2567 * should that happen) */
2568 if (uv1 < 0xA640) {
2569 goto cases_to_self;
2570 }
2571
2572 if (uv1 >= 0xAC00) {
2573 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2574 if (ckWARN_d(WARN_SURROGATE)) {
2575 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2576 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 2577 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
2578 }
2579 goto cases_to_self;
2580 }
36eaa811
KW
2581
2582 /* AC00..FAFF Catches Hangul syllables and private use, plus
2583 * some others */
2584 if (uv1 < 0xFB00) {
2585 goto cases_to_self;
2586
2587 }
2588
5af9bc97
KW
2589 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2590 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2591 && ckWARN_d(WARN_DEPRECATED))
2592 {
2593 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2594 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2595 }
2596 if (ckWARN_d(WARN_NON_UNICODE)) {
2597 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2598 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 2599 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
2600 }
2601 goto cases_to_self;
2602 }
3bfc1e70
KW
2603#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2604 if (UNLIKELY(uv1
2605 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2606 {
2607
2608 /* As of this writing, this means we avoid swash creation
2609 * for anything beyond low Plane 1 */
2610 goto cases_to_self;
2611 }
2612#endif
36eaa811
KW
2613 }
2614 }
9ae3ac1a 2615
36eaa811
KW
2616 /* Note that non-characters are perfectly legal, so no warning should
2617 * be given. There are so few of them, that it isn't worth the extra
2618 * tests to avoid swash creation */
9ae3ac1a
KW
2619 }
2620
0134edef 2621 if (!*swashp) /* load on-demand */
5ab9d2ef 2622 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2623
a6f87d8c 2624 if (special) {
0134edef 2625 /* It might be "special" (sometimes, but not always,
2a37f04d 2626 * a multicharacter mapping) */
4a8240a3 2627 HV *hv = NULL;
b08cf34e
JH
2628 SV **svp;
2629
4a8240a3
KW
2630 /* If passed in the specials name, use that; otherwise use any
2631 * given in the swash */
2632 if (*special != '\0') {
2633 hv = get_hv(special, 0);
2634 }
2635 else {
2636 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2637 if (svp) {
2638 hv = MUTABLE_HV(SvRV(*svp));
2639 }
2640 }
2641
176fe009 2642 if (hv
5f560d8a 2643 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2644 && (*svp))
2645 {
cfd0369c 2646 const char *s;
47654450 2647
cfd0369c 2648 s = SvPV_const(*svp, len);
47654450 2649 if (len == 1)
f4cd282c 2650 /* EIGHTBIT */
c80e42f3 2651 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2652 else {
d2dcd0fb 2653 Copy(s, ustrp, len, U8);
29e98929 2654 }
983ffd37 2655 }
0134edef
JH
2656 }
2657
2658 if (!len && *swashp) {
4a4088c4 2659 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2660
0134edef
JH
2661 if (uv2) {
2662 /* It was "normal" (a single character mapping). */
f4cd282c 2663 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2664 }
2665 }
1feea2c7 2666
cbe07460
KW
2667 if (len) {
2668 if (lenp) {
2669 *lenp = len;
2670 }
2671 return valid_utf8_to_uvchr(ustrp, 0);
2672 }
2673
2674 /* Here, there was no mapping defined, which means that the code point maps
2675 * to itself. Return the inputs */
e24dfe9c 2676 cases_to_self:
bfdf22ec 2677 len = UTF8SKIP(p);
ca9fab46
KW
2678 if (p != ustrp) { /* Don't copy onto itself */
2679 Copy(p, ustrp, len, U8);
2680 }
0134edef 2681
2a37f04d
JH
2682 if (lenp)
2683 *lenp = len;
2684
f4cd282c 2685 return uv1;
cbe07460 2686
a0ed51b3
LW
2687}
2688
051a06d4 2689STATIC UV
357aadde 2690S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2691{
4a4088c4 2692 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2693 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2694 * result contains a character that crosses the 255/256 boundary, disallow
2695 * the change, and return the original code point. See L<perlfunc/lc> for
2696 * why;
051a06d4 2697 *
a1433954
KW
2698 * p points to the original string whose case was changed; assumed
2699 * by this routine to be well-formed
051a06d4
KW
2700 * result the code point of the first character in the changed-case string
2701 * ustrp points to the changed-case string (<result> represents its first char)
2702 * lenp points to the length of <ustrp> */
2703
2704 UV original; /* To store the first code point of <p> */
2705
2706 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2707
a4f12ed7 2708 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2709
2710 /* We know immediately if the first character in the string crosses the
2711 * boundary, so can skip */
2712 if (result > 255) {
2713
2714 /* Look at every character in the result; if any cross the
2715 * boundary, the whole thing is disallowed */
2716 U8* s = ustrp + UTF8SKIP(ustrp);
2717 U8* e = ustrp + *lenp;
2718 while (s < e) {
a4f12ed7 2719 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2720 goto bad_crossing;
2721 }
2722 s += UTF8SKIP(s);
2723 }
2724
613abc6d
KW
2725 /* Here, no characters crossed, result is ok as-is, but we warn. */
2726 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2727 return result;
2728 }
2729
7b52d656 2730 bad_crossing:
051a06d4
KW
2731
2732 /* Failed, have to return the original */
4b88fb76 2733 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2734
2735 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2736 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
147e3846
KW
2737 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
2738 "resolved to \"\\x{%" UVXf "}\".",
357aadde 2739 OP_DESC(PL_op),
ab0b796c
KW
2740 original,
2741 original);
051a06d4
KW
2742 Copy(p, ustrp, *lenp, char);
2743 return original;
2744}
2745
d3e79532 2746/*
87cea99e 2747=for apidoc to_utf8_upper
d3e79532 2748
1f607577 2749Instead use L</toUPPER_utf8>.
a1433954 2750
d3e79532
JH
2751=cut */
2752
051a06d4 2753/* Not currently externally documented, and subject to change:
31f05a37
KW
2754 * <flags> is set iff iff the rules from the current underlying locale are to
2755 * be used. */
051a06d4 2756
2104c8d9 2757UV
31f05a37 2758Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2759{
051a06d4
KW
2760 UV result;
2761
2762 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2763
780fcc9f
KW
2764 if (flags) {
2765 /* Treat a UTF-8 locale as not being in locale at all */
2766 if (IN_UTF8_CTYPE_LOCALE) {
2767 flags = FALSE;
2768 }
2769 else {
2770 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2771 }
31f05a37
KW
2772 }
2773
3a4c58c9 2774 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2775 if (flags) {
2776 result = toUPPER_LC(*p);
2777 }
2778 else {
81c6c7ce 2779 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2780 }
3a4c58c9
KW
2781 }
2782 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2783 if (flags) {
a62b247b 2784 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2785 result = toUPPER_LC(c);
051a06d4
KW
2786 }
2787 else {
a62b247b 2788 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2789 ustrp, lenp, 'S');
051a06d4
KW
2790 }
2791 }
4a4088c4 2792 else { /* UTF-8, ord above 255 */
b9992569 2793 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2794
2795 if (flags) {
357aadde 2796 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2797 }
2798 return result;
2799 }
2800
4a4088c4 2801 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2802 if (UTF8_IS_INVARIANT(result)) {
2803 *ustrp = (U8) result;
2804 *lenp = 1;
2805 }
2806 else {
62cb07ea
KW
2807 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2808 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2809 *lenp = 2;
3a4c58c9 2810 }
baa60164 2811
051a06d4 2812 return result;
983ffd37 2813}
a0ed51b3 2814
d3e79532 2815/*
87cea99e 2816=for apidoc to_utf8_title
d3e79532 2817
1f607577 2818Instead use L</toTITLE_utf8>.
a1433954 2819
d3e79532
JH
2820=cut */
2821
051a06d4 2822/* Not currently externally documented, and subject to change:
31f05a37
KW
2823 * <flags> is set iff the rules from the current underlying locale are to be
2824 * used. Since titlecase is not defined in POSIX, for other than a
2825 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2826 */
051a06d4 2827
983ffd37 2828UV
31f05a37 2829Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2830{
051a06d4
KW
2831 UV result;
2832
2833 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2834
780fcc9f
KW
2835 if (flags) {
2836 /* Treat a UTF-8 locale as not being in locale at all */
2837 if (IN_UTF8_CTYPE_LOCALE) {
2838 flags = FALSE;
2839 }
2840 else {
2841 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2842 }
31f05a37
KW
2843 }
2844
3a4c58c9 2845 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2846 if (flags) {
2847 result = toUPPER_LC(*p);
2848 }
2849 else {
81c6c7ce 2850 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2851 }
3a4c58c9
KW
2852 }
2853 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2854 if (flags) {
a62b247b 2855 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2856 result = toUPPER_LC(c);
051a06d4
KW
2857 }
2858 else {
a62b247b 2859 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2860 ustrp, lenp, 's');
051a06d4
KW
2861 }
2862 }
4a4088c4 2863 else { /* UTF-8, ord above 255 */
b9992569 2864 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2865
2866 if (flags) {
357aadde 2867 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2868 }
2869 return result;
2870 }
2871
4a4088c4 2872 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2873 if (UTF8_IS_INVARIANT(result)) {
2874 *ustrp = (U8) result;
2875 *lenp = 1;
2876 }
2877 else {
62cb07ea
KW
2878 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2879 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2880 *lenp = 2;
3a4c58c9
KW
2881 }
2882
051a06d4 2883 return result;
a0ed51b3
LW
2884}
2885
d3e79532 2886/*
87cea99e 2887=for apidoc to_utf8_lower
d3e79532 2888
1f607577 2889Instead use L</toLOWER_utf8>.
a1433954 2890
d3e79532
JH
2891=cut */
2892
051a06d4 2893/* Not currently externally documented, and subject to change:
31f05a37
KW
2894 * <flags> is set iff iff the rules from the current underlying locale are to
2895 * be used.
2896 */
051a06d4 2897
2104c8d9 2898UV
31f05a37 2899Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2900{
051a06d4
KW
2901 UV result;
2902
051a06d4 2903 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2904
780fcc9f
KW
2905 if (flags) {
2906 /* Treat a UTF-8 locale as not being in locale at all */
2907 if (IN_UTF8_CTYPE_LOCALE) {
2908 flags = FALSE;
2909 }
2910 else {
2911 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2912 }
31f05a37
KW
2913 }
2914
968c5e6a 2915 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2916 if (flags) {
2917 result = toLOWER_LC(*p);
2918 }
2919 else {
81c6c7ce 2920 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2921 }
968c5e6a
KW
2922 }
2923 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2924 if (flags) {
a62b247b 2925 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2926 result = toLOWER_LC(c);
051a06d4
KW
2927 }
2928 else {
a62b247b 2929 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2930 ustrp, lenp);
051a06d4 2931 }
968c5e6a 2932 }
4a4088c4 2933 else { /* UTF-8, ord above 255 */
b9992569 2934 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2935
2936 if (flags) {
357aadde 2937 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2938 }
968c5e6a 2939
051a06d4
KW
2940 return result;
2941 }
2942
4a4088c4 2943 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2944 if (UTF8_IS_INVARIANT(result)) {
2945 *ustrp = (U8) result;
2946 *lenp = 1;
2947 }
2948 else {
62cb07ea
KW
2949 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2950 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2951 *lenp = 2;
2952 }
2953
051a06d4 2954 return result;
b4e400f9
JH
2955}
2956
d3e79532 2957/*
87cea99e 2958=for apidoc to_utf8_fold
d3e79532 2959
1f607577 2960Instead use L</toFOLD_utf8>.
a1433954 2961
d3e79532
JH
2962=cut */
2963
051a06d4
KW
2964/* Not currently externally documented, and subject to change,
2965 * in <flags>
31f05a37
KW
2966 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2967 * locale are to be used.
051a06d4
KW
2968 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2969 * otherwise simple folds
a0270393
KW
2970 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2971 * prohibited
445bf929 2972 */
36bb2ab6 2973
b4e400f9 2974UV
445bf929 2975Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2976{
051a06d4
KW
2977 UV result;
2978
36bb2ab6 2979 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2980
a0270393
KW
2981 /* These are mutually exclusive */
2982 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2983
50ba90ff
KW
2984 assert(p != ustrp); /* Otherwise overwrites */
2985
780fcc9f
KW
2986 if (flags & FOLD_FLAGS_LOCALE) {
2987 /* Treat a UTF-8 locale as not being in locale at all */
2988 if (IN_UTF8_CTYPE_LOCALE) {
2989 flags &= ~FOLD_FLAGS_LOCALE;
2990 }
2991 else {
2992 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2993 }
31f05a37
KW
2994 }
2995
a1dde8de 2996 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2997 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2998 result = toFOLD_LC(*p);
051a06d4
KW
2999 }
3000 else {
81c6c7ce 3001 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 3002 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 3003 }
a1dde8de
KW
3004 }
3005 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 3006 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 3007 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 3008 result = toFOLD_LC(c);
051a06d4
KW
3009 }
3010 else {
a62b247b 3011 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 3012 ustrp, lenp,
1ca267a5 3013 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 3014 }
a1dde8de 3015 }
4a4088c4 3016 else { /* UTF-8, ord above 255 */
b9992569 3017 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3018
1ca267a5
KW
3019 if (flags & FOLD_FLAGS_LOCALE) {
3020
76f2ffcd 3021# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3022 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3023
3024# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3025# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3026
3027 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3028
538e84ed
KW
3029 /* Special case these two characters, as what normally gets
3030 * returned under locale doesn't work */
76f2ffcd
KW
3031 if (UTF8SKIP(p) == cap_sharp_s_len
3032 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3033 {
ab0b796c
KW
3034 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3035 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3036 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3037 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3038 goto return_long_s;
3039 }
0766489e
KW
3040 else
3041#endif
3042 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3043 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3044 {
ab0b796c
KW
3045 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3046 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3047 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3048 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3049 goto return_ligature_st;
3050 }
74894415
KW
3051
3052#if UNICODE_MAJOR_VERSION == 3 \
3053 && UNICODE_DOT_VERSION == 0 \
3054 && UNICODE_DOT_DOT_VERSION == 1
3055# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3056
3057 /* And special case this on this Unicode version only, for the same
3058 * reaons the other two are special cased. They would cross the
3059 * 255/256 boundary which is forbidden under /l, and so the code
3060 * wouldn't catch that they are equivalent (which they are only in
3061 * this release) */
3062 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3063 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3064 {
3065 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3066 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3067 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3068 "resolved to \"\\x{0131}\".");
3069 goto return_dotless_i;
3070 }
3071#endif
3072
357aadde 3073 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3074 }
a0270393
KW
3075 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3076 return result;
3077 }
3078 else {
4a4088c4 3079 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3080 * character above the ASCII range, and the result should not
3081 * contain an ASCII character. */
a0270393
KW
3082
3083 UV original; /* To store the first code point of <p> */
3084
3085 /* Look at every character in the result; if any cross the
3086 * boundary, the whole thing is disallowed */
3087 U8* s = ustrp;
3088 U8* e = ustrp + *lenp;
3089 while (s < e) {
3090 if (isASCII(*s)) {
3091 /* Crossed, have to return the original */
3092 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3093
9fc2026f 3094 /* But in these instances, there is an alternative we can
1ca267a5 3095 * return that is valid */
0766489e
KW
3096 if (original == LATIN_SMALL_LETTER_SHARP_S
3097#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3098 || original == LATIN_CAPITAL_LETTER_SHARP_S
3099#endif
3100 ) {
1ca267a5
KW
3101 goto return_long_s;
3102 }
9fc2026f
KW
3103 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3104 goto return_ligature_st;
3105 }
74894415
KW
3106#if UNICODE_MAJOR_VERSION == 3 \
3107 && UNICODE_DOT_VERSION == 0 \
3108 && UNICODE_DOT_DOT_VERSION == 1
3109
3110 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3111 goto return_dotless_i;
3112 }
3113#endif
a0270393
KW
3114 Copy(p, ustrp, *lenp, char);
3115 return original;
3116 }
3117 s += UTF8SKIP(s);
3118 }
051a06d4 3119
a0270393
KW
3120 /* Here, no characters crossed, result is ok as-is */
3121 return result;
3122 }
051a06d4
KW
3123 }
3124
4a4088c4 3125 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3126 if (UTF8_IS_INVARIANT(result)) {
3127 *ustrp = (U8) result;
3128 *lenp = 1;
3129 }
3130 else {
62cb07ea
KW
3131 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3132 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3133 *lenp = 2;
3134 }
3135
051a06d4 3136 return result;
1ca267a5
KW
3137
3138 return_long_s:
3139 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3140 * folds to a string of two of these characters. By returning this
3141 * instead, then, e.g.,
3142 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3143 * works. */
3144
3145 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3146 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3147 ustrp, *lenp, U8);
3148 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3149
3150 return_ligature_st:
3151 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3152 * have the other one fold to it */
3153
3154 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3155 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3156 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3157
3158#if UNICODE_MAJOR_VERSION == 3 \
3159 && UNICODE_DOT_VERSION == 0 \
3160 && UNICODE_DOT_DOT_VERSION == 1
3161
3162 return_dotless_i:
3163 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3164 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3165 return LATIN_SMALL_LETTER_DOTLESS_I;
3166
3167#endif
3168
a0ed51b3
LW
3169}
3170
711a919c 3171/* Note:
f90a9a02 3172 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
3173 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3174 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3175 */
c4a5db0c 3176
a0ed51b3 3177SV*
7fc63493 3178Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3179{
c4a5db0c
KW
3180 PERL_ARGS_ASSERT_SWASH_INIT;
3181
3182 /* Returns a copy of a swash initiated by the called function. This is the
3183 * public interface, and returning a copy prevents others from doing
3184 * mischief on the original */
3185
5d3d13d1 3186 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3187}
3188
3189SV*
5d3d13d1 3190Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3191{
2c1f00b9
YO
3192
3193 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3194 * use the following define */
3195
3196#define CORE_SWASH_INIT_RETURN(x) \
3197 PL_curpm= old_PL_curpm; \
3198 return x
3199
c4a5db0c 3200 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3201 * by calling utf8_heavy.pl in the general case. The returned value may be
3202 * the swash's inversion list instead if the input parameters allow it.
3203 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3204 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3205 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3206 *
3207 * This interface should only be used by functions that won't destroy or
3208 * adversely change the swash, as doing so affects all other uses of the
3209 * swash in the program; the general public should use 'Perl_swash_init'
3210 * instead.
3211 *
3212 * pkg is the name of the package that <name> should be in.
3213 * name is the name of the swash to find. Typically it is a Unicode
3214 * property name, including user-defined ones
3215 * listsv is a string to initialize the swash with. It must be of the form
3216 * documented as the subroutine return value in
3217 * L<perlunicode/User-Defined Character Properties>
3218 * minbits is the number of bits required to represent each data element.
3219 * It is '1' for binary properties.
3220 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3221 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3222 * flags_p if non-NULL is the address of various input and output flag bits
3223 * to the routine, as follows: ('I' means is input to the routine;
3224 * 'O' means output from the routine. Only flags marked O are
3225 * meaningful on return.)
3226 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3227 * came from a user-defined property. (I O)
5d3d13d1
KW
3228 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3229 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3230 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3231 * return of an inversion list instead of a swash hash if this routine
3232 * thinks that would result in faster execution of swash_fetch() later
3233 * on. (I)
9a53f6cf
KW
3234 *
3235 * Thus there are three possible inputs to find the swash: <name>,
3236 * <listsv>, and <invlist>. At least one must be specified. The result
3237 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3238 * actions can intersect, etc. what <name> gives. To avoid going out to
3239 * disk at all, <invlist> should specify completely what the swash should
3240 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3241 *
3242 * <invlist> is only valid for binary properties */
c4a5db0c 3243
2c1f00b9
YO
3244 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3245
c4a5db0c 3246 SV* retval = &PL_sv_undef;
83199d38 3247 HV* swash_hv = NULL;
87367d5f
KW
3248 const int invlist_swash_boundary =
3249 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3250 ? 512 /* Based on some benchmarking, but not extensive, see commit
3251 message */
3252 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3253
3254 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3255 assert(! invlist || minbits == 1);
3256
2c1f00b9
YO
3257 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3258 that triggered the swash init and the swash init perl logic itself.
3259 See perl #122747 */
3260
9a53f6cf
KW
3261 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3262 * so */
3263 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3264 dSP;
3265 const size_t pkg_len = strlen(pkg);
3266 const size_t name_len = strlen(name);
3267 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3268 SV* errsv_save;
3269 GV *method;
3270
3271 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3272
3273 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3274 ENTER;
69794297 3275 SAVEHINTS();
2782061f 3276 save_re_context();
650f067c
JL
3277 /* We might get here via a subroutine signature which uses a utf8
3278 * parameter name, at which point PL_subname will have been set
3279 * but not yet used. */
3280 save_item(PL_subname);
69794297
KW
3281 if (PL_parser && PL_parser->error_count)
3282 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3283 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3284 if (!method) { /* demand load UTF-8 */
69794297 3285 ENTER;
db2c6cb3
FC
3286 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3287 GvSV(PL_errgv) = NULL;
1a419e6b 3288#ifndef NO_TAINT_SUPPORT
69794297
KW
3289 /* It is assumed that callers of this routine are not passing in
3290 * any user derived data. */
2782061f
DM
3291 /* Need to do this after save_re_context() as it will set
3292 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3293 * in Perl_magic_get). Even line to create errsv_save can turn on
3294 * PL_tainted. */
284167a5
SM
3295 SAVEBOOL(TAINT_get);
3296 TAINT_NOT;
3297#endif
69794297
KW
3298 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3299 NULL);
eed484f9 3300 {
db2c6cb3
FC
3301 /* Not ERRSV, as there is no need to vivify a scalar we are
3302 about to discard. */
3303 SV * const errsv = GvSV(PL_errgv);
3304 if (!SvTRUE(errsv)) {
3305 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3306 SvREFCNT_dec(errsv);
3307 }
eed484f9 3308 }
69794297
KW
3309 LEAVE;
3310 }
3311 SPAGAIN;
3312 PUSHMARK(SP);
3313 EXTEND(SP,5);
3314 mPUSHp(pkg, pkg_len);
3315 mPUSHp(name, name_len);
3316 PUSHs(listsv);
3317 mPUSHi(minbits);
3318 mPUSHi(none);
3319 PUTBACK;
db2c6cb3
FC
3320 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3321 GvSV(PL_errgv) = NULL;
69794297
KW
3322 /* If we already have a pointer to the method, no need to use
3323 * call_method() to repeat the lookup. */
c41800a8
KW
3324 if (method
3325 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3326 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3327 {
3328 retval = *PL_stack_sp--;
3329 SvREFCNT_inc(retval);
3330 }
eed484f9 3331 {
db2c6cb3
FC
3332 /* Not ERRSV. See above. */
3333 SV * const errsv = GvSV(PL_errgv);
3334 if (!SvTRUE(errsv)) {
3335 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3336 SvREFCNT_dec(errsv);
3337 }
eed484f9 3338 }
ce3b816e 3339 LEAVE;
69794297
KW
3340 POPSTACK;
3341 if (IN_PERL_COMPILETIME) {
3342 CopHINTS_set(PL_curcop, PL_hints);
3343 }
3344 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3345 if (SvPOK(retval)) {
69794297
KW
3346
3347 /* If caller wants to handle missing properties, let them */
5d3d13d1 3348 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3349 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3350 }
3351 Perl_croak(aTHX_
147e3846 3352 "Can't find Unicode property definition \"%" SVf "\"",
69794297 3353 SVfARG(retval));
a25b5927 3354 NOT_REACHED; /* NOTREACHED */
d95e4a00 3355 }
69794297 3356 }
9a53f6cf 3357 } /* End of calling the module to find the swash */
36eb48b4 3358
83199d38
KW
3359 /* If this operation fetched a swash, and we will need it later, get it */
3360 if (retval != &PL_sv_undef
3361 && (minbits == 1 || (flags_p
3362 && ! (*flags_p
3363 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3364 {
3365 swash_hv = MUTABLE_HV(SvRV(retval));
3366
3367 /* If we don't already know that there is a user-defined component to
3368 * this swash, and the user has indicated they wish to know if there is
3369 * one (by passing <flags_p>), find out */
3370 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3371 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3372 if (user_defined && SvUV(*user_defined)) {
3373 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3374 }
3375 }
3376 }
3377
36eb48b4
KW
3378 /* Make sure there is an inversion list for binary properties */
3379 if (minbits == 1) {
3380 SV** swash_invlistsvp = NULL;
3381 SV* swash_invlist = NULL;
9a53f6cf 3382 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3383 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3384 an unclaimed reference count */
36eb48b4 3385
9a53f6cf 3386 /* If this operation fetched a swash, get its already existing
83199d38 3387 * inversion list, or create one for it */
36eb48b4 3388
83199d38 3389 if (swash_hv) {
5c9f4bd2 3390 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3391 if (swash_invlistsvp) {
3392 swash_invlist = *swash_invlistsvp;
3393 invlist_in_swash_is_valid = TRUE;
3394 }
3395 else {
36eb48b4 3396 swash_invlist = _swash_to_invlist(retval);
02c85471 3397 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3398 }
3399 }
3400
3401 /* If an inversion list was passed in, have to include it */
3402 if (invlist) {
3403
3404 /* Any fetched swash will by now have an inversion list in it;
3405 * otherwise <swash_invlist> will be NULL, indicating that we
3406 * didn't fetch a swash */
3407 if (swash_invlist) {
3408
3409 /* Add the passed-in inversion list, which invalidates the one
3410 * already stored in the swash */
3411 invlist_in_swash_is_valid = FALSE;
eee4c920 3412 SvREADONLY_off(swash_invlist); /* Turned on again below */
9a53f6cf
KW
3413 _invlist_union(invlist, swash_invlist, &swash_invlist);
3414 }
3415 else {
3416
87367d5f
KW
3417 /* Here, there is no swash already. Set up a minimal one, if
3418 * we are going to return a swash */
3419 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3420 swash_hv = newHV();
4aca0fe6 3421 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3422 }
9a53f6cf
KW
3423 swash_invlist = invlist;
3424 }
9a53f6cf
KW
3425 }
3426
3427 /* Here, we have computed the union of all the passed-in data. It may
3428 * be that there was an inversion list in the swash which didn't get
538e84ed 3429 * touched; otherwise save the computed one */
87367d5f
KW
3430 if (! invlist_in_swash_is_valid
3431 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3432 {
5c9f4bd2 3433 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3434 {
3435 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3436 }
cc34d8c5
FC
3437 /* We just stole a reference count. */
3438 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3439 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3440 }
87367d5f 3441
ee3222e3 3442 /* The result is immutable. Forbid attempts to change it. */
dbfdbd26
KW
3443 SvREADONLY_on(swash_invlist);
3444
c41800a8 3445 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3446 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3447 SvREFCNT_dec(retval);
02c85471
FC
3448 if (!swash_invlist_unclaimed)
3449 SvREFCNT_inc_simple_void_NN(swash_invlist);
3450 retval = newRV_noinc(swash_invlist);
87367d5f 3451 }
36eb48b4
KW
3452 }
3453
2c1f00b9
YO
3454 CORE_SWASH_INIT_RETURN(retval);
3455#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3456}
3457
035d37be
JH
3458
3459/* This API is wrong for special case conversions since we may need to
3460 * return several Unicode characters for a single Unicode character
3461 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3462 * the lower-level routine, and it is similarly broken for returning
38684baa 3463 * multiple values. --jhi
b9992569 3464 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3465/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
ST
3466
3467/* Note:
3468 * Returns the value of property/mapping C<swash> for the first character
3469 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3470 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3471 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3472 *
3473 * A "swash" is a hash which contains initially the keys/values set up by
3474 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3475 * property for all possible code points. Things are stored in a compact form
3476 * (see utf8_heavy.pl) so that calculation is required to find the actual
3477 * property value for a given code point. As code points are looked up, new
3478 * key/value pairs are added to the hash, so that the calculation doesn't have
3479 * to ever be re-done. Further, each calculation is done, not just for the
3480 * desired one, but for a whole block of code points adjacent to that one.
3481 * For binary properties on ASCII machines, the block is usually for 64 code
3482 * points, starting with a code point evenly divisible by 64. Thus if the
3483 * property value for code point 257 is requested, the code goes out and
3484 * calculates the property values for all 64 code points between 256 and 319,
3485 * and stores these as a single 64-bit long bit vector, called a "swatch",
3486 * under the key for code point 256. The key is the UTF-8 encoding for code
3487 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3488 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3489 * for code point 258 is then requested, this code realizes that it would be
3490 * stored under the key for 256, and would find that value and extract the
3491 * relevant bit, offset from 256.
3492 *
3493 * Non-binary properties are stored in as many bits as necessary to represent
3494 * their values (32 currently, though the code is more general than that), not
fc273927 3495 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3496 * vector that encompasses the property values for all code points whose UTF-8
3497 * representations are represented by the key. That is, for all code points
3498 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3499 * bytes of that.
680c470c 3500 */
a0ed51b3 3501UV
680c470c 3502Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3503{
ef8f7699 3504 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3505 U32 klen;
3506 U32 off;
9b56a019 3507 STRLEN slen = 0;
7d85a32c 3508 STRLEN needents;
cfd0369c 3509 const U8 *tmps = NULL;
979f2922 3510 SV *swatch;
08fb1ac5 3511 const U8 c = *ptr;
3568d838 3512
7918f24d
NC
3513 PERL_ARGS_ASSERT_SWASH_FETCH;
3514
87367d5f
KW
3515 /* If it really isn't a hash, it isn't really swash; must be an inversion
3516 * list */
3517 if (SvTYPE(hv) != SVt_PVHV) {
3518 return _invlist_contains_cp((SV*)hv,
3519 (do_utf8)
3520 ? valid_utf8_to_uvchr(ptr, NULL)
3521 : c);
3522 }
3523
08fb1ac5
KW
3524 /* We store the values in a "swatch" which is a vec() value in a swash
3525 * hash. Code points 0-255 are a single vec() stored with key length
3526 * (klen) 0. All other code points have a UTF-8 representation
3527 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3528 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3529 * length for them is the length of the encoded char - 1. ptr[klen] is the
3530 * final byte in the sequence representing the character */
3531 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3532 klen = 0;
3533 needents = 256;
3534 off = c;
3568d838 3535 }
08fb1ac5
KW
3536 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3537 klen = 0;
3538 needents = 256;
a62b247b 3539 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
ST
3540 }
3541 else {
08fb1ac5
KW
3542 klen = UTF8SKIP(ptr) - 1;
3543
3544 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3545 * the vec is the final byte in the sequence. (In EBCDIC this is
3546 * converted to I8 to get consecutive values.) To help you visualize
3547 * all this:
3548 * Straight 1047 After final byte
3549 * UTF-8 UTF-EBCDIC I8 transform
3550 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3551 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3552 * ...
3553 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3554 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3555 * ...
3556 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3557 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3558 * ...
3559 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3560 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3561 * ...
3562 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3563 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3564 *
3565 * (There are no discontinuities in the elided (...) entries.)
3566 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3567 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3568 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3569 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3570 * index into the vec() swatch (after subtracting 0x80, which we
3571 * actually do with an '&').
3572 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3573 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3574 * dicontinuities which go away by transforming it into I8, and we
3575 * effectively subtract 0xA0 to get the index. */
979f2922 3576 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3577 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3578 }
7d85a32c 3579
a0ed51b3 3580 /*
4a4088c4 3581 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3582 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3583 * it's nothing to sniff at.) Pity we usually come through at least
3584 * two function calls to get here...
3585 *
3586 * NB: this code assumes that swatches are never modified, once generated!
3587 */
3588
3568d838 3589 if (hv == PL_last_swash_hv &&
a0ed51b3 3590 klen == PL_last_swash_klen &&
27da23d5 3591 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3592 {
3593 tmps = PL_last_swash_tmps;
3594 slen = PL_last_swash_slen;
3595 }
3596 else {
3597 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3598 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3599
b0e3252e 3600 /* If not cached, generate it via swatch_get */
979f2922 3601 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3602 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3603 {
3604 if (klen) {
3605 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3606 swatch = swatch_get(swash,
3607 code_point & ~((UV)needents - 1),
3608 needents);
3609 }
3610 else { /* For the first 256 code points, the swatch has a key of
3611 length 0 */
3612 swatch = swatch_get(swash, 0, needents);
3613 }
979f2922 3614
923e4eb5 3615 if (IN_PERL_COMPILETIME)
623e6609 3616 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3617
979f2922 3618 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3619
979f2922
ST
3620 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3621 || (slen << 3) < needents)
5637ef5b 3622 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
147e3846 3623 "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
5637ef5b 3624 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3625 }
3626
3627 PL_last_swash_hv = hv;
16d8f38a 3628 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3629 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3630 /* FIXME change interpvar.h? */
3631 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3632 PL_last_swash_slen = slen;
3633 if (klen)
3634 Copy(ptr, PL_last_swash_key, klen, U8);
3635 }
3636
9faf8d75 3637 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3638 case 1:
e7aca353 3639 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3640 case 8:
e7aca353 3641 return ((UV) tmps[off]);
a0ed51b3
LW
3642 case 16:
3643 off <<= 1;
e7aca353
JH
3644 return
3645 ((UV) tmps[off ] << 8) +
3646 ((UV) tmps[off + 1]);
a0ed51b3
LW
3647 case 32:
3648 off <<= 2;
e7aca353
JH
3649 return
3650 ((UV) tmps[off ] << 24) +
3651 ((UV) tmps[off + 1] << 16) +
3652 ((UV) tmps[off + 2] << 8) +
3653 ((UV) tmps[off + 3]);
a0ed51b3 3654 }
5637ef5b 3655 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
147e3846 3656 "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
670f1322 3657 NORETURN_FUNCTION_END;
a0ed51b3 3658}
2b9d42f0 3659
319009ee
KW
3660/* Read a single line of the main body of the swash input text. These are of
3661 * the form:
3662 * 0053 0056 0073
3663 * where each number is hex. The first two numbers form the minimum and
3664 * maximum of a range, and the third is the value associated with the range.
3665 * Not all swashes should have a third number
3666 *
3667 * On input: l points to the beginning of the line to be examined; it points
3668 * to somewhere in the string of the whole input text, and is
3669 * terminated by a \n or the null string terminator.
3670 * lend points to the null terminator of that string
3671 * wants_value is non-zero if the swash expects a third number
3672 * typestr is the name of the swash's mapping, like 'ToLower'
3673 * On output: *min, *max, and *val are set to the values read from the line.
3674 * returns a pointer just beyond the line examined. If there was no
3675 * valid min number on the line, returns lend+1
3676 */
3677
3678STATIC U8*
3679S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3680 const bool wants_value, const U8* const typestr)
3681{
3682 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3683 STRLEN numlen; /* Length of the number */
02470786
KW
3684 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3685 | PERL_SCAN_DISALLOW_PREFIX
3686 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3687
3688 /* nl points to the next \n in the scan */
3689 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3690
95543e92
KW
3691 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3692
319009ee
KW
3693 /* Get the first number on the line: the range minimum */