This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Consolidate duplicated string constants
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
76513bdc 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
9cbfb8ab
KW
55void
56Perl__force_out_malformed_utf8_message(pTHX_
57 const U8 *const p, /* First byte in UTF-8 sequence */
58 const U8 * const e, /* Final byte in sequence (may include
59 multiple chars */
60 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
61 usually 0, or some DISALLOW flags */
62 const bool die_here) /* If TRUE, this function does not return */
63{
64 /* This core-only function is to be called when a malformed UTF-8 character
65 * is found, in order to output the detailed information about the
66 * malformation before dieing. The reason it exists is for the occasions
67 * when such a malformation is fatal, but warnings might be turned off, so
68 * that normally they would not be actually output. This ensures that they
69 * do get output. Because a sequence may be malformed in more than one
70 * way, multiple messages may be generated, so we can't make them fatal, as
71 * that would cause the first one to die.
72 *
73 * Instead we pretend -W was passed to perl, then die afterwards. The
74 * flexibility is here to return to the caller so they can finish up and
75 * die themselves */
76 U32 errors;
77
78 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
79
80 ENTER;
c15a80f3 81 SAVEI8(PL_dowarn);
9cbfb8ab
KW
82 SAVESPTR(PL_curcop);
83
84 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
85 if (PL_curcop) {
86 PL_curcop->cop_warnings = pWARN_ALL;
87 }
88
89 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
90
91 LEAVE;
92
93 if (! errors) {
94 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
95 " be called only when there are errors found");
96 }
97
98 if (die_here) {
99 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
100 }
101}
102
eaf7a4d2 103/*
378516de 104=for apidoc uvoffuni_to_utf8_flags
eebe1485 105
a27992cc 106THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
107Instead, B<Almost all code should use L</uvchr_to_utf8> or
108L</uvchr_to_utf8_flags>>.
a27992cc 109
de69f3af
KW
110This function is like them, but the input is a strict Unicode
111(as opposed to native) code point. Only in very rare circumstances should code
112not be using the native code point.
949cf498 113
efa9cd84 114For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 115
eebe1485
SC
116=cut
117*/
118
c94c2f39
KW
119/* All these formats take a single UV code point argument */
120const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
121const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
122 " is not recommended for open interchange";
123const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
124 " may not be portable";
125const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
126 " Unicode, and not portable";
127
8ee1cdcb
KW
128#define HANDLE_UNICODE_SURROGATE(uv, flags) \
129 STMT_START { \
130 if (flags & UNICODE_WARN_SURROGATE) { \
131 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
c94c2f39 132 surrogate_cp_format, uv); \
8ee1cdcb
KW
133 } \
134 if (flags & UNICODE_DISALLOW_SURROGATE) { \
135 return NULL; \
136 } \
137 } STMT_END;
138
139#define HANDLE_UNICODE_NONCHAR(uv, flags) \
140 STMT_START { \
141 if (flags & UNICODE_WARN_NONCHAR) { \
142 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
c94c2f39 143 nonchar_cp_format, uv); \
8ee1cdcb
KW
144 } \
145 if (flags & UNICODE_DISALLOW_NONCHAR) { \
146 return NULL; \
147 } \
148 } STMT_END;
149
ba6ed43c
KW
150/* Use shorter names internally in this file */
151#define SHIFT UTF_ACCUMULATION_SHIFT
152#undef MARK
153#define MARK UTF_CONTINUATION_MARK
154#define MASK UTF_CONTINUATION_MASK
155
dfe13c55 156U8 *
4b31b634 157Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
a0ed51b3 158{
378516de 159 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 160
2d1545e5 161 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 162 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
163 return d;
164 }
facc1dc2 165
3ea68d71 166 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
167 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
168 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
169 return d;
170 }
d9432125 171
ba6ed43c
KW
172 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
173 * below, the 16 is for start bytes E0-EF (which are all the possible ones
174 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
175 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
176 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
177 * 0x800-0xFFFF on ASCII */
178 if (uv < (16 * (1U << (2 * SHIFT)))) {
179 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
180 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
181 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
182
183#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
184 aren't tested here */
185 /* The most likely code points in this range are below the surrogates.
186 * Do an extra test to quickly exclude those. */
187 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
188 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
189 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
190 {
8ee1cdcb
KW
191 HANDLE_UNICODE_NONCHAR(uv, flags);
192 }
193 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
194 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 195 }
ba6ed43c
KW
196 }
197#endif
198 return d;
199 }
200
201 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
202 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
203 * happen starting with 4-byte characters on ASCII platforms. We unify the
204 * code for these with EBCDIC, even though some of them require 5-bytes on
205 * those, because khw believes the code saving is worth the very slight
206 * performance hit on these high EBCDIC code points. */
207
208 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
76513bdc
KW
209 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
210 && ckWARN_d(WARN_DEPRECATED))
211 {
212 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
213 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
a5bf80e0
KW
214 }
215 if ( (flags & UNICODE_WARN_SUPER)
216 || ( UNICODE_IS_ABOVE_31_BIT(uv)
217 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
218 {
219 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
220
221 /* Choose the more dire applicable warning */
222 (UNICODE_IS_ABOVE_31_BIT(uv))
c94c2f39
KW
223 ? above_31_bit_cp_format
224 : super_cp_format,
a5bf80e0
KW
225 uv);
226 }
227 if (flags & UNICODE_DISALLOW_SUPER
228 || ( UNICODE_IS_ABOVE_31_BIT(uv)
229 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
230 {
231 return NULL;
232 }
233 }
ba6ed43c
KW
234 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
235 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 236 }
d9432125 237
ba6ed43c
KW
238 /* Test for and handle 4-byte result. In the test immediately below, the
239 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
240 * characters). The 3 is for 3 continuation bytes; these each contribute
241 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
242 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
243 * 0x1_0000-0x1F_FFFF on ASCII */
244 if (uv < (8 * (1U << (3 * SHIFT)))) {
245 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
246 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
247 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
248 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
249
250#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
251 characters. The end-plane non-characters for EBCDIC were
252 handled just above */
253 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
254 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 255 }
ba6ed43c
KW
256 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
257 HANDLE_UNICODE_SURROGATE(uv, flags);
258 }
259#endif
260
261 return d;
262 }
263
264 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
265 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
266 * format. The unrolled version above turns out to not save all that much
267 * time, and at these high code points (well above the legal Unicode range
268 * on ASCII platforms, and well above anything in common use in EBCDIC),
269 * khw believes that less code outweighs slight performance gains. */
270
d9432125 271 {
5aaebcb3 272 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
273 U8 *p = d+len-1;
274 while (p > d) {
4c8cd605 275 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
276 uv >>= UTF_ACCUMULATION_SHIFT;
277 }
4c8cd605 278 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
279 return d+len;
280 }
a0ed51b3 281}
a5bf80e0 282
646ca15d 283/*
07693fe6
KW
284=for apidoc uvchr_to_utf8
285
bcb1a2d4 286Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 287of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
288C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
289the byte after the end of the new character. In other words,
07693fe6
KW
290
291 d = uvchr_to_utf8(d, uv);
292
293is the recommended wide native character-aware way of saying
294
295 *(d++) = uv;
296
760c7c2f
KW
297This function accepts any UV as input, but very high code points (above
298C<IV_MAX> on the platform) will raise a deprecation warning. This is
299typically 0x7FFF_FFFF in a 32-bit word.
300
301It is possible to forbid or warn on non-Unicode code points, or those that may
302be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 303
07693fe6
KW
304=cut
305*/
306
de69f3af
KW
307/* This is also a macro */
308PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
309
07693fe6
KW
310U8 *
311Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
312{
de69f3af 313 return uvchr_to_utf8(d, uv);
07693fe6
KW
314}
315
de69f3af
KW
316/*
317=for apidoc uvchr_to_utf8_flags
318
319Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 320of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
321C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
322the byte after the end of the new character. In other words,
de69f3af
KW
323
324 d = uvchr_to_utf8_flags(d, uv, flags);
325
326or, in most cases,
327
328 d = uvchr_to_utf8_flags(d, uv, 0);
329
330This is the Unicode-aware way of saying
331
332 *(d++) = uv;
333
760c7c2f
KW
334If C<flags> is 0, this function accepts any UV as input, but very high code
335points (above C<IV_MAX> for the platform) will raise a deprecation warning.
336This is typically 0x7FFF_FFFF in a 32-bit word.
337
338Specifying C<flags> can further restrict what is allowed and not warned on, as
339follows:
de69f3af 340
796b6530 341If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
342the function will raise a warning, provided UTF8 warnings are enabled. If
343instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
344NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 345
760c7c2f
KW
346Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
347affect how the function handles a Unicode non-character.
93e6dbd6 348
760c7c2f
KW
349And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
350affect the handling of code points that are above the Unicode maximum of
3510x10FFFF. Languages other than Perl may not be able to accept files that
352contain these.
93e6dbd6
KW
353
354The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
355the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
356three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
357allowed inputs to the strict UTF-8 traditionally defined by Unicode.
358Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
359C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
360above-Unicode and surrogate flags, but not the non-character ones, as
361defined in
362L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
363See L<perlunicode/Noncharacter code points>.
93e6dbd6 364
ab8e6d41
KW
365Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
366so using them is more problematic than other above-Unicode code points. Perl
367invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
368likely that non-Perl languages will not be able to read files that contain
369these that written by the perl interpreter; nor would Perl understand files
370written by something that uses a different extension. For these reasons, there
371is a separate set of flags that can warn and/or disallow these extremely high
372code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
373C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
374are entirely independent from the deprecation warning for code points above
375C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
376code point that needs more than 31 bits to represent. When that happens,
377effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
37832-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
379above-Unicode code points, including these, as malformations; and
380C<UNICODE_WARN_SUPER> warns on these.)
381
382On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
383extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
384than on ASCII. Prior to that, code points 2**31 and higher were simply
385unrepresentable, and a different, incompatible method was used to represent
386code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
387and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
388platforms, warning and disallowing 2**31 and higher.
de69f3af 389
de69f3af
KW
390=cut
391*/
392
393/* This is also a macro */
394PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
395
07693fe6
KW
396U8 *
397Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
398{
de69f3af 399 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
400}
401
83dc0f42
KW
402PERL_STATIC_INLINE bool
403S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
404{
405 /* Returns TRUE if the first code point represented by the Perl-extended-
406 * UTF-8-encoded string starting at 's', and looking no further than 'e -
407 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
408 *
409 * The function handles the case where the input bytes do not include all
410 * the ones necessary to represent a full character. That is, they may be
411 * the intial bytes of the representation of a code point, but possibly
412 * the final ones necessary for the complete representation may be beyond
413 * 'e - 1'.
414 *
415 * The function assumes that the sequence is well-formed UTF-8 as far as it
416 * goes, and is for a UTF-8 variant code point. If the sequence is
417 * incomplete, the function returns FALSE if there is any well-formed
418 * UTF-8 byte sequence that can complete it in such a way that a code point
419 * < 2**31 is produced; otherwise it returns TRUE.
420 *
421 * Getting this exactly right is slightly tricky, and has to be done in
422 * several places in this file, so is centralized here. It is based on the
423 * following table:
424 *
425 * U+7FFFFFFF (2 ** 31 - 1)
426 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
427 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
428 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
429 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
430 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
431 * U+80000000 (2 ** 31):
432 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
433 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
434 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
435 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
436 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
437 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
438 */
439
440#ifdef EBCDIC
441
37086697
KW
442 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
443 const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
444 const STRLEN prefix_len = sizeof(prefix) - 1;
445 const STRLEN len = e - s;
f880f78a 446 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
447
448#else
449
450 PERL_UNUSED_ARG(e);
451
452#endif
453
454 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
455
456 assert(! UTF8_IS_INVARIANT(*s));
457
458#ifndef EBCDIC
459
460 /* Technically, a start byte of FE can be for a code point that fits into
461 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
462 * malformation. */
463 return (*s >= 0xFE);
464
465#else
466
467 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
468 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
469 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
470 * bits. */
471 if (*s != 0xFE || len == 1) {
472 return FALSE;
473 }
474
475 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
476 * \x41 and \x42. */
477 return cBOOL(memGT(s + 1, prefix, cmp_len));
478
479#endif
480
481}
482
c179379d
KW
483/* Anything larger than this will overflow the word if it were converted into a UV */
484#if defined(UV_IS_QUAD)
485# ifdef EBCDIC /* Actually is I8 */
486# define HIGHEST_REPRESENTABLE_UTF8 \
487 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
488# else
489# define HIGHEST_REPRESENTABLE_UTF8 \
490 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
491# endif
492#else /* 32-bit */
493# ifdef EBCDIC
494# define HIGHEST_REPRESENTABLE_UTF8 \
495 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
496# else
497# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
498# endif
499#endif
500
12a4bed3
KW
501PERL_STATIC_INLINE bool
502S_does_utf8_overflow(const U8 * const s, const U8 * e)
503{
504 const U8 *x;
505 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
506
c551bb01
KW
507#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
508
509 const STRLEN len = e - s;
510
511#endif
512
12a4bed3
KW
513 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
514 * platform, that is if it represents a code point larger than the highest
515 * representable code point. (For ASCII platforms, we could use memcmp()
516 * because we don't have to convert each byte to I8, but it's very rare
517 * input indeed that would approach overflow, so the loop below will likely
518 * only get executed once.
519 *
520 * 'e' must not be beyond a full character. If it is less than a full
521 * character, the function returns FALSE if there is any input beyond 'e'
522 * that could result in a non-overflowing code point */
523
524 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
b0b342d4
KW
525 assert(s <= e && s + UTF8SKIP(s) >= e);
526
527#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
528
529 /* On 32 bit ASCII machines, many overlongs that start with FF don't
530 * overflow */
531
c551bb01 532 if (isFF_OVERLONG(s, len)) {
b0b342d4
KW
533 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
534 return memGE(s, max_32_bit_overlong,
c551bb01 535 MIN(len, sizeof(max_32_bit_overlong) - 1));
b0b342d4
KW
536 }
537
538#endif
12a4bed3
KW
539
540 for (x = s; x < e; x++, y++) {
541
542 /* If this byte is larger than the corresponding highest UTF-8 byte, it
543 * overflows */
544 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
545 return TRUE;
546 }
547
548 /* If not the same as this byte, it must be smaller, doesn't overflow */
549 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
550 return FALSE;
551 }
552 }
553
554 /* Got to the end and all bytes are the same. If the input is a whole
555 * character, it doesn't overflow. And if it is a partial character,
556 * there's not enough information to tell, so assume doesn't overflow */
557 return FALSE;
558}
559
560PERL_STATIC_INLINE bool
561S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
562{
563 /* Overlongs can occur whenever the number of continuation bytes
564 * changes. That means whenever the number of leading 1 bits in a start
565 * byte increases from the next lower start byte. That happens for start
566 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
567 * illegal start bytes have already been excluded, so don't need to be
568 * tested here;
569 * ASCII platforms: C0, C1
570 * EBCDIC platforms C0, C1, C2, C3, C4, E0
571 *
572 * At least a second byte is required to determine if other sequences will
573 * be an overlong. */
574
575 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
576 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
577
578 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
579 assert(len > 1 && UTF8_IS_START(*s));
580
581 /* Each platform has overlongs after the start bytes given above (expressed
582 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
583 * the logic is the same, except the E0 overlong has already been excluded
584 * on EBCDIC platforms. The values below were found by manually
585 * inspecting the UTF-8 patterns. See the tables in utf8.h and
586 * utfebcdic.h. */
587
588# ifdef EBCDIC
589# define F0_ABOVE_OVERLONG 0xB0
590# define F8_ABOVE_OVERLONG 0xA8
591# define FC_ABOVE_OVERLONG 0xA4
592# define FE_ABOVE_OVERLONG 0xA2
593# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
594 /* I8(0xfe) is FF */
595# else
596
597 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
598 return TRUE;
599 }
600
601# define F0_ABOVE_OVERLONG 0x90
602# define F8_ABOVE_OVERLONG 0x88
603# define FC_ABOVE_OVERLONG 0x84
604# define FE_ABOVE_OVERLONG 0x82
605# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
606# endif
607
608
609 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
610 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
611 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
612 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
613 {
614 return TRUE;
615 }
616
b0b342d4
KW
617 /* Check for the FF overlong */
618 return isFF_OVERLONG(s, len);
619}
620
621PERL_STATIC_INLINE bool
622S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
623{
624 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3
KW
625
626 /* Check for the FF overlong. This happens only if all these bytes match;
627 * what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 628 * utfebcdic.h. */
12a4bed3 629
b0b342d4
KW
630 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
631 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
632 sizeof(FF_OVERLONG_PREFIX) - 1));
12a4bed3
KW
633}
634
635#undef F0_ABOVE_OVERLONG
636#undef F8_ABOVE_OVERLONG
637#undef FC_ABOVE_OVERLONG
638#undef FE_ABOVE_OVERLONG
639#undef FF_OVERLONG_PREFIX
640
35f8c9bd 641STRLEN
edc2c47a 642Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 643{
2b479609 644 STRLEN len;
12a4bed3 645 const U8 *x;
35f8c9bd 646
2b479609
KW
647 /* A helper function that should not be called directly.
648 *
649 * This function returns non-zero if the string beginning at 's' and
650 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
651 * code point; otherwise it returns 0. The examination stops after the
652 * first code point in 's' is validated, not looking at the rest of the
653 * input. If 'e' is such that there are not enough bytes to represent a
654 * complete code point, this function will return non-zero anyway, if the
655 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
656 * excluded by 'flags'.
657 *
658 * A non-zero return gives the number of bytes required to represent the
659 * code point. Be aware that if the input is for a partial character, the
660 * return will be larger than 'e - s'.
661 *
662 * This function assumes that the code point represented is UTF-8 variant.
663 * The caller should have excluded this possibility before calling this
664 * function.
665 *
666 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
667 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
668 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
669 * disallowed by the flags. If the input is only for a partial character,
670 * the function will return non-zero if there is any sequence of
671 * well-formed UTF-8 that, when appended to the input sequence, could
672 * result in an allowed code point; otherwise it returns 0. Non characters
673 * cannot be determined based on partial character input. But many of the
674 * other excluded types can be determined with just the first one or two
675 * bytes.
676 *
677 */
678
679 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
680
681 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
682 |UTF8_DISALLOW_ABOVE_31_BIT)));
683 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 684
2b479609 685 /* A variant char must begin with a start byte */
35f8c9bd
KW
686 if (UNLIKELY(! UTF8_IS_START(*s))) {
687 return 0;
688 }
689
edc2c47a
KW
690 /* Examine a maximum of a single whole code point */
691 if (e - s > UTF8SKIP(s)) {
692 e = s + UTF8SKIP(s);
693 }
694
2b479609
KW
695 len = e - s;
696
697 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
698 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 699
2b479609
KW
700 /* The code below is derived from this table. Keep in mind that legal
701 * continuation bytes range between \x80..\xBF for UTF-8, and
702 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
703 * Hence, we don't have to test the upper edge because if any of those
704 * are encountered, the sequence is malformed, and will fail elsewhere
705 * in this function.
706 * UTF-8 UTF-EBCDIC I8
707 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
708 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
709 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
710 *
711 */
712
713#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
714# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 715# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 716
19794540
KW
717# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
718 /* B6 and B7 */ \
719 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
720#else
721# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
722# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
723# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
724#endif
725
726 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
727 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
728 {
2b479609
KW
729 return 0; /* Above Unicode */
730 }
731
732 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
733 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
734 {
735 return 0; /* Above 31 bits */
736 }
737
738 if (len > 1) {
739 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
740
741 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 742 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
743 {
744 return 0; /* Above Unicode */
745 }
746
747 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 748 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
749 {
750 return 0; /* Surrogate */
751 }
752
753 if ( (flags & UTF8_DISALLOW_NONCHAR)
754 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
755 {
756 return 0; /* Noncharacter code point */
757 }
758 }
759 }
760
761 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
762 for (x = s + 1; x < e; x++) {
763 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
764 return 0;
765 }
766 }
767
af13dd8a 768 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
769 * overlong. */
770 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
771 return 0;
af13dd8a
KW
772 }
773
12a4bed3
KW
774 /* And finally, that the code point represented fits in a word on this
775 * platform */
776 if (does_utf8_overflow(s, e)) {
777 return 0;
35f8c9bd
KW
778 }
779
2b479609 780 return UTF8SKIP(s);
35f8c9bd
KW
781}
782
7e2f38b2
KW
783char *
784Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
7cf8d05d
KW
785{
786 /* Returns a mortalized C string that is a displayable copy of the 'len'
7e2f38b2
KW
787 * bytes starting at 's'. 'format' gives how to display each byte.
788 * Currently, there are only two formats, so it is currently a bool:
789 * 0 \xab
790 * 1 ab (that is a space between two hex digit bytes)
791 */
7cf8d05d
KW
792
793 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
794 trailing NUL */
795 const U8 * const e = s + len;
796 char * output;
797 char * d;
798
799 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
800
801 Newx(output, output_len, char);
802 SAVEFREEPV(output);
803
804 d = output;
805 for (; s < e; s++) {
806 const unsigned high_nibble = (*s & 0xF0) >> 4;
807 const unsigned low_nibble = (*s & 0x0F);
808
7e2f38b2
KW
809 if (format) {
810 *d++ = ' ';
811 }
812 else {
813 *d++ = '\\';
814 *d++ = 'x';
815 }
7cf8d05d
KW
816
817 if (high_nibble < 10) {
818 *d++ = high_nibble + '0';
819 }
820 else {
821 *d++ = high_nibble - 10 + 'a';
822 }
823
824 if (low_nibble < 10) {
825 *d++ = low_nibble + '0';
826 }
827 else {
828 *d++ = low_nibble - 10 + 'a';
829 }
830 }
831
832 *d = '\0';
833 return output;
834}
835
806547a7 836PERL_STATIC_INLINE char *
7cf8d05d
KW
837S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
838
839 /* How many bytes to print */
3cc6a05e 840 STRLEN print_len,
7cf8d05d
KW
841
842 /* Which one is the non-continuation */
843 const STRLEN non_cont_byte_pos,
844
845 /* How many bytes should there be? */
846 const STRLEN expect_len)
806547a7
KW
847{
848 /* Return the malformation warning text for an unexpected continuation
849 * byte. */
850
7cf8d05d 851 const char * const where = (non_cont_byte_pos == 1)
806547a7 852 ? "immediately"
7cf8d05d
KW
853 : Perl_form(aTHX_ "%d bytes",
854 (int) non_cont_byte_pos);
806547a7
KW
855
856 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
857
7cf8d05d
KW
858 /* We don't need to pass this parameter, but since it has already been
859 * calculated, it's likely faster to pass it; verify under DEBUGGING */
860 assert(expect_len == UTF8SKIP(s));
861
862 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
863 " %s after start byte 0x%02x; need %d bytes, got %d)",
864 malformed_text,
7e2f38b2 865 _byte_dump_string(s, print_len, 0),
7cf8d05d
KW
866 *(s + non_cont_byte_pos),
867 where,
868 *s,
869 (int) expect_len,
870 (int) non_cont_byte_pos);
806547a7
KW
871}
872
35f8c9bd
KW
873/*
874
de69f3af 875=for apidoc utf8n_to_uvchr
378516de
KW
876
877THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 878Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 879
9041c2e3 880Bottom level UTF-8 decode routine.
de69f3af 881Returns the native code point value of the first character in the string C<s>,
746afd53
KW
882which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
883C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
884the length, in bytes, of that character.
949cf498
KW
885
886The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
887well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
888causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
889is the next possible position in C<s> that could begin a non-malformed
890character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
891is raised. Some UTF-8 input sequences may contain multiple malformations.
892This function tries to find every possible one in each call, so multiple
893warnings can be raised for each sequence.
949cf498
KW
894
895Various ALLOW flags can be set in C<flags> to allow (and not warn on)
896individual types of malformations, such as the sequence being overlong (that
897is, when there is a shorter sequence that can express the same code point;
898overlong sequences are expressly forbidden in the UTF-8 standard due to
899potential security issues). Another malformation example is the first byte of
900a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
901flags. Even if allowed, this function generally returns the Unicode
902REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
903F<utf8.h> to override this behavior for the overlong malformations, but don't
904do that except for very specialized purposes.
949cf498 905
796b6530 906The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
907flags) malformation is found. If this flag is set, the routine assumes that
908the caller will raise a warning, and this function will silently just set
d088425d
KW
909C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
910
75200dff 911Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 912character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
913in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
914be set to 1. To disambiguate, upon a zero return, see if the first byte of
915C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 916error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
917
918Certain code points are considered problematic. These are Unicode surrogates,
746afd53 919Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 920By default these are considered regular code points, but certain situations
ecc1615f
KW
921warrant special handling for them, which can be specified using the C<flags>
922parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
923three classes are treated as malformations and handled as such. The flags
924C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
925C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
926disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
927restricts the allowed inputs to the strict UTF-8 traditionally defined by
928Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
929definition given by
930L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
931The difference between traditional strictness and C9 strictness is that the
932latter does not forbid non-character code points. (They are still discouraged,
933however.) For more discussion see L<perlunicode/Noncharacter code points>.
934
935The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
936C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
937C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
938raised for their respective categories, but otherwise the code points are
939considered valid (not malformations). To get a category to both be treated as
940a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 941(But note that warnings are not raised if lexically disabled nor if
796b6530 942C<UTF8_CHECK_ONLY> is also specified.)
949cf498 943
760c7c2f
KW
944It is now deprecated to have very high code points (above C<IV_MAX> on the
945platforms) and this function will raise a deprecation warning for these (unless
d5944cab 946such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 947in a 32-bit word.
ab8e6d41
KW
948
949Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
950so using them is more problematic than other above-Unicode code points. Perl
951invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
952likely that non-Perl languages will not be able to read files that contain
aff2be59 953these; nor would Perl understand files
ab8e6d41
KW
954written by something that uses a different extension. For these reasons, there
955is a separate set of flags that can warn and/or disallow these extremely high
956code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
957C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
958are entirely independent from the deprecation warning for code points above
959C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
960code point that needs more than 31 bits to represent. When that happens,
961effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
96232-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
963above-Unicode code points, including these, as malformations; and
964C<UTF8_WARN_SUPER> warns on these.)
965
966On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
967extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
968than on ASCII. Prior to that, code points 2**31 and higher were simply
969unrepresentable, and a different, incompatible method was used to represent
970code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
971and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
972platforms, warning and disallowing 2**31 and higher.
949cf498
KW
973
974All other code points corresponding to Unicode characters, including private
975use and those yet to be assigned, are never considered malformed and never
976warn.
67e989fb 977
37607a96 978=cut
f9380377
KW
979
980Also implemented as a macro in utf8.h
981*/
982
983UV
984Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
985 STRLEN curlen,
986 STRLEN *retlen,
987 const U32 flags)
988{
989 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
990
991 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
992}
993
994/*
995
996=for apidoc utf8n_to_uvchr_error
997
998THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
999Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1000
1001This function is for code that needs to know what the precise malformation(s)
1002are when an error is found.
1003
1004It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1005all the others, C<errors>. If this parameter is 0, this function behaves
1006identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1007to a C<U32> variable, which this function sets to indicate any errors found.
1008Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1009C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1010of these bits will be set if a malformation is found, even if the input
7a65503b 1011C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1012exceptions are noted:
1013
1014=over 4
1015
1016=item C<UTF8_GOT_ABOVE_31_BIT>
1017
1018The code point represented by the input UTF-8 sequence occupies more than 31
1019bits.
1020This bit is set only if the input C<flags> parameter contains either the
1021C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
1022
1023=item C<UTF8_GOT_CONTINUATION>
1024
1025The input sequence was malformed in that the first byte was a a UTF-8
1026continuation byte.
1027
1028=item C<UTF8_GOT_EMPTY>
1029
1030The input C<curlen> parameter was 0.
1031
1032=item C<UTF8_GOT_LONG>
1033
1034The input sequence was malformed in that there is some other sequence that
1035evaluates to the same code point, but that sequence is shorter than this one.
1036
fecaf136
KW
1037Until Unicode 3.1, it was legal for programs to accept this malformation, but
1038it was discovered that this created security issues.
1039
f9380377
KW
1040=item C<UTF8_GOT_NONCHAR>
1041
1042The code point represented by the input UTF-8 sequence is for a Unicode
1043non-character code point.
1044This bit is set only if the input C<flags> parameter contains either the
1045C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1046
1047=item C<UTF8_GOT_NON_CONTINUATION>
1048
1049The input sequence was malformed in that a non-continuation type byte was found
1050in a position where only a continuation type one should be.
1051
1052=item C<UTF8_GOT_OVERFLOW>
1053
1054The input sequence was malformed in that it is for a code point that is not
1055representable in the number of bits available in a UV on the current platform.
1056
1057=item C<UTF8_GOT_SHORT>
1058
1059The input sequence was malformed in that C<curlen> is smaller than required for
1060a complete sequence. In other words, the input is for a partial character
1061sequence.
1062
1063=item C<UTF8_GOT_SUPER>
1064
1065The input sequence was malformed in that it is for a non-Unicode code point;
1066that is, one above the legal Unicode maximum.
1067This bit is set only if the input C<flags> parameter contains either the
1068C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1069
1070=item C<UTF8_GOT_SURROGATE>
1071
1072The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1073code point.
1074This bit is set only if the input C<flags> parameter contains either the
1075C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1076
1077=back
1078
133551d8
KW
1079To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1080flag to suppress any warnings, and then examine the C<*errors> return.
1081
f9380377 1082=cut
37607a96 1083*/
67e989fb 1084
a0ed51b3 1085UV
f9380377
KW
1086Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1087 STRLEN curlen,
1088 STRLEN *retlen,
1089 const U32 flags,
1090 U32 * errors)
a0ed51b3 1091{
d4c19fe8 1092 const U8 * const s0 = s;
2b5e7bc2
KW
1093 U8 * send = NULL; /* (initialized to silence compilers' wrong
1094 warning) */
1095 U32 possible_problems = 0; /* A bit is set here for each potential problem
1096 found as we go along */
eb83ed87 1097 UV uv = *s;
2b5e7bc2
KW
1098 STRLEN expectlen = 0; /* How long should this sequence be?
1099 (initialized to silence compilers' wrong
1100 warning) */
e308b348 1101 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1102 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1103 this gets set and discarded */
a0dbb045 1104
2b5e7bc2
KW
1105 /* The below are used only if there is both an overlong malformation and a
1106 * too short one. Otherwise the first two are set to 's0' and 'send', and
1107 * the third not used at all */
1108 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1109 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1110 warning) */
e9f2c446
KW
1111 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1112 routine; see [perl #130921] */
2b5e7bc2 1113 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1114
f9380377
KW
1115 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1116
1117 if (errors) {
1118 *errors = 0;
1119 }
1120 else {
1121 errors = &discard_errors;
1122 }
a0dbb045 1123
eb83ed87
KW
1124 /* The order of malformation tests here is important. We should consume as
1125 * few bytes as possible in order to not skip any valid character. This is
1126 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1127 * http://unicode.org/reports/tr36 for more discussion as to why. For
1128 * example, once we've done a UTF8SKIP, we can tell the expected number of
1129 * bytes, and could fail right off the bat if the input parameters indicate
1130 * that there are too few available. But it could be that just that first
1131 * byte is garbled, and the intended character occupies fewer bytes. If we
1132 * blindly assumed that the first byte is correct, and skipped based on
1133 * that number, we could skip over a valid input character. So instead, we
1134 * always examine the sequence byte-by-byte.
1135 *
1136 * We also should not consume too few bytes, otherwise someone could inject
1137 * things. For example, an input could be deliberately designed to
1138 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1139 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1140 * which is actually part of of the overflowing sequence), that could look
1141 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1142 * sequence and process the rest, inappropriately.
1143 *
1144 * Some possible input sequences are malformed in more than one way. This
1145 * function goes to lengths to try to find all of them. This is necessary
1146 * for correctness, as the inputs may allow one malformation but not
1147 * another, and if we abandon searching for others after finding the
1148 * allowed one, we could allow in something that shouldn't have been.
1149 */
eb83ed87 1150
b5b9af04 1151 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1152 possible_problems |= UTF8_GOT_EMPTY;
1153 curlen = 0;
5a48568d 1154 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1155 goto ready_to_handle_errors;
0c443dc2
JH
1156 }
1157
eb83ed87
KW
1158 expectlen = UTF8SKIP(s);
1159
1160 /* A well-formed UTF-8 character, as the vast majority of calls to this
1161 * function will be for, has this expected length. For efficiency, set
1162 * things up here to return it. It will be overriden only in those rare
1163 * cases where a malformation is found */
1164 if (retlen) {
1165 *retlen = expectlen;
1166 }
1167
1168 /* An invariant is trivially well-formed */
1d72bdf6 1169 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1170 return uv;
a0ed51b3 1171 }
67e989fb 1172
eb83ed87 1173 /* A continuation character can't start a valid sequence */
b5b9af04 1174 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1175 possible_problems |= UTF8_GOT_CONTINUATION;
1176 curlen = 1;
1177 uv = UNICODE_REPLACEMENT;
1178 goto ready_to_handle_errors;
ba210ebe 1179 }
9041c2e3 1180
dcd27b3c 1181 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1182 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1183 * because it excludes start bytes like \xC0 that always lead to
1184 * overlongs.) */
dcd27b3c 1185
534752c1
KW
1186 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1187 * that indicate the number of bytes in the character's whole UTF-8
1188 * sequence, leaving just the bits that are part of the value. */
1189 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1190
e308b348
KW
1191 /* Setup the loop end point, making sure to not look past the end of the
1192 * input string, and flag it as too short if the size isn't big enough. */
1193 send = (U8*) s0;
1194 if (UNLIKELY(curlen < expectlen)) {
1195 possible_problems |= UTF8_GOT_SHORT;
1196 avail_len = curlen;
1197 send += curlen;
1198 }
1199 else {
1200 send += expectlen;
1201 }
1202 adjusted_send = send;
1203
eb83ed87 1204 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1205 * accumulating each into the working value as we go. */
eb83ed87 1206 for (s = s0 + 1; s < send; s++) {
b5b9af04 1207 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1208 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1209 continue;
1210 }
1211
1212 /* Here, found a non-continuation before processing all expected bytes.
1213 * This byte indicates the beginning of a new character, so quit, even
1214 * if allowing this malformation. */
2b5e7bc2 1215 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1216 break;
eb83ed87
KW
1217 } /* End of loop through the character's bytes */
1218
1219 /* Save how many bytes were actually in the character */
1220 curlen = s - s0;
1221
2b5e7bc2
KW
1222 /* Note that there are two types of too-short malformation. One is when
1223 * there is actual wrong data before the normal termination of the
1224 * sequence. The other is that the sequence wasn't complete before the end
1225 * of the data we are allowed to look at, based on the input 'curlen'.
1226 * This means that we were passed data for a partial character, but it is
1227 * valid as far as we saw. The other is definitely invalid. This
1228 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1229 * separate.
1230 *
1231 * A convenience macro that matches either of the too-short conditions. */
1232# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1233
1234 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1235 uv_so_far = uv;
1236 uv = UNICODE_REPLACEMENT;
1237 }
2b5e7bc2
KW
1238
1239 /* Check for overflow */
1240 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1241 possible_problems |= UTF8_GOT_OVERFLOW;
1242 uv = UNICODE_REPLACEMENT;
eb83ed87 1243 }
eb83ed87 1244
2b5e7bc2
KW
1245 /* Check for overlong. If no problems so far, 'uv' is the correct code
1246 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1247 * we must look at the UTF-8 byte sequence itself to see if it is for an
1248 * overlong */
1249 if ( ( LIKELY(! possible_problems)
1250 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1251 || ( UNLIKELY( possible_problems)
1252 && ( UNLIKELY(! UTF8_IS_START(*s0))
1253 || ( curlen > 1
1254 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1255 send - s0))))))
2f8f112e 1256 {
2b5e7bc2
KW
1257 possible_problems |= UTF8_GOT_LONG;
1258
abc28b54
KW
1259 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1260 /* The calculation in the 'true' branch of this 'if'
1261 * below won't work if overflows, and isn't needed
1262 * anyway. Further below we handle all overflow
1263 * cases */
1264 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1265 {
2b5e7bc2
KW
1266 UV min_uv = uv_so_far;
1267 STRLEN i;
1268
1269 /* Here, the input is both overlong and is missing some trailing
1270 * bytes. There is no single code point it could be for, but there
1271 * may be enough information present to determine if what we have
1272 * so far is for an unallowed code point, such as for a surrogate.
1273 * The code below has the intelligence to determine this, but just
1274 * for non-overlong UTF-8 sequences. What we do here is calculate
1275 * the smallest code point the input could represent if there were
1276 * no too short malformation. Then we compute and save the UTF-8
1277 * for that, which is what the code below looks at instead of the
1278 * raw input. It turns out that the smallest such code point is
1279 * all we need. */
1280 for (i = curlen; i < expectlen; i++) {
1281 min_uv = UTF8_ACCUMULATE(min_uv,
1282 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1283 }
1284
e9f2c446 1285 adjusted_s0 = temp_char_buf;
2b5e7bc2
KW
1286 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1287 }
eb83ed87
KW
1288 }
1289
2b5e7bc2
KW
1290 /* Now check that the input isn't for a problematic code point not allowed
1291 * by the input parameters. */
1292 /* isn't problematic if < this */
1293 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1294 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1295
1296 /* if overflow, we know without looking further
1297 * precisely which of the problematic types it is,
1298 * and we deal with those in the overflow handling
1299 * code */
1300 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
2b5e7bc2 1301 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1302 && ((flags & ( UTF8_DISALLOW_NONCHAR
1303 |UTF8_DISALLOW_SURROGATE
1304 |UTF8_DISALLOW_SUPER
1305 |UTF8_DISALLOW_ABOVE_31_BIT
1306 |UTF8_WARN_NONCHAR
1307 |UTF8_WARN_SURROGATE
1308 |UTF8_WARN_SUPER
1309 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1310 /* In case of a malformation, 'uv' is not valid, and has
1311 * been changed to something in the Unicode range.
1312 * Currently we don't output a deprecation message if there
1313 * is already a malformation, so we don't have to special
1314 * case the test immediately below */
760c7c2f
KW
1315 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1316 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1317 {
2b5e7bc2
KW
1318 /* If there were no malformations, or the only malformation is an
1319 * overlong, 'uv' is valid */
1320 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1321 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1322 possible_problems |= UTF8_GOT_SURROGATE;
1323 }
1324 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1325 possible_problems |= UTF8_GOT_SUPER;
1326 }
1327 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1328 possible_problems |= UTF8_GOT_NONCHAR;
1329 }
1330 }
1331 else { /* Otherwise, need to look at the source UTF-8, possibly
1332 adjusted to be non-overlong */
1333
1334 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1335 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1336 {
2b5e7bc2
KW
1337 possible_problems |= UTF8_GOT_SUPER;
1338 }
1339 else if (curlen > 1) {
1340 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1341 NATIVE_UTF8_TO_I8(*adjusted_s0),
1342 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1343 {
2b5e7bc2 1344 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1345 }
2b5e7bc2
KW
1346 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1347 NATIVE_UTF8_TO_I8(*adjusted_s0),
1348 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1349 {
1350 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1351 }
1352 }
c0236afe 1353
2b5e7bc2
KW
1354 /* We need a complete well-formed UTF-8 character to discern
1355 * non-characters, so can't look for them here */
1356 }
1357 }
949cf498 1358
2b5e7bc2
KW
1359 ready_to_handle_errors:
1360
1361 /* At this point:
1362 * curlen contains the number of bytes in the sequence that
1363 * this call should advance the input by.
e308b348
KW
1364 * avail_len gives the available number of bytes passed in, but
1365 * only if this is less than the expected number of
1366 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1367 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1368 * is set in it for each potential problem found.
1369 * uv contains the code point the input sequence
1370 * represents; or if there is a problem that prevents
1371 * a well-defined value from being computed, it is
1372 * some subsitute value, typically the REPLACEMENT
1373 * CHARACTER.
1374 * s0 points to the first byte of the character
1375 * send points to just after where that (potentially
1376 * partial) character ends
1377 * adjusted_s0 normally is the same as s0, but in case of an
1378 * overlong for which the UTF-8 matters below, it is
1379 * the first byte of the shortest form representation
1380 * of the input.
1381 * adjusted_send normally is the same as 'send', but if adjusted_s0
1382 * is set to something other than s0, this points one
1383 * beyond its end
1384 */
eb83ed87 1385
2b5e7bc2
KW
1386 if (UNLIKELY(possible_problems)) {
1387 bool disallowed = FALSE;
1388 const U32 orig_problems = possible_problems;
1389
1390 while (possible_problems) { /* Handle each possible problem */
1391 UV pack_warn = 0;
1392 char * message = NULL;
1393
1394 /* Each 'if' clause handles one problem. They are ordered so that
1395 * the first ones' messages will be displayed before the later
1396 * ones; this is kinda in decreasing severity order */
1397 if (possible_problems & UTF8_GOT_OVERFLOW) {
1398
1399 /* Overflow means also got a super and above 31 bits, but we
1400 * handle all three cases here */
1401 possible_problems
1402 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1403 *errors |= UTF8_GOT_OVERFLOW;
1404
1405 /* But the API says we flag all errors found */
1406 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1407 *errors |= UTF8_GOT_SUPER;
1408 }
ddb65933
KW
1409 if (flags
1410 & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
1411 {
f9380377
KW
1412 *errors |= UTF8_GOT_ABOVE_31_BIT;
1413 }
2b5e7bc2 1414
d60baaa7
KW
1415 /* Disallow if any of the three categories say to */
1416 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1417 || (flags & ( UTF8_DISALLOW_SUPER
1418 |UTF8_DISALLOW_ABOVE_31_BIT)))
1419 {
1420 disallowed = TRUE;
1421 }
1422
1423
1424 /* Likewise, warn if any say to, plus if deprecation warnings
1425 * are on, because this code point is above IV_MAX */
1426 if ( ckWARN_d(WARN_DEPRECATED)
1427 || ! (flags & UTF8_ALLOW_OVERFLOW)
1428 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
1429 {
2b5e7bc2 1430
ddb65933
KW
1431 /* The warnings code explicitly says it doesn't handle the
1432 * case of packWARN2 and two categories which have
1433 * parent-child relationship. Even if it works now to
1434 * raise the warning if either is enabled, it wouldn't
1435 * necessarily do so in the future. We output (only) the
1436 * most dire warning*/
1437 if (! (flags & UTF8_CHECK_ONLY)) {
1438 if (ckWARN_d(WARN_UTF8)) {
1439 pack_warn = packWARN(WARN_UTF8);
1440 }
1441 else if (ckWARN_d(WARN_NON_UNICODE)) {
1442 pack_warn = packWARN(WARN_NON_UNICODE);
1443 }
1444 if (pack_warn) {
1445 message = Perl_form(aTHX_ "%s: %s (overflows)",
1446 malformed_text,
05b9033b 1447 _byte_dump_string(s0, curlen, 0));
ddb65933 1448 }
2b5e7bc2
KW
1449 }
1450 }
1451 }
1452 else if (possible_problems & UTF8_GOT_EMPTY) {
1453 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1454 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1455
1456 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1457
1458 /* This so-called malformation is now treated as a bug in
1459 * the caller. If you have nothing to decode, skip calling
1460 * this function */
1461 assert(0);
1462
2b5e7bc2
KW
1463 disallowed = TRUE;
1464 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1465 pack_warn = packWARN(WARN_UTF8);
1466 message = Perl_form(aTHX_ "%s (empty string)",
1467 malformed_text);
1468 }
1469 }
1470 }
1471 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1472 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1473 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1474
1475 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1476 disallowed = TRUE;
1477 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1478 pack_warn = packWARN(WARN_UTF8);
1479 message = Perl_form(aTHX_
1480 "%s: %s (unexpected continuation byte 0x%02x,"
1481 " with no preceding start byte)",
1482 malformed_text,
7e2f38b2 1483 _byte_dump_string(s0, 1, 0), *s0);
2b5e7bc2
KW
1484 }
1485 }
1486 }
2b5e7bc2
KW
1487 else if (possible_problems & UTF8_GOT_SHORT) {
1488 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1489 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1490
1491 if (! (flags & UTF8_ALLOW_SHORT)) {
1492 disallowed = TRUE;
1493 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1494 pack_warn = packWARN(WARN_UTF8);
1495 message = Perl_form(aTHX_
9a6c9c81 1496 "%s: %s (too short; %d byte%s available, need %d)",
2b5e7bc2 1497 malformed_text,
7e2f38b2 1498 _byte_dump_string(s0, send - s0, 0),
e308b348
KW
1499 (int)avail_len,
1500 avail_len == 1 ? "" : "s",
2b5e7bc2
KW
1501 (int)expectlen);
1502 }
1503 }
ba210ebe 1504
2b5e7bc2 1505 }
e308b348
KW
1506 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1507 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1508 *errors |= UTF8_GOT_NON_CONTINUATION;
1509
1510 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1511 disallowed = TRUE;
1512 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
99a765e9
KW
1513
1514 /* If we don't know for sure that the input length is
1515 * valid, avoid as much as possible reading past the
1516 * end of the buffer */
1517 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1518 ? s - s0
1519 : send - s0;
e308b348
KW
1520 pack_warn = packWARN(WARN_UTF8);
1521 message = Perl_form(aTHX_ "%s",
1522 unexpected_non_continuation_text(s0,
99a765e9 1523 printlen,
e308b348
KW
1524 s - s0,
1525 (int) expectlen));
1526 }
1527 }
1528 }
2b5e7bc2
KW
1529 else if (possible_problems & UTF8_GOT_LONG) {
1530 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1531 *errors |= UTF8_GOT_LONG;
2b5e7bc2 1532
94953955
KW
1533 if (flags & UTF8_ALLOW_LONG) {
1534
1535 /* We don't allow the actual overlong value, unless the
1536 * special extra bit is also set */
1537 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
1538 & ~UTF8_ALLOW_LONG)))
1539 {
1540 uv = UNICODE_REPLACEMENT;
1541 }
1542 }
1543 else {
2b5e7bc2
KW
1544 disallowed = TRUE;
1545
1546 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1547 pack_warn = packWARN(WARN_UTF8);
1548
1549 /* These error types cause 'uv' to be something that
1550 * isn't what was intended, so can't use it in the
1551 * message. The other error types either can't
1552 * generate an overlong, or else the 'uv' is valid */
1553 if (orig_problems &
1554 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1555 {
1556 message = Perl_form(aTHX_
1557 "%s: %s (any UTF-8 sequence that starts"
1558 " with \"%s\" is overlong which can and"
1559 " should be represented with a"
1560 " different, shorter sequence)",
1561 malformed_text,
7e2f38b2
KW
1562 _byte_dump_string(s0, send - s0, 0),
1563 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1564 }
1565 else {
1566 U8 tmpbuf[UTF8_MAXBYTES+1];
1567 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1568 uv, 0);
1569 message = Perl_form(aTHX_
1570 "%s: %s (overlong; instead use %s to represent"
147e3846 1571 " U+%0*" UVXf ")",
2b5e7bc2 1572 malformed_text,
05b9033b 1573 _byte_dump_string(s0, curlen, 0),
7e2f38b2 1574 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2b5e7bc2
KW
1575 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1576 small code points */
1577 uv);
1578 }
1579 }
1580 }
1581 }
1582 else if (possible_problems & UTF8_GOT_SURROGATE) {
1583 possible_problems &= ~UTF8_GOT_SURROGATE;
1584
f9380377
KW
1585 if (flags & UTF8_WARN_SURROGATE) {
1586 *errors |= UTF8_GOT_SURROGATE;
1587
1588 if ( ! (flags & UTF8_CHECK_ONLY)
1589 && ckWARN_d(WARN_SURROGATE))
1590 {
2b5e7bc2
KW
1591 pack_warn = packWARN(WARN_SURROGATE);
1592
1593 /* These are the only errors that can occur with a
1594 * surrogate when the 'uv' isn't valid */
1595 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1596 message = Perl_form(aTHX_
1597 "UTF-16 surrogate (any UTF-8 sequence that"
1598 " starts with \"%s\" is for a surrogate)",
7e2f38b2 1599 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1600 }
1601 else {
c94c2f39 1602 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 1603 }
f9380377 1604 }
2b5e7bc2 1605 }
ba210ebe 1606
2b5e7bc2
KW
1607 if (flags & UTF8_DISALLOW_SURROGATE) {
1608 disallowed = TRUE;
f9380377 1609 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1610 }
1611 }
1612 else if (possible_problems & UTF8_GOT_SUPER) {
1613 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1614
f9380377
KW
1615 if (flags & UTF8_WARN_SUPER) {
1616 *errors |= UTF8_GOT_SUPER;
1617
1618 if ( ! (flags & UTF8_CHECK_ONLY)
1619 && ckWARN_d(WARN_NON_UNICODE))
1620 {
2b5e7bc2
KW
1621 pack_warn = packWARN(WARN_NON_UNICODE);
1622
1623 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1624 message = Perl_form(aTHX_
1625 "Any UTF-8 sequence that starts with"
1626 " \"%s\" is for a non-Unicode code point,"
1627 " may not be portable",
7e2f38b2 1628 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1629 }
1630 else {
c94c2f39 1631 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 1632 }
f9380377 1633 }
2b5e7bc2 1634 }
ba210ebe 1635
2b5e7bc2
KW
1636 /* The maximum code point ever specified by a standard was
1637 * 2**31 - 1. Anything larger than that is a Perl extension
1638 * that very well may not be understood by other applications
1639 * (including earlier perl versions on EBCDIC platforms). We
1640 * test for these after the regular SUPER ones, and before
1641 * possibly bailing out, so that the slightly more dire warning
1642 * will override the regular one. */
1643 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1644 |UTF8_WARN_SUPER
1645 |UTF8_DISALLOW_ABOVE_31_BIT))
1646 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1647 && UNLIKELY(is_utf8_cp_above_31_bits(
1648 adjusted_s0,
1649 adjusted_send)))
1650 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1651 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1652 {
1653 if ( ! (flags & UTF8_CHECK_ONLY)
1654 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1655 && ckWARN_d(WARN_UTF8))
1656 {
1657 pack_warn = packWARN(WARN_UTF8);
1658
1659 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1660 message = Perl_form(aTHX_
1661 "Any UTF-8 sequence that starts with"
1662 " \"%s\" is for a non-Unicode code"
1663 " point, and is not portable",
7e2f38b2 1664 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1665 }
1666 else {
1667 message = Perl_form(aTHX_
c94c2f39 1668 above_31_bit_cp_format, uv);
2b5e7bc2
KW
1669 }
1670 }
1671
ddb65933
KW
1672 if (flags & ( UTF8_WARN_ABOVE_31_BIT
1673 |UTF8_DISALLOW_ABOVE_31_BIT))
1674 {
f9380377
KW
1675 *errors |= UTF8_GOT_ABOVE_31_BIT;
1676
1677 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1678 disallowed = TRUE;
1679 }
2b5e7bc2
KW
1680 }
1681 }
eb83ed87 1682
2b5e7bc2 1683 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1684 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1685 disallowed = TRUE;
1686 }
eb83ed87 1687
2b5e7bc2
KW
1688 /* The deprecated warning overrides any non-deprecated one. If
1689 * there are other problems, a deprecation message is not
1690 * really helpful, so don't bother to raise it in that case.
1691 * This also keeps the code from having to handle the case
1692 * where 'uv' is not valid. */
1693 if ( ! (orig_problems
1694 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
76513bdc
KW
1695 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1696 && ckWARN_d(WARN_DEPRECATED))
1697 {
1698 message = Perl_form(aTHX_ cp_above_legal_max,
1699 uv, MAX_NON_DEPRECATED_CP);
1700 pack_warn = packWARN(WARN_DEPRECATED);
2b5e7bc2
KW
1701 }
1702 }
1703 else if (possible_problems & UTF8_GOT_NONCHAR) {
1704 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1705
f9380377
KW
1706 if (flags & UTF8_WARN_NONCHAR) {
1707 *errors |= UTF8_GOT_NONCHAR;
1708
1709 if ( ! (flags & UTF8_CHECK_ONLY)
1710 && ckWARN_d(WARN_NONCHAR))
1711 {
2b5e7bc2
KW
1712 /* The code above should have guaranteed that we don't
1713 * get here with errors other than overlong */
1714 assert (! (orig_problems
1715 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1716
1717 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 1718 message = Perl_form(aTHX_ nonchar_cp_format, uv);
f9380377 1719 }
2b5e7bc2 1720 }
5b311467 1721
2b5e7bc2
KW
1722 if (flags & UTF8_DISALLOW_NONCHAR) {
1723 disallowed = TRUE;
f9380377 1724 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1725 }
1726 } /* End of looking through the possible flags */
1727
1728 /* Display the message (if any) for the problem being handled in
1729 * this iteration of the loop */
1730 if (message) {
1731 if (PL_op)
1732 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1733 OP_DESC(PL_op));
1734 else
1735 Perl_warner(aTHX_ pack_warn, "%s", message);
1736 }
ddb65933 1737 } /* End of 'while (possible_problems)' */
a0dbb045 1738
2b5e7bc2
KW
1739 /* Since there was a possible problem, the returned length may need to
1740 * be changed from the one stored at the beginning of this function.
1741 * Instead of trying to figure out if that's needed, just do it. */
1742 if (retlen) {
1743 *retlen = curlen;
1744 }
a0dbb045 1745
2b5e7bc2
KW
1746 if (disallowed) {
1747 if (flags & UTF8_CHECK_ONLY && retlen) {
1748 *retlen = ((STRLEN) -1);
1749 }
1750 return 0;
1751 }
eb83ed87 1752 }
ba210ebe 1753
2b5e7bc2 1754 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1755}
1756
8e84507e 1757/*
ec5f19d0
KW
1758=for apidoc utf8_to_uvchr_buf
1759
1760Returns the native code point of the first character in the string C<s> which
1761is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1762C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1763
524080c4
KW
1764If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1765enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1766C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1767(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1768C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1769the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1770See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1771returned.
ec5f19d0 1772
760c7c2f
KW
1773Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1774unless those are turned off.
1775
ec5f19d0 1776=cut
52be2536
KW
1777
1778Also implemented as a macro in utf8.h
1779
ec5f19d0
KW
1780*/
1781
1782
1783UV
1784Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1785{
7f974d7e
KW
1786 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
1787
ec5f19d0
KW
1788 assert(s < send);
1789
1790 return utf8n_to_uvchr(s, send - s, retlen,
ddb65933 1791 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
ec5f19d0
KW
1792}
1793
52be2536
KW
1794/* This is marked as deprecated
1795 *
ec5f19d0
KW
1796=for apidoc utf8_to_uvuni_buf
1797
de69f3af
KW
1798Only in very rare circumstances should code need to be dealing in Unicode
1799(as opposed to native) code points. In those few cases, use
1800C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1801
1802Returns the Unicode (not-native) code point of the first character in the
1803string C<s> which
ec5f19d0
KW
1804is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1805C<retlen> will be set to the length, in bytes, of that character.
1806
524080c4
KW
1807If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1808enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1809NULL) to -1. If those warnings are off, the computed value if well-defined (or
1810the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1811is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1812next possible position in C<s> that could begin a non-malformed character.
de69f3af 1813See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1814
760c7c2f
KW
1815Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1816unless those are turned off.
1817
ec5f19d0
KW
1818=cut
1819*/
1820
1821UV
1822Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1823{
1824 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1825
1826 assert(send > s);
1827
5962d97e
KW
1828 /* Call the low level routine, asking for checks */
1829 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1830}
1831
b76347f2 1832/*
87cea99e 1833=for apidoc utf8_length
b76347f2
JH
1834
1835Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1836Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1837up past C<e>, croaks.
b76347f2
JH
1838
1839=cut
1840*/
1841
1842STRLEN
35a4481c 1843Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1844{
1845 STRLEN len = 0;
1846
7918f24d
NC
1847 PERL_ARGS_ASSERT_UTF8_LENGTH;
1848
8850bf83
JH
1849 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1850 * the bitops (especially ~) can create illegal UTF-8.
1851 * In other words: in Perl UTF-8 is not just for Unicode. */
1852
a3b680e6
AL
1853 if (e < s)
1854 goto warn_and_return;
b76347f2 1855 while (s < e) {
4cbf4130 1856 s += UTF8SKIP(s);
8e91ec7f
AV
1857 len++;
1858 }
1859
1860 if (e != s) {
1861 len--;
1862 warn_and_return:
9b387841
NC
1863 if (PL_op)
1864 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1865 "%s in %s", unees, OP_DESC(PL_op));
1866 else
61a12c31 1867 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1868 }
1869
1870 return len;
1871}
1872
b06226ff 1873/*
fed3ba5d
NC
1874=for apidoc bytes_cmp_utf8
1875
a1433954 1876Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1877sequence of characters (stored as UTF-8)
1878in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1879equal, -1 or -2 if the first string is less than the second string, +1 or +2
1880if the first string is greater than the second string.
1881
1882-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1883longer string. -2 or +2 is returned if
1884there was a difference between characters
fed3ba5d
NC
1885within the strings.
1886
1887=cut
1888*/
1889
1890int
1891Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1892{
1893 const U8 *const bend = b + blen;
1894 const U8 *const uend = u + ulen;
1895
1896 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1897
1898 while (b < bend && u < uend) {
1899 U8 c = *u++;
1900 if (!UTF8_IS_INVARIANT(c)) {
1901 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1902 if (u < uend) {
1903 U8 c1 = *u++;
1904 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1905 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1906 } else {
2b5e7bc2 1907 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1908 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1909 "%s %s%s",
1d5030e1 1910 unexpected_non_continuation_text(u - 2, 2, 1, 2),
806547a7
KW
1911 PL_op ? " in " : "",
1912 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1913 return -2;
1914 }
1915 } else {
1916 if (PL_op)
1917 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1918 "%s in %s", unees, OP_DESC(PL_op));
1919 else
61a12c31 1920 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1921 return -2; /* Really want to return undef :-) */
1922 }
1923 } else {
1924 return -2;
1925 }
1926 }
1927 if (*b != c) {
1928 return *b < c ? -2 : +2;
1929 }
1930 ++b;
1931 }
1932
1933 if (b == bend && u == uend)
1934 return 0;
1935
1936 return b < bend ? +1 : -1;
1937}
1938
1939/*
87cea99e 1940=for apidoc utf8_to_bytes
6940069f 1941
3bc0c78c 1942Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 1943Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 1944updates C<*lenp> to contain the new length.
3bc0c78c
KW
1945Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
1946
1947Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
1948having saved the value of C<*lenp> before the call, and subtracting the
1949after-call value of C<*lenp> from it.
6940069f 1950
a1433954 1951If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1952
6940069f
GS
1953=cut
1954*/
1955
1956U8 *
09af0336 1957Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 1958{
9fe0d3c2 1959 U8 * first_variant;
246fae53 1960
7918f24d 1961 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1962 PERL_UNUSED_CONTEXT;
7918f24d 1963
9fe0d3c2 1964 /* This is a no-op if no variants at all in the input */
09af0336 1965 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
1966 return s;
1967 }
1968
1969 {
3c5aa262 1970 U8 * const save = s;
09af0336 1971 U8 * const send = s + *lenp;
3c5aa262
KW
1972 U8 * d;
1973
1974 /* Nothing before the first variant needs to be changed, so start the real
1975 * work there */
1976 s = first_variant;
1977 while (s < send) {
1978 if (! UTF8_IS_INVARIANT(*s)) {
1979 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 1980 *lenp = ((STRLEN) -1);
3c5aa262
KW
1981 return 0;
1982 }
1983 s++;
d59937ca
KW
1984 }
1985 s++;
dcad2880 1986 }
dcad2880 1987
3c5aa262
KW
1988 /* Is downgradable, so do it */
1989 d = s = first_variant;
1990 while (s < send) {
1991 U8 c = *s++;
1992 if (! UVCHR_IS_INVARIANT(c)) {
1993 /* Then it is two-byte encoded */
1994 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1995 s++;
1996 }
1997 *d++ = c;
1998 }
1999 *d = '\0';
09af0336 2000 *lenp = d - save;
3c5aa262
KW
2001
2002 return save;
9fe0d3c2 2003 }
6940069f
GS
2004}
2005
2006/*
87cea99e 2007=for apidoc bytes_from_utf8
f9a63242 2008
09af0336 2009Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2010byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2011actually encoded in UTF-8.
2012
2013Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2014the input string.
2015
41ae6089
KW
2016Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2017not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2018C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2019
41ae6089 2020Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2021newly created string containing a downgraded copy of C<s>, and whose length is
23b37b12 2022returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
f9a63242 2023
3bc0c78c 2024Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2025having saved the value of C<*lenp> before the call, and subtracting the
2026after-call value of C<*lenp> from it.
3bc0c78c 2027
37607a96 2028=cut
976c1b08
KW
2029
2030There is a macro that avoids this function call, but this is retained for
2031anyone who calls it with the Perl_ prefix */
f9a63242
JH
2032
2033U8 *
41ae6089 2034Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2035{
7918f24d 2036 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2037 PERL_UNUSED_CONTEXT;
f9a63242 2038
976c1b08
KW
2039 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2040}
2041
2042/*
2043No = here because currently externally undocumented
2044for apidoc bytes_from_utf8_loc
2045
2046Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2047to store the location of the first character in C<"s"> that cannot be
2048converted to non-UTF8.
2049
2050If that parameter is C<NULL>, this function behaves identically to
2051C<bytes_from_utf8>.
2052
2053Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2054C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2055
2056Otherwise, the function returns a newly created C<NUL>-terminated string
2057containing the non-UTF8 equivalent of the convertible first portion of
2058C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2059If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2060and C<*first_non_downgradable> is set to C<NULL>.
2061
2062Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2063first character in the original string that wasn't converted. C<*is_utf8p> is
2064unchanged. Note that the new string may have length 0.
2065
2066Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2067C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2068converts as many characters in it as possible stopping at the first one it
385b74be 2069finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2070set to point to that. The function returns the portion that could be converted
2071in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2072not including the terminating C<NUL>. If the very first character in the
2073original could not be converted, C<*lenp> will be 0, and the new string will
2074contain just a single C<NUL>. If the entire input string was converted,
2075C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2076
2077Upon successful return, the number of variants in the converted portion of the
2078string can be computed by having saved the value of C<*lenp> before the call,
2079and subtracting the after-call value of C<*lenp> from it.
2080
2081=cut
2082
2083
2084*/
2085
2086U8 *
2087Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2088{
2089 U8 *d;
2090 const U8 *original = s;
2091 U8 *converted_start;
2092 const U8 *send = s + *lenp;
f9a63242 2093
976c1b08 2094 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2095
976c1b08
KW
2096 if (! *is_utf8p) {
2097 if (first_unconverted) {
2098 *first_unconverted = NULL;
2099 }
2100
2101 return (U8 *) original;
2102 }
2103
2104 Newx(d, (*lenp) + 1, U8);
2105
2106 converted_start = d;
7299a045
KW
2107 while (s < send) {
2108 U8 c = *s++;
2109 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2110
2111 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2112 * have to stop now */
2113 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2114 if (first_unconverted) {
2115 *first_unconverted = s - 1;
2116 goto finish_and_return;
2117 }
2118 else {
2119 Safefree(converted_start);
2120 return (U8 *) original;
2121 }
2122 }
2123
7299a045
KW
2124 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2125 s++;
38af28cf 2126 }
7299a045
KW
2127 *d++ = c;
2128 }
170a1c22 2129
976c1b08
KW
2130 /* Here, converted the whole of the input */
2131 *is_utf8p = FALSE;
2132 if (first_unconverted) {
2133 *first_unconverted = NULL;
170a1c22 2134 }
976c1b08
KW
2135
2136 finish_and_return:
2137 *d = '\0';
2138 *lenp = d - converted_start;
2139
2140 /* Trim unused space */
2141 Renew(converted_start, *lenp + 1, U8);
2142
2143 return converted_start;
f9a63242
JH
2144}
2145
2146/*
87cea99e 2147=for apidoc bytes_to_utf8
6940069f 2148
09af0336 2149Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2150UTF-8.
09af0336 2151Returns a pointer to the newly-created string, and sets C<*lenp> to
ff97e5cf 2152reflect the new length in bytes.
6940069f 2153
3bc0c78c 2154Upon successful return, the number of variants in the string can be computed by
23b37b12 2155having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2156after-call value of C<*lenp>.
2157
75200dff 2158A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2159
2160If you want to convert to UTF-8 from encodings other than
2161the native (Latin1 or EBCDIC),
a1433954 2162see L</sv_recode_to_utf8>().
c9ada85f 2163
497711e7 2164=cut
6940069f
GS
2165*/
2166
2167U8*
09af0336 2168Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2169{
09af0336 2170 const U8 * const send = s + (*lenp);
6940069f
GS
2171 U8 *d;
2172 U8 *dst;
7918f24d
NC
2173
2174 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2175 PERL_UNUSED_CONTEXT;
6940069f 2176
09af0336 2177 Newx(d, (*lenp) * 2 + 1, U8);
6940069f
GS
2178 dst = d;
2179
2180 while (s < send) {
55d09dc8
KW
2181 append_utf8_from_native_byte(*s, &d);
2182 s++;
6940069f
GS
2183 }
2184 *d = '\0';
09af0336 2185 *lenp = d-dst;
6940069f
GS
2186 return dst;
2187}
2188
a0ed51b3 2189/*
dea0fc0b 2190 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
2191 *
2192 * Destination must be pre-extended to 3/2 source. Do not use in-place.
2193 * We optimize for native, for obvious reasons. */
2194
2195U8*
dea0fc0b 2196Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 2197{
dea0fc0b
JH
2198 U8* pend;
2199 U8* dstart = d;
2200
7918f24d
NC
2201 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2202
dea0fc0b 2203 if (bytelen & 1)
147e3846 2204 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
dea0fc0b
JH
2205
2206 pend = p + bytelen;
2207
a0ed51b3 2208 while (p < pend) {
dea0fc0b
JH
2209 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2210 p += 2;
2d1545e5 2211 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2212 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2213 continue;
2214 }
56d37426
KW
2215 if (uv <= MAX_UTF8_TWO_BYTE) {
2216 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2217 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2218 continue;
2219 }
46956fad
KW
2220#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2221#define LAST_HIGH_SURROGATE 0xDBFF
2222#define FIRST_LOW_SURROGATE 0xDC00
2223#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
2224
2225 /* This assumes that most uses will be in the first Unicode plane, not
2226 * needing surrogates */
2227 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2228 && uv <= UNICODE_SURROGATE_LAST))
2229 {
2230 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2231 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2232 }
2233 else {
01ea242b 2234 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2235 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2236 || UNLIKELY(low > LAST_LOW_SURROGATE))
2237 {
01ea242b 2238 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2239 }
2240 p += 2;
46956fad
KW
2241 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2242 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2243 }
a0ed51b3 2244 }
56d37426
KW
2245#ifdef EBCDIC
2246 d = uvoffuni_to_utf8_flags(d, uv, 0);
2247#else
a0ed51b3 2248 if (uv < 0x10000) {
eb160463
GS
2249 *d++ = (U8)(( uv >> 12) | 0xe0);
2250 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2251 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2252 continue;
2253 }
2254 else {
eb160463
GS
2255 *d++ = (U8)(( uv >> 18) | 0xf0);
2256 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2257 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2258 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2259 continue;
2260 }
56d37426 2261#endif
a0ed51b3 2262 }
dea0fc0b 2263 *newlen = d - dstart;
a0ed51b3
LW
2264 return d;
2265}
2266
2267/* Note: this one is slightly destructive of the source. */
2268
2269U8*
dea0fc0b 2270Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2271{
2272 U8* s = (U8*)p;
d4c19fe8 2273 U8* const send = s + bytelen;
7918f24d
NC
2274
2275 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2276
e0ea5e2d 2277 if (bytelen & 1)
147e3846 2278 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2279 (UV)bytelen);
2280
a0ed51b3 2281 while (s < send) {
d4c19fe8 2282 const U8 tmp = s[0];
a0ed51b3
LW
2283 s[0] = s[1];
2284 s[1] = tmp;
2285 s += 2;
2286 }
dea0fc0b 2287 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2288}
2289
922e8cb4
KW
2290bool
2291Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2292{
2293 U8 tmpbuf[UTF8_MAXBYTES+1];
2294 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2295 return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
922e8cb4
KW
2296}
2297
f9ae8fb6
JD
2298/* Internal function so we can deprecate the external one, and call
2299 this one from other deprecated functions in this file */
2300
f2645549
KW
2301bool
2302Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2303{
f2645549 2304 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2305
2306 if (*p == '_')
2307 return TRUE;
f25ce844 2308 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2309}
2310
5092f92a 2311bool
eba68aa0
KW
2312Perl__is_uni_perl_idcont(pTHX_ UV c)
2313{
2314 U8 tmpbuf[UTF8_MAXBYTES+1];
2315 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2316 return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
eba68aa0
KW
2317}
2318
2319bool
f91dcd13
KW
2320Perl__is_uni_perl_idstart(pTHX_ UV c)
2321{
2322 U8 tmpbuf[UTF8_MAXBYTES+1];
2323 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2324 return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
f91dcd13
KW
2325}
2326
3a4c58c9
KW
2327UV
2328Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2329{
2330 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2331 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2332 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2333 * either "SS" or "Ss". Which one to use is passed into the routine in
2334 * 'S_or_s' to avoid a test */
2335
2336 UV converted = toUPPER_LATIN1_MOD(c);
2337
2338 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2339
2340 assert(S_or_s == 'S' || S_or_s == 's');
2341
6f2d5cbc 2342 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2343 characters in this range */
3a4c58c9
KW
2344 *p = (U8) converted;
2345 *lenp = 1;
2346 return converted;
2347 }
2348
2349 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2350 * which it maps to one of them, so as to only have to have one check for
2351 * it in the main case */
2352 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2353 switch (c) {
2354 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2355 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2356 break;
2357 case MICRO_SIGN:
2358 converted = GREEK_CAPITAL_LETTER_MU;
2359 break;
79e064b9
KW
2360#if UNICODE_MAJOR_VERSION > 2 \
2361 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2362 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2363 case LATIN_SMALL_LETTER_SHARP_S:
2364 *(p)++ = 'S';
2365 *p = S_or_s;
2366 *lenp = 2;
2367 return 'S';
79e064b9 2368#endif
3a4c58c9
KW
2369 default:
2370 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2371 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2372 }
2373 }
2374
2375 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2376 *p = UTF8_TWO_BYTE_LO(converted);
2377 *lenp = 2;
2378
2379 return converted;
2380}
2381
50bda2c3
KW
2382/* Call the function to convert a UTF-8 encoded character to the specified case.
2383 * Note that there may be more than one character in the result.
2384 * INP is a pointer to the first byte of the input character
2385 * OUTP will be set to the first byte of the string of changed characters. It
2386 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2387 * LENP will be set to the length in bytes of the string of changed characters
2388 *
2389 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2390#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2391#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2392#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2393
b9992569
KW
2394/* This additionally has the input parameter 'specials', which if non-zero will
2395 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2396 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2397#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2398
84afefe6
JH
2399UV
2400Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2401{
a1433954
KW
2402 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2403 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2404 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2405 * the changed version may be longer than the original character.
2406 *
2407 * The ordinal of the first character of the changed version is returned
2408 * (but note, as explained above, that there may be more.) */
2409
7918f24d
NC
2410 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2411
3a4c58c9
KW
2412 if (c < 256) {
2413 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2414 }
2415
0ebc6274 2416 uvchr_to_utf8(p, c);
b9992569 2417 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2418}
2419
84afefe6
JH
2420UV
2421Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2422{
7918f24d
NC
2423 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2424
3a4c58c9
KW
2425 if (c < 256) {
2426 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2427 }
2428
0ebc6274 2429 uvchr_to_utf8(p, c);
b9992569 2430 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2431}
2432
afc16117 2433STATIC U8
eaf412bf 2434S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2435{
2436 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2437 * those, converting the result to UTF-8. Since the result is always just
a1433954 2438 * one character, we allow <p> to be NULL */
afc16117
KW
2439
2440 U8 converted = toLOWER_LATIN1(c);
2441
eaf412bf
KW
2442 PERL_UNUSED_ARG(dummy);
2443
afc16117 2444 if (p != NULL) {
6f2d5cbc 2445 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2446 *p = converted;
2447 *lenp = 1;
2448 }
2449 else {
430c9760
KW
2450 /* Result is known to always be < 256, so can use the EIGHT_BIT
2451 * macros */
2452 *p = UTF8_EIGHT_BIT_HI(converted);
2453 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2454 *lenp = 2;
2455 }
2456 }
2457 return converted;
2458}
2459
84afefe6
JH
2460UV
2461Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2462{
7918f24d
NC
2463 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2464
afc16117 2465 if (c < 256) {
eaf412bf 2466 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
2467 }
2468
afc16117 2469 uvchr_to_utf8(p, c);
b9992569 2470 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2471}
2472
84afefe6 2473UV
51910141 2474Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2475{
51910141 2476 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2477 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2478 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2479 *
2480 * Not to be used for locale folds
51910141 2481 */
f673fad4 2482
a1dde8de
KW
2483 UV converted;
2484
2485 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2486 PERL_UNUSED_CONTEXT;
a1dde8de 2487
1ca267a5
KW
2488 assert (! (flags & FOLD_FLAGS_LOCALE));
2489
659a7c2d 2490 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2491 converted = GREEK_SMALL_LETTER_MU;
2492 }
9b63e895
KW
2493#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2494 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2495 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2496 else if ( (flags & FOLD_FLAGS_FULL)
2497 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2498 {
1ca267a5
KW
2499 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2500 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2501 * under those circumstances. */
2502 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2503 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2504 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2505 p, *lenp, U8);
2506 return LATIN_SMALL_LETTER_LONG_S;
2507 }
2508 else {
4f489194
KW
2509 *(p)++ = 's';
2510 *p = 's';
2511 *lenp = 2;
2512 return 's';
1ca267a5 2513 }
a1dde8de 2514 }
9b63e895 2515#endif
a1dde8de
KW
2516 else { /* In this range the fold of all other characters is their lower
2517 case */
2518 converted = toLOWER_LATIN1(c);
2519 }
2520
6f2d5cbc 2521 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2522 *p = (U8) converted;
2523 *lenp = 1;
2524 }
2525 else {
2526 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2527 *p = UTF8_TWO_BYTE_LO(converted);
2528 *lenp = 2;
2529 }
2530
2531 return converted;
2532}
2533
2534UV
31f05a37 2535Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2536{
4b593389 2537
a0270393
KW
2538 /* Not currently externally documented, and subject to change
2539 * <flags> bits meanings:
2540 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2541 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2542 * locale are to be used.
a0270393
KW
2543 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2544 */
4b593389 2545
36bb2ab6 2546 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2547
780fcc9f
KW
2548 if (flags & FOLD_FLAGS_LOCALE) {
2549 /* Treat a UTF-8 locale as not being in locale at all */
2550 if (IN_UTF8_CTYPE_LOCALE) {
2551 flags &= ~FOLD_FLAGS_LOCALE;
2552 }
2553 else {
2554 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2555 goto needs_full_generality;
780fcc9f 2556 }
31f05a37
KW
2557 }
2558
a1dde8de 2559 if (c < 256) {
e7b7ac46 2560 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2561 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2562 }
2563
2f306ab9 2564 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2565 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2566 uvchr_to_utf8(p, c);
b9992569 2567 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 2568 }
567b353c 2569 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
2570 the special flags. */
2571 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2572
2573 needs_full_generality:
a0270393 2574 uvchr_to_utf8(utf8_c, c);
a1a5ec35 2575 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags);
a0270393 2576 }
84afefe6
JH
2577}
2578
26483009 2579PERL_STATIC_INLINE bool
5141f98e 2580S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2581 const char *const swashname, SV* const invlist)
bde6a22d 2582{
ea317ccb
KW
2583 /* returns a boolean giving whether or not the UTF8-encoded character that
2584 * starts at <p> is in the swash indicated by <swashname>. <swash>
2585 * contains a pointer to where the swash indicated by <swashname>
2586 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2587 * look at <*swash> and only generate a swash if it is not null. <invlist>
2588 * is NULL or an inversion list that defines the swash. If not null, it
2589 * saves time during initialization of the swash.
ea317ccb
KW
2590 *
2591 * Note that it is assumed that the buffer length of <p> is enough to
2592 * contain all the bytes that comprise the character. Thus, <*p> should
2593 * have been checked before this call for mal-formedness enough to assure
2594 * that. */
2595
7918f24d
NC
2596 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2597
492a624f 2598 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2599 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2600 * as far as there being enough bytes available in it to accommodate the
2601 * character without reading beyond the end, and pass that number on to the
2602 * validating routine */
6302f837 2603 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
86ae6e94 2604 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
99a765e9 2605 _UTF8_NO_CONFIDENCE_IN_CURLEN,
86ae6e94
KW
2606 1 /* Die */ );
2607 NOT_REACHED; /* NOTREACHED */
28123549 2608 }
86ae6e94 2609
87367d5f
KW
2610 if (!*swash) {
2611 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2612 *swash = _core_swash_init("utf8",
2613
2614 /* Only use the name if there is no inversion
2615 * list; otherwise will go out to disk */
2616 (invlist) ? "" : swashname,
2617
2618 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2619 }
28123549 2620
bde6a22d
NC
2621 return swash_fetch(*swash, p, TRUE) != 0;
2622}
2623
da8c1a98
KW
2624PERL_STATIC_INLINE bool
2625S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
2626 const char *const swashname, SV* const invlist)
2627{
2628 /* returns a boolean giving whether or not the UTF8-encoded character that
2629 * starts at <p>, and extending no further than <e - 1> is in the swash
2630 * indicated by <swashname>. <swash> contains a pointer to where the swash
2631 * indicated by <swashname> is to be stored; which this routine will do, so
2632 * that future calls will look at <*swash> and only generate a swash if it
2633 * is not null. <invlist> is NULL or an inversion list that defines the
2634 * swash. If not null, it saves time during initialization of the swash.
2635 */
2636
2637 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
2638
2639 if (! isUTF8_CHAR(p, e)) {
2640 _force_out_malformed_utf8_message(p, e, 0, 1);
2641 NOT_REACHED; /* NOTREACHED */
2642 }
2643
2644 if (!*swash) {
2645 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2646 *swash = _core_swash_init("utf8",
2647
2648 /* Only use the name if there is no inversion
2649 * list; otherwise will go out to disk */
2650 (invlist) ? "" : swashname,
2651
2652 &PL_sv_undef, 1, 0, invlist, &flags);
2653 }
2654
2655 return swash_fetch(*swash, p, TRUE) != 0;
2656}
2657
34aeb2e9
KW
2658STATIC void
2659S_warn_on_first_deprecated_use(pTHX_ const char * const name,
2660 const char * const alternative,
2661 const bool use_locale,
2662 const char * const file,
2663 const unsigned line)
2664{
2665 const char * key;
2666
2667 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
2668
2669 if (ckWARN_d(WARN_DEPRECATED)) {
2670
2671 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
2672 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
2673 if (! PL_seen_deprecated_macro) {
2674 PL_seen_deprecated_macro = newHV();
2675 }
2676 if (! hv_store(PL_seen_deprecated_macro, key,
2677 strlen(key), &PL_sv_undef, 0))
2678 {
2679 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2680 }
2681
c44e9413 2682 if (instr(file, "mathoms.c")) {
607313a1
KW
2683 Perl_warner(aTHX_ WARN_DEPRECATED,
2684 "In %s, line %d, starting in Perl v5.30, %s()"
2685 " will be removed. Avoid this message by"
2686 " converting to use %s().\n",
2687 file, line, name, alternative);
2688 }
2689 else {
34aeb2e9
KW
2690 Perl_warner(aTHX_ WARN_DEPRECATED,
2691 "In %s, line %d, starting in Perl v5.30, %s() will"
2692 " require an additional parameter. Avoid this"
2693 " message by converting to use %s().\n",
2694 file, line, name, alternative);
607313a1 2695 }
34aeb2e9
KW
2696 }
2697 }
2698}
2699
bde6a22d 2700bool
34aeb2e9 2701Perl__is_utf8_FOO(pTHX_ U8 classnum,
be99e2c2 2702 const U8 * const p,
34aeb2e9
KW
2703 const char * const name,
2704 const char * const alternative,
2705 const bool use_utf8,
2706 const bool use_locale,
2707 const char * const file,
2708 const unsigned line)
922e8cb4 2709{
922e8cb4
KW
2710 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2711
34aeb2e9
KW
2712 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
2713
2714 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
34aeb2e9
KW
2715
2716 switch (classnum) {
2717 case _CC_WORDCHAR:
2718 case _CC_DIGIT:
2719 case _CC_ALPHA:
2720 case _CC_LOWER:
2721 case _CC_UPPER:
2722 case _CC_PUNCT:
2723 case _CC_PRINT:
2724 case _CC_ALPHANUMERIC:
2725 case _CC_GRAPH:
2726 case _CC_CASED:
2727
2728 return is_utf8_common(p,
2729 &PL_utf8_swash_ptrs[classnum],
2730 swash_property_names[classnum],
2731 PL_XPosix_ptrs[classnum]);
2732
2733 case _CC_SPACE:
2734 return is_XPERLSPACE_high(p);
2735 case _CC_BLANK:
2736 return is_HORIZWS_high(p);
2737 case _CC_XDIGIT:
2738 return is_XDIGIT_high(p);
2739 case _CC_CNTRL:
2740 return 0;
2741 case _CC_ASCII:
2742 return 0;
2743 case _CC_VERTSPACE:
2744 return is_VERTWS_high(p);
2745 case _CC_IDFIRST:
2746 if (! PL_utf8_perl_idstart) {
22f0498f
KW
2747 PL_utf8_perl_idstart
2748 = _new_invlist_C_array(_Perl_IDStart_invlist);
34aeb2e9 2749 }
22f0498f
KW
2750 return is_utf8_common(p, &PL_utf8_perl_idstart,
2751 "_Perl_IDStart", NULL);
34aeb2e9
KW
2752 case _CC_IDCONT:
2753 if (! PL_utf8_perl_idcont) {
22f0498f
KW
2754 PL_utf8_perl_idcont
2755 = _new_invlist_C_array(_Perl_IDCont_invlist);
34aeb2e9 2756 }
22f0498f
KW
2757 return is_utf8_common(p, &PL_utf8_perl_idcont,
2758 "_Perl_IDCont", NULL);
34aeb2e9
KW
2759 }
2760 }
2761
2762 /* idcont is the same as wordchar below 256 */
2763 if (classnum == _CC_IDCONT) {
2764 classnum = _CC_WORDCHAR;
2765 }
2766 else if (classnum == _CC_IDFIRST) {
2767 if (*p == '_') {
2768 return TRUE;
2769 }
2770 classnum = _CC_ALPHA;
2771 }
2772
2773 if (! use_locale) {
2774 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2775 return _generic_isCC(*p, classnum);
2776 }
922e8cb4 2777
34aeb2e9
KW
2778 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
2779 }
2780 else {
2781 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2782 return isFOO_lc(classnum, *p);
2783 }
2784
2785 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
2786 }
2787
2788 NOT_REACHED; /* NOTREACHED */
922e8cb4
KW
2789}
2790
2791bool
da8c1a98
KW
2792Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
2793 const U8 * const e)
2794{
2795 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
2796
2797 assert(classnum < _FIRST_NON_SWASH_CC);
2798
2799 return is_utf8_common_with_len(p,
2800 e,
2801 &PL_utf8_swash_ptrs[classnum],
2802 swash_property_names[classnum],
2803 PL_XPosix_ptrs[classnum]);
2804}
2805
2806bool
da8c1a98
KW
2807Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
2808{
2809 SV* invlist = NULL;
2810
2811 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
2812
2813 if (! PL_utf8_perl_idstart) {
2814 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2815 }
2816 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
2817 "_Perl_IDStart", invlist);
2818}
2819
2820bool
f2645549 2821Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2822{
f2645549 2823 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2824
2825 if (*p == '_')
2826 return TRUE;
f25ce844 2827 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2828}
2829
2830bool
da8c1a98
KW
2831Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
2832{
2833 SV* invlist = NULL;
2834
2835 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
2836
2837 if (! PL_utf8_perl_idcont) {
2838 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2839 }
2840 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
2841 "_Perl_IDCont", invlist);
2842}
2843
2844bool
f2645549 2845Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2846{
f2645549 2847 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2848
f25ce844 2849 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2850}
2851
2852bool
f2645549 2853Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2854{
f2645549 2855 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2856
f25ce844 2857 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2858}
2859
2860bool
7dbf68d2
KW
2861Perl__is_utf8_mark(pTHX_ const U8 *p)
2862{
7dbf68d2
KW
2863 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2864
f25ce844 2865 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2866}
2867
b9992569 2868 /* change namve uv1 to 'from' */
6a4a25f4 2869STATIC UV
b9992569
KW
2870S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2871 SV **swashp, const char *normal, const char *special)
2872{
0134edef 2873 STRLEN len = 0;
7918f24d 2874
b9992569 2875 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2876
36eaa811
KW
2877 /* For code points that don't change case, we already know that the output
2878 * of this function is the unchanged input, so we can skip doing look-ups
2879 * for them. Unfortunately the case-changing code points are scattered
2880 * around. But there are some long consecutive ranges where there are no
2881 * case changing code points. By adding tests, we can eliminate the lookup
2882 * for all the ones in such ranges. This is currently done here only for
2883 * just a few cases where the scripts are in common use in modern commerce
2884 * (and scripts adjacent to those which can be included without additional
2885 * tests). */
2886
2887 if (uv1 >= 0x0590) {
2888 /* This keeps from needing further processing the code points most
2889 * likely to be used in the following non-cased scripts: Hebrew,
2890 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2891 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2892 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2893 if (uv1 < 0x10A0) {
2894 goto cases_to_self;
2895 }
2896
2897 /* The following largish code point ranges also don't have case
2898 * changes, but khw didn't think they warranted extra tests to speed
2899 * them up (which would slightly slow down everything else above them):
2900 * 1100..139F Hangul Jamo, Ethiopic
2901 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2902 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2903 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2904 * Combining Diacritical Marks Extended, Balinese,
2905 * Sundanese, Batak, Lepcha, Ol Chiki
2906 * 2000..206F General Punctuation
2907 */
2908
2909 if (uv1 >= 0x2D30) {
2910
2911 /* This keeps the from needing further processing the code points
2912 * most likely to be used in the following non-cased major scripts:
2913 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2914 *
2915 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2916 * event that Unicode eventually allocates the unused block as of
2917 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2918 * that the test suite will start having failures to alert you
2919 * should that happen) */
2920 if (uv1 < 0xA640) {
2921 goto cases_to_self;
2922 }
2923
2924 if (uv1 >= 0xAC00) {
2925 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2926 if (ckWARN_d(WARN_SURROGATE)) {
2927 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2928 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 2929 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
2930 }
2931 goto cases_to_self;
2932 }
36eaa811
KW
2933
2934 /* AC00..FAFF Catches Hangul syllables and private use, plus
2935 * some others */
2936 if (uv1 < 0xFB00) {
2937 goto cases_to_self;
2938
2939 }
2940
5af9bc97 2941 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
76513bdc
KW
2942 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2943 && ckWARN_d(WARN_DEPRECATED))
2944 {
2945 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2946 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
5af9bc97
KW
2947 }
2948 if (ckWARN_d(WARN_NON_UNICODE)) {
2949 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2950 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 2951 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
2952 }
2953 goto cases_to_self;
2954 }
3bfc1e70
KW
2955#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2956 if (UNLIKELY(uv1
2957 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2958 {
2959
2960 /* As of this writing, this means we avoid swash creation
2961 * for anything beyond low Plane 1 */
2962 goto cases_to_self;
2963 }
2964#endif
36eaa811
KW
2965 }
2966 }
9ae3ac1a 2967
36eaa811
KW
2968 /* Note that non-characters are perfectly legal, so no warning should
2969 * be given. There are so few of them, that it isn't worth the extra
2970 * tests to avoid swash creation */
9ae3ac1a
KW
2971 }
2972
0134edef 2973 if (!*swashp) /* load on-demand */
5ab9d2ef 2974 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2975
a6f87d8c 2976 if (special) {
0134edef 2977 /* It might be "special" (sometimes, but not always,
2a37f04d 2978 * a multicharacter mapping) */
4a8240a3 2979 HV *hv = NULL;
b08cf34e
JH
2980 SV **svp;
2981
4a8240a3
KW
2982 /* If passed in the specials name, use that; otherwise use any
2983 * given in the swash */
2984 if (*special != '\0') {
2985 hv = get_hv(special, 0);
2986 }
2987 else {
2988 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2989 if (svp) {
2990 hv = MUTABLE_HV(SvRV(*svp));
2991 }
2992 }
2993
176fe009 2994 if (hv
5f560d8a 2995 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2996 && (*svp))
2997 {
cfd0369c 2998 const char *s;
47654450 2999
cfd0369c 3000 s = SvPV_const(*svp, len);
47654450 3001 if (len == 1)
f4cd282c 3002 /* EIGHTBIT */
c80e42f3 3003 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 3004 else {
d2dcd0fb 3005 Copy(s, ustrp, len, U8);
29e98929 3006 }
983ffd37 3007 }
0134edef
JH
3008 }
3009
3010 if (!len && *swashp) {
4a4088c4 3011 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 3012
0134edef
JH
3013 if (uv2) {
3014 /* It was "normal" (a single character mapping). */
f4cd282c 3015 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
3016 }
3017 }
1feea2c7 3018
cbe07460
KW
3019 if (len) {
3020 if (lenp) {
3021 *lenp = len;
3022 }
3023 return valid_utf8_to_uvchr(ustrp, 0);
3024 }
3025
3026 /* Here, there was no mapping defined, which means that the code point maps
3027 * to itself. Return the inputs */
e24dfe9c 3028 cases_to_self:
bfdf22ec 3029 len = UTF8SKIP(p);
ca9fab46
KW
3030 if (p != ustrp) { /* Don't copy onto itself */
3031 Copy(p, ustrp, len, U8);
3032 }
0134edef 3033
2a37f04d
JH
3034 if (lenp)
3035 *lenp = len;
3036
f4cd282c 3037 return uv1;
cbe07460 3038
a0ed51b3
LW
3039}
3040
051a06d4 3041STATIC UV
357aadde 3042S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 3043{
4a4088c4 3044 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3045 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3046 * result contains a character that crosses the 255/256 boundary, disallow
3047 * the change, and return the original code point. See L<perlfunc/lc> for
3048 * why;
051a06d4 3049 *
a1433954
KW
3050 * p points to the original string whose case was changed; assumed
3051 * by this routine to be well-formed
051a06d4
KW
3052 * result the code point of the first character in the changed-case string
3053 * ustrp points to the changed-case string (<result> represents its first char)
3054 * lenp points to the length of <ustrp> */
3055
3056 UV original; /* To store the first code point of <p> */
3057
3058 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3059
a4f12ed7 3060 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3061
3062 /* We know immediately if the first character in the string crosses the
3063 * boundary, so can skip */
3064 if (result > 255) {
3065
3066 /* Look at every character in the result; if any cross the
3067 * boundary, the whole thing is disallowed */
3068 U8* s = ustrp + UTF8SKIP(ustrp);
3069 U8* e = ustrp + *lenp;
3070 while (s < e) {
a4f12ed7 3071 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3072 goto bad_crossing;
3073 }
3074 s += UTF8SKIP(s);
3075 }
3076
613abc6d
KW
3077 /* Here, no characters crossed, result is ok as-is, but we warn. */
3078 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3079 return result;
3080 }
3081
7b52d656 3082 bad_crossing:
051a06d4
KW
3083
3084 /* Failed, have to return the original */
4b88fb76 3085 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3086
3087 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3088 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
147e3846
KW
3089 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
3090 "resolved to \"\\x{%" UVXf "}\".",
357aadde 3091 OP_DESC(PL_op),
ab0b796c
KW
3092 original,
3093 original);
051a06d4
KW
3094 Copy(p, ustrp, *lenp, char);
3095 return original;
3096}
3097
607313a1
KW
3098STATIC U32
3099S_check_and_deprecate(pTHX_ const U8 *p,
3100 const U8 **e,
3101 const unsigned int type, /* See below */
3102 const bool use_locale, /* Is this a 'LC_'
3103 macro call? */
3104 const char * const file,
3105 const unsigned line)
3106{
3107 /* This is a temporary function to deprecate the unsafe calls to the case
3108 * changing macros and functions. It keeps all the special stuff in just
3109 * one place.
3110 *
3111 * It updates *e with the pointer to the end of the input string. If using
3112 * the old-style macros, *e is NULL on input, and so this function assumes
3113 * the input string is long enough to hold the entire UTF-8 sequence, and
3114 * sets *e accordingly, but it then returns a flag to pass the
3115 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3116 * using the full length if possible.
3117 *
3118 * It also does the assert that *e > p when *e is not NULL. This should be
3119 * migrated to the callers when this function gets deleted.
3120 *
3121 * The 'type' parameter is used for the caller to specify which case
3122 * changing function this is called from: */
3123
3124# define DEPRECATE_TO_UPPER 0
3125# define DEPRECATE_TO_TITLE 1
3126# define DEPRECATE_TO_LOWER 2
3127# define DEPRECATE_TO_FOLD 3
3128
3129 U32 utf8n_flags = 0;
3130 const char * name;
3131 const char * alternative;
3132
3133 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3134
3135 if (*e == NULL) {
3136 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3137 *e = p + UTF8SKIP(p);
3138
3139 /* For mathoms.c calls, we use the function name we know is stored
c44e9413 3140 * there. It could be part of a larger path */
607313a1 3141 if (type == DEPRECATE_TO_UPPER) {
c44e9413 3142 name = instr(file, "mathoms.c")
607313a1
KW
3143 ? "to_utf8_upper"
3144 : "toUPPER_utf8";
3145 alternative = "toUPPER_utf8_safe";
3146 }
3147 else if (type == DEPRECATE_TO_TITLE) {
c44e9413 3148 name = instr(file, "mathoms.c")
607313a1
KW
3149 ? "to_utf8_title"
3150 : "toTITLE_utf8";
3151 alternative = "toTITLE_utf8_safe";
3152 }
3153 else if (type == DEPRECATE_TO_LOWER) {
c44e9413 3154 name = instr(file, "mathoms.c")
607313a1
KW
3155 ? "to_utf8_lower"
3156 : "toLOWER_utf8";
3157 alternative = "toLOWER_utf8_safe";
3158 }
3159 else if (type == DEPRECATE_TO_FOLD) {
c44e9413 3160 name = instr(file, "mathoms.c")
607313a1
KW
3161 ? "to_utf8_fold"
3162 : "toFOLD_utf8";
3163 alternative = "toFOLD_utf8_safe";
3164 }
3165 else Perl_croak(aTHX_ "panic: Unexpected case change type");
3166
3167 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3168 }
3169 else {
3170 assert (p < *e);
3171 }
3172
3173 return utf8n_flags;
3174}
3175
eaf412bf
KW
3176/* The process for changing the case is essentially the same for the four case
3177 * change types, except there are complications for folding. Otherwise the
3178 * difference is only which case to change to. To make sure that they all do
3179 * the same thing, the bodies of the functions are extracted out into the
3180 * following two macros. The functions are written with the same variable
3181 * names, and these are known and used inside these macros. It would be
3182 * better, of course, to have inline functions to do it, but since different
3183 * macros are called, depending on which case is being changed to, this is not
3184 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3185 * function can start with the common start macro, then finish with its special
3186 * handling; while the other three cases can just use the common end macro.
3187 *
3188 * The algorithm is to use the proper (passed in) macro or function to change
3189 * the case for code points that are below 256. The macro is used if using
3190 * locale rules for the case change; the function if not. If the code point is
3191 * above 255, it is computed from the input UTF-8, and another macro is called
3192 * to do the conversion. If necessary, the output is converted to UTF-8. If
3193 * using a locale, we have to check that the change did not cross the 255/256
3194 * boundary, see check_locale_boundary_crossing() for further details.
3195 *
3196 * The macros are split with the correct case change for the below-256 case
3197 * stored into 'result', and in the middle of an else clause for the above-255
3198 * case. At that point in the 'else', 'result' is not the final result, but is
3199 * the input code point calculated from the UTF-8. The fold code needs to
3200 * realize all this and take it from there.
3201 *
3202 * If you read the two macros as sequential, it's easier to understand what's
3203 * going on. */
3204#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
3205 L1_func_extra_param) \
a239b1e2 3206 \
eaf412bf
KW
3207 if (flags & (locale_flags)) { \
3208 /* Treat a UTF-8 locale as not being in locale at all */ \
3209 if (IN_UTF8_CTYPE_LOCALE) { \
3210 flags &= ~(locale_flags); \
3211 } \
3212 else { \
3213 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3214 } \
3215 } \
3216 \
3217 if (UTF8_IS_INVARIANT(*p)) { \
3218 if (flags & (locale_flags)) { \
3219 result = LC_L1_change_macro(*p); \
3220 } \
3221 else { \
3222 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3223 } \
3224 } \
a239b1e2 3225 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
eaf412bf
KW
3226 if (flags & (locale_flags)) { \
3227 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
3228 *(p+1))); \
3229 } \
3230 else { \
3231 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
3232 ustrp, lenp, L1_func_extra_param); \
3233 } \
3234 } \
fa8ab374
KW
3235 else { /* malformed UTF-8 or ord above 255 */ \
3236 STRLEN len_result; \
fa8ab374
KW
3237 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3238 if (len_result == (STRLEN) -1) { \
607313a1
KW
3239 _force_out_malformed_utf8_message(p, e, utf8n_flags, \
3240 1 /* Die */ ); \
fa8ab374 3241 }
eaf412bf
KW
3242
3243#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3244 result = change_macro(result, p, ustrp, lenp); \
3245 \
3246 if (flags & (locale_flags)) { \
3247 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3248 } \
3249 return result; \
3250 } \
3251 \
3252 /* Here, used locale rules. Convert back to UTF-8 */ \
3253 if (UTF8_IS_INVARIANT(result)) { \
3254 *ustrp = (U8) result; \
3255 *lenp = 1; \
3256 } \
3257 else { \
3258 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3259 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3260 *lenp = 2; \
3261 } \
3262 \
3263 return result;
3264
d3e79532 3265/*
87cea99e 3266=for apidoc to_utf8_upper
d3e79532 3267
a239b1e2 3268Instead use L</toUPPER_utf8_safe>.
a1433954 3269
d3e79532
JH
3270=cut */
3271
051a06d4 3272/* Not currently externally documented, and subject to change:
31f05a37
KW
3273 * <flags> is set iff iff the rules from the current underlying locale are to
3274 * be used. */
051a06d4 3275
2104c8d9 3276UV
607313a1
KW
3277Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3278 const U8 *e,
3279 U8* ustrp,
3280 STRLEN *lenp,
3281 bool flags,
3282 const char * const file,
3283 const int line)
a0ed51b3 3284{
051a06d4 3285 UV result;
607313a1
KW
3286 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
3287 cBOOL(flags), file, line);
051a06d4
KW
3288
3289 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 3290
eaf412bf
KW
3291 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3292 /* 2nd char of uc(U+DF) is 'S' */
3293 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
3294 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
983ffd37 3295}
a0ed51b3 3296
d3e79532 3297/*
87cea99e 3298=for apidoc to_utf8_title
d3e79532 3299
a239b1e2 3300Instead use L</toTITLE_utf8_safe>.
a1433954 3301
d3e79532
JH
3302=cut */
3303
051a06d4 3304/* Not currently externally documented, and subject to change:
31f05a37
KW
3305 * <flags> is set iff the rules from the current underlying locale are to be
3306 * used. Since titlecase is not defined in POSIX, for other than a
3307 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 3308 */
051a06d4 3309
983ffd37 3310UV
607313a1
KW
3311Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3312 const U8 *e,
3313 U8* ustrp,
3314 STRLEN *lenp,
3315 bool flags,
3316 const char * const file,
3317 const int line)
983ffd37 3318{
051a06d4 3319 UV result;
607313a1
KW
3320 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
3321 cBOOL(flags), file, line);
051a06d4
KW
3322
3323 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 3324
eaf412bf
KW
3325 /* 2nd char of ucfirst(U+DF) is 's' */
3326 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
3327 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
a0ed51b3
LW
3328}
3329
d3e79532 3330/*
87cea99e 3331=for apidoc to_utf8_lower
d3e79532 3332
a239b1e2 3333Instead use L</toLOWER_utf8_safe>.
a1433954 3334
d3e79532
JH
3335=cut */
3336
051a06d4 3337/* Not currently externally documented, and subject to change:
31f05a37
KW
3338 * <flags> is set iff iff the rules from the current underlying locale are to
3339 * be used.
3340 */
051a06d4 3341
2104c8d9 3342UV
607313a1
KW
3343Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3344 const U8 *e,
3345 U8* ustrp,
3346 STRLEN *lenp,
3347 bool flags,
3348 const char * const file,
3349 const int line)
a0ed51b3 3350{
051a06d4 3351 UV result;
607313a1
KW
3352 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
3353 cBOOL(flags), file, line);
051a06d4 3354
051a06d4 3355 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 3356
eaf412bf
KW
3357 CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
3358 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
b4e400f9
JH
3359}
3360
d3e79532 3361/*
87cea99e 3362=for apidoc to_utf8_fold
d3e79532 3363
a239b1e2 3364Instead use L</toFOLD_utf8_safe>.
a1433954 3365
d3e79532
JH
3366=cut */
3367
051a06d4
KW
3368/* Not currently externally documented, and subject to change,
3369 * in <flags>
31f05a37
KW
3370 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3371 * locale are to be used.
051a06d4
KW
3372 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3373 * otherwise simple folds
a0270393
KW
3374 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3375 * prohibited
445bf929 3376 */
36bb2ab6 3377
b4e400f9 3378UV
607313a1
KW
3379Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3380 const U8 *e,
3381 U8* ustrp,
3382 STRLEN *lenp,
3383 U8 flags,
3384 const char * const file,
3385 const int line)
b4e400f9 3386{
051a06d4 3387 UV result;
607313a1
KW
3388 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
3389 cBOOL(flags), file, line);
051a06d4 3390
36bb2ab6 3391 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 3392
a0270393
KW
3393 /* These are mutually exclusive */
3394 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3395
50ba90ff
KW
3396 assert(p != ustrp); /* Otherwise overwrites */
3397
eaf412bf
KW
3398 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
3399 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
31f05a37 3400
eaf412bf 3401 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3402
1ca267a5
KW
3403 if (flags & FOLD_FLAGS_LOCALE) {
3404
76f2ffcd 3405# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3406 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3407
3408# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3409# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3410
3411 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3412
538e84ed
KW
3413 /* Special case these two characters, as what normally gets
3414 * returned under locale doesn't work */
76f2ffcd
KW
3415 if (UTF8SKIP(p) == cap_sharp_s_len
3416 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3417 {
ab0b796c
KW
3418 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3419 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3420 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3421 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3422 goto return_long_s;
3423 }
0766489e
KW
3424 else
3425#endif
3426 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3427 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3428 {
ab0b796c
KW
3429 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3430 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3431 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3432 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3433 goto return_ligature_st;
3434 }
74894415
KW
3435
3436#if UNICODE_MAJOR_VERSION == 3 \
3437 && UNICODE_DOT_VERSION == 0 \
3438 && UNICODE_DOT_DOT_VERSION == 1
3439# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3440
3441 /* And special case this on this Unicode version only, for the same
3442 * reaons the other two are special cased. They would cross the
3443 * 255/256 boundary which is forbidden under /l, and so the code
3444 * wouldn't catch that they are equivalent (which they are only in
3445 * this release) */
3446 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3447 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3448 {
3449 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3450 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3451 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3452 "resolved to \"\\x{0131}\".");
3453 goto return_dotless_i;
3454 }
3455#endif
3456
357aadde 3457 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3458 }
a0270393
KW
3459 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3460 return result;
3461 }
3462 else {
4a4088c4 3463 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3464 * character above the ASCII range, and the result should not
3465 * contain an ASCII character. */
a0270393
KW
3466
3467 UV original; /* To store the first code point of <p> */
3468
3469 /* Look at every character in the result; if any cross the
3470 * boundary, the whole thing is disallowed */
3471 U8* s = ustrp;
3472 U8* e = ustrp + *lenp;
3473 while (s < e) {
3474 if (isASCII(*s)) {
3475 /* Crossed, have to return the original */
3476 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3477
9fc2026f 3478 /* But in these instances, there is an alternative we can
1ca267a5 3479 * return that is valid */
0766489e
KW
3480 if (original == LATIN_SMALL_LETTER_SHARP_S
3481#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3482 || original == LATIN_CAPITAL_LETTER_SHARP_S
3483#endif
3484 ) {
1ca267a5
KW
3485 goto return_long_s;
3486 }
9fc2026f
KW
3487 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3488 goto return_ligature_st;
3489 }
74894415
KW
3490#if UNICODE_MAJOR_VERSION == 3 \
3491 && UNICODE_DOT_VERSION == 0 \
3492 && UNICODE_DOT_DOT_VERSION == 1
3493
3494 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3495 goto return_dotless_i;
3496 }
3497#endif
a0270393
KW
3498 Copy(p, ustrp, *lenp, char);
3499 return original;
3500 }
3501 s += UTF8SKIP(s);
3502 }
051a06d4 3503
a0270393
KW
3504 /* Here, no characters crossed, result is ok as-is */
3505 return result;
3506 }
051a06d4
KW
3507 }
3508
4a4088c4 3509 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3510 if (UTF8_IS_INVARIANT(result)) {
3511 *ustrp = (U8) result;
3512 *lenp = 1;
3513 }
3514 else {
62cb07ea
KW
3515 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3516 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3517 *lenp = 2;
3518 }
3519
051a06d4 3520 return result;
1ca267a5
KW
3521
3522 return_long_s:
3523 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3524 * folds to a string of two of these characters. By returning this
3525 * instead, then, e.g.,
3526 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3527 * works. */
3528
3529 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3530 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3531 ustrp, *lenp, U8);
3532 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3533
3534 return_ligature_st:
3535 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3536 * have the other one fold to it */
3537
3538 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3539 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3540 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3541
3542#if UNICODE_MAJOR_VERSION == 3 \
3543 && UNICODE_DOT_VERSION == 0 \
3544 && UNICODE_DOT_DOT_VERSION == 1
3545
3546 return_dotless_i:
3547 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3548 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3549 return LATIN_SMALL_LETTER_DOTLESS_I;
3550
3551#endif
3552
a0ed51b3
LW
3553}
3554
711a919c 3555/* Note:
f90a9a02 3556 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
ST
3557 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3558 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3559 */
c4a5db0c 3560
a0ed51b3 3561SV*
7fc63493 3562Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3563{
c4a5db0c
KW
3564 PERL_ARGS_ASSERT_SWASH_INIT;
3565
3566 /* Returns a copy of a swash initiated by the called function. This is the
3567 * public interface, and returning a copy prevents others from doing
3568 * mischief on the original */
3569
5d3d13d1 3570 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3571}
3572
3573SV*
5d3d13d1 3574Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3575{
2c1f00b9
YO
3576
3577 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3578 * use the following define */
3579
3580#define CORE_SWASH_INIT_RETURN(x) \
3581 PL_curpm= old_PL_curpm; \
3582 return x
3583
c4a5db0c 3584 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3585 * by calling utf8_heavy.pl in the general case. The returned value may be
3586 * the swash's inversion list instead if the input parameters allow it.
3587 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3588 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3589 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3590 *
3591 * This interface should only be used by functions that won't destroy or
3592 * adversely change the swash, as doing so affects all other uses of the
3593 * swash in the program; the general public should use 'Perl_swash_init'
3594 * instead.
3595 *
3596 * pkg is the name of the package that <name> should be in.
3597 * name is the name of the swash to find. Typically it is a Unicode
3598 * property name, including user-defined ones
3599 * listsv is a string to initialize the swash with. It must be of the form
3600 * documented as the subroutine return value in
3601 * L<perlunicode/User-Defined Character Properties>
3602 * minbits is the number of bits required to represent each data element.
3603 * It is '1' for binary properties.
3604 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3605 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3606 * flags_p if non-NULL is the address of various input and output flag bits
3607 * to the routine, as follows: ('I' means is input to the routine;
3608 * 'O' means output from the routine. Only flags marked O are
3609 * meaningful on return.)
3610 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3611 * came from a user-defined property. (I O)
5d3d13d1
KW
3612 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3613 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3614 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3615 * return of an inversion list instead of a swash hash if this routine
3616 * thinks that would result in faster execution of swash_fetch() later
3617 * on. (I)
9a53f6cf
KW
3618 *
3619 * Thus there are three possible inputs to find the swash: <name>,
3620 * <listsv>, and <invlist>. At least one must be specified. The result
3621 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3622 * actions can intersect, etc. what <name> gives. To avoid going out to
3623 * disk at all, <invlist> should specify completely what the swash should
3624 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3625 *
3626 * <invlist> is only valid for binary properties */
c4a5db0c 3627
2c1f00b9
YO
3628 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3629
c4a5db0c 3630 SV* retval = &PL_sv_undef;
83199d38 3631 HV* swash_hv = NULL;
87367d5f
KW
3632 const int invlist_swash_boundary =
3633 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3634 ? 512 /* Based on some benchmarking, but not extensive, see commit
3635 message */
3636 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3637
3638 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3639 assert(! invlist || minbits == 1);
3640
2c1f00b9
YO
3641 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3642 that triggered the swash init and the swash init perl logic itself.
3643 See perl #122747 */
3644
9a53f6cf
KW
3645 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3646 * so */
3647 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3648 dSP;
3649 const size_t pkg_len = strlen(pkg);
3650 const size_t name_len = strlen(name);
3651 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3652 SV* errsv_save;
3653 GV *method;
3654
3655 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3656
3657 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3658 ENTER;
69794297 3659 SAVEHINTS();
2782061f 3660 save_re_context();
650f067c
JL
3661 /* We might get here via a subroutine signature which uses a utf8
3662 * parameter name, at which point PL_subname will have been set
3663 * but not yet used. */
3664 save_item(PL_subname);
69794297
KW
3665 if (PL_parser && PL_parser->error_count)
3666 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3667 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3668 if (!method) { /* demand load UTF-8 */
69794297 3669 ENTER;
db2c6cb3
FC
3670 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3671 GvSV(PL_errgv) = NULL;
1a419e6b 3672#ifndef NO_TAINT_SUPPORT
69794297
KW
3673 /* It is assumed that callers of this routine are not passing in
3674 * any user derived data. */
2782061f
DM
3675 /* Need to do this after save_re_context() as it will set
3676 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3677 * in Perl_magic_get). Even line to create errsv_save can turn on
3678 * PL_tainted. */
284167a5
SM
3679 SAVEBOOL(TAINT_get);
3680 TAINT_NOT;
3681#endif
69794297
KW
3682 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3683 NULL);
eed484f9 3684 {
db2c6cb3
FC
3685 /* Not ERRSV, as there is no need to vivify a scalar we are
3686 about to discard. */
3687 SV * const errsv = GvSV(PL_errgv);
3688 if (!SvTRUE(errsv)) {
3689 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3690 SvREFCNT_dec(errsv);
3691 }
eed484f9 3692 }
69794297
KW
3693 LEAVE;
3694 }
3695 SPAGAIN;
3696 PUSHMARK(SP);
3697 EXTEND(SP,5);
3698 mPUSHp(pkg, pkg_len);
3699 mPUSHp(name, name_len);
3700 PUSHs(listsv);
3701 mPUSHi(minbits);
3702 mPUSHi(none);
3703 PUTBACK;
db2c6cb3
FC
3704 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3705 GvSV(PL_errgv) = NULL;
69794297
KW
3706 /* If we already have a pointer to the method, no need to use
3707 * call_method() to repeat the lookup. */
c41800a8
KW
3708 if (method
3709 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3710 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3711 {
3712 retval = *PL_stack_sp--;
3713 SvREFCNT_inc(retval);
3714 }
eed484f9 3715 {
db2c6cb3
FC
3716 /* Not ERRSV. See above. */
3717 SV * const errsv = GvSV(PL_errgv);
3718 if (!SvTRUE(errsv)) {
3719 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3720 SvREFCNT_dec(errsv);
3721 }
eed484f9 3722 }
ce3b816e 3723 LEAVE;
69794297
KW
3724 POPSTACK;
3725 if (IN_PERL_COMPILETIME) {
3726 CopHINTS_set(PL_curcop, PL_hints);
3727 }
3728 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3729 if (SvPOK(retval)) {
69794297
KW
3730
3731 /* If caller wants to handle missing properties, let them */
5d3d13d1 3732 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3733 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3734 }
3735 Perl_croak(aTHX_
147e3846 3736 "Can't find Unicode property definition \"%" SVf "\"",
69794297 3737 SVfARG(retval));
a25b5927 3738 NOT_REACHED; /* NOTREACHED */
d95e4a00 3739 }
69794297 3740 }
9a53f6cf 3741 } /* End of calling the module to find the swash */
36eb48b4 3742
83199d38
KW
3743 /* If this operation fetched a swash, and we will need it later, get it */
3744 if (retval != &PL_sv_undef
3745 && (minbits == 1 || (flags_p
3746 && ! (*flags_p
3747 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3748 {
3749 swash_hv = MUTABLE_HV(SvRV(retval));
3750
3751 /* If we don't already know that there is a user-defined component to
3752 * this swash, and the user has indicated they wish to know if there is
3753 * one (by passing <flags_p>), find out */
3754 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3755 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3756 if (user_defined && SvUV(*user_defined)) {
3757