This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Don't calc code point from overflowing UTF8
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
76513bdc 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
9cbfb8ab
KW
55void
56Perl__force_out_malformed_utf8_message(pTHX_
57 const U8 *const p, /* First byte in UTF-8 sequence */
58 const U8 * const e, /* Final byte in sequence (may include
59 multiple chars */
60 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
61 usually 0, or some DISALLOW flags */
62 const bool die_here) /* If TRUE, this function does not return */
63{
64 /* This core-only function is to be called when a malformed UTF-8 character
65 * is found, in order to output the detailed information about the
66 * malformation before dieing. The reason it exists is for the occasions
67 * when such a malformation is fatal, but warnings might be turned off, so
68 * that normally they would not be actually output. This ensures that they
69 * do get output. Because a sequence may be malformed in more than one
70 * way, multiple messages may be generated, so we can't make them fatal, as
71 * that would cause the first one to die.
72 *
73 * Instead we pretend -W was passed to perl, then die afterwards. The
74 * flexibility is here to return to the caller so they can finish up and
75 * die themselves */
76 U32 errors;
77
78 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
79
80 ENTER;
c15a80f3 81 SAVEI8(PL_dowarn);
9cbfb8ab
KW
82 SAVESPTR(PL_curcop);
83
84 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
85 if (PL_curcop) {
86 PL_curcop->cop_warnings = pWARN_ALL;
87 }
88
89 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
90
91 LEAVE;
92
93 if (! errors) {
94 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
95 " be called only when there are errors found");
96 }
97
98 if (die_here) {
99 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
100 }
101}
102
eaf7a4d2 103/*
378516de 104=for apidoc uvoffuni_to_utf8_flags
eebe1485 105
a27992cc 106THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
107Instead, B<Almost all code should use L</uvchr_to_utf8> or
108L</uvchr_to_utf8_flags>>.
a27992cc 109
de69f3af
KW
110This function is like them, but the input is a strict Unicode
111(as opposed to native) code point. Only in very rare circumstances should code
112not be using the native code point.
949cf498 113
efa9cd84 114For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 115
eebe1485
SC
116=cut
117*/
118
8ee1cdcb
KW
119#define HANDLE_UNICODE_SURROGATE(uv, flags) \
120 STMT_START { \
121 if (flags & UNICODE_WARN_SURROGATE) { \
122 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
147e3846 123 "UTF-16 surrogate U+%04" UVXf, uv); \
8ee1cdcb
KW
124 } \
125 if (flags & UNICODE_DISALLOW_SURROGATE) { \
126 return NULL; \
127 } \
128 } STMT_END;
129
130#define HANDLE_UNICODE_NONCHAR(uv, flags) \
131 STMT_START { \
132 if (flags & UNICODE_WARN_NONCHAR) { \
133 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
147e3846 134 "Unicode non-character U+%04" UVXf " is not " \
8ee1cdcb
KW
135 "recommended for open interchange", uv); \
136 } \
137 if (flags & UNICODE_DISALLOW_NONCHAR) { \
138 return NULL; \
139 } \
140 } STMT_END;
141
ba6ed43c
KW
142/* Use shorter names internally in this file */
143#define SHIFT UTF_ACCUMULATION_SHIFT
144#undef MARK
145#define MARK UTF_CONTINUATION_MARK
146#define MASK UTF_CONTINUATION_MASK
147
dfe13c55 148U8 *
4b31b634 149Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
a0ed51b3 150{
378516de 151 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 152
2d1545e5 153 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 154 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
155 return d;
156 }
facc1dc2 157
3ea68d71 158 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
159 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
160 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
161 return d;
162 }
d9432125 163
ba6ed43c
KW
164 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
165 * below, the 16 is for start bytes E0-EF (which are all the possible ones
166 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
167 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
168 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
169 * 0x800-0xFFFF on ASCII */
170 if (uv < (16 * (1U << (2 * SHIFT)))) {
171 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
172 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
173 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
174
175#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
176 aren't tested here */
177 /* The most likely code points in this range are below the surrogates.
178 * Do an extra test to quickly exclude those. */
179 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
180 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
181 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
182 {
8ee1cdcb
KW
183 HANDLE_UNICODE_NONCHAR(uv, flags);
184 }
185 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
186 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 187 }
ba6ed43c
KW
188 }
189#endif
190 return d;
191 }
192
193 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
194 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
195 * happen starting with 4-byte characters on ASCII platforms. We unify the
196 * code for these with EBCDIC, even though some of them require 5-bytes on
197 * those, because khw believes the code saving is worth the very slight
198 * performance hit on these high EBCDIC code points. */
199
200 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
76513bdc
KW
201 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
202 && ckWARN_d(WARN_DEPRECATED))
203 {
204 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
205 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
a5bf80e0
KW
206 }
207 if ( (flags & UNICODE_WARN_SUPER)
208 || ( UNICODE_IS_ABOVE_31_BIT(uv)
209 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
210 {
211 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
212
213 /* Choose the more dire applicable warning */
214 (UNICODE_IS_ABOVE_31_BIT(uv))
147e3846
KW
215 ? "Code point 0x%" UVXf " is not Unicode, and not portable"
216 : "Code point 0x%" UVXf " is not Unicode, may not be portable",
a5bf80e0
KW
217 uv);
218 }
219 if (flags & UNICODE_DISALLOW_SUPER
220 || ( UNICODE_IS_ABOVE_31_BIT(uv)
221 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
222 {
223 return NULL;
224 }
225 }
ba6ed43c
KW
226 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
227 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 228 }
d9432125 229
ba6ed43c
KW
230 /* Test for and handle 4-byte result. In the test immediately below, the
231 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
232 * characters). The 3 is for 3 continuation bytes; these each contribute
233 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
234 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
235 * 0x1_0000-0x1F_FFFF on ASCII */
236 if (uv < (8 * (1U << (3 * SHIFT)))) {
237 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
238 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
239 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
240 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
241
242#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
243 characters. The end-plane non-characters for EBCDIC were
244 handled just above */
245 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
246 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 247 }
ba6ed43c
KW
248 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
249 HANDLE_UNICODE_SURROGATE(uv, flags);
250 }
251#endif
252
253 return d;
254 }
255
256 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
257 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
258 * format. The unrolled version above turns out to not save all that much
259 * time, and at these high code points (well above the legal Unicode range
260 * on ASCII platforms, and well above anything in common use in EBCDIC),
261 * khw believes that less code outweighs slight performance gains. */
262
d9432125 263 {
5aaebcb3 264 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
265 U8 *p = d+len-1;
266 while (p > d) {
4c8cd605 267 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
268 uv >>= UTF_ACCUMULATION_SHIFT;
269 }
4c8cd605 270 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
271 return d+len;
272 }
a0ed51b3 273}
a5bf80e0 274
646ca15d 275/*
07693fe6
KW
276=for apidoc uvchr_to_utf8
277
bcb1a2d4 278Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 279of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
280C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
281the byte after the end of the new character. In other words,
07693fe6
KW
282
283 d = uvchr_to_utf8(d, uv);
284
285is the recommended wide native character-aware way of saying
286
287 *(d++) = uv;
288
760c7c2f
KW
289This function accepts any UV as input, but very high code points (above
290C<IV_MAX> on the platform) will raise a deprecation warning. This is
291typically 0x7FFF_FFFF in a 32-bit word.
292
293It is possible to forbid or warn on non-Unicode code points, or those that may
294be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 295
07693fe6
KW
296=cut
297*/
298
de69f3af
KW
299/* This is also a macro */
300PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
301
07693fe6
KW
302U8 *
303Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
304{
de69f3af 305 return uvchr_to_utf8(d, uv);
07693fe6
KW
306}
307
de69f3af
KW
308/*
309=for apidoc uvchr_to_utf8_flags
310
311Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 312of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
313C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
314the byte after the end of the new character. In other words,
de69f3af
KW
315
316 d = uvchr_to_utf8_flags(d, uv, flags);
317
318or, in most cases,
319
320 d = uvchr_to_utf8_flags(d, uv, 0);
321
322This is the Unicode-aware way of saying
323
324 *(d++) = uv;
325
760c7c2f
KW
326If C<flags> is 0, this function accepts any UV as input, but very high code
327points (above C<IV_MAX> for the platform) will raise a deprecation warning.
328This is typically 0x7FFF_FFFF in a 32-bit word.
329
330Specifying C<flags> can further restrict what is allowed and not warned on, as
331follows:
de69f3af 332
796b6530 333If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
334the function will raise a warning, provided UTF8 warnings are enabled. If
335instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
336NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 337
760c7c2f
KW
338Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
339affect how the function handles a Unicode non-character.
93e6dbd6 340
760c7c2f
KW
341And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
342affect the handling of code points that are above the Unicode maximum of
3430x10FFFF. Languages other than Perl may not be able to accept files that
344contain these.
93e6dbd6
KW
345
346The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
347the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
348three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
349allowed inputs to the strict UTF-8 traditionally defined by Unicode.
350Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
351C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
352above-Unicode and surrogate flags, but not the non-character ones, as
353defined in
354L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
355See L<perlunicode/Noncharacter code points>.
93e6dbd6 356
ab8e6d41
KW
357Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
358so using them is more problematic than other above-Unicode code points. Perl
359invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
360likely that non-Perl languages will not be able to read files that contain
361these that written by the perl interpreter; nor would Perl understand files
362written by something that uses a different extension. For these reasons, there
363is a separate set of flags that can warn and/or disallow these extremely high
364code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
365C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
366are entirely independent from the deprecation warning for code points above
367C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
368code point that needs more than 31 bits to represent. When that happens,
369effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
37032-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
371above-Unicode code points, including these, as malformations; and
372C<UNICODE_WARN_SUPER> warns on these.)
373
374On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
375extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
376than on ASCII. Prior to that, code points 2**31 and higher were simply
377unrepresentable, and a different, incompatible method was used to represent
378code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
379and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
380platforms, warning and disallowing 2**31 and higher.
de69f3af 381
de69f3af
KW
382=cut
383*/
384
385/* This is also a macro */
386PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
387
07693fe6
KW
388U8 *
389Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
390{
de69f3af 391 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
392}
393
83dc0f42
KW
394PERL_STATIC_INLINE bool
395S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
396{
397 /* Returns TRUE if the first code point represented by the Perl-extended-
398 * UTF-8-encoded string starting at 's', and looking no further than 'e -
399 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
400 *
401 * The function handles the case where the input bytes do not include all
402 * the ones necessary to represent a full character. That is, they may be
403 * the intial bytes of the representation of a code point, but possibly
404 * the final ones necessary for the complete representation may be beyond
405 * 'e - 1'.
406 *
407 * The function assumes that the sequence is well-formed UTF-8 as far as it
408 * goes, and is for a UTF-8 variant code point. If the sequence is
409 * incomplete, the function returns FALSE if there is any well-formed
410 * UTF-8 byte sequence that can complete it in such a way that a code point
411 * < 2**31 is produced; otherwise it returns TRUE.
412 *
413 * Getting this exactly right is slightly tricky, and has to be done in
414 * several places in this file, so is centralized here. It is based on the
415 * following table:
416 *
417 * U+7FFFFFFF (2 ** 31 - 1)
418 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
419 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
420 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
421 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
422 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
423 * U+80000000 (2 ** 31):
424 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
425 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
426 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
427 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
428 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
429 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
430 */
431
432#ifdef EBCDIC
433
37086697
KW
434 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
435 const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
436 const STRLEN prefix_len = sizeof(prefix) - 1;
437 const STRLEN len = e - s;
f880f78a 438 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
439
440#else
441
442 PERL_UNUSED_ARG(e);
443
444#endif
445
446 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
447
448 assert(! UTF8_IS_INVARIANT(*s));
449
450#ifndef EBCDIC
451
452 /* Technically, a start byte of FE can be for a code point that fits into
453 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
454 * malformation. */
455 return (*s >= 0xFE);
456
457#else
458
459 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
460 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
461 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
462 * bits. */
463 if (*s != 0xFE || len == 1) {
464 return FALSE;
465 }
466
467 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
468 * \x41 and \x42. */
469 return cBOOL(memGT(s + 1, prefix, cmp_len));
470
471#endif
472
473}
474
c179379d
KW
475/* Anything larger than this will overflow the word if it were converted into a UV */
476#if defined(UV_IS_QUAD)
477# ifdef EBCDIC /* Actually is I8 */
478# define HIGHEST_REPRESENTABLE_UTF8 \
479 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
480# else
481# define HIGHEST_REPRESENTABLE_UTF8 \
482 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
483# endif
484#else /* 32-bit */
485# ifdef EBCDIC
486# define HIGHEST_REPRESENTABLE_UTF8 \
487 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
488# else
489# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
490# endif
491#endif
492
12a4bed3
KW
493PERL_STATIC_INLINE bool
494S_does_utf8_overflow(const U8 * const s, const U8 * e)
495{
496 const U8 *x;
497 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
498
c551bb01
KW
499#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
500
501 const STRLEN len = e - s;
502
503#endif
504
12a4bed3
KW
505 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
506 * platform, that is if it represents a code point larger than the highest
507 * representable code point. (For ASCII platforms, we could use memcmp()
508 * because we don't have to convert each byte to I8, but it's very rare
509 * input indeed that would approach overflow, so the loop below will likely
510 * only get executed once.
511 *
512 * 'e' must not be beyond a full character. If it is less than a full
513 * character, the function returns FALSE if there is any input beyond 'e'
514 * that could result in a non-overflowing code point */
515
516 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
b0b342d4
KW
517 assert(s <= e && s + UTF8SKIP(s) >= e);
518
519#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
520
521 /* On 32 bit ASCII machines, many overlongs that start with FF don't
522 * overflow */
523
c551bb01 524 if (isFF_OVERLONG(s, len)) {
b0b342d4
KW
525 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
526 return memGE(s, max_32_bit_overlong,
c551bb01 527 MIN(len, sizeof(max_32_bit_overlong) - 1));
b0b342d4
KW
528 }
529
530#endif
12a4bed3
KW
531
532 for (x = s; x < e; x++, y++) {
533
534 /* If this byte is larger than the corresponding highest UTF-8 byte, it
535 * overflows */
536 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
537 return TRUE;
538 }
539
540 /* If not the same as this byte, it must be smaller, doesn't overflow */
541 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
542 return FALSE;
543 }
544 }
545
546 /* Got to the end and all bytes are the same. If the input is a whole
547 * character, it doesn't overflow. And if it is a partial character,
548 * there's not enough information to tell, so assume doesn't overflow */
549 return FALSE;
550}
551
552PERL_STATIC_INLINE bool
553S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
554{
555 /* Overlongs can occur whenever the number of continuation bytes
556 * changes. That means whenever the number of leading 1 bits in a start
557 * byte increases from the next lower start byte. That happens for start
558 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
559 * illegal start bytes have already been excluded, so don't need to be
560 * tested here;
561 * ASCII platforms: C0, C1
562 * EBCDIC platforms C0, C1, C2, C3, C4, E0
563 *
564 * At least a second byte is required to determine if other sequences will
565 * be an overlong. */
566
567 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
568 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
569
570 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
571 assert(len > 1 && UTF8_IS_START(*s));
572
573 /* Each platform has overlongs after the start bytes given above (expressed
574 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
575 * the logic is the same, except the E0 overlong has already been excluded
576 * on EBCDIC platforms. The values below were found by manually
577 * inspecting the UTF-8 patterns. See the tables in utf8.h and
578 * utfebcdic.h. */
579
580# ifdef EBCDIC
581# define F0_ABOVE_OVERLONG 0xB0
582# define F8_ABOVE_OVERLONG 0xA8
583# define FC_ABOVE_OVERLONG 0xA4
584# define FE_ABOVE_OVERLONG 0xA2
585# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
586 /* I8(0xfe) is FF */
587# else
588
589 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
590 return TRUE;
591 }
592
593# define F0_ABOVE_OVERLONG 0x90
594# define F8_ABOVE_OVERLONG 0x88
595# define FC_ABOVE_OVERLONG 0x84
596# define FE_ABOVE_OVERLONG 0x82
597# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
598# endif
599
600
601 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
602 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
603 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
604 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
605 {
606 return TRUE;
607 }
608
b0b342d4
KW
609 /* Check for the FF overlong */
610 return isFF_OVERLONG(s, len);
611}
612
613PERL_STATIC_INLINE bool
614S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
615{
616 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3
KW
617
618 /* Check for the FF overlong. This happens only if all these bytes match;
619 * what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 620 * utfebcdic.h. */
12a4bed3 621
b0b342d4
KW
622 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
623 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
624 sizeof(FF_OVERLONG_PREFIX) - 1));
12a4bed3
KW
625}
626
627#undef F0_ABOVE_OVERLONG
628#undef F8_ABOVE_OVERLONG
629#undef FC_ABOVE_OVERLONG
630#undef FE_ABOVE_OVERLONG
631#undef FF_OVERLONG_PREFIX
632
35f8c9bd 633STRLEN
edc2c47a 634Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 635{
2b479609 636 STRLEN len;
12a4bed3 637 const U8 *x;
35f8c9bd 638
2b479609
KW
639 /* A helper function that should not be called directly.
640 *
641 * This function returns non-zero if the string beginning at 's' and
642 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
643 * code point; otherwise it returns 0. The examination stops after the
644 * first code point in 's' is validated, not looking at the rest of the
645 * input. If 'e' is such that there are not enough bytes to represent a
646 * complete code point, this function will return non-zero anyway, if the
647 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
648 * excluded by 'flags'.
649 *
650 * A non-zero return gives the number of bytes required to represent the
651 * code point. Be aware that if the input is for a partial character, the
652 * return will be larger than 'e - s'.
653 *
654 * This function assumes that the code point represented is UTF-8 variant.
655 * The caller should have excluded this possibility before calling this
656 * function.
657 *
658 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
659 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
660 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
661 * disallowed by the flags. If the input is only for a partial character,
662 * the function will return non-zero if there is any sequence of
663 * well-formed UTF-8 that, when appended to the input sequence, could
664 * result in an allowed code point; otherwise it returns 0. Non characters
665 * cannot be determined based on partial character input. But many of the
666 * other excluded types can be determined with just the first one or two
667 * bytes.
668 *
669 */
670
671 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
672
673 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
674 |UTF8_DISALLOW_ABOVE_31_BIT)));
675 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 676
2b479609 677 /* A variant char must begin with a start byte */
35f8c9bd
KW
678 if (UNLIKELY(! UTF8_IS_START(*s))) {
679 return 0;
680 }
681
edc2c47a
KW
682 /* Examine a maximum of a single whole code point */
683 if (e - s > UTF8SKIP(s)) {
684 e = s + UTF8SKIP(s);
685 }
686
2b479609
KW
687 len = e - s;
688
689 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
690 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 691
2b479609
KW
692 /* The code below is derived from this table. Keep in mind that legal
693 * continuation bytes range between \x80..\xBF for UTF-8, and
694 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
695 * Hence, we don't have to test the upper edge because if any of those
696 * are encountered, the sequence is malformed, and will fail elsewhere
697 * in this function.
698 * UTF-8 UTF-EBCDIC I8
699 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
700 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
701 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
702 *
703 */
704
705#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
706# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 707# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 708
19794540
KW
709# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
710 /* B6 and B7 */ \
711 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
712#else
713# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
714# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
715# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
716#endif
717
718 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
719 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
720 {
2b479609
KW
721 return 0; /* Above Unicode */
722 }
723
724 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
725 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
726 {
727 return 0; /* Above 31 bits */
728 }
729
730 if (len > 1) {
731 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
732
733 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 734 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
735 {
736 return 0; /* Above Unicode */
737 }
738
739 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 740 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
741 {
742 return 0; /* Surrogate */
743 }
744
745 if ( (flags & UTF8_DISALLOW_NONCHAR)
746 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
747 {
748 return 0; /* Noncharacter code point */
749 }
750 }
751 }
752
753 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
754 for (x = s + 1; x < e; x++) {
755 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
756 return 0;
757 }
758 }
759
af13dd8a 760 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
761 * overlong. */
762 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
763 return 0;
af13dd8a
KW
764 }
765
12a4bed3
KW
766 /* And finally, that the code point represented fits in a word on this
767 * platform */
768 if (does_utf8_overflow(s, e)) {
769 return 0;
35f8c9bd
KW
770 }
771
2b479609 772 return UTF8SKIP(s);
35f8c9bd
KW
773}
774
7e2f38b2
KW
775char *
776Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
7cf8d05d
KW
777{
778 /* Returns a mortalized C string that is a displayable copy of the 'len'
7e2f38b2
KW
779 * bytes starting at 's'. 'format' gives how to display each byte.
780 * Currently, there are only two formats, so it is currently a bool:
781 * 0 \xab
782 * 1 ab (that is a space between two hex digit bytes)
783 */
7cf8d05d
KW
784
785 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
786 trailing NUL */
787 const U8 * const e = s + len;
788 char * output;
789 char * d;
790
791 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
792
793 Newx(output, output_len, char);
794 SAVEFREEPV(output);
795
796 d = output;
797 for (; s < e; s++) {
798 const unsigned high_nibble = (*s & 0xF0) >> 4;
799 const unsigned low_nibble = (*s & 0x0F);
800
7e2f38b2
KW
801 if (format) {
802 *d++ = ' ';
803 }
804 else {
805 *d++ = '\\';
806 *d++ = 'x';
807 }
7cf8d05d
KW
808
809 if (high_nibble < 10) {
810 *d++ = high_nibble + '0';
811 }
812 else {
813 *d++ = high_nibble - 10 + 'a';
814 }
815
816 if (low_nibble < 10) {
817 *d++ = low_nibble + '0';
818 }
819 else {
820 *d++ = low_nibble - 10 + 'a';
821 }
822 }
823
824 *d = '\0';
825 return output;
826}
827
806547a7 828PERL_STATIC_INLINE char *
7cf8d05d
KW
829S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
830
831 /* How many bytes to print */
3cc6a05e 832 STRLEN print_len,
7cf8d05d
KW
833
834 /* Which one is the non-continuation */
835 const STRLEN non_cont_byte_pos,
836
837 /* How many bytes should there be? */
838 const STRLEN expect_len)
806547a7
KW
839{
840 /* Return the malformation warning text for an unexpected continuation
841 * byte. */
842
7cf8d05d 843 const char * const where = (non_cont_byte_pos == 1)
806547a7 844 ? "immediately"
7cf8d05d
KW
845 : Perl_form(aTHX_ "%d bytes",
846 (int) non_cont_byte_pos);
806547a7
KW
847
848 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
849
7cf8d05d
KW
850 /* We don't need to pass this parameter, but since it has already been
851 * calculated, it's likely faster to pass it; verify under DEBUGGING */
852 assert(expect_len == UTF8SKIP(s));
853
854 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
855 " %s after start byte 0x%02x; need %d bytes, got %d)",
856 malformed_text,
7e2f38b2 857 _byte_dump_string(s, print_len, 0),
7cf8d05d
KW
858 *(s + non_cont_byte_pos),
859 where,
860 *s,
861 (int) expect_len,
862 (int) non_cont_byte_pos);
806547a7
KW
863}
864
35f8c9bd
KW
865/*
866
de69f3af 867=for apidoc utf8n_to_uvchr
378516de
KW
868
869THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 870Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 871
9041c2e3 872Bottom level UTF-8 decode routine.
de69f3af 873Returns the native code point value of the first character in the string C<s>,
746afd53
KW
874which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
875C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
876the length, in bytes, of that character.
949cf498
KW
877
878The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
879well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
880causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
881is the next possible position in C<s> that could begin a non-malformed
882character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
883is raised. Some UTF-8 input sequences may contain multiple malformations.
884This function tries to find every possible one in each call, so multiple
885warnings can be raised for each sequence.
949cf498
KW
886
887Various ALLOW flags can be set in C<flags> to allow (and not warn on)
888individual types of malformations, such as the sequence being overlong (that
889is, when there is a shorter sequence that can express the same code point;
890overlong sequences are expressly forbidden in the UTF-8 standard due to
891potential security issues). Another malformation example is the first byte of
892a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
893flags. Even if allowed, this function generally returns the Unicode
894REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
895F<utf8.h> to override this behavior for the overlong malformations, but don't
896do that except for very specialized purposes.
949cf498 897
796b6530 898The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
899flags) malformation is found. If this flag is set, the routine assumes that
900the caller will raise a warning, and this function will silently just set
d088425d
KW
901C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
902
75200dff 903Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 904character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
905in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
906be set to 1. To disambiguate, upon a zero return, see if the first byte of
907C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 908error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
909
910Certain code points are considered problematic. These are Unicode surrogates,
746afd53 911Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 912By default these are considered regular code points, but certain situations
ecc1615f
KW
913warrant special handling for them, which can be specified using the C<flags>
914parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
915three classes are treated as malformations and handled as such. The flags
916C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
917C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
918disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
919restricts the allowed inputs to the strict UTF-8 traditionally defined by
920Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
921definition given by
922L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
923The difference between traditional strictness and C9 strictness is that the
924latter does not forbid non-character code points. (They are still discouraged,
925however.) For more discussion see L<perlunicode/Noncharacter code points>.
926
927The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
928C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
929C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
930raised for their respective categories, but otherwise the code points are
931considered valid (not malformations). To get a category to both be treated as
932a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 933(But note that warnings are not raised if lexically disabled nor if
796b6530 934C<UTF8_CHECK_ONLY> is also specified.)
949cf498 935
760c7c2f
KW
936It is now deprecated to have very high code points (above C<IV_MAX> on the
937platforms) and this function will raise a deprecation warning for these (unless
d5944cab 938such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 939in a 32-bit word.
ab8e6d41
KW
940
941Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
942so using them is more problematic than other above-Unicode code points. Perl
943invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
944likely that non-Perl languages will not be able to read files that contain
aff2be59 945these; nor would Perl understand files
ab8e6d41
KW
946written by something that uses a different extension. For these reasons, there
947is a separate set of flags that can warn and/or disallow these extremely high
948code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
949C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
950are entirely independent from the deprecation warning for code points above
951C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
952code point that needs more than 31 bits to represent. When that happens,
953effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
95432-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
955above-Unicode code points, including these, as malformations; and
956C<UTF8_WARN_SUPER> warns on these.)
957
958On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
959extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
960than on ASCII. Prior to that, code points 2**31 and higher were simply
961unrepresentable, and a different, incompatible method was used to represent
962code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
963and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
964platforms, warning and disallowing 2**31 and higher.
949cf498
KW
965
966All other code points corresponding to Unicode characters, including private
967use and those yet to be assigned, are never considered malformed and never
968warn.
67e989fb 969
37607a96 970=cut
f9380377
KW
971
972Also implemented as a macro in utf8.h
973*/
974
975UV
976Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
977 STRLEN curlen,
978 STRLEN *retlen,
979 const U32 flags)
980{
981 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
982
983 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
984}
985
986/*
987
988=for apidoc utf8n_to_uvchr_error
989
990THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
991Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
992
993This function is for code that needs to know what the precise malformation(s)
994are when an error is found.
995
996It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
997all the others, C<errors>. If this parameter is 0, this function behaves
998identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
999to a C<U32> variable, which this function sets to indicate any errors found.
1000Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1001C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1002of these bits will be set if a malformation is found, even if the input
7a65503b 1003C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1004exceptions are noted:
1005
1006=over 4
1007
1008=item C<UTF8_GOT_ABOVE_31_BIT>
1009
1010The code point represented by the input UTF-8 sequence occupies more than 31
1011bits.
1012This bit is set only if the input C<flags> parameter contains either the
1013C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
1014
1015=item C<UTF8_GOT_CONTINUATION>
1016
1017The input sequence was malformed in that the first byte was a a UTF-8
1018continuation byte.
1019
1020=item C<UTF8_GOT_EMPTY>
1021
1022The input C<curlen> parameter was 0.
1023
1024=item C<UTF8_GOT_LONG>
1025
1026The input sequence was malformed in that there is some other sequence that
1027evaluates to the same code point, but that sequence is shorter than this one.
1028
fecaf136
KW
1029Until Unicode 3.1, it was legal for programs to accept this malformation, but
1030it was discovered that this created security issues.
1031
f9380377
KW
1032=item C<UTF8_GOT_NONCHAR>
1033
1034The code point represented by the input UTF-8 sequence is for a Unicode
1035non-character code point.
1036This bit is set only if the input C<flags> parameter contains either the
1037C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1038
1039=item C<UTF8_GOT_NON_CONTINUATION>
1040
1041The input sequence was malformed in that a non-continuation type byte was found
1042in a position where only a continuation type one should be.
1043
1044=item C<UTF8_GOT_OVERFLOW>
1045
1046The input sequence was malformed in that it is for a code point that is not
1047representable in the number of bits available in a UV on the current platform.
1048
1049=item C<UTF8_GOT_SHORT>
1050
1051The input sequence was malformed in that C<curlen> is smaller than required for
1052a complete sequence. In other words, the input is for a partial character
1053sequence.
1054
1055=item C<UTF8_GOT_SUPER>
1056
1057The input sequence was malformed in that it is for a non-Unicode code point;
1058that is, one above the legal Unicode maximum.
1059This bit is set only if the input C<flags> parameter contains either the
1060C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1061
1062=item C<UTF8_GOT_SURROGATE>
1063
1064The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1065code point.
1066This bit is set only if the input C<flags> parameter contains either the
1067C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1068
1069=back
1070
133551d8
KW
1071To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1072flag to suppress any warnings, and then examine the C<*errors> return.
1073
f9380377 1074=cut
37607a96 1075*/
67e989fb 1076
a0ed51b3 1077UV
f9380377
KW
1078Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1079 STRLEN curlen,
1080 STRLEN *retlen,
1081 const U32 flags,
1082 U32 * errors)
a0ed51b3 1083{
d4c19fe8 1084 const U8 * const s0 = s;
2b5e7bc2
KW
1085 U8 * send = NULL; /* (initialized to silence compilers' wrong
1086 warning) */
1087 U32 possible_problems = 0; /* A bit is set here for each potential problem
1088 found as we go along */
eb83ed87 1089 UV uv = *s;
2b5e7bc2
KW
1090 STRLEN expectlen = 0; /* How long should this sequence be?
1091 (initialized to silence compilers' wrong
1092 warning) */
e308b348 1093 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1094 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1095 this gets set and discarded */
a0dbb045 1096
2b5e7bc2
KW
1097 /* The below are used only if there is both an overlong malformation and a
1098 * too short one. Otherwise the first two are set to 's0' and 'send', and
1099 * the third not used at all */
1100 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1101 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1102 warning) */
e9f2c446
KW
1103 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1104 routine; see [perl #130921] */
2b5e7bc2 1105 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1106
f9380377
KW
1107 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1108
1109 if (errors) {
1110 *errors = 0;
1111 }
1112 else {
1113 errors = &discard_errors;
1114 }
a0dbb045 1115
eb83ed87
KW
1116 /* The order of malformation tests here is important. We should consume as
1117 * few bytes as possible in order to not skip any valid character. This is
1118 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1119 * http://unicode.org/reports/tr36 for more discussion as to why. For
1120 * example, once we've done a UTF8SKIP, we can tell the expected number of
1121 * bytes, and could fail right off the bat if the input parameters indicate
1122 * that there are too few available. But it could be that just that first
1123 * byte is garbled, and the intended character occupies fewer bytes. If we
1124 * blindly assumed that the first byte is correct, and skipped based on
1125 * that number, we could skip over a valid input character. So instead, we
1126 * always examine the sequence byte-by-byte.
1127 *
1128 * We also should not consume too few bytes, otherwise someone could inject
1129 * things. For example, an input could be deliberately designed to
1130 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1131 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1132 * which is actually part of of the overflowing sequence), that could look
1133 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1134 * sequence and process the rest, inappropriately.
1135 *
1136 * Some possible input sequences are malformed in more than one way. This
1137 * function goes to lengths to try to find all of them. This is necessary
1138 * for correctness, as the inputs may allow one malformation but not
1139 * another, and if we abandon searching for others after finding the
1140 * allowed one, we could allow in something that shouldn't have been.
1141 */
eb83ed87 1142
b5b9af04 1143 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1144 possible_problems |= UTF8_GOT_EMPTY;
1145 curlen = 0;
5a48568d 1146 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1147 goto ready_to_handle_errors;
0c443dc2
JH
1148 }
1149
eb83ed87
KW
1150 expectlen = UTF8SKIP(s);
1151
1152 /* A well-formed UTF-8 character, as the vast majority of calls to this
1153 * function will be for, has this expected length. For efficiency, set
1154 * things up here to return it. It will be overriden only in those rare
1155 * cases where a malformation is found */
1156 if (retlen) {
1157 *retlen = expectlen;
1158 }
1159
1160 /* An invariant is trivially well-formed */
1d72bdf6 1161 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1162 return uv;
a0ed51b3 1163 }
67e989fb 1164
eb83ed87 1165 /* A continuation character can't start a valid sequence */
b5b9af04 1166 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1167 possible_problems |= UTF8_GOT_CONTINUATION;
1168 curlen = 1;
1169 uv = UNICODE_REPLACEMENT;
1170 goto ready_to_handle_errors;
ba210ebe 1171 }
9041c2e3 1172
dcd27b3c 1173 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1174 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1175 * because it excludes start bytes like \xC0 that always lead to
1176 * overlongs.) */
dcd27b3c 1177
534752c1
KW
1178 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1179 * that indicate the number of bytes in the character's whole UTF-8
1180 * sequence, leaving just the bits that are part of the value. */
1181 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1182
e308b348
KW
1183 /* Setup the loop end point, making sure to not look past the end of the
1184 * input string, and flag it as too short if the size isn't big enough. */
1185 send = (U8*) s0;
1186 if (UNLIKELY(curlen < expectlen)) {
1187 possible_problems |= UTF8_GOT_SHORT;
1188 avail_len = curlen;
1189 send += curlen;
1190 }
1191 else {
1192 send += expectlen;
1193 }
1194 adjusted_send = send;
1195
eb83ed87 1196 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1197 * accumulating each into the working value as we go. */
eb83ed87 1198 for (s = s0 + 1; s < send; s++) {
b5b9af04 1199 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1200 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1201 continue;
1202 }
1203
1204 /* Here, found a non-continuation before processing all expected bytes.
1205 * This byte indicates the beginning of a new character, so quit, even
1206 * if allowing this malformation. */
2b5e7bc2 1207 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1208 break;
eb83ed87
KW
1209 } /* End of loop through the character's bytes */
1210
1211 /* Save how many bytes were actually in the character */
1212 curlen = s - s0;
1213
2b5e7bc2
KW
1214 /* Note that there are two types of too-short malformation. One is when
1215 * there is actual wrong data before the normal termination of the
1216 * sequence. The other is that the sequence wasn't complete before the end
1217 * of the data we are allowed to look at, based on the input 'curlen'.
1218 * This means that we were passed data for a partial character, but it is
1219 * valid as far as we saw. The other is definitely invalid. This
1220 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1221 * separate.
1222 *
1223 * A convenience macro that matches either of the too-short conditions. */
1224# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1225
1226 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1227 uv_so_far = uv;
1228 uv = UNICODE_REPLACEMENT;
1229 }
2b5e7bc2
KW
1230
1231 /* Check for overflow */
1232 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1233 possible_problems |= UTF8_GOT_OVERFLOW;
1234 uv = UNICODE_REPLACEMENT;
eb83ed87 1235 }
eb83ed87 1236
2b5e7bc2
KW
1237 /* Check for overlong. If no problems so far, 'uv' is the correct code
1238 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1239 * we must look at the UTF-8 byte sequence itself to see if it is for an
1240 * overlong */
1241 if ( ( LIKELY(! possible_problems)
1242 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1243 || ( UNLIKELY( possible_problems)
1244 && ( UNLIKELY(! UTF8_IS_START(*s0))
1245 || ( curlen > 1
1246 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1247 send - s0))))))
2f8f112e 1248 {
2b5e7bc2
KW
1249 possible_problems |= UTF8_GOT_LONG;
1250
abc28b54
KW
1251 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1252 /* The calculation in the 'true' branch of this 'if'
1253 * below won't work if overflows, and isn't needed
1254 * anyway. Further below we handle all overflow
1255 * cases */
1256 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1257 {
2b5e7bc2
KW
1258 UV min_uv = uv_so_far;
1259 STRLEN i;
1260
1261 /* Here, the input is both overlong and is missing some trailing
1262 * bytes. There is no single code point it could be for, but there
1263 * may be enough information present to determine if what we have
1264 * so far is for an unallowed code point, such as for a surrogate.
1265 * The code below has the intelligence to determine this, but just
1266 * for non-overlong UTF-8 sequences. What we do here is calculate
1267 * the smallest code point the input could represent if there were
1268 * no too short malformation. Then we compute and save the UTF-8
1269 * for that, which is what the code below looks at instead of the
1270 * raw input. It turns out that the smallest such code point is
1271 * all we need. */
1272 for (i = curlen; i < expectlen; i++) {
1273 min_uv = UTF8_ACCUMULATE(min_uv,
1274 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1275 }
1276
e9f2c446 1277 adjusted_s0 = temp_char_buf;
2b5e7bc2
KW
1278 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1279 }
eb83ed87
KW
1280 }
1281
2b5e7bc2
KW
1282 /* Now check that the input isn't for a problematic code point not allowed
1283 * by the input parameters. */
1284 /* isn't problematic if < this */
1285 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1286 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1287
1288 /* if overflow, we know without looking further
1289 * precisely which of the problematic types it is,
1290 * and we deal with those in the overflow handling
1291 * code */
1292 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
2b5e7bc2 1293 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1294 && ((flags & ( UTF8_DISALLOW_NONCHAR
1295 |UTF8_DISALLOW_SURROGATE
1296 |UTF8_DISALLOW_SUPER
1297 |UTF8_DISALLOW_ABOVE_31_BIT
1298 |UTF8_WARN_NONCHAR
1299 |UTF8_WARN_SURROGATE
1300 |UTF8_WARN_SUPER
1301 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1302 /* In case of a malformation, 'uv' is not valid, and has
1303 * been changed to something in the Unicode range.
1304 * Currently we don't output a deprecation message if there
1305 * is already a malformation, so we don't have to special
1306 * case the test immediately below */
760c7c2f
KW
1307 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1308 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1309 {
2b5e7bc2
KW
1310 /* If there were no malformations, or the only malformation is an
1311 * overlong, 'uv' is valid */
1312 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1313 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1314 possible_problems |= UTF8_GOT_SURROGATE;
1315 }
1316 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1317 possible_problems |= UTF8_GOT_SUPER;
1318 }
1319 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1320 possible_problems |= UTF8_GOT_NONCHAR;
1321 }
1322 }
1323 else { /* Otherwise, need to look at the source UTF-8, possibly
1324 adjusted to be non-overlong */
1325
1326 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1327 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1328 {
2b5e7bc2
KW
1329 possible_problems |= UTF8_GOT_SUPER;
1330 }
1331 else if (curlen > 1) {
1332 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1333 NATIVE_UTF8_TO_I8(*adjusted_s0),
1334 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1335 {
2b5e7bc2 1336 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1337 }
2b5e7bc2
KW
1338 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1339 NATIVE_UTF8_TO_I8(*adjusted_s0),
1340 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1341 {
1342 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1343 }
1344 }
c0236afe 1345
2b5e7bc2
KW
1346 /* We need a complete well-formed UTF-8 character to discern
1347 * non-characters, so can't look for them here */
1348 }
1349 }
949cf498 1350
2b5e7bc2
KW
1351 ready_to_handle_errors:
1352
1353 /* At this point:
1354 * curlen contains the number of bytes in the sequence that
1355 * this call should advance the input by.
e308b348
KW
1356 * avail_len gives the available number of bytes passed in, but
1357 * only if this is less than the expected number of
1358 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1359 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1360 * is set in it for each potential problem found.
1361 * uv contains the code point the input sequence
1362 * represents; or if there is a problem that prevents
1363 * a well-defined value from being computed, it is
1364 * some subsitute value, typically the REPLACEMENT
1365 * CHARACTER.
1366 * s0 points to the first byte of the character
1367 * send points to just after where that (potentially
1368 * partial) character ends
1369 * adjusted_s0 normally is the same as s0, but in case of an
1370 * overlong for which the UTF-8 matters below, it is
1371 * the first byte of the shortest form representation
1372 * of the input.
1373 * adjusted_send normally is the same as 'send', but if adjusted_s0
1374 * is set to something other than s0, this points one
1375 * beyond its end
1376 */
eb83ed87 1377
2b5e7bc2
KW
1378 if (UNLIKELY(possible_problems)) {
1379 bool disallowed = FALSE;
1380 const U32 orig_problems = possible_problems;
1381
1382 while (possible_problems) { /* Handle each possible problem */
1383 UV pack_warn = 0;
1384 char * message = NULL;
1385
1386 /* Each 'if' clause handles one problem. They are ordered so that
1387 * the first ones' messages will be displayed before the later
1388 * ones; this is kinda in decreasing severity order */
1389 if (possible_problems & UTF8_GOT_OVERFLOW) {
1390
1391 /* Overflow means also got a super and above 31 bits, but we
1392 * handle all three cases here */
1393 possible_problems
1394 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1395 *errors |= UTF8_GOT_OVERFLOW;
1396
1397 /* But the API says we flag all errors found */
1398 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1399 *errors |= UTF8_GOT_SUPER;
1400 }
ddb65933
KW
1401 if (flags
1402 & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
1403 {
f9380377
KW
1404 *errors |= UTF8_GOT_ABOVE_31_BIT;
1405 }
2b5e7bc2 1406
d60baaa7
KW
1407 /* Disallow if any of the three categories say to */
1408 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1409 || (flags & ( UTF8_DISALLOW_SUPER
1410 |UTF8_DISALLOW_ABOVE_31_BIT)))
1411 {
1412 disallowed = TRUE;
1413 }
1414
1415
1416 /* Likewise, warn if any say to, plus if deprecation warnings
1417 * are on, because this code point is above IV_MAX */
1418 if ( ckWARN_d(WARN_DEPRECATED)
1419 || ! (flags & UTF8_ALLOW_OVERFLOW)
1420 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
1421 {
2b5e7bc2 1422
ddb65933
KW
1423 /* The warnings code explicitly says it doesn't handle the
1424 * case of packWARN2 and two categories which have
1425 * parent-child relationship. Even if it works now to
1426 * raise the warning if either is enabled, it wouldn't
1427 * necessarily do so in the future. We output (only) the
1428 * most dire warning*/
1429 if (! (flags & UTF8_CHECK_ONLY)) {
1430 if (ckWARN_d(WARN_UTF8)) {
1431 pack_warn = packWARN(WARN_UTF8);
1432 }
1433 else if (ckWARN_d(WARN_NON_UNICODE)) {
1434 pack_warn = packWARN(WARN_NON_UNICODE);
1435 }
1436 if (pack_warn) {
1437 message = Perl_form(aTHX_ "%s: %s (overflows)",
1438 malformed_text,
05b9033b 1439 _byte_dump_string(s0, curlen, 0));
ddb65933 1440 }
2b5e7bc2
KW
1441 }
1442 }
1443 }
1444 else if (possible_problems & UTF8_GOT_EMPTY) {
1445 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1446 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1447
1448 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1449
1450 /* This so-called malformation is now treated as a bug in
1451 * the caller. If you have nothing to decode, skip calling
1452 * this function */
1453 assert(0);
1454
2b5e7bc2
KW
1455 disallowed = TRUE;
1456 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1457 pack_warn = packWARN(WARN_UTF8);
1458 message = Perl_form(aTHX_ "%s (empty string)",
1459 malformed_text);
1460 }
1461 }
1462 }
1463 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1464 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1465 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1466
1467 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1468 disallowed = TRUE;
1469 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1470 pack_warn = packWARN(WARN_UTF8);
1471 message = Perl_form(aTHX_
1472 "%s: %s (unexpected continuation byte 0x%02x,"
1473 " with no preceding start byte)",
1474 malformed_text,
7e2f38b2 1475 _byte_dump_string(s0, 1, 0), *s0);
2b5e7bc2
KW
1476 }
1477 }
1478 }
2b5e7bc2
KW
1479 else if (possible_problems & UTF8_GOT_SHORT) {
1480 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1481 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1482
1483 if (! (flags & UTF8_ALLOW_SHORT)) {
1484 disallowed = TRUE;
1485 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1486 pack_warn = packWARN(WARN_UTF8);
1487 message = Perl_form(aTHX_
9a6c9c81 1488 "%s: %s (too short; %d byte%s available, need %d)",
2b5e7bc2 1489 malformed_text,
7e2f38b2 1490 _byte_dump_string(s0, send - s0, 0),
e308b348
KW
1491 (int)avail_len,
1492 avail_len == 1 ? "" : "s",
2b5e7bc2
KW
1493 (int)expectlen);
1494 }
1495 }
ba210ebe 1496
2b5e7bc2 1497 }
e308b348
KW
1498 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1499 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1500 *errors |= UTF8_GOT_NON_CONTINUATION;
1501
1502 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1503 disallowed = TRUE;
1504 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
99a765e9
KW
1505
1506 /* If we don't know for sure that the input length is
1507 * valid, avoid as much as possible reading past the
1508 * end of the buffer */
1509 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1510 ? s - s0
1511 : send - s0;
e308b348
KW
1512 pack_warn = packWARN(WARN_UTF8);
1513 message = Perl_form(aTHX_ "%s",
1514 unexpected_non_continuation_text(s0,
99a765e9 1515 printlen,
e308b348
KW
1516 s - s0,
1517 (int) expectlen));
1518 }
1519 }
1520 }
2b5e7bc2
KW
1521 else if (possible_problems & UTF8_GOT_LONG) {
1522 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1523 *errors |= UTF8_GOT_LONG;
2b5e7bc2 1524
94953955
KW
1525 if (flags & UTF8_ALLOW_LONG) {
1526
1527 /* We don't allow the actual overlong value, unless the
1528 * special extra bit is also set */
1529 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
1530 & ~UTF8_ALLOW_LONG)))
1531 {
1532 uv = UNICODE_REPLACEMENT;
1533 }
1534 }
1535 else {
2b5e7bc2
KW
1536 disallowed = TRUE;
1537
1538 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1539 pack_warn = packWARN(WARN_UTF8);
1540
1541 /* These error types cause 'uv' to be something that
1542 * isn't what was intended, so can't use it in the
1543 * message. The other error types either can't
1544 * generate an overlong, or else the 'uv' is valid */
1545 if (orig_problems &
1546 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1547 {
1548 message = Perl_form(aTHX_
1549 "%s: %s (any UTF-8 sequence that starts"
1550 " with \"%s\" is overlong which can and"
1551 " should be represented with a"
1552 " different, shorter sequence)",
1553 malformed_text,
7e2f38b2
KW
1554 _byte_dump_string(s0, send - s0, 0),
1555 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1556 }
1557 else {
1558 U8 tmpbuf[UTF8_MAXBYTES+1];
1559 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1560 uv, 0);
1561 message = Perl_form(aTHX_
1562 "%s: %s (overlong; instead use %s to represent"
147e3846 1563 " U+%0*" UVXf ")",
2b5e7bc2 1564 malformed_text,
05b9033b 1565 _byte_dump_string(s0, curlen, 0),
7e2f38b2 1566 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2b5e7bc2
KW
1567 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1568 small code points */
1569 uv);
1570 }
1571 }
1572 }
1573 }
1574 else if (possible_problems & UTF8_GOT_SURROGATE) {
1575 possible_problems &= ~UTF8_GOT_SURROGATE;
1576
f9380377
KW
1577 if (flags & UTF8_WARN_SURROGATE) {
1578 *errors |= UTF8_GOT_SURROGATE;
1579
1580 if ( ! (flags & UTF8_CHECK_ONLY)
1581 && ckWARN_d(WARN_SURROGATE))
1582 {
2b5e7bc2
KW
1583 pack_warn = packWARN(WARN_SURROGATE);
1584
1585 /* These are the only errors that can occur with a
1586 * surrogate when the 'uv' isn't valid */
1587 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1588 message = Perl_form(aTHX_
1589 "UTF-16 surrogate (any UTF-8 sequence that"
1590 " starts with \"%s\" is for a surrogate)",
7e2f38b2 1591 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1592 }
1593 else {
1594 message = Perl_form(aTHX_
147e3846 1595 "UTF-16 surrogate U+%04" UVXf, uv);
2b5e7bc2 1596 }
f9380377 1597 }
2b5e7bc2 1598 }
ba210ebe 1599
2b5e7bc2
KW
1600 if (flags & UTF8_DISALLOW_SURROGATE) {
1601 disallowed = TRUE;
f9380377 1602 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1603 }
1604 }
1605 else if (possible_problems & UTF8_GOT_SUPER) {
1606 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1607
f9380377
KW
1608 if (flags & UTF8_WARN_SUPER) {
1609 *errors |= UTF8_GOT_SUPER;
1610
1611 if ( ! (flags & UTF8_CHECK_ONLY)
1612 && ckWARN_d(WARN_NON_UNICODE))
1613 {
2b5e7bc2
KW
1614 pack_warn = packWARN(WARN_NON_UNICODE);
1615
1616 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1617 message = Perl_form(aTHX_
1618 "Any UTF-8 sequence that starts with"
1619 " \"%s\" is for a non-Unicode code point,"
1620 " may not be portable",
7e2f38b2 1621 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1622 }
1623 else {
1624 message = Perl_form(aTHX_
147e3846 1625 "Code point 0x%04" UVXf " is not"
2b5e7bc2
KW
1626 " Unicode, may not be portable",
1627 uv);
1628 }
f9380377 1629 }
2b5e7bc2 1630 }
ba210ebe 1631
2b5e7bc2
KW
1632 /* The maximum code point ever specified by a standard was
1633 * 2**31 - 1. Anything larger than that is a Perl extension
1634 * that very well may not be understood by other applications
1635 * (including earlier perl versions on EBCDIC platforms). We
1636 * test for these after the regular SUPER ones, and before
1637 * possibly bailing out, so that the slightly more dire warning
1638 * will override the regular one. */
1639 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1640 |UTF8_WARN_SUPER
1641 |UTF8_DISALLOW_ABOVE_31_BIT))
1642 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1643 && UNLIKELY(is_utf8_cp_above_31_bits(
1644 adjusted_s0,
1645 adjusted_send)))
1646 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1647 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1648 {
1649 if ( ! (flags & UTF8_CHECK_ONLY)
1650 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1651 && ckWARN_d(WARN_UTF8))
1652 {
1653 pack_warn = packWARN(WARN_UTF8);
1654
1655 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1656 message = Perl_form(aTHX_
1657 "Any UTF-8 sequence that starts with"
1658 " \"%s\" is for a non-Unicode code"
1659 " point, and is not portable",
7e2f38b2 1660 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1661 }
1662 else {
1663 message = Perl_form(aTHX_
147e3846 1664 "Code point 0x%" UVXf " is not Unicode,"
2b5e7bc2
KW
1665 " and not portable",
1666 uv);
1667 }
1668 }
1669
ddb65933
KW
1670 if (flags & ( UTF8_WARN_ABOVE_31_BIT
1671 |UTF8_DISALLOW_ABOVE_31_BIT))
1672 {
f9380377
KW
1673 *errors |= UTF8_GOT_ABOVE_31_BIT;
1674
1675 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1676 disallowed = TRUE;
1677 }
2b5e7bc2
KW
1678 }
1679 }
eb83ed87 1680
2b5e7bc2 1681 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1682 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1683 disallowed = TRUE;
1684 }
eb83ed87 1685
2b5e7bc2
KW
1686 /* The deprecated warning overrides any non-deprecated one. If
1687 * there are other problems, a deprecation message is not
1688 * really helpful, so don't bother to raise it in that case.
1689 * This also keeps the code from having to handle the case
1690 * where 'uv' is not valid. */
1691 if ( ! (orig_problems
1692 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
76513bdc
KW
1693 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1694 && ckWARN_d(WARN_DEPRECATED))
1695 {
1696 message = Perl_form(aTHX_ cp_above_legal_max,
1697 uv, MAX_NON_DEPRECATED_CP);
1698 pack_warn = packWARN(WARN_DEPRECATED);
2b5e7bc2
KW
1699 }
1700 }
1701 else if (possible_problems & UTF8_GOT_NONCHAR) {
1702 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1703
f9380377
KW
1704 if (flags & UTF8_WARN_NONCHAR) {
1705 *errors |= UTF8_GOT_NONCHAR;
1706
1707 if ( ! (flags & UTF8_CHECK_ONLY)
1708 && ckWARN_d(WARN_NONCHAR))
1709 {
2b5e7bc2
KW
1710 /* The code above should have guaranteed that we don't
1711 * get here with errors other than overlong */
1712 assert (! (orig_problems
1713 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1714
1715 pack_warn = packWARN(WARN_NONCHAR);
1716 message = Perl_form(aTHX_ "Unicode non-character"
147e3846 1717 " U+%04" UVXf " is not recommended"
2b5e7bc2 1718 " for open interchange", uv);
f9380377 1719 }
2b5e7bc2 1720 }
5b311467 1721
2b5e7bc2
KW
1722 if (flags & UTF8_DISALLOW_NONCHAR) {
1723 disallowed = TRUE;
f9380377 1724 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1725 }
1726 } /* End of looking through the possible flags */
1727
1728 /* Display the message (if any) for the problem being handled in
1729 * this iteration of the loop */
1730 if (message) {
1731 if (PL_op)
1732 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1733 OP_DESC(PL_op));
1734 else
1735 Perl_warner(aTHX_ pack_warn, "%s", message);
1736 }
ddb65933 1737 } /* End of 'while (possible_problems)' */
a0dbb045 1738
2b5e7bc2
KW
1739 /* Since there was a possible problem, the returned length may need to
1740 * be changed from the one stored at the beginning of this function.
1741 * Instead of trying to figure out if that's needed, just do it. */
1742 if (retlen) {
1743 *retlen = curlen;
1744 }
a0dbb045 1745
2b5e7bc2
KW
1746 if (disallowed) {
1747 if (flags & UTF8_CHECK_ONLY && retlen) {
1748 *retlen = ((STRLEN) -1);
1749 }
1750 return 0;
1751 }
eb83ed87 1752 }
ba210ebe 1753
2b5e7bc2 1754 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1755}
1756
8e84507e 1757/*
ec5f19d0
KW
1758=for apidoc utf8_to_uvchr_buf
1759
1760Returns the native code point of the first character in the string C<s> which
1761is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1762C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1763
524080c4
KW
1764If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1765enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1766C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1767(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1768C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1769the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1770See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1771returned.
ec5f19d0 1772
760c7c2f
KW
1773Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1774unless those are turned off.
1775
ec5f19d0 1776=cut
52be2536
KW
1777
1778Also implemented as a macro in utf8.h
1779
ec5f19d0
KW
1780*/
1781
1782
1783UV
1784Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1785{
7f974d7e
KW
1786 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
1787
ec5f19d0
KW
1788 assert(s < send);
1789
1790 return utf8n_to_uvchr(s, send - s, retlen,
ddb65933 1791 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
ec5f19d0
KW
1792}
1793
52be2536
KW
1794/* This is marked as deprecated
1795 *
ec5f19d0
KW
1796=for apidoc utf8_to_uvuni_buf
1797
de69f3af
KW
1798Only in very rare circumstances should code need to be dealing in Unicode
1799(as opposed to native) code points. In those few cases, use
1800C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1801
1802Returns the Unicode (not-native) code point of the first character in the
1803string C<s> which
ec5f19d0
KW
1804is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1805C<retlen> will be set to the length, in bytes, of that character.
1806
524080c4
KW
1807If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1808enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1809NULL) to -1. If those warnings are off, the computed value if well-defined (or
1810the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1811is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1812next possible position in C<s> that could begin a non-malformed character.
de69f3af 1813See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1814
760c7c2f
KW
1815Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1816unless those are turned off.
1817
ec5f19d0
KW
1818=cut
1819*/
1820
1821UV
1822Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1823{
1824 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1825
1826 assert(send > s);
1827
5962d97e
KW
1828 /* Call the low level routine, asking for checks */
1829 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1830}
1831
b76347f2 1832/*
87cea99e 1833=for apidoc utf8_length
b76347f2
JH
1834
1835Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1836Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1837up past C<e>, croaks.
b76347f2
JH
1838
1839=cut
1840*/
1841
1842STRLEN
35a4481c 1843Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1844{
1845 STRLEN len = 0;
1846
7918f24d
NC
1847 PERL_ARGS_ASSERT_UTF8_LENGTH;
1848
8850bf83
JH
1849 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1850 * the bitops (especially ~) can create illegal UTF-8.
1851 * In other words: in Perl UTF-8 is not just for Unicode. */
1852
a3b680e6
AL
1853 if (e < s)
1854 goto warn_and_return;
b76347f2 1855 while (s < e) {
4cbf4130 1856 s += UTF8SKIP(s);
8e91ec7f
AV
1857 len++;
1858 }
1859
1860 if (e != s) {
1861 len--;
1862 warn_and_return:
9b387841
NC
1863 if (PL_op)
1864 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1865 "%s in %s", unees, OP_DESC(PL_op));
1866 else
61a12c31 1867 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1868 }
1869
1870 return len;
1871}
1872
b06226ff 1873/*
fed3ba5d
NC
1874=for apidoc bytes_cmp_utf8
1875
a1433954 1876Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1877sequence of characters (stored as UTF-8)
1878in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1879equal, -1 or -2 if the first string is less than the second string, +1 or +2
1880if the first string is greater than the second string.
1881
1882-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1883longer string. -2 or +2 is returned if
1884there was a difference between characters
fed3ba5d
NC
1885within the strings.
1886
1887=cut
1888*/
1889
1890int
1891Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1892{
1893 const U8 *const bend = b + blen;
1894 const U8 *const uend = u + ulen;
1895
1896 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1897
1898 while (b < bend && u < uend) {
1899 U8 c = *u++;
1900 if (!UTF8_IS_INVARIANT(c)) {
1901 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1902 if (u < uend) {
1903 U8 c1 = *u++;
1904 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1905 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1906 } else {
2b5e7bc2 1907 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1908 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1909 "%s %s%s",
1d5030e1 1910 unexpected_non_continuation_text(u - 2, 2, 1, 2),
806547a7
KW
1911 PL_op ? " in " : "",
1912 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1913 return -2;
1914 }
1915 } else {
1916 if (PL_op)
1917 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1918 "%s in %s", unees, OP_DESC(PL_op));
1919 else
61a12c31 1920 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1921 return -2; /* Really want to return undef :-) */
1922 }
1923 } else {
1924 return -2;
1925 }
1926 }
1927 if (*b != c) {
1928 return *b < c ? -2 : +2;
1929 }
1930 ++b;
1931 }
1932
1933 if (b == bend && u == uend)
1934 return 0;
1935
1936 return b < bend ? +1 : -1;
1937}
1938
1939/*
87cea99e 1940=for apidoc utf8_to_bytes
6940069f 1941
3bc0c78c 1942Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 1943Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 1944updates C<*lenp> to contain the new length.
3bc0c78c
KW
1945Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
1946
1947Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
1948having saved the value of C<*lenp> before the call, and subtracting the
1949after-call value of C<*lenp> from it.
6940069f 1950
a1433954 1951If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1952
6940069f
GS
1953=cut
1954*/
1955
1956U8 *
09af0336 1957Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 1958{
9fe0d3c2 1959 U8 * first_variant;
246fae53 1960
7918f24d 1961 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1962 PERL_UNUSED_CONTEXT;
7918f24d 1963
9fe0d3c2 1964 /* This is a no-op if no variants at all in the input */
09af0336 1965 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
1966 return s;
1967 }
1968
1969 {
3c5aa262 1970 U8 * const save = s;
09af0336 1971 U8 * const send = s + *lenp;
3c5aa262
KW
1972 U8 * d;
1973
1974 /* Nothing before the first variant needs to be changed, so start the real
1975 * work there */
1976 s = first_variant;
1977 while (s < send) {
1978 if (! UTF8_IS_INVARIANT(*s)) {
1979 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 1980 *lenp = ((STRLEN) -1);
3c5aa262
KW
1981 return 0;
1982 }
1983 s++;
d59937ca
KW
1984 }
1985 s++;
dcad2880 1986 }
dcad2880 1987
3c5aa262
KW
1988 /* Is downgradable, so do it */
1989 d = s = first_variant;
1990 while (s < send) {
1991 U8 c = *s++;
1992 if (! UVCHR_IS_INVARIANT(c)) {
1993 /* Then it is two-byte encoded */
1994 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1995 s++;
1996 }
1997 *d++ = c;
1998 }
1999 *d = '\0';
09af0336 2000 *lenp = d - save;
3c5aa262
KW
2001
2002 return save;
9fe0d3c2 2003 }
6940069f
GS
2004}
2005
2006/*
87cea99e 2007=for apidoc bytes_from_utf8
f9a63242 2008
09af0336 2009Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2010byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2011actually encoded in UTF-8.
2012
2013Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2014the input string.
2015
41ae6089
KW
2016Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2017not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2018C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2019
41ae6089 2020Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2021newly created string containing a downgraded copy of C<s>, and whose length is
23b37b12 2022returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
f9a63242 2023
3bc0c78c 2024Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2025having saved the value of C<*lenp> before the call, and subtracting the
2026after-call value of C<*lenp> from it.
3bc0c78c 2027
37607a96 2028=cut
976c1b08
KW
2029
2030There is a macro that avoids this function call, but this is retained for
2031anyone who calls it with the Perl_ prefix */
f9a63242
JH
2032
2033U8 *
41ae6089 2034Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2035{
7918f24d 2036 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2037 PERL_UNUSED_CONTEXT;
f9a63242 2038
976c1b08
KW
2039 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2040}
2041
2042/*
2043No = here because currently externally undocumented
2044for apidoc bytes_from_utf8_loc
2045
2046Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2047to store the location of the first character in C<"s"> that cannot be
2048converted to non-UTF8.
2049
2050If that parameter is C<NULL>, this function behaves identically to
2051C<bytes_from_utf8>.
2052
2053Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2054C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2055
2056Otherwise, the function returns a newly created C<NUL>-terminated string
2057containing the non-UTF8 equivalent of the convertible first portion of
2058C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2059If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2060and C<*first_non_downgradable> is set to C<NULL>.
2061
2062Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2063first character in the original string that wasn't converted. C<*is_utf8p> is
2064unchanged. Note that the new string may have length 0.
2065
2066Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2067C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2068converts as many characters in it as possible stopping at the first one it
385b74be 2069finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2070set to point to that. The function returns the portion that could be converted
2071in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2072not including the terminating C<NUL>. If the very first character in the
2073original could not be converted, C<*lenp> will be 0, and the new string will
2074contain just a single C<NUL>. If the entire input string was converted,
2075C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2076
2077Upon successful return, the number of variants in the converted portion of the
2078string can be computed by having saved the value of C<*lenp> before the call,
2079and subtracting the after-call value of C<*lenp> from it.
2080
2081=cut
2082
2083
2084*/
2085
2086U8 *
2087Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2088{
2089 U8 *d;
2090 const U8 *original = s;
2091 U8 *converted_start;
2092 const U8 *send = s + *lenp;
f9a63242 2093
976c1b08 2094 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2095
976c1b08
KW
2096 if (! *is_utf8p) {
2097 if (first_unconverted) {
2098 *first_unconverted = NULL;
2099 }
2100
2101 return (U8 *) original;
2102 }
2103
2104 Newx(d, (*lenp) + 1, U8);
2105
2106 converted_start = d;
7299a045
KW
2107 while (s < send) {
2108 U8 c = *s++;
2109 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2110
2111 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2112 * have to stop now */
2113 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2114 if (first_unconverted) {
2115 *first_unconverted = s - 1;
2116 goto finish_and_return;
2117 }
2118 else {
2119 Safefree(converted_start);
2120 return (U8 *) original;
2121 }
2122 }
2123
7299a045
KW
2124 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2125 s++;
38af28cf 2126 }
7299a045
KW
2127 *d++ = c;
2128 }
170a1c22 2129
976c1b08
KW
2130 /* Here, converted the whole of the input */
2131 *is_utf8p = FALSE;
2132 if (first_unconverted) {
2133 *first_unconverted = NULL;
170a1c22 2134 }
976c1b08
KW
2135
2136 finish_and_return:
2137 *d = '\0';
2138 *lenp = d - converted_start;
2139
2140 /* Trim unused space */
2141 Renew(converted_start, *lenp + 1, U8);
2142
2143 return converted_start;
f9a63242
JH
2144}
2145
2146/*
87cea99e 2147=for apidoc bytes_to_utf8
6940069f 2148
09af0336 2149Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2150UTF-8.
09af0336 2151Returns a pointer to the newly-created string, and sets C<*lenp> to
ff97e5cf 2152reflect the new length in bytes.
6940069f 2153
3bc0c78c 2154Upon successful return, the number of variants in the string can be computed by
23b37b12 2155having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2156after-call value of C<*lenp>.
2157
75200dff 2158A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2159
2160If you want to convert to UTF-8 from encodings other than
2161the native (Latin1 or EBCDIC),
a1433954 2162see L</sv_recode_to_utf8>().
c9ada85f 2163
497711e7 2164=cut
6940069f
GS
2165*/
2166
2167U8*
09af0336 2168Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2169{
09af0336 2170 const U8 * const send = s + (*lenp);
6940069f
GS
2171 U8 *d;
2172 U8 *dst;
7918f24d
NC
2173
2174 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2175 PERL_UNUSED_CONTEXT;
6940069f 2176
09af0336 2177 Newx(d, (*lenp) * 2 + 1, U8);
6940069f
GS
2178 dst = d;
2179
2180 while (s < send) {
55d09dc8
KW
2181 append_utf8_from_native_byte(*s, &d);
2182 s++;
6940069f
GS
2183 }
2184 *d = '\0';
09af0336 2185 *lenp = d-dst;
6940069f
GS
2186 return dst;
2187}
2188
a0ed51b3 2189/*
dea0fc0b 2190 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
2191 *
2192 * Destination must be pre-extended to 3/2 source. Do not use in-place.
2193 * We optimize for native, for obvious reasons. */
2194
2195U8*
dea0fc0b 2196Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 2197{
dea0fc0b
JH
2198 U8* pend;
2199 U8* dstart = d;
2200
7918f24d
NC
2201 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2202
dea0fc0b 2203 if (bytelen & 1)
147e3846 2204 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
dea0fc0b
JH
2205
2206 pend = p + bytelen;
2207
a0ed51b3 2208 while (p < pend) {
dea0fc0b
JH
2209 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2210 p += 2;
2d1545e5 2211 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2212 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2213 continue;
2214 }
56d37426
KW
2215 if (uv <= MAX_UTF8_TWO_BYTE) {
2216 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2217 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2218 continue;
2219 }
46956fad
KW
2220#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2221#define LAST_HIGH_SURROGATE 0xDBFF
2222#define FIRST_LOW_SURROGATE 0xDC00
2223#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
2224
2225 /* This assumes that most uses will be in the first Unicode plane, not
2226 * needing surrogates */
2227 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2228 && uv <= UNICODE_SURROGATE_LAST))
2229 {
2230 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2231 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2232 }
2233 else {
01ea242b 2234 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2235 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2236 || UNLIKELY(low > LAST_LOW_SURROGATE))
2237 {
01ea242b 2238 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2239 }
2240 p += 2;
46956fad
KW
2241 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2242 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2243 }
a0ed51b3 2244 }
56d37426
KW
2245#ifdef EBCDIC
2246 d = uvoffuni_to_utf8_flags(d, uv, 0);
2247#else
a0ed51b3 2248 if (uv < 0x10000) {
eb160463
GS
2249 *d++ = (U8)(( uv >> 12) | 0xe0);
2250 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2251 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2252 continue;
2253 }
2254 else {
eb160463
GS
2255 *d++ = (U8)(( uv >> 18) | 0xf0);
2256 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2257 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2258 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2259 continue;
2260 }
56d37426 2261#endif
a0ed51b3 2262 }
dea0fc0b 2263 *newlen = d - dstart;
a0ed51b3
LW
2264 return d;
2265}
2266
2267/* Note: this one is slightly destructive of the source. */
2268
2269U8*
dea0fc0b 2270Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2271{
2272 U8* s = (U8*)p;
d4c19fe8 2273 U8* const send = s + bytelen;
7918f24d
NC
2274
2275 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2276
e0ea5e2d 2277 if (bytelen & 1)
147e3846 2278 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2279 (UV)bytelen);
2280
a0ed51b3 2281 while (s < send) {
d4c19fe8 2282 const U8 tmp = s[0];
a0ed51b3
LW
2283 s[0] = s[1];
2284 s[1] = tmp;
2285 s += 2;
2286 }
dea0fc0b 2287 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2288}
2289
922e8cb4
KW
2290bool
2291Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2292{
2293 U8 tmpbuf[UTF8_MAXBYTES+1];
2294 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2295 return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
922e8cb4
KW
2296}
2297
f9ae8fb6
JD
2298/* Internal function so we can deprecate the external one, and call
2299 this one from other deprecated functions in this file */
2300
f2645549
KW
2301bool
2302Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2303{
f2645549 2304 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2305
2306 if (*p == '_')
2307 return TRUE;
f25ce844 2308 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2309}
2310
5092f92a 2311bool
eba68aa0
KW
2312Perl__is_uni_perl_idcont(pTHX_ UV c)
2313{
2314 U8 tmpbuf[UTF8_MAXBYTES+1];
2315 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2316 return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
eba68aa0
KW
2317}
2318
2319bool
f91dcd13
KW
2320Perl__is_uni_perl_idstart(pTHX_ UV c)
2321{
2322 U8 tmpbuf[UTF8_MAXBYTES+1];
2323 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2324 return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
f91dcd13
KW
2325}
2326
3a4c58c9
KW
2327UV
2328Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2329{
2330 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2331 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2332 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2333 * either "SS" or "Ss". Which one to use is passed into the routine in
2334 * 'S_or_s' to avoid a test */
2335
2336 UV converted = toUPPER_LATIN1_MOD(c);
2337
2338 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2339
2340 assert(S_or_s == 'S' || S_or_s == 's');
2341
6f2d5cbc 2342 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2343 characters in this range */
3a4c58c9
KW
2344 *p = (U8) converted;
2345 *lenp = 1;
2346 return converted;
2347 }
2348
2349 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2350 * which it maps to one of them, so as to only have to have one check for
2351 * it in the main case */
2352 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2353 switch (c) {
2354 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2355 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2356 break;
2357 case MICRO_SIGN:
2358 converted = GREEK_CAPITAL_LETTER_MU;
2359 break;
79e064b9
KW
2360#if UNICODE_MAJOR_VERSION > 2 \
2361 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2362 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2363 case LATIN_SMALL_LETTER_SHARP_S:
2364 *(p)++ = 'S';
2365 *p = S_or_s;
2366 *lenp = 2;
2367 return 'S';
79e064b9 2368#endif
3a4c58c9
KW
2369 default:
2370 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2371 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2372 }
2373 }
2374
2375 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2376 *p = UTF8_TWO_BYTE_LO(converted);
2377 *lenp = 2;
2378
2379 return converted;
2380}
2381
50bda2c3
KW
2382/* Call the function to convert a UTF-8 encoded character to the specified case.
2383 * Note that there may be more than one character in the result.
2384 * INP is a pointer to the first byte of the input character
2385 * OUTP will be set to the first byte of the string of changed characters. It
2386 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2387 * LENP will be set to the length in bytes of the string of changed characters
2388 *
2389 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2390#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2391#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2392#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2393
b9992569
KW
2394/* This additionally has the input parameter 'specials', which if non-zero will
2395 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2396 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2397#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2398
84afefe6
JH
2399UV
2400Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2401{
a1433954
KW
2402 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2403 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2404 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2405 * the changed version may be longer than the original character.
2406 *
2407 * The ordinal of the first character of the changed version is returned
2408 * (but note, as explained above, that there may be more.) */
2409
7918f24d
NC
2410 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2411
3a4c58c9
KW
2412 if (c < 256) {
2413 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2414 }
2415
0ebc6274 2416 uvchr_to_utf8(p, c);
b9992569 2417 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2418}
2419
84afefe6
JH
2420UV
2421Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2422{
7918f24d
NC
2423 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2424
3a4c58c9
KW
2425 if (c < 256) {
2426 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2427 }
2428
0ebc6274 2429 uvchr_to_utf8(p, c);
b9992569 2430 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2431}
2432
afc16117 2433STATIC U8
eaf412bf 2434S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2435{
2436 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2437 * those, converting the result to UTF-8. Since the result is always just
a1433954 2438 * one character, we allow <p> to be NULL */
afc16117
KW
2439
2440 U8 converted = toLOWER_LATIN1(c);
2441
eaf412bf
KW
2442 PERL_UNUSED_ARG(dummy);
2443
afc16117 2444 if (p != NULL) {
6f2d5cbc 2445 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2446 *p = converted;
2447 *lenp = 1;
2448 }
2449 else {
430c9760
KW
2450 /* Result is known to always be < 256, so can use the EIGHT_BIT
2451 * macros */
2452 *p = UTF8_EIGHT_BIT_HI(converted);
2453 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2454 *lenp = 2;
2455 }
2456 }
2457 return converted;
2458}
2459
84afefe6
JH
2460UV
2461Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2462{
7918f24d
NC
2463 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2464
afc16117 2465 if (c < 256) {
eaf412bf 2466 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
2467 }
2468
afc16117 2469 uvchr_to_utf8(p, c);
b9992569 2470 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2471}
2472
84afefe6 2473UV
51910141 2474Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2475{
51910141 2476 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2477 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2478 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2479 *
2480 * Not to be used for locale folds
51910141 2481 */
f673fad4 2482
a1dde8de
KW
2483 UV converted;
2484
2485 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2486 PERL_UNUSED_CONTEXT;
a1dde8de 2487
1ca267a5
KW
2488 assert (! (flags & FOLD_FLAGS_LOCALE));
2489
659a7c2d 2490 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2491 converted = GREEK_SMALL_LETTER_MU;
2492 }
9b63e895
KW
2493#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2494 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2495 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2496 else if ( (flags & FOLD_FLAGS_FULL)
2497 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2498 {
1ca267a5
KW
2499 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2500 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2501 * under those circumstances. */
2502 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2503 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2504 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2505 p, *lenp, U8);
2506 return LATIN_SMALL_LETTER_LONG_S;
2507 }
2508 else {
4f489194
KW
2509 *(p)++ = 's';
2510 *p = 's';
2511 *lenp = 2;
2512 return 's';
1ca267a5 2513 }
a1dde8de 2514 }
9b63e895 2515#endif
a1dde8de
KW
2516 else { /* In this range the fold of all other characters is their lower
2517 case */
2518 converted = toLOWER_LATIN1(c);
2519 }
2520
6f2d5cbc 2521 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2522 *p = (U8) converted;
2523 *lenp = 1;
2524 }
2525 else {
2526 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2527 *p = UTF8_TWO_BYTE_LO(converted);
2528 *lenp = 2;
2529 }
2530
2531 return converted;
2532}
2533
2534UV
31f05a37 2535Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2536{
4b593389 2537
a0270393
KW
2538 /* Not currently externally documented, and subject to change
2539 * <flags> bits meanings:
2540 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2541 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2542 * locale are to be used.
a0270393
KW
2543 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2544 */
4b593389 2545
36bb2ab6 2546 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2547
780fcc9f
KW
2548 if (flags & FOLD_FLAGS_LOCALE) {
2549 /* Treat a UTF-8 locale as not being in locale at all */
2550 if (IN_UTF8_CTYPE_LOCALE) {
2551 flags &= ~FOLD_FLAGS_LOCALE;
2552 }
2553 else {
2554 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2555 goto needs_full_generality;
780fcc9f 2556 }
31f05a37
KW
2557 }
2558
a1dde8de 2559 if (c < 256) {
e7b7ac46 2560 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2561 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2562 }
2563
2f306ab9 2564 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2565 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2566 uvchr_to_utf8(p, c);
b9992569 2567 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 2568 }
567b353c 2569 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
2570 the special flags. */
2571 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2572
2573 needs_full_generality:
a0270393 2574 uvchr_to_utf8(utf8_c, c);
a1a5ec35 2575 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags);
a0270393 2576 }
84afefe6
JH
2577}
2578
26483009 2579PERL_STATIC_INLINE bool
5141f98e 2580S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2581 const char *const swashname, SV* const invlist)
bde6a22d 2582{
ea317ccb
KW
2583 /* returns a boolean giving whether or not the UTF8-encoded character that
2584 * starts at <p> is in the swash indicated by <swashname>. <swash>
2585 * contains a pointer to where the swash indicated by <swashname>
2586 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2587 * look at <*swash> and only generate a swash if it is not null. <invlist>
2588 * is NULL or an inversion list that defines the swash. If not null, it
2589 * saves time during initialization of the swash.
ea317ccb
KW
2590 *
2591 * Note that it is assumed that the buffer length of <p> is enough to
2592 * contain all the bytes that comprise the character. Thus, <*p> should
2593 * have been checked before this call for mal-formedness enough to assure
2594 * that. */
2595
7918f24d
NC
2596 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2597
492a624f 2598 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2599 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2600 * as far as there being enough bytes available in it to accommodate the
2601 * character without reading beyond the end, and pass that number on to the
2602 * validating routine */
6302f837 2603 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
86ae6e94 2604 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
99a765e9 2605 _UTF8_NO_CONFIDENCE_IN_CURLEN,
86ae6e94
KW
2606 1 /* Die */ );
2607 NOT_REACHED; /* NOTREACHED */
28123549 2608 }
86ae6e94 2609
87367d5f
KW
2610 if (!*swash) {
2611 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2612 *swash = _core_swash_init("utf8",
2613
2614 /* Only use the name if there is no inversion
2615 * list; otherwise will go out to disk */
2616 (invlist) ? "" : swashname,
2617
2618 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2619 }
28123549 2620
bde6a22d
NC
2621 return swash_fetch(*swash, p, TRUE) != 0;
2622}
2623
da8c1a98
KW
2624PERL_STATIC_INLINE bool
2625S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
2626 const char *const swashname, SV* const invlist)
2627{
2628 /* returns a boolean giving whether or not the UTF8-encoded character that
2629 * starts at <p>, and extending no further than <e - 1> is in the swash
2630 * indicated by <swashname>. <swash> contains a pointer to where the swash
2631 * indicated by <swashname> is to be stored; which this routine will do, so
2632 * that future calls will look at <*swash> and only generate a swash if it
2633 * is not null. <invlist> is NULL or an inversion list that defines the
2634 * swash. If not null, it saves time during initialization of the swash.
2635 */
2636
2637 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
2638
2639 if (! isUTF8_CHAR(p, e)) {
2640 _force_out_malformed_utf8_message(p, e, 0, 1);
2641 NOT_REACHED; /* NOTREACHED */
2642 }
2643
2644 if (!*swash) {
2645 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2646 *swash = _core_swash_init("utf8",
2647
2648 /* Only use the name if there is no inversion
2649 * list; otherwise will go out to disk */
2650 (invlist) ? "" : swashname,
2651
2652 &PL_sv_undef, 1, 0, invlist, &flags);
2653 }
2654
2655 return swash_fetch(*swash, p, TRUE) != 0;
2656}
2657
34aeb2e9
KW
2658STATIC void
2659S_warn_on_first_deprecated_use(pTHX_ const char * const name,
2660 const char * const alternative,
2661 const bool use_locale,
2662 const char * const file,
2663 const unsigned line)
2664{
2665 const char * key;
2666
2667 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
2668
2669 if (ckWARN_d(WARN_DEPRECATED)) {
2670
2671 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
2672 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
2673 if (! PL_seen_deprecated_macro) {
2674 PL_seen_deprecated_macro = newHV();
2675 }
2676 if (! hv_store(PL_seen_deprecated_macro, key,
2677 strlen(key), &PL_sv_undef, 0))
2678 {
2679 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2680 }
2681
c44e9413 2682 if (instr(file, "mathoms.c")) {
607313a1
KW
2683 Perl_warner(aTHX_ WARN_DEPRECATED,
2684 "In %s, line %d, starting in Perl v5.30, %s()"
2685 " will be removed. Avoid this message by"
2686 " converting to use %s().\n",
2687 file, line, name, alternative);
2688 }
2689 else {
34aeb2e9
KW
2690 Perl_warner(aTHX_ WARN_DEPRECATED,
2691 "In %s, line %d, starting in Perl v5.30, %s() will"
2692 " require an additional parameter. Avoid this"
2693 " message by converting to use %s().\n",
2694 file, line, name, alternative);
607313a1 2695 }
34aeb2e9
KW
2696 }
2697 }
2698}
2699
bde6a22d 2700bool
34aeb2e9 2701Perl__is_utf8_FOO(pTHX_ U8 classnum,
be99e2c2 2702 const U8 * const p,
34aeb2e9
KW
2703 const char * const name,
2704 const char * const alternative,
2705 const bool use_utf8,
2706 const bool use_locale,
2707 const char * const file,
2708 const unsigned line)
922e8cb4 2709{
922e8cb4
KW
2710 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2711
34aeb2e9
KW
2712 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
2713
2714 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
34aeb2e9
KW
2715
2716 switch (classnum) {
2717 case _CC_WORDCHAR:
2718 case _CC_DIGIT:
2719 case _CC_ALPHA:
2720 case _CC_LOWER:
2721 case _CC_UPPER:
2722 case _CC_PUNCT:
2723 case _CC_PRINT:
2724 case _CC_ALPHANUMERIC:
2725 case _CC_GRAPH:
2726 case _CC_CASED:
2727
2728 return is_utf8_common(p,
2729 &PL_utf8_swash_ptrs[classnum],
2730 swash_property_names[classnum],
2731 PL_XPosix_ptrs[classnum]);
2732
2733 case _CC_SPACE:
2734 return is_XPERLSPACE_high(p);
2735 case _CC_BLANK:
2736 return is_HORIZWS_high(p);
2737 case _CC_XDIGIT:
2738 return is_XDIGIT_high(p);
2739 case _CC_CNTRL:
2740 return 0;
2741 case _CC_ASCII:
2742 return 0;
2743 case _CC_VERTSPACE:
2744 return is_VERTWS_high(p);
2745 case _CC_IDFIRST:
2746 if (! PL_utf8_perl_idstart) {
22f0498f
KW
2747 PL_utf8_perl_idstart
2748 = _new_invlist_C_array(_Perl_IDStart_invlist);
34aeb2e9 2749 }
22f0498f
KW
2750 return is_utf8_common(p, &PL_utf8_perl_idstart,
2751 "_Perl_IDStart", NULL);
34aeb2e9
KW
2752 case _CC_IDCONT:
2753 if (! PL_utf8_perl_idcont) {
22f0498f
KW
2754 PL_utf8_perl_idcont
2755 = _new_invlist_C_array(_Perl_IDCont_invlist);
34aeb2e9 2756 }
22f0498f
KW
2757 return is_utf8_common(p, &PL_utf8_perl_idcont,
2758 "_Perl_IDCont", NULL);
34aeb2e9
KW
2759 }
2760 }
2761
2762 /* idcont is the same as wordchar below 256 */
2763 if (classnum == _CC_IDCONT) {
2764 classnum = _CC_WORDCHAR;
2765 }
2766 else if (classnum == _CC_IDFIRST) {
2767 if (*p == '_') {
2768 return TRUE;
2769 }
2770 classnum = _CC_ALPHA;
2771 }
2772
2773 if (! use_locale) {
2774 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2775 return _generic_isCC(*p, classnum);
2776 }
922e8cb4 2777
34aeb2e9
KW
2778 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
2779 }
2780 else {
2781 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2782 return isFOO_lc(classnum, *p);
2783 }
2784
2785 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
2786 }
2787
2788 NOT_REACHED; /* NOTREACHED */
922e8cb4
KW
2789}
2790
2791bool
da8c1a98
KW
2792Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
2793 const U8 * const e)
2794{
2795 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
2796
2797 assert(classnum < _FIRST_NON_SWASH_CC);
2798
2799 return is_utf8_common_with_len(p,
2800 e,
2801 &PL_utf8_swash_ptrs[classnum],
2802 swash_property_names[classnum],
2803 PL_XPosix_ptrs[classnum]);
2804}
2805
2806bool
da8c1a98
KW
2807Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
2808{
2809 SV* invlist = NULL;
2810
2811 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
2812
2813 if (! PL_utf8_perl_idstart) {
2814 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2815 }
2816 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
2817 "_Perl_IDStart", invlist);
2818}
2819
2820bool
f2645549 2821Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2822{
f2645549 2823 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2824
2825 if (*p == '_')
2826 return TRUE;
f25ce844 2827 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2828}
2829
2830bool
da8c1a98
KW
2831Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
2832{
2833 SV* invlist = NULL;
2834
2835 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
2836
2837 if (! PL_utf8_perl_idcont) {
2838 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2839 }
2840 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
2841 "_Perl_IDCont", invlist);
2842}
2843
2844bool
f2645549 2845Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2846{
f2645549 2847 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2848
f25ce844 2849 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2850}
2851
2852bool
f2645549 2853Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2854{
f2645549 2855 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2856
f25ce844 2857 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2858}
2859
2860bool
7dbf68d2
KW
2861Perl__is_utf8_mark(pTHX_ const U8 *p)
2862{
7dbf68d2
KW
2863 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2864
f25ce844 2865 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2866}
2867
b9992569 2868 /* change namve uv1 to 'from' */
6a4a25f4 2869STATIC UV
b9992569
KW
2870S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2871 SV **swashp, const char *normal, const char *special)
2872{
0134edef 2873 STRLEN len = 0;
7918f24d 2874
b9992569 2875 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2876
36eaa811
KW
2877 /* For code points that don't change case, we already know that the output
2878 * of this function is the unchanged input, so we can skip doing look-ups
2879 * for them. Unfortunately the case-changing code points are scattered
2880 * around. But there are some long consecutive ranges where there are no
2881 * case changing code points. By adding tests, we can eliminate the lookup
2882 * for all the ones in such ranges. This is currently done here only for
2883 * just a few cases where the scripts are in common use in modern commerce
2884 * (and scripts adjacent to those which can be included without additional
2885 * tests). */
2886
2887 if (uv1 >= 0x0590) {
2888 /* This keeps from needing further processing the code points most
2889 * likely to be used in the following non-cased scripts: Hebrew,
2890 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2891 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2892 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2893 if (uv1 < 0x10A0) {
2894 goto cases_to_self;
2895 }
2896
2897 /* The following largish code point ranges also don't have case
2898 * changes, but khw didn't think they warranted extra tests to speed
2899 * them up (which would slightly slow down everything else above them):
2900 * 1100..139F Hangul Jamo, Ethiopic
2901 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2902 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2903 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2904 * Combining Diacritical Marks Extended, Balinese,
2905 * Sundanese, Batak, Lepcha, Ol Chiki
2906 * 2000..206F General Punctuation
2907 */
2908
2909 if (uv1 >= 0x2D30) {
2910
2911 /* This keeps the from needing further processing the code points
2912 * most likely to be used in the following non-cased major scripts:
2913 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2914 *
2915 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2916 * event that Unicode eventually allocates the unused block as of
2917 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2918 * that the test suite will start having failures to alert you
2919 * should that happen) */
2920 if (uv1 < 0xA640) {
2921 goto cases_to_self;
2922 }
2923
2924 if (uv1 >= 0xAC00) {
2925 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2926 if (ckWARN_d(WARN_SURROGATE)) {
2927 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2928 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 2929 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
2930 }
2931 goto cases_to_self;
2932 }
36eaa811
KW
2933
2934 /* AC00..FAFF Catches Hangul syllables and private use, plus
2935 * some others */
2936 if (uv1 < 0xFB00) {
2937 goto cases_to_self;
2938
2939 }
2940
5af9bc97 2941 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
76513bdc
KW
2942 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2943 && ckWARN_d(WARN_DEPRECATED))
2944 {
2945 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2946 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
5af9bc97
KW
2947 }
2948 if (ckWARN_d(WARN_NON_UNICODE)) {
2949 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2950 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 2951 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
2952 }
2953 goto cases_to_self;
2954 }
3bfc1e70
KW
2955#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2956 if (UNLIKELY(uv1
2957 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2958 {
2959
2960 /* As of this writing, this means we avoid swash creation
2961 * for anything beyond low Plane 1 */
2962 goto cases_to_self;
2963 }
2964#endif
36eaa811
KW
2965 }
2966 }
9ae3ac1a 2967
36eaa811
KW
2968 /* Note that non-characters are perfectly legal, so no warning should
2969 * be given. There are so few of them, that it isn't worth the extra
2970 * tests to avoid swash creation */
9ae3ac1a
KW
2971 }
2972
0134edef 2973 if (!*swashp) /* load on-demand */
5ab9d2ef 2974 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2975
a6f87d8c 2976 if (special) {
0134edef 2977 /* It might be "special" (sometimes, but not always,
2a37f04d 2978 * a multicharacter mapping) */
4a8240a3 2979 HV *hv = NULL;
b08cf34e
JH
2980 SV **svp;
2981
4a8240a3
KW
2982 /* If passed in the specials name, use that; otherwise use any
2983 * given in the swash */
2984 if (*special != '\0') {
2985 hv = get_hv(special, 0);
2986 }
2987 else {
2988 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2989 if (svp) {
2990 hv = MUTABLE_HV(SvRV(*svp));
2991 }
2992 }
2993
176fe009 2994 if (hv
5f560d8a 2995 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2996 && (*svp))
2997 {
cfd0369c 2998 const char *s;
47654450 2999
cfd0369c 3000 s = SvPV_const(*svp, len);
47654450 3001 if (len == 1)
f4cd282c 3002 /* EIGHTBIT */
c80e42f3 3003 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 3004 else {
d2dcd0fb 3005 Copy(s, ustrp, len, U8);
29e98929 3006 }
983ffd37 3007 }
0134edef
JH
3008 }
3009
3010 if (!len && *swashp) {
4a4088c4 3011 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 3012
0134edef
JH
3013 if (uv2) {
3014 /* It was "normal" (a single character mapping). */
f4cd282c 3015 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
3016 }
3017 }
1feea2c7 3018
cbe07460
KW
3019 if (len) {
3020 if (lenp) {
3021 *lenp = len;
3022 }
3023 return valid_utf8_to_uvchr(ustrp, 0);
3024 }
3025
3026 /* Here, there was no mapping defined, which means that the code point maps
3027 * to itself. Return the inputs */
e24dfe9c 3028 cases_to_self:
bfdf22ec 3029 len = UTF8SKIP(p);
ca9fab46
KW
3030 if (p != ustrp) { /* Don't copy onto itself */
3031 Copy(p, ustrp, len, U8);
3032 }
0134edef 3033
2a37f04d
JH
3034 if (lenp)
3035 *lenp = len;
3036
f4cd282c 3037 return uv1;
cbe07460 3038
a0ed51b3
LW
3039}
3040
051a06d4 3041STATIC UV
357aadde 3042S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 3043{
4a4088c4 3044 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3045 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3046 * result contains a character that crosses the 255/256 boundary, disallow
3047 * the change, and return the original code point. See L<perlfunc/lc> for
3048 * why;
051a06d4 3049 *
a1433954
KW
3050 * p points to the original string whose case was changed; assumed
3051 * by this routine to be well-formed
051a06d4
KW
3052 * result the code point of the first character in the changed-case string
3053 * ustrp points to the changed-case string (<result> represents its first char)
3054 * lenp points to the length of <ustrp> */
3055
3056 UV original; /* To store the first code point of <p> */
3057
3058 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3059
a4f12ed7 3060 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3061
3062 /* We know immediately if the first character in the string crosses the
3063 * boundary, so can skip */
3064 if (result > 255) {
3065
3066 /* Look at every character in the result; if any cross the
3067 * boundary, the whole thing is disallowed */
3068 U8* s = ustrp + UTF8SKIP(ustrp);
3069 U8* e = ustrp + *lenp;
3070 while (s < e) {
a4f12ed7 3071 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3072 goto bad_crossing;
3073 }
3074 s += UTF8SKIP(s);
3075 }
3076
613abc6d
KW
3077 /* Here, no characters crossed, result is ok as-is, but we warn. */
3078 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3079 return result;
3080 }
3081
7b52d656 3082 bad_crossing:
051a06d4
KW
3083
3084 /* Failed, have to return the original */
4b88fb76 3085 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3086
3087 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3088 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
147e3846
KW
3089 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
3090 "resolved to \"\\x{%" UVXf "}\".",
357aadde 3091 OP_DESC(PL_op),
ab0b796c
KW
3092 original,
3093 original);
051a06d4
KW
3094 Copy(p, ustrp, *lenp, char);
3095 return original;
3096}
3097
607313a1
KW
3098STATIC U32
3099S_check_and_deprecate(pTHX_ const U8 *p,
3100 const U8 **e,
3101 const unsigned int type, /* See below */
3102 const bool use_locale, /* Is this a 'LC_'
3103 macro call? */
3104 const char * const file,
3105 const unsigned line)
3106{
3107 /* This is a temporary function to deprecate the unsafe calls to the case
3108 * changing macros and functions. It keeps all the special stuff in just
3109 * one place.
3110 *
3111 * It updates *e with the pointer to the end of the input string. If using
3112 * the old-style macros, *e is NULL on input, and so this function assumes
3113 * the input string is long enough to hold the entire UTF-8 sequence, and
3114 * sets *e accordingly, but it then returns a flag to pass the
3115 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3116 * using the full length if possible.
3117 *
3118 * It also does the assert that *e > p when *e is not NULL. This should be
3119 * migrated to the callers when this function gets deleted.
3120 *
3121 * The 'type' parameter is used for the caller to specify which case
3122 * changing function this is called from: */
3123
3124# define DEPRECATE_TO_UPPER 0
3125# define DEPRECATE_TO_TITLE 1
3126# define DEPRECATE_TO_LOWER 2
3127# define DEPRECATE_TO_FOLD 3
3128
3129 U32 utf8n_flags = 0;
3130 const char * name;
3131 const char * alternative;
3132
3133 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3134
3135 if (*e == NULL) {
3136 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3137 *e = p + UTF8SKIP(p);
3138
3139 /* For mathoms.c calls, we use the function name we know is stored
c44e9413 3140 * there. It could be part of a larger path */
607313a1 3141 if (type == DEPRECATE_TO_UPPER) {
c44e9413 3142 name = instr(file, "mathoms.c")
607313a1
KW
3143 ? "to_utf8_upper"
3144 : "toUPPER_utf8";
3145 alternative = "toUPPER_utf8_safe";
3146 }
3147 else if (type == DEPRECATE_TO_TITLE) {
c44e9413 3148 name = instr(file, "mathoms.c")
607313a1
KW
3149 ? "to_utf8_title"
3150 : "toTITLE_utf8";
3151 alternative = "toTITLE_utf8_safe";
3152 }
3153 else if (type == DEPRECATE_TO_LOWER) {
c44e9413 3154 name = instr(file, "mathoms.c")
607313a1
KW
3155 ? "to_utf8_lower"
3156 : "toLOWER_utf8";
3157 alternative = "toLOWER_utf8_safe";
3158 }
3159 else if (type == DEPRECATE_TO_FOLD) {
c44e9413 3160 name = instr(file, "mathoms.c")
607313a1
KW
3161 ? "to_utf8_fold"
3162 : "toFOLD_utf8";
3163 alternative = "toFOLD_utf8_safe";
3164 }
3165 else Perl_croak(aTHX_ "panic: Unexpected case change type");
3166
3167 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3168 }
3169 else {
3170 assert (p < *e);
3171 }
3172
3173 return utf8n_flags;
3174}
3175
eaf412bf
KW
3176/* The process for changing the case is essentially the same for the four case
3177 * change types, except there are complications for folding. Otherwise the
3178 * difference is only which case to change to. To make sure that they all do
3179 * the same thing, the bodies of the functions are extracted out into the
3180 * following two macros. The functions are written with the same variable
3181 * names, and these are known and used inside these macros. It would be
3182 * better, of course, to have inline functions to do it, but since different
3183 * macros are called, depending on which case is being changed to, this is not
3184 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3185 * function can start with the common start macro, then finish with its special
3186 * handling; while the other three cases can just use the common end macro.
3187 *
3188 * The algorithm is to use the proper (passed in) macro or function to change
3189 * the case for code points that are below 256. The macro is used if using
3190 * locale rules for the case change; the function if not. If the code point is
3191 * above 255, it is computed from the input UTF-8, and another macro is called
3192 * to do the conversion. If necessary, the output is converted to UTF-8. If
3193 * using a locale, we have to check that the change did not cross the 255/256
3194 * boundary, see check_locale_boundary_crossing() for further details.
3195 *
3196 * The macros are split with the correct case change for the below-256 case
3197 * stored into 'result', and in the middle of an else clause for the above-255
3198 * case. At that point in the 'else', 'result' is not the final result, but is
3199 * the input code point calculated from the UTF-8. The fold code needs to
3200 * realize all this and take it from there.
3201 *
3202 * If you read the two macros as sequential, it's easier to understand what's
3203 * going on. */
3204#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
3205 L1_func_extra_param) \
a239b1e2 3206 \
eaf412bf
KW
3207 if (flags & (locale_flags)) { \
3208 /* Treat a UTF-8 locale as not being in locale at all */ \
3209 if (IN_UTF8_CTYPE_LOCALE) { \
3210 flags &= ~(locale_flags); \
3211 } \
3212 else { \
3213 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3214 } \
3215 } \
3216 \
3217 if (UTF8_IS_INVARIANT(*p)) { \
3218 if (flags & (locale_flags)) { \
3219 result = LC_L1_change_macro(*p); \
3220 } \
3221 else { \
3222 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3223 } \
3224 } \
a239b1e2 3225 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
eaf412bf
KW
3226 if (flags & (locale_flags)) { \
3227 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
3228 *(p+1))); \
3229 } \
3230 else { \
3231 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
3232 ustrp, lenp, L1_func_extra_param); \
3233 } \
3234 } \
fa8ab374
KW
3235 else { /* malformed UTF-8 or ord above 255 */ \
3236 STRLEN len_result; \
fa8ab374
KW
3237 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3238 if (len_result == (STRLEN) -1) { \
607313a1
KW
3239 _force_out_malformed_utf8_message(p, e, utf8n_flags, \
3240 1 /* Die */ ); \
fa8ab374 3241 }
eaf412bf
KW
3242
3243#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3244 result = change_macro(result, p, ustrp, lenp); \
3245 \
3246 if (flags & (locale_flags)) { \
3247 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3248 } \
3249 return result; \
3250 } \
3251 \
3252 /* Here, used locale rules. Convert back to UTF-8 */ \
3253 if (UTF8_IS_INVARIANT(result)) { \
3254 *ustrp = (U8) result; \
3255 *lenp = 1; \
3256 } \
3257 else { \
3258 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3259 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3260 *lenp = 2; \
3261 } \
3262 \
3263 return result;
3264
d3e79532 3265/*
87cea99e 3266=for apidoc to_utf8_upper
d3e79532 3267
a239b1e2 3268Instead use L</toUPPER_utf8_safe>.
a1433954 3269
d3e79532
JH
3270=cut */
3271
051a06d4 3272/* Not currently externally documented, and subject to change:
31f05a37
KW
3273 * <flags> is set iff iff the rules from the current underlying locale are to
3274 * be used. */
051a06d4 3275
2104c8d9 3276UV
607313a1
KW
3277Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3278 const U8 *e,
3279 U8* ustrp,
3280 STRLEN *lenp,
3281 bool flags,
3282 const char * const file,
3283 const int line)
a0ed51b3 3284{
051a06d4 3285 UV result;
607313a1
KW
3286 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
3287 cBOOL(flags), file, line);
051a06d4
KW
3288
3289 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 3290
eaf412bf
KW
3291 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3292 /* 2nd char of uc(U+DF) is 'S' */
3293 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
3294 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
983ffd37 3295}
a0ed51b3 3296
d3e79532 3297/*
87cea99e 3298=for apidoc to_utf8_title
d3e79532 3299
a239b1e2 3300Instead use L</toTITLE_utf8_safe>.
a1433954 3301
d3e79532
JH
3302=cut */
3303
051a06d4 3304/* Not currently externally documented, and subject to change:
31f05a37
KW
3305 * <flags> is set iff the rules from the current underlying locale are to be
3306 * used. Since titlecase is not defined in POSIX, for other than a
3307 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 3308 */
051a06d4 3309
983ffd37 3310UV
607313a1
KW
3311Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3312 const U8 *e,
3313 U8* ustrp,
3314 STRLEN *lenp,
3315 bool flags,
3316 const char * const file,
3317 const int line)
983ffd37 3318{
051a06d4 3319 UV result;
607313a1
KW
3320 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
3321 cBOOL(flags), file, line);
051a06d4
KW
3322
3323 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 3324
eaf412bf
KW
3325 /* 2nd char of ucfirst(U+DF) is 's' */
3326 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
3327 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
a0ed51b3
LW
3328}
3329
d3e79532 3330/*
87cea99e 3331=for apidoc to_utf8_lower
d3e79532 3332
a239b1e2 3333Instead use L</toLOWER_utf8_safe>.
a1433954 3334
d3e79532
JH
3335=cut */
3336
051a06d4 3337/* Not currently externally documented, and subject to change:
31f05a37
KW
3338 * <flags> is set iff iff the rules from the current underlying locale are to
3339 * be used.
3340 */
051a06d4 3341
2104c8d9 3342UV
607313a1
KW
3343Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3344 const U8 *e,
3345 U8* ustrp,
3346 STRLEN *lenp,
3347 bool flags,
3348 const char * const file,
3349 const int line)
a0ed51b3 3350{
051a06d4 3351 UV result;
607313a1
KW
3352 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
3353 cBOOL(flags), file, line);
051a06d4 3354
051a06d4 3355 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 3356
eaf412bf
KW
3357 CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
3358 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
b4e400f9
JH
3359}
3360
d3e79532 3361/*
87cea99e 3362=for apidoc to_utf8_fold
d3e79532 3363
a239b1e2 3364Instead use L</toFOLD_utf8_safe>.
a1433954 3365
d3e79532
JH
3366=cut */
3367
051a06d4
KW
3368/* Not currently externally documented, and subject to change,
3369 * in <flags>
31f05a37
KW
3370 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3371 * locale are to be used.
051a06d4
KW
3372 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3373 * otherwise simple folds
a0270393
KW
3374 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3375 * prohibited
445bf929 3376 */
36bb2ab6 3377
b4e400f9 3378UV
607313a1
KW
3379Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3380 const U8 *e,
3381 U8* ustrp,
3382 STRLEN *lenp,
3383 U8 flags,
3384 const char * const file,
3385 const int line)
b4e400f9 3386{
051a06d4 3387 UV result;
607313a1
KW
3388 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
3389 cBOOL(flags), file, line);
051a06d4 3390
36bb2ab6 3391 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 3392
a0270393
KW
3393 /* These are mutually exclusive */
3394 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3395
50ba90ff
KW
3396 assert(p != ustrp); /* Otherwise overwrites */
3397
eaf412bf
KW
3398 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
3399 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
31f05a37 3400
eaf412bf 3401 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3402
1ca267a5
KW
3403 if (flags & FOLD_FLAGS_LOCALE) {
3404
76f2ffcd 3405# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3406 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3407
3408# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3409# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3410
3411 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3412
538e84ed
KW
3413 /* Special case these two characters, as what normally gets
3414 * returned under locale doesn't work */
76f2ffcd
KW
3415 if (UTF8SKIP(p) == cap_sharp_s_len
3416 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3417 {
ab0b796c
KW
3418 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3419 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3420 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3421 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3422 goto return_long_s;
3423 }
0766489e
KW
3424 else
3425#endif
3426 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3427 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3428 {
ab0b796c
KW
3429 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3430 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3431 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3432 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3433 goto return_ligature_st;
3434 }
74894415
KW
3435
3436#if UNICODE_MAJOR_VERSION == 3 \
3437 && UNICODE_DOT_VERSION == 0 \
3438 && UNICODE_DOT_DOT_VERSION == 1
3439# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3440
3441 /* And special case this on this Unicode version only, for the same
3442 * reaons the other two are special cased. They would cross the
3443 * 255/256 boundary which is forbidden under /l, and so the code
3444 * wouldn't catch that they are equivalent (which they are only in
3445 * this release) */
3446 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3447 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3448 {
3449 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3450 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3451 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3452 "resolved to \"\\x{0131}\".");
3453 goto return_dotless_i;
3454 }
3455#endif
3456
357aadde 3457 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3458 }
a0270393
KW
3459 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3460 return result;
3461 }
3462 else {
4a4088c4 3463 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3464 * character above the ASCII range, and the result should not
3465 * contain an ASCII character. */
a0270393
KW
3466
3467 UV original; /* To store the first code point of <p> */
3468
3469 /* Look at every character in the result; if any cross the
3470 * boundary, the whole thing is disallowed */
3471 U8* s = ustrp;
3472 U8* e = ustrp + *lenp;
3473 while (s < e) {
3474 if (isASCII(*s)) {
3475 /* Crossed, have to return the original */
3476 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3477
9fc2026f 3478 /* But in these instances, there is an alternative we can
1ca267a5 3479 * return that is valid */
0766489e
KW
3480 if (original == LATIN_SMALL_LETTER_SHARP_S
3481#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3482 || original == LATIN_CAPITAL_LETTER_SHARP_S
3483#endif
3484 ) {
1ca267a5
KW
3485 goto return_long_s;
3486 }
9fc2026f
KW
3487 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3488 goto return_ligature_st;
3489 }
74894415
KW
3490#if UNICODE_MAJOR_VERSION == 3 \
3491 && UNICODE_DOT_VERSION == 0 \
3492 && UNICODE_DOT_DOT_VERSION == 1
3493
3494 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3495 goto return_dotless_i;
3496 }
3497#endif
a0270393
KW
3498 Copy(p, ustrp, *lenp, char);
3499 return original;
3500 }
3501 s += UTF8SKIP(s);
3502 }
051a06d4 3503
a0270393
KW
3504 /* Here, no characters crossed, result is ok as-is */
3505 return result;
3506 }
051a06d4
KW
3507 }
3508
4a4088c4 3509 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3510 if (UTF8_IS_INVARIANT(result)) {
3511 *ustrp = (U8) result;
3512 *lenp = 1;
3513 }
3514 else {
62cb07ea
KW
3515 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3516 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3517 *lenp = 2;
3518 }
3519
051a06d4 3520 return result;
1ca267a5
KW
3521
3522 return_long_s:
3523 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3524 * folds to a string of two of these characters. By returning this
3525 * instead, then, e.g.,
3526 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3527 * works. */
3528
3529 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3530 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3531 ustrp, *lenp, U8);
3532 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3533
3534 return_ligature_st:
3535 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3536 * have the other one fold to it */
3537
3538 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3539 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3540 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3541
3542#if UNICODE_MAJOR_VERSION == 3 \
3543 && UNICODE_DOT_VERSION == 0 \
3544 && UNICODE_DOT_DOT_VERSION == 1
3545
3546 return_dotless_i:
3547 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3548 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3549 return LATIN_SMALL_LETTER_DOTLESS_I;
3550
3551#endif
3552
a0ed51b3
LW
3553}
3554
711a919c 3555/* Note:
f90a9a02 3556 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
3557 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3558 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3559 */
c4a5db0c 3560
a0ed51b3 3561SV*
7fc63493 3562Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3563{
c4a5db0c
KW
3564 PERL_ARGS_ASSERT_SWASH_INIT;
3565
3566 /* Returns a copy of a swash initiated by the called function. This is the
3567 * public interface, and returning a copy prevents others from doing
3568 * mischief on the original */
3569
5d3d13d1 3570 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3571}
3572
3573SV*
5d3d13d1 3574Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3575{
2c1f00b9
YO
3576
3577 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3578 * use the following define */
3579
3580#define CORE_SWASH_INIT_RETURN(x) \
3581 PL_curpm= old_PL_curpm; \
3582 return x
3583
c4a5db0c 3584 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3585 * by calling utf8_heavy.pl in the general case. The returned value may be
3586 * the swash's inversion list instead if the input parameters allow it.
3587 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3588 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3589 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3590 *
3591 * This interface should only be used by functions that won't destroy or
3592 * adversely change the swash, as doing so affects all other uses of the
3593 * swash in the program; the general public should use 'Perl_swash_init'
3594 * instead.
3595 *
3596 * pkg is the name of the package that <name> should be in.
3597 * name is the name of the swash to find. Typically it is a Unicode
3598 * property name, including user-defined ones
3599 * listsv is a string to initialize the swash with. It must be of the form
3600 * documented as the subroutine return value in
3601 * L<perlunicode/User-Defined Character Properties>
3602 * minbits is the number of bits required to represent each data element.
3603 * It is '1' for binary properties.
3604 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3605 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3606 * flags_p if non-NULL is the address of various input and output flag bits
3607 * to the routine, as follows: ('I' means is input to the routine;
3608 * 'O' means output from the routine. Only flags marked O are
3609 * meaningful on return.)
3610 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3611 * came from a user-defined property. (I O)
5d3d13d1
KW
3612 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3613 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3614 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3615 * return of an inversion list instead of a swash hash if this routine
3616 * thinks that would result in faster execution of swash_fetch() later
3617 * on. (I)
9a53f6cf
KW
3618 *
3619 * Thus there are three possible inputs to find the swash: <name>,
3620 * <listsv>, and <invlist>. At least one must be specified. The result
3621 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3622 * actions can intersect, etc. what <name> gives. To avoid going out to
3623 * disk at all, <invlist> should specify completely what the swash should
3624 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3625 *
3626 * <invlist> is only valid for binary properties */
c4a5db0c 3627
2c1f00b9
YO
3628 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3629
c4a5db0c 3630 SV* retval = &PL_sv_undef;
83199d38 3631 HV* swash_hv = NULL;
87367d5f
KW
3632 const int invlist_swash_boundary =
3633 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3634 ? 512 /* Based on some benchmarking, but not extensive, see commit
3635 message */
3636 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3637
3638 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3639 assert(! invlist || minbits == 1);
3640
2c1f00b9
YO
3641 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3642 that triggered the swash init and the swash init perl logic itself.
3643 See perl #122747 */
3644
9a53f6cf
KW
3645 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3646 * so */
3647 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3648 dSP;
3649 const size_t pkg_len = strlen(pkg);
3650 const size_t name_len = strlen(name);
3651 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3652 SV* errsv_save;
3653 GV *method;
3654
3655 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3656
3657 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3658 ENTER;
69794297 3659 SAVEHINTS();
2782061f 3660 save_re_context();
650f067c
JL
3661 /* We might get here via a subroutine signature which uses a utf8
3662 * parameter name, at which point PL_subname will have been set
3663 * but not yet used. */
3664 save_item(PL_subname);
69794297
KW
3665 if (PL_parser && PL_parser->error_count)
3666 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3667 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3668 if (!method) { /* demand load UTF-8 */
69794297 3669 ENTER;
db2c6cb3
FC
3670 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3671 GvSV(PL_errgv) = NULL;
1a419e6b 3672#ifndef NO_TAINT_SUPPORT
69794297
KW
3673 /* It is assumed that callers of this routine are not passing in
3674 * any user derived data. */
2782061f
DM
3675 /* Need to do this after save_re_context() as it will set
3676 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3677 * in Perl_magic_get). Even line to create errsv_save can turn on
3678 * PL_tainted. */
284167a5
S
3679 SAVEBOOL(TAINT_get);
3680 TAINT_NOT;
3681#endif
69794297
KW
3682 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3683 NULL);
eed484f9 3684 {
db2c6cb3
FC
3685 /* Not ERRSV, as there is no need to vivify a scalar we are
3686 about to discard. */
3687 SV * const errsv = GvSV(PL_errgv);
3688 if (!SvTRUE(errsv)) {
3689 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3690 SvREFCNT_dec(errsv);
3691 }
eed484f9 3692 }
69794297
KW
3693 LEAVE;
3694 }
3695 SPAGAIN;
3696 PUSHMARK(SP);
3697 EXTEND(SP,5);
3698 mPUSHp(pkg, pkg_len);
3699 mPUSHp(name, name_len);
3700 PUSHs(listsv);
3701 mPUSHi(minbits);
3702 mPUSHi(none);
3703 PUTBACK;
db2c6cb3
FC
3704 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3705 GvSV(PL_errgv) = NULL;
69794297
KW
3706 /* If we already have a pointer to the method, no need to use
3707 * call_method() to repeat the lookup. */
c41800a8
KW
3708 if (method
3709 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3710 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3711 {
3712 retval = *PL_stack_sp--;
3713 SvREFCNT_inc(retval);
3714 }
eed484f9 3715 {
db2c6cb3
FC
3716 /* Not ERRSV. See above. */
3717 SV * const errsv = GvSV(PL_errgv);
3718 if (!SvTRUE(errsv)) {
3719 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3720 SvREFCNT_dec(errsv);
3721 }
eed484f9 3722 }
ce3b816e 3723 LEAVE;
69794297
KW
3724 POPSTACK;
3725 if (IN_PERL_COMPILETIME) {
3726 CopHINTS_set(PL_curcop, PL_hints);
3727 }
3728 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3729 if (SvPOK(retval)) {
69794297
KW
3730
3731 /* If caller wants to handle missing properties, let them */
5d3d13d1 3732 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3733 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3734 }
3735 Perl_croak(aTHX_
147e3846 3736 "Can't find Unicode property definition \"%" SVf "\"",
69794297 3737 SVfARG(retval));
a25b5927 3738 NOT_REACHED; /* NOTREACHED */
d95e4a00 3739 }
69794297 3740 }
9a53f6cf 3741 } /* End of calling the module to find the swash */
36eb48b4 3742
83199d38
KW
3743 /* If this operation fetched a swash, and we will need it later, get it */
3744 if (retval != &PL_sv_undef
3745 && (minbits == 1 || (flags_p
3746 && ! (*flags_p
3747 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3748 {
3749 swash_hv = MUTABLE_HV(SvRV(retval));
3750
3751 /* If we don't already know that there is a user-defined component to
3752 * this swash, and the user has indicated they wish to know if there is
3753 * one (by passing <flags_p>), find out */
3754 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3755 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);