This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Change 2 static fcns to handle overlongs
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
76513bdc 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
9cbfb8ab
KW
55void
56Perl__force_out_malformed_utf8_message(pTHX_
57 const U8 *const p, /* First byte in UTF-8 sequence */
58 const U8 * const e, /* Final byte in sequence (may include
59 multiple chars */
60 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
61 usually 0, or some DISALLOW flags */
62 const bool die_here) /* If TRUE, this function does not return */
63{
64 /* This core-only function is to be called when a malformed UTF-8 character
65 * is found, in order to output the detailed information about the
66 * malformation before dieing. The reason it exists is for the occasions
67 * when such a malformation is fatal, but warnings might be turned off, so
68 * that normally they would not be actually output. This ensures that they
69 * do get output. Because a sequence may be malformed in more than one
70 * way, multiple messages may be generated, so we can't make them fatal, as
71 * that would cause the first one to die.
72 *
73 * Instead we pretend -W was passed to perl, then die afterwards. The
74 * flexibility is here to return to the caller so they can finish up and
75 * die themselves */
76 U32 errors;
77
78 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
79
80 ENTER;
c15a80f3 81 SAVEI8(PL_dowarn);
9cbfb8ab
KW
82 SAVESPTR(PL_curcop);
83
84 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
85 if (PL_curcop) {
86 PL_curcop->cop_warnings = pWARN_ALL;
87 }
88
89 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
90
91 LEAVE;
92
93 if (! errors) {
94 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
95 " be called only when there are errors found");
96 }
97
98 if (die_here) {
99 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
100 }
101}
102
eaf7a4d2 103/*
378516de 104=for apidoc uvoffuni_to_utf8_flags
eebe1485 105
a27992cc 106THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
107Instead, B<Almost all code should use L</uvchr_to_utf8> or
108L</uvchr_to_utf8_flags>>.
a27992cc 109
de69f3af
KW
110This function is like them, but the input is a strict Unicode
111(as opposed to native) code point. Only in very rare circumstances should code
112not be using the native code point.
949cf498 113
efa9cd84 114For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 115
eebe1485
SC
116=cut
117*/
118
c94c2f39
KW
119/* All these formats take a single UV code point argument */
120const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
121const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
122 " is not recommended for open interchange";
123const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
124 " may not be portable";
57ff5f59
KW
125const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
126 " Unicode, requires a Perl extension," \
127 " and so is not portable";
c94c2f39 128
8ee1cdcb
KW
129#define HANDLE_UNICODE_SURROGATE(uv, flags) \
130 STMT_START { \
131 if (flags & UNICODE_WARN_SURROGATE) { \
132 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
c94c2f39 133 surrogate_cp_format, uv); \
8ee1cdcb
KW
134 } \
135 if (flags & UNICODE_DISALLOW_SURROGATE) { \
136 return NULL; \
137 } \
138 } STMT_END;
139
140#define HANDLE_UNICODE_NONCHAR(uv, flags) \
141 STMT_START { \
142 if (flags & UNICODE_WARN_NONCHAR) { \
143 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
c94c2f39 144 nonchar_cp_format, uv); \
8ee1cdcb
KW
145 } \
146 if (flags & UNICODE_DISALLOW_NONCHAR) { \
147 return NULL; \
148 } \
149 } STMT_END;
150
ba6ed43c
KW
151/* Use shorter names internally in this file */
152#define SHIFT UTF_ACCUMULATION_SHIFT
153#undef MARK
154#define MARK UTF_CONTINUATION_MARK
155#define MASK UTF_CONTINUATION_MASK
156
dfe13c55 157U8 *
4b31b634 158Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
a0ed51b3 159{
378516de 160 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 161
2d1545e5 162 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 163 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
164 return d;
165 }
facc1dc2 166
3ea68d71 167 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
168 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
169 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
170 return d;
171 }
d9432125 172
ba6ed43c
KW
173 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
174 * below, the 16 is for start bytes E0-EF (which are all the possible ones
175 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
176 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
177 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
178 * 0x800-0xFFFF on ASCII */
179 if (uv < (16 * (1U << (2 * SHIFT)))) {
180 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
181 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
182 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
183
184#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
185 aren't tested here */
186 /* The most likely code points in this range are below the surrogates.
187 * Do an extra test to quickly exclude those. */
188 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
189 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
190 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
191 {
8ee1cdcb
KW
192 HANDLE_UNICODE_NONCHAR(uv, flags);
193 }
194 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
195 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 196 }
ba6ed43c
KW
197 }
198#endif
199 return d;
200 }
201
202 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
203 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
204 * happen starting with 4-byte characters on ASCII platforms. We unify the
205 * code for these with EBCDIC, even though some of them require 5-bytes on
206 * those, because khw believes the code saving is worth the very slight
207 * performance hit on these high EBCDIC code points. */
208
209 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
76513bdc
KW
210 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
211 && ckWARN_d(WARN_DEPRECATED))
212 {
213 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
214 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
a5bf80e0 215 }
0a8a1a5b
KW
216 if ( (flags & UNICODE_WARN_SUPER)
217 || ( (flags & UNICODE_WARN_PERL_EXTENDED)
218 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
219 {
220 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
221
222 /* Choose the more dire applicable warning */
d044b7a7 223 (UNICODE_IS_PERL_EXTENDED(uv))
57ff5f59 224 ? perl_extended_cp_format
c94c2f39 225 : super_cp_format,
a5bf80e0
KW
226 uv);
227 }
56576a04 228 if ( (flags & UNICODE_DISALLOW_SUPER)
0a8a1a5b
KW
229 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
230 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
231 {
232 return NULL;
233 }
234 }
ba6ed43c
KW
235 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
236 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 237 }
d9432125 238
ba6ed43c
KW
239 /* Test for and handle 4-byte result. In the test immediately below, the
240 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
241 * characters). The 3 is for 3 continuation bytes; these each contribute
242 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
243 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
244 * 0x1_0000-0x1F_FFFF on ASCII */
245 if (uv < (8 * (1U << (3 * SHIFT)))) {
246 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
247 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
248 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
249 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
250
251#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
252 characters. The end-plane non-characters for EBCDIC were
253 handled just above */
254 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
255 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 256 }
ba6ed43c
KW
257 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
258 HANDLE_UNICODE_SURROGATE(uv, flags);
259 }
260#endif
261
262 return d;
263 }
264
265 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
266 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
267 * format. The unrolled version above turns out to not save all that much
268 * time, and at these high code points (well above the legal Unicode range
269 * on ASCII platforms, and well above anything in common use in EBCDIC),
270 * khw believes that less code outweighs slight performance gains. */
271
d9432125 272 {
5aaebcb3 273 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
274 U8 *p = d+len-1;
275 while (p > d) {
4c8cd605 276 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
277 uv >>= UTF_ACCUMULATION_SHIFT;
278 }
4c8cd605 279 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
280 return d+len;
281 }
a0ed51b3 282}
a5bf80e0 283
646ca15d 284/*
07693fe6
KW
285=for apidoc uvchr_to_utf8
286
bcb1a2d4 287Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 288of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
289C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
290the byte after the end of the new character. In other words,
07693fe6
KW
291
292 d = uvchr_to_utf8(d, uv);
293
294is the recommended wide native character-aware way of saying
295
296 *(d++) = uv;
297
760c7c2f
KW
298This function accepts any UV as input, but very high code points (above
299C<IV_MAX> on the platform) will raise a deprecation warning. This is
300typically 0x7FFF_FFFF in a 32-bit word.
301
302It is possible to forbid or warn on non-Unicode code points, or those that may
303be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 304
07693fe6
KW
305=cut
306*/
307
de69f3af
KW
308/* This is also a macro */
309PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
310
07693fe6
KW
311U8 *
312Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
313{
de69f3af 314 return uvchr_to_utf8(d, uv);
07693fe6
KW
315}
316
de69f3af
KW
317/*
318=for apidoc uvchr_to_utf8_flags
319
320Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 321of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
322C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
323the byte after the end of the new character. In other words,
de69f3af
KW
324
325 d = uvchr_to_utf8_flags(d, uv, flags);
326
327or, in most cases,
328
329 d = uvchr_to_utf8_flags(d, uv, 0);
330
331This is the Unicode-aware way of saying
332
333 *(d++) = uv;
334
760c7c2f
KW
335If C<flags> is 0, this function accepts any UV as input, but very high code
336points (above C<IV_MAX> for the platform) will raise a deprecation warning.
337This is typically 0x7FFF_FFFF in a 32-bit word.
338
339Specifying C<flags> can further restrict what is allowed and not warned on, as
340follows:
de69f3af 341
796b6530 342If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
343the function will raise a warning, provided UTF8 warnings are enabled. If
344instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
345NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 346
760c7c2f
KW
347Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
348affect how the function handles a Unicode non-character.
93e6dbd6 349
760c7c2f
KW
350And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
351affect the handling of code points that are above the Unicode maximum of
3520x10FFFF. Languages other than Perl may not be able to accept files that
353contain these.
93e6dbd6
KW
354
355The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
356the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
357three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
358allowed inputs to the strict UTF-8 traditionally defined by Unicode.
359Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
360C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
361above-Unicode and surrogate flags, but not the non-character ones, as
362defined in
363L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
364See L<perlunicode/Noncharacter code points>.
93e6dbd6 365
57ff5f59
KW
366Extremely high code points were never specified in any standard, and require an
367extension to UTF-8 to express, which Perl does. It is likely that programs
368written in something other than Perl would not be able to read files that
369contain these; nor would Perl understand files written by something that uses a
370different extension. For these reasons, there is a separate set of flags that
371can warn and/or disallow these extremely high code points, even if other
372above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
373and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
374L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
375treat all above-Unicode code points, including these, as malformations. (Note
376that the Unicode standard considers anything above 0x10FFFF to be illegal, but
377there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
378
379A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
380retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
381C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
382C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
383flags can apply to code points that actually do fit in 31 bits. This happens
384on EBCDIC platforms, and sometimes when the L<overlong
385malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
386describe the situation in all cases.
de69f3af 387
de69f3af
KW
388=cut
389*/
390
391/* This is also a macro */
392PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
393
07693fe6
KW
394U8 *
395Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
396{
de69f3af 397 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
398}
399
57ff5f59
KW
400#ifndef UV_IS_QUAD
401
e050c007
KW
402STATIC int
403S_is_utf8_cp_above_31_bits(const U8 * const s,
404 const U8 * const e,
405 const bool consider_overlongs)
83dc0f42
KW
406{
407 /* Returns TRUE if the first code point represented by the Perl-extended-
408 * UTF-8-encoded string starting at 's', and looking no further than 'e -
409 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
410 *
411 * The function handles the case where the input bytes do not include all
412 * the ones necessary to represent a full character. That is, they may be
413 * the intial bytes of the representation of a code point, but possibly
414 * the final ones necessary for the complete representation may be beyond
415 * 'e - 1'.
416 *
e050c007
KW
417 * The function also can handle the case where the input is an overlong
418 * sequence. If 'consider_overlongs' is 0, the function assumes the
419 * input is not overlong, without checking, and will return based on that
420 * assumption. If this parameter is 1, the function will go to the trouble
421 * of figuring out if it actually evaluates to above or below 31 bits.
83dc0f42 422 *
e050c007 423 * The sequence is otherwise assumed to be well-formed, without checking.
83dc0f42
KW
424 */
425
e050c007
KW
426 const STRLEN len = e - s;
427 int is_overlong;
428
429 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
430
431 assert(! UTF8_IS_INVARIANT(*s) && e > s);
432
83dc0f42
KW
433#ifdef EBCDIC
434
e050c007 435 PERL_UNUSED_ARG(consider_overlongs);
83dc0f42 436
e050c007
KW
437 /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
438 * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
439 * also be the start byte for a 31-bit code point; we need at least 2
440 * bytes, and maybe up through 8 bytes, to determine that. (It can also be
441 * the start byte for an overlong sequence, but for 30-bit or smaller code
442 * points, so we don't have to worry about overlongs on EBCDIC.) */
443 if (*s != 0xFE) {
444 return 0;
445 }
83dc0f42 446
e050c007
KW
447 if (len == 1) {
448 return -1;
449 }
83dc0f42 450
e050c007 451#else
83dc0f42 452
e050c007
KW
453 /* On ASCII, FE and FF are the only start bytes that can evaluate to
454 * needing more than 31 bits. */
455 if (LIKELY(*s < 0xFE)) {
456 return 0;
457 }
83dc0f42 458
e050c007
KW
459 /* What we have left are FE and FF. Both of these require more than 31
460 * bits unless they are for overlongs. */
461 if (! consider_overlongs) {
462 return 1;
463 }
83dc0f42 464
e050c007
KW
465 /* Here, we have FE or FF. If the input isn't overlong, it evaluates to
466 * above 31 bits. But we need more than one byte to discern this, so if
467 * passed just the start byte, it could be an overlong evaluating to
468 * smaller */
469 if (len == 1) {
470 return -1;
471 }
83dc0f42 472
e050c007
KW
473 /* Having excluded len==1, and knowing that FE and FF are both valid start
474 * bytes, we can call the function below to see if the sequence is
475 * overlong. (We don't need the full generality of the called function,
476 * but for these huge code points, speed shouldn't be a consideration, and
477 * the compiler does have enough information, since it's static to this
478 * file, to optimize to just the needed parts.) */
479 is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
83dc0f42 480
e050c007
KW
481 /* If it isn't overlong, more than 31 bits are required. */
482 if (is_overlong == 0) {
483 return 1;
484 }
83dc0f42 485
e050c007
KW
486 /* If it is indeterminate if it is overlong, return that */
487 if (is_overlong < 0) {
488 return -1;
489 }
490
491 /* Here is overlong. Such a sequence starting with FE is below 31 bits, as
492 * the max it can be is 2**31 - 1 */
493 if (*s == 0xFE) {
494 return 0;
83dc0f42
KW
495 }
496
e050c007
KW
497#endif
498
499 /* Here, ASCII and EBCDIC rejoin:
500 * On ASCII: We have an overlong sequence starting with FF
501 * On EBCDIC: We have a sequence starting with FE. */
502
503 { /* For C89, use a block so the declaration can be close to its use */
504
505#ifdef EBCDIC
506
5f995336
KW
507 /* U+7FFFFFFF (2 ** 31 - 1)
508 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
509 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
510 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
511 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
512 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
513 * U+80000000 (2 ** 31):
514 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
515 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
516 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
517 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
e050c007
KW
518 *
519 * and since we know that *s = \xfe, any continuation sequcence
520 * following it that is gt the below is above 31 bits
521 [0] [1] [2] [3] [4] [5] [6] */
522 const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
523
524#else
525
526 /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
527 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
528 * FF overlong for U+80000000 (2 ** 31):
529 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
530 * and since we know that *s = \xff, any continuation sequcence
531 * following it that is gt the below is above 30 bits
532 [0] [1] [2] [3] [4] [5] [6] */
533 const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
5f995336 534
83dc0f42
KW
535
536#endif
e050c007
KW
537 const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
538 const STRLEN cmp_len = MIN(conts_len, len - 1);
539
540 /* Now compare the continuation bytes in s with the ones we have
541 * compiled in that are for the largest 30 bit code point. If we have
542 * enough bytes available to determine the answer, or the bytes we do
543 * have differ from them, we can compare the two to get a definitive
544 * answer (Note that in UTF-EBCDIC, the two lowest possible
545 * continuation bytes are \x41 and \x42.) */
546 if (cmp_len >= conts_len || memNE(s + 1,
547 conts_for_highest_30_bit,
548 cmp_len))
549 {
550 return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
551 }
83dc0f42 552
e050c007
KW
553 /* Here, all the bytes we have are the same as the highest 30-bit code
554 * point, but we are missing so many bytes that we can't make the
555 * determination */
556 return -1;
557 }
83dc0f42
KW
558}
559
57ff5f59
KW
560#endif
561
d6be65ae 562PERL_STATIC_INLINE int
12a4bed3
KW
563S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
564{
d6be65ae
KW
565 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
566 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
567 * it isn't, and -1 if there isn't enough information to tell. This last
568 * return value can happen if the sequence is incomplete, missing some
569 * trailing bytes that would form a complete character. If there are
570 * enough bytes to make a definitive decision, this function does so.
571 * Usually 2 bytes sufficient.
572 *
573 * Overlongs can occur whenever the number of continuation bytes changes.
574 * That means whenever the number of leading 1 bits in a start byte
575 * increases from the next lower start byte. That happens for start bytes
576 * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
577 * start bytes have already been excluded, so don't need to be tested here;
12a4bed3
KW
578 * ASCII platforms: C0, C1
579 * EBCDIC platforms C0, C1, C2, C3, C4, E0
d6be65ae 580 */
12a4bed3
KW
581
582 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
583 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
584
585 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
586 assert(len > 1 && UTF8_IS_START(*s));
587
588 /* Each platform has overlongs after the start bytes given above (expressed
589 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
590 * the logic is the same, except the E0 overlong has already been excluded
591 * on EBCDIC platforms. The values below were found by manually
592 * inspecting the UTF-8 patterns. See the tables in utf8.h and
593 * utfebcdic.h. */
594
595# ifdef EBCDIC
596# define F0_ABOVE_OVERLONG 0xB0
597# define F8_ABOVE_OVERLONG 0xA8
598# define FC_ABOVE_OVERLONG 0xA4
599# define FE_ABOVE_OVERLONG 0xA2
600# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
601 /* I8(0xfe) is FF */
602# else
603
604 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
d6be65ae 605 return 1;
12a4bed3
KW
606 }
607
608# define F0_ABOVE_OVERLONG 0x90
609# define F8_ABOVE_OVERLONG 0x88
610# define FC_ABOVE_OVERLONG 0x84
611# define FE_ABOVE_OVERLONG 0x82
612# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
613# endif
614
615
616 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
617 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
618 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
619 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
620 {
d6be65ae 621 return 1;
12a4bed3
KW
622 }
623
b0b342d4 624 /* Check for the FF overlong */
d6be65ae 625 return isFF_OVERLONG(s, len);
b0b342d4
KW
626}
627
8d6204cc 628PERL_STATIC_INLINE int
b0b342d4
KW
629S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
630{
8d6204cc
KW
631 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
632 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
633 * it isn't, and -1 if there isn't enough information to tell. This last
634 * return value can happen if the sequence is incomplete, missing some
635 * trailing bytes that would form a complete character. If there are
636 * enough bytes to make a definitive decision, this function does so. */
637
b0b342d4 638 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3 639
8d6204cc
KW
640 /* To be an FF overlong, all the available bytes must match */
641 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
642 MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
643 {
644 return 0;
645 }
646
647 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
648 * be there; what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 649 * utfebcdic.h. */
8d6204cc
KW
650 if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
651 return 1;
652 }
12a4bed3 653
8d6204cc
KW
654 /* The missing bytes could cause the result to go one way or the other, so
655 * the result is indeterminate */
656 return -1;
12a4bed3
KW
657}
658
a77c906e
KW
659/* Anything larger than this will overflow the word if it were converted into a UV */
660#if defined(UV_IS_QUAD)
661# ifdef EBCDIC /* Actually is I8 */
662# define HIGHEST_REPRESENTABLE_UTF8 \
663 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
664# else
665# define HIGHEST_REPRESENTABLE_UTF8 \
666 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
667# endif
668#else /* 32-bit */
669# ifdef EBCDIC
670# define HIGHEST_REPRESENTABLE_UTF8 \
671 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
672# else
673# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
674# endif
675#endif
676
c285bbc4 677PERL_STATIC_INLINE int
e050c007
KW
678S_does_utf8_overflow(const U8 * const s,
679 const U8 * e,
680 const bool consider_overlongs)
a77c906e 681{
c285bbc4
KW
682 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
683 * 'e' - 1 would overflow a UV on this platform; that is if it represents a
684 * code point larger than the highest representable code point. It returns
685 * 1 if it does overflow; 0 if it doesn't, and -1 if there isn't enough
686 * information to tell. This last return value can happen if the sequence
687 * is incomplete, missing some trailing bytes that would form a complete
688 * character. If there are enough bytes to make a definitive decision,
689 * this function does so.
690 *
e050c007
KW
691 * If 'consider_overlongs' is TRUE, the function checks for the possibility
692 * that the sequence is an overlong that doesn't overflow. Otherwise, it
693 * assumes the sequence is not an overlong. This can give different
694 * results only on ASCII 32-bit platforms.
695 *
c285bbc4
KW
696 * (For ASCII platforms, we could use memcmp() because we don't have to
697 * convert each byte to I8, but it's very rare input indeed that would
698 * approach overflow, so the loop below will likely only get executed once.)
699 *
700 * 'e' - 1 must not be beyond a full character. */
a77c906e
KW
701
702 const STRLEN len = e - s;
c285bbc4
KW
703 const U8 *x;
704 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
a77c906e
KW
705
706 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
707 assert(s <= e && s + UTF8SKIP(s) >= e);
708
709#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
710
711 /* On 32 bit ASCII machines, many overlongs that start with FF don't
712 * overflow */
e050c007 713 if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
c285bbc4
KW
714
715 /* To be such an overlong, the first bytes of 's' must match
716 * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
717 * don't have any additional bytes available, the sequence, when
718 * completed might or might not fit in 32 bits. But if we have that
719 * next byte, we can tell for sure. If it is <= 0x83, then it does
720 * fit. */
721 if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
722 return -1;
723 }
724
725 return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
a77c906e
KW
726 }
727
e050c007
KW
728#else
729
730 PERL_UNUSED_ARG(consider_overlongs);
731
a77c906e
KW
732#endif
733
734 for (x = s; x < e; x++, y++) {
735
736 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
737 continue;
738 }
739
740 /* If this byte is larger than the corresponding highest UTF-8 byte,
741 * the sequence overflow; otherwise the byte is less than, and so the
742 * sequence doesn't overflow */
743 return NATIVE_UTF8_TO_I8(*x) > *y;
744
745 }
746
747 /* Got to the end and all bytes are the same. If the input is a whole
748 * character, it doesn't overflow. And if it is a partial character,
c285bbc4
KW
749 * there's not enough information to tell */
750 if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
751 return -1;
752 }
753
754 return 0;
a77c906e
KW
755}
756
12a4bed3
KW
757#undef F0_ABOVE_OVERLONG
758#undef F8_ABOVE_OVERLONG
759#undef FC_ABOVE_OVERLONG
760#undef FE_ABOVE_OVERLONG
761#undef FF_OVERLONG_PREFIX
762
35f8c9bd 763STRLEN
edc2c47a 764Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 765{
2b479609 766 STRLEN len;
12a4bed3 767 const U8 *x;
35f8c9bd 768
2b479609
KW
769 /* A helper function that should not be called directly.
770 *
771 * This function returns non-zero if the string beginning at 's' and
772 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
773 * code point; otherwise it returns 0. The examination stops after the
774 * first code point in 's' is validated, not looking at the rest of the
775 * input. If 'e' is such that there are not enough bytes to represent a
776 * complete code point, this function will return non-zero anyway, if the
777 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
778 * excluded by 'flags'.
779 *
780 * A non-zero return gives the number of bytes required to represent the
781 * code point. Be aware that if the input is for a partial character, the
782 * return will be larger than 'e - s'.
783 *
784 * This function assumes that the code point represented is UTF-8 variant.
56576a04
KW
785 * The caller should have excluded the possibility of it being invariant
786 * before calling this function.
2b479609
KW
787 *
788 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
789 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
790 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
791 * disallowed by the flags. If the input is only for a partial character,
792 * the function will return non-zero if there is any sequence of
793 * well-formed UTF-8 that, when appended to the input sequence, could
794 * result in an allowed code point; otherwise it returns 0. Non characters
795 * cannot be determined based on partial character input. But many of the
796 * other excluded types can be determined with just the first one or two
797 * bytes.
798 *
799 */
800
801 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
802
803 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 804 |UTF8_DISALLOW_PERL_EXTENDED)));
2b479609 805 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 806
2b479609 807 /* A variant char must begin with a start byte */
35f8c9bd
KW
808 if (UNLIKELY(! UTF8_IS_START(*s))) {
809 return 0;
810 }
811
edc2c47a
KW
812 /* Examine a maximum of a single whole code point */
813 if (e - s > UTF8SKIP(s)) {
814 e = s + UTF8SKIP(s);
815 }
816
2b479609
KW
817 len = e - s;
818
819 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
820 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 821
56576a04
KW
822 /* Here, we are disallowing some set of largish code points, and the
823 * first byte indicates the sequence is for a code point that could be
824 * in the excluded set. We generally don't have to look beyond this or
825 * the second byte to see if the sequence is actually for one of the
826 * excluded classes. The code below is derived from this table:
827 *
2b479609
KW
828 * UTF-8 UTF-EBCDIC I8
829 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
830 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
831 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
832 *
56576a04
KW
833 * Keep in mind that legal continuation bytes range between \x80..\xBF
834 * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
835 * continuation bytes. Hence, we don't have to test the upper edge
836 * because if any of those is encountered, the sequence is malformed,
837 * and would fail elsewhere in this function.
838 *
839 * The code here likewise assumes that there aren't other
840 * malformations; again the function should fail elsewhere because of
841 * these. For example, an overlong beginning with FC doesn't actually
842 * have to be a super; it could actually represent a small code point,
843 * even U+0000. But, since overlongs (and other malformations) are
844 * illegal, the function should return FALSE in either case.
2b479609
KW
845 */
846
847#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
848# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 849# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 850
19794540
KW
851# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
852 /* B6 and B7 */ \
853 && ((s1) & 0xFE ) == 0xB6)
57ff5f59 854# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
2b479609
KW
855#else
856# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
857# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
858# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
57ff5f59 859# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
2b479609
KW
860#endif
861
862 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
863 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
864 {
2b479609
KW
865 return 0; /* Above Unicode */
866 }
867
d044b7a7 868 if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
57ff5f59 869 && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
2b479609 870 {
57ff5f59 871 return 0;
2b479609
KW
872 }
873
874 if (len > 1) {
875 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
876
877 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 878 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
879 {
880 return 0; /* Above Unicode */
881 }
882
883 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 884 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
885 {
886 return 0; /* Surrogate */
887 }
888
889 if ( (flags & UTF8_DISALLOW_NONCHAR)
890 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
891 {
892 return 0; /* Noncharacter code point */
893 }
894 }
895 }
896
897 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
898 for (x = s + 1; x < e; x++) {
899 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
900 return 0;
901 }
902 }
903
af13dd8a 904 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3 905 * overlong. */
d6be65ae 906 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
12a4bed3 907 return 0;
af13dd8a
KW
908 }
909
12a4bed3
KW
910 /* And finally, that the code point represented fits in a word on this
911 * platform */
e050c007
KW
912 if (0 < does_utf8_overflow(s, e,
913 0 /* Don't consider overlongs */
914 ))
915 {
12a4bed3 916 return 0;
35f8c9bd
KW
917 }
918
2b479609 919 return UTF8SKIP(s);
35f8c9bd
KW
920}
921
7e2f38b2
KW
922char *
923Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
7cf8d05d
KW
924{
925 /* Returns a mortalized C string that is a displayable copy of the 'len'
7e2f38b2
KW
926 * bytes starting at 's'. 'format' gives how to display each byte.
927 * Currently, there are only two formats, so it is currently a bool:
928 * 0 \xab
929 * 1 ab (that is a space between two hex digit bytes)
930 */
7cf8d05d
KW
931
932 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
933 trailing NUL */
934 const U8 * const e = s + len;
935 char * output;
936 char * d;
937
938 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
939
940 Newx(output, output_len, char);
941 SAVEFREEPV(output);
942
943 d = output;
944 for (; s < e; s++) {
945 const unsigned high_nibble = (*s & 0xF0) >> 4;
946 const unsigned low_nibble = (*s & 0x0F);
947
7e2f38b2
KW
948 if (format) {
949 *d++ = ' ';
950 }
951 else {
952 *d++ = '\\';
953 *d++ = 'x';
954 }
7cf8d05d
KW
955
956 if (high_nibble < 10) {
957 *d++ = high_nibble + '0';
958 }
959 else {
960 *d++ = high_nibble - 10 + 'a';
961 }
962
963 if (low_nibble < 10) {
964 *d++ = low_nibble + '0';
965 }
966 else {
967 *d++ = low_nibble - 10 + 'a';
968 }
969 }
970
971 *d = '\0';
972 return output;
973}
974
806547a7 975PERL_STATIC_INLINE char *
7cf8d05d
KW
976S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
977
978 /* How many bytes to print */
3cc6a05e 979 STRLEN print_len,
7cf8d05d
KW
980
981 /* Which one is the non-continuation */
982 const STRLEN non_cont_byte_pos,
983
984 /* How many bytes should there be? */
985 const STRLEN expect_len)
806547a7
KW
986{
987 /* Return the malformation warning text for an unexpected continuation
988 * byte. */
989
7cf8d05d 990 const char * const where = (non_cont_byte_pos == 1)
806547a7 991 ? "immediately"
7cf8d05d
KW
992 : Perl_form(aTHX_ "%d bytes",
993 (int) non_cont_byte_pos);
806547a7
KW
994
995 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
996
7cf8d05d
KW
997 /* We don't need to pass this parameter, but since it has already been
998 * calculated, it's likely faster to pass it; verify under DEBUGGING */
999 assert(expect_len == UTF8SKIP(s));
1000
1001 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1002 " %s after start byte 0x%02x; need %d bytes, got %d)",
1003 malformed_text,
7e2f38b2 1004 _byte_dump_string(s, print_len, 0),
7cf8d05d
KW
1005 *(s + non_cont_byte_pos),
1006 where,
1007 *s,
1008 (int) expect_len,
1009 (int) non_cont_byte_pos);
806547a7
KW
1010}
1011
35f8c9bd
KW
1012/*
1013
de69f3af 1014=for apidoc utf8n_to_uvchr
378516de
KW
1015
1016THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 1017Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 1018
9041c2e3 1019Bottom level UTF-8 decode routine.
de69f3af 1020Returns the native code point value of the first character in the string C<s>,
746afd53
KW
1021which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1022C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1023the length, in bytes, of that character.
949cf498
KW
1024
1025The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
1026well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1027causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1028is the next possible position in C<s> that could begin a non-malformed
1029character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1030is raised. Some UTF-8 input sequences may contain multiple malformations.
1031This function tries to find every possible one in each call, so multiple
56576a04 1032warnings can be raised for the same sequence.
949cf498
KW
1033
1034Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1035individual types of malformations, such as the sequence being overlong (that
1036is, when there is a shorter sequence that can express the same code point;
1037overlong sequences are expressly forbidden in the UTF-8 standard due to
1038potential security issues). Another malformation example is the first byte of
1039a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
1040flags. Even if allowed, this function generally returns the Unicode
1041REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1042F<utf8.h> to override this behavior for the overlong malformations, but don't
1043do that except for very specialized purposes.
949cf498 1044
796b6530 1045The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
1046flags) malformation is found. If this flag is set, the routine assumes that
1047the caller will raise a warning, and this function will silently just set
d088425d
KW
1048C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1049
75200dff 1050Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 1051character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
1052in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1053be set to 1. To disambiguate, upon a zero return, see if the first byte of
1054C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 1055error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
1056
1057Certain code points are considered problematic. These are Unicode surrogates,
746afd53 1058Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 1059By default these are considered regular code points, but certain situations
ecc1615f
KW
1060warrant special handling for them, which can be specified using the C<flags>
1061parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1062three classes are treated as malformations and handled as such. The flags
1063C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1064C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1065disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1066restricts the allowed inputs to the strict UTF-8 traditionally defined by
1067Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1068definition given by
1069L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1070The difference between traditional strictness and C9 strictness is that the
1071latter does not forbid non-character code points. (They are still discouraged,
1072however.) For more discussion see L<perlunicode/Noncharacter code points>.
1073
1074The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1075C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
1076C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1077raised for their respective categories, but otherwise the code points are
1078considered valid (not malformations). To get a category to both be treated as
1079a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 1080(But note that warnings are not raised if lexically disabled nor if
796b6530 1081C<UTF8_CHECK_ONLY> is also specified.)
949cf498 1082
57ff5f59
KW
1083Extremely high code points were never specified in any standard, and require an
1084extension to UTF-8 to express, which Perl does. It is likely that programs
1085written in something other than Perl would not be able to read files that
1086contain these; nor would Perl understand files written by something that uses a
1087different extension. For these reasons, there is a separate set of flags that
1088can warn and/or disallow these extremely high code points, even if other
1089above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1090C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
1091L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
1092above-Unicode code points, including these, as malformations.
1093(Note that the Unicode standard considers anything above 0x10FFFF to be
1094illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1095(2**31 -1))
1096
1097A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1098retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1099C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1100C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1101can apply to code points that actually do fit in 31 bits. This happens on
1102EBCDIC platforms, and sometimes when the L<overlong
1103malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1104describe the situation in all cases.
1105
760c7c2f
KW
1106It is now deprecated to have very high code points (above C<IV_MAX> on the
1107platforms) and this function will raise a deprecation warning for these (unless
d5944cab 1108such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 1109in a 32-bit word.
ab8e6d41 1110
949cf498
KW
1111All other code points corresponding to Unicode characters, including private
1112use and those yet to be assigned, are never considered malformed and never
1113warn.
67e989fb 1114
37607a96 1115=cut
f9380377
KW
1116
1117Also implemented as a macro in utf8.h
1118*/
1119
1120UV
1121Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
1122 STRLEN curlen,
1123 STRLEN *retlen,
1124 const U32 flags)
1125{
1126 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1127
1128 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1129}
1130
1131/*
1132
1133=for apidoc utf8n_to_uvchr_error
1134
1135THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1136Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1137
1138This function is for code that needs to know what the precise malformation(s)
1139are when an error is found.
1140
1141It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1142all the others, C<errors>. If this parameter is 0, this function behaves
1143identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1144to a C<U32> variable, which this function sets to indicate any errors found.
1145Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1146C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1147of these bits will be set if a malformation is found, even if the input
7a65503b 1148C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1149exceptions are noted:
1150
1151=over 4
1152
57ff5f59 1153=item C<UTF8_GOT_PERL_EXTENDED>
f9380377 1154
57ff5f59
KW
1155The input sequence is not standard UTF-8, but a Perl extension. This bit is
1156set only if the input C<flags> parameter contains either the
1157C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1158
1159Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1160and so some extension must be used to express them. Perl uses a natural
1161extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1162extension to represent even higher ones, so that any code point that fits in a
116364-bit word can be represented. Text using these extensions is not likely to
1164be portable to non-Perl code. We lump both of these extensions together and
1165refer to them as Perl extended UTF-8. There exist other extensions that people
1166have invented, incompatible with Perl's.
1167
1168On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1169extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1170than on ASCII. Prior to that, code points 2**31 and higher were simply
1171unrepresentable, and a different, incompatible method was used to represent
1172code points between 2**30 and 2**31 - 1.
1173
1174On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1175Perl extended UTF-8 is used.
1176
1177In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1178may use for backward compatibility. That name is misleading, as this flag may
1179be set when the code point actually does fit in 31 bits. This happens on
1180EBCDIC platforms, and sometimes when the L<overlong
1181malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1182describes the situation in all cases.
f9380377
KW
1183
1184=item C<UTF8_GOT_CONTINUATION>
1185
1186The input sequence was malformed in that the first byte was a a UTF-8
1187continuation byte.
1188
1189=item C<UTF8_GOT_EMPTY>
1190
1191The input C<curlen> parameter was 0.
1192
1193=item C<UTF8_GOT_LONG>
1194
1195The input sequence was malformed in that there is some other sequence that
1196evaluates to the same code point, but that sequence is shorter than this one.
1197
fecaf136
KW
1198Until Unicode 3.1, it was legal for programs to accept this malformation, but
1199it was discovered that this created security issues.
1200
f9380377
KW
1201=item C<UTF8_GOT_NONCHAR>
1202
1203The code point represented by the input UTF-8 sequence is for a Unicode
1204non-character code point.
1205This bit is set only if the input C<flags> parameter contains either the
1206C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1207
1208=item C<UTF8_GOT_NON_CONTINUATION>
1209
1210The input sequence was malformed in that a non-continuation type byte was found
1211in a position where only a continuation type one should be.
1212
1213=item C<UTF8_GOT_OVERFLOW>
1214
1215The input sequence was malformed in that it is for a code point that is not
1216representable in the number of bits available in a UV on the current platform.
1217
1218=item C<UTF8_GOT_SHORT>
1219
1220The input sequence was malformed in that C<curlen> is smaller than required for
1221a complete sequence. In other words, the input is for a partial character
1222sequence.
1223
1224=item C<UTF8_GOT_SUPER>
1225
1226The input sequence was malformed in that it is for a non-Unicode code point;
1227that is, one above the legal Unicode maximum.
1228This bit is set only if the input C<flags> parameter contains either the
1229C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1230
1231=item C<UTF8_GOT_SURROGATE>
1232
1233The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1234code point.
1235This bit is set only if the input C<flags> parameter contains either the
1236C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1237
1238=back
1239
133551d8
KW
1240To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1241flag to suppress any warnings, and then examine the C<*errors> return.
1242
f9380377 1243=cut
37607a96 1244*/
67e989fb 1245
a0ed51b3 1246UV
f9380377
KW
1247Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1248 STRLEN curlen,
1249 STRLEN *retlen,
1250 const U32 flags,
1251 U32 * errors)
a0ed51b3 1252{
d4c19fe8 1253 const U8 * const s0 = s;
2b5e7bc2
KW
1254 U8 * send = NULL; /* (initialized to silence compilers' wrong
1255 warning) */
1256 U32 possible_problems = 0; /* A bit is set here for each potential problem
1257 found as we go along */
eb83ed87 1258 UV uv = *s;
2b5e7bc2
KW
1259 STRLEN expectlen = 0; /* How long should this sequence be?
1260 (initialized to silence compilers' wrong
1261 warning) */
e308b348 1262 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1263 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1264 this gets set and discarded */
a0dbb045 1265
2b5e7bc2
KW
1266 /* The below are used only if there is both an overlong malformation and a
1267 * too short one. Otherwise the first two are set to 's0' and 'send', and
1268 * the third not used at all */
1269 U8 * adjusted_s0 = (U8 *) s0;
e9f2c446
KW
1270 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1271 routine; see [perl #130921] */
2b5e7bc2 1272 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1273
f9380377
KW
1274 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1275
1276 if (errors) {
1277 *errors = 0;
1278 }
1279 else {
1280 errors = &discard_errors;
1281 }
a0dbb045 1282
eb83ed87
KW
1283 /* The order of malformation tests here is important. We should consume as
1284 * few bytes as possible in order to not skip any valid character. This is
1285 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1286 * http://unicode.org/reports/tr36 for more discussion as to why. For
1287 * example, once we've done a UTF8SKIP, we can tell the expected number of
1288 * bytes, and could fail right off the bat if the input parameters indicate
1289 * that there are too few available. But it could be that just that first
1290 * byte is garbled, and the intended character occupies fewer bytes. If we
1291 * blindly assumed that the first byte is correct, and skipped based on
1292 * that number, we could skip over a valid input character. So instead, we
1293 * always examine the sequence byte-by-byte.
1294 *
1295 * We also should not consume too few bytes, otherwise someone could inject
1296 * things. For example, an input could be deliberately designed to
1297 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1298 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1299 * which is actually part of of the overflowing sequence), that could look
1300 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1301 * sequence and process the rest, inappropriately.
1302 *
1303 * Some possible input sequences are malformed in more than one way. This
1304 * function goes to lengths to try to find all of them. This is necessary
1305 * for correctness, as the inputs may allow one malformation but not
1306 * another, and if we abandon searching for others after finding the
1307 * allowed one, we could allow in something that shouldn't have been.
1308 */
eb83ed87 1309
b5b9af04 1310 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1311 possible_problems |= UTF8_GOT_EMPTY;
1312 curlen = 0;
5a48568d 1313 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1314 goto ready_to_handle_errors;
0c443dc2
JH
1315 }
1316
eb83ed87
KW
1317 expectlen = UTF8SKIP(s);
1318
1319 /* A well-formed UTF-8 character, as the vast majority of calls to this
1320 * function will be for, has this expected length. For efficiency, set
1321 * things up here to return it. It will be overriden only in those rare
1322 * cases where a malformation is found */
1323 if (retlen) {
1324 *retlen = expectlen;
1325 }
1326
1327 /* An invariant is trivially well-formed */
1d72bdf6 1328 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1329 return uv;
a0ed51b3 1330 }
67e989fb 1331
eb83ed87 1332 /* A continuation character can't start a valid sequence */
b5b9af04 1333 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1334 possible_problems |= UTF8_GOT_CONTINUATION;
1335 curlen = 1;
1336 uv = UNICODE_REPLACEMENT;
1337 goto ready_to_handle_errors;
ba210ebe 1338 }
9041c2e3 1339
dcd27b3c 1340 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1341 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1342 * because it excludes start bytes like \xC0 that always lead to
1343 * overlongs.) */
dcd27b3c 1344
534752c1
KW
1345 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1346 * that indicate the number of bytes in the character's whole UTF-8
1347 * sequence, leaving just the bits that are part of the value. */
1348 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1349
e308b348
KW
1350 /* Setup the loop end point, making sure to not look past the end of the
1351 * input string, and flag it as too short if the size isn't big enough. */
1352 send = (U8*) s0;
1353 if (UNLIKELY(curlen < expectlen)) {
1354 possible_problems |= UTF8_GOT_SHORT;
1355 avail_len = curlen;
1356 send += curlen;
1357 }
1358 else {
1359 send += expectlen;
1360 }
e308b348 1361
eb83ed87 1362 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1363 * accumulating each into the working value as we go. */
eb83ed87 1364 for (s = s0 + 1; s < send; s++) {
b5b9af04 1365 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1366 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1367 continue;
1368 }
1369
1370 /* Here, found a non-continuation before processing all expected bytes.
1371 * This byte indicates the beginning of a new character, so quit, even
1372 * if allowing this malformation. */
2b5e7bc2 1373 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1374 break;
eb83ed87
KW
1375 } /* End of loop through the character's bytes */
1376
1377 /* Save how many bytes were actually in the character */
1378 curlen = s - s0;
1379
2b5e7bc2
KW
1380 /* Note that there are two types of too-short malformation. One is when
1381 * there is actual wrong data before the normal termination of the
1382 * sequence. The other is that the sequence wasn't complete before the end
1383 * of the data we are allowed to look at, based on the input 'curlen'.
1384 * This means that we were passed data for a partial character, but it is
1385 * valid as far as we saw. The other is definitely invalid. This
1386 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1387 * separate.
1388 *
1389 * A convenience macro that matches either of the too-short conditions. */
1390# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1391
1392 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1393 uv_so_far = uv;
1394 uv = UNICODE_REPLACEMENT;
1395 }
2b5e7bc2 1396
08e73697
KW
1397 /* Check for overflow. The algorithm requires us to not look past the end
1398 * of the current character, even if partial, so the upper limit is 's' */
e050c007
KW
1399 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1400 1 /* Do consider overlongs */
1401 )))
1402 {
2b5e7bc2
KW
1403 possible_problems |= UTF8_GOT_OVERFLOW;
1404 uv = UNICODE_REPLACEMENT;
eb83ed87 1405 }
eb83ed87 1406
2b5e7bc2
KW
1407 /* Check for overlong. If no problems so far, 'uv' is the correct code
1408 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1409 * we must look at the UTF-8 byte sequence itself to see if it is for an
1410 * overlong */
1411 if ( ( LIKELY(! possible_problems)
1412 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
56576a04 1413 || ( UNLIKELY(possible_problems)
2b5e7bc2
KW
1414 && ( UNLIKELY(! UTF8_IS_START(*s0))
1415 || ( curlen > 1
d6be65ae 1416 && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
08e73697 1417 s - s0))))))
2f8f112e 1418 {
2b5e7bc2
KW
1419 possible_problems |= UTF8_GOT_LONG;
1420
abc28b54 1421 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
56576a04 1422
abc28b54
KW
1423 /* The calculation in the 'true' branch of this 'if'
1424 * below won't work if overflows, and isn't needed
1425 * anyway. Further below we handle all overflow
1426 * cases */
1427 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1428 {
2b5e7bc2
KW
1429 UV min_uv = uv_so_far;
1430 STRLEN i;
1431
1432 /* Here, the input is both overlong and is missing some trailing
1433 * bytes. There is no single code point it could be for, but there
1434 * may be enough information present to determine if what we have
1435 * so far is for an unallowed code point, such as for a surrogate.
56576a04
KW
1436 * The code further below has the intelligence to determine this,
1437 * but just for non-overlong UTF-8 sequences. What we do here is
1438 * calculate the smallest code point the input could represent if
1439 * there were no too short malformation. Then we compute and save
1440 * the UTF-8 for that, which is what the code below looks at
1441 * instead of the raw input. It turns out that the smallest such
1442 * code point is all we need. */
2b5e7bc2
KW
1443 for (i = curlen; i < expectlen; i++) {
1444 min_uv = UTF8_ACCUMULATE(min_uv,
1445 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1446 }
1447
e9f2c446 1448 adjusted_s0 = temp_char_buf;
57ff5f59 1449 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
2b5e7bc2 1450 }
eb83ed87
KW
1451 }
1452
56576a04
KW
1453 /* Here, we have found all the possible problems, except for when the input
1454 * is for a problematic code point not allowed by the input parameters. */
1455
06188866
KW
1456 /* uv is valid for overlongs */
1457 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1458
1459 /* isn't problematic if < this */
1460 && uv >= UNICODE_SURROGATE_FIRST)
2b5e7bc2 1461 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1462
1463 /* if overflow, we know without looking further
1464 * precisely which of the problematic types it is,
1465 * and we deal with those in the overflow handling
1466 * code */
1467 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
57ff5f59
KW
1468 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1469 || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
760c7c2f
KW
1470 && ((flags & ( UTF8_DISALLOW_NONCHAR
1471 |UTF8_DISALLOW_SURROGATE
1472 |UTF8_DISALLOW_SUPER
d044b7a7 1473 |UTF8_DISALLOW_PERL_EXTENDED
760c7c2f
KW
1474 |UTF8_WARN_NONCHAR
1475 |UTF8_WARN_SURROGATE
1476 |UTF8_WARN_SUPER
d044b7a7 1477 |UTF8_WARN_PERL_EXTENDED))
2b5e7bc2
KW
1478 /* In case of a malformation, 'uv' is not valid, and has
1479 * been changed to something in the Unicode range.
1480 * Currently we don't output a deprecation message if there
1481 * is already a malformation, so we don't have to special
1482 * case the test immediately below */
760c7c2f
KW
1483 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1484 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1485 {
2b5e7bc2
KW
1486 /* If there were no malformations, or the only malformation is an
1487 * overlong, 'uv' is valid */
1488 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1489 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1490 possible_problems |= UTF8_GOT_SURROGATE;
1491 }
1492 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1493 possible_problems |= UTF8_GOT_SUPER;
1494 }
1495 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1496 possible_problems |= UTF8_GOT_NONCHAR;
1497 }
1498 }
1499 else { /* Otherwise, need to look at the source UTF-8, possibly
1500 adjusted to be non-overlong */
1501
1502 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1503 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1504 {
2b5e7bc2
KW
1505 possible_problems |= UTF8_GOT_SUPER;
1506 }
1507 else if (curlen > 1) {
1508 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1509 NATIVE_UTF8_TO_I8(*adjusted_s0),
1510 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1511 {
2b5e7bc2 1512 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1513 }
2b5e7bc2
KW
1514 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1515 NATIVE_UTF8_TO_I8(*adjusted_s0),
1516 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1517 {
1518 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1519 }
1520 }
c0236afe 1521
2b5e7bc2
KW
1522 /* We need a complete well-formed UTF-8 character to discern
1523 * non-characters, so can't look for them here */
1524 }
1525 }
949cf498 1526
2b5e7bc2
KW
1527 ready_to_handle_errors:
1528
1529 /* At this point:
1530 * curlen contains the number of bytes in the sequence that
1531 * this call should advance the input by.
e308b348
KW
1532 * avail_len gives the available number of bytes passed in, but
1533 * only if this is less than the expected number of
1534 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1535 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1536 * is set in it for each potential problem found.
1537 * uv contains the code point the input sequence
1538 * represents; or if there is a problem that prevents
1539 * a well-defined value from being computed, it is
1540 * some subsitute value, typically the REPLACEMENT
1541 * CHARACTER.
1542 * s0 points to the first byte of the character
56576a04
KW
1543 * s points to just after were we left off processing
1544 * the character
1545 * send points to just after where that character should
1546 * end, based on how many bytes the start byte tells
1547 * us should be in it, but no further than s0 +
1548 * avail_len
2b5e7bc2 1549 */
eb83ed87 1550
2b5e7bc2
KW
1551 if (UNLIKELY(possible_problems)) {
1552 bool disallowed = FALSE;
1553 const U32 orig_problems = possible_problems;
1554
1555 while (possible_problems) { /* Handle each possible problem */
1556 UV pack_warn = 0;
1557 char * message = NULL;
1558
1559 /* Each 'if' clause handles one problem. They are ordered so that
1560 * the first ones' messages will be displayed before the later
6c64cd9d
KW
1561 * ones; this is kinda in decreasing severity order. But the
1562 * overlong must come last, as it changes 'uv' looked at by the
1563 * others */
2b5e7bc2
KW
1564 if (possible_problems & UTF8_GOT_OVERFLOW) {
1565
56576a04
KW
1566 /* Overflow means also got a super and are using Perl's
1567 * extended UTF-8, but we handle all three cases here */
2b5e7bc2 1568 possible_problems
d044b7a7 1569 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
f9380377
KW
1570 *errors |= UTF8_GOT_OVERFLOW;
1571
1572 /* But the API says we flag all errors found */
1573 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1574 *errors |= UTF8_GOT_SUPER;
1575 }
ddb65933 1576 if (flags
d044b7a7 1577 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1578 {
d044b7a7 1579 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1580 }
2b5e7bc2 1581
d60baaa7 1582 /* Disallow if any of the three categories say to */
56576a04 1583 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
d60baaa7 1584 || (flags & ( UTF8_DISALLOW_SUPER
d044b7a7 1585 |UTF8_DISALLOW_PERL_EXTENDED)))
d60baaa7
KW
1586 {
1587 disallowed = TRUE;
1588 }
1589
d60baaa7
KW
1590 /* Likewise, warn if any say to, plus if deprecation warnings
1591 * are on, because this code point is above IV_MAX */
56576a04 1592 if ( ckWARN_d(WARN_DEPRECATED)
d60baaa7 1593 || ! (flags & UTF8_ALLOW_OVERFLOW)
d044b7a7 1594 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
d60baaa7 1595 {
2b5e7bc2 1596
ddb65933
KW
1597 /* The warnings code explicitly says it doesn't handle the
1598 * case of packWARN2 and two categories which have
1599 * parent-child relationship. Even if it works now to
1600 * raise the warning if either is enabled, it wouldn't
1601 * necessarily do so in the future. We output (only) the
56576a04 1602 * most dire warning */
ddb65933
KW
1603 if (! (flags & UTF8_CHECK_ONLY)) {
1604 if (ckWARN_d(WARN_UTF8)) {
1605 pack_warn = packWARN(WARN_UTF8);
1606 }
1607 else if (ckWARN_d(WARN_NON_UNICODE)) {
1608 pack_warn = packWARN(WARN_NON_UNICODE);
1609 }
1610 if (pack_warn) {
1611 message = Perl_form(aTHX_ "%s: %s (overflows)",
1612 malformed_text,
05b9033b 1613 _byte_dump_string(s0, curlen, 0));
ddb65933 1614 }
2b5e7bc2
KW
1615 }
1616 }
1617 }
1618 else if (possible_problems & UTF8_GOT_EMPTY) {
1619 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1620 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1621
1622 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1623
1624 /* This so-called malformation is now treated as a bug in
1625 * the caller. If you have nothing to decode, skip calling
1626 * this function */
1627 assert(0);
1628
2b5e7bc2
KW
1629 disallowed = TRUE;
1630 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1631 pack_warn = packWARN(WARN_UTF8);
1632 message = Perl_form(aTHX_ "%s (empty string)",
1633 malformed_text);
1634 }
1635 }
1636 }
1637 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1638 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1639 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1640
1641 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1642 disallowed = TRUE;
1643 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1644 pack_warn = packWARN(WARN_UTF8);
1645 message = Perl_form(aTHX_
1646 "%s: %s (unexpected continuation byte 0x%02x,"
1647 " with no preceding start byte)",
1648 malformed_text,
7e2f38b2 1649 _byte_dump_string(s0, 1, 0), *s0);
2b5e7bc2
KW
1650 }
1651 }
1652 }
2b5e7bc2
KW
1653 else if (possible_problems & UTF8_GOT_SHORT) {
1654 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1655 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1656
1657 if (! (flags & UTF8_ALLOW_SHORT)) {
1658 disallowed = TRUE;
1659 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1660 pack_warn = packWARN(WARN_UTF8);
1661 message = Perl_form(aTHX_
56576a04
KW
1662 "%s: %s (too short; %d byte%s available, need %d)",
1663 malformed_text,
1664 _byte_dump_string(s0, send - s0, 0),
1665 (int)avail_len,
1666 avail_len == 1 ? "" : "s",
1667 (int)expectlen);
2b5e7bc2
KW
1668 }
1669 }
ba210ebe 1670
2b5e7bc2 1671 }
e308b348
KW
1672 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1673 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1674 *errors |= UTF8_GOT_NON_CONTINUATION;
1675
1676 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1677 disallowed = TRUE;
1678 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
99a765e9
KW
1679
1680 /* If we don't know for sure that the input length is
1681 * valid, avoid as much as possible reading past the
1682 * end of the buffer */
1683 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1684 ? s - s0
1685 : send - s0;
e308b348
KW
1686 pack_warn = packWARN(WARN_UTF8);
1687 message = Perl_form(aTHX_ "%s",
1688 unexpected_non_continuation_text(s0,
99a765e9 1689 printlen,
e308b348
KW
1690 s - s0,
1691 (int) expectlen));
1692 }
1693 }
1694 }
2b5e7bc2
KW
1695 else if (possible_problems & UTF8_GOT_SURROGATE) {
1696 possible_problems &= ~UTF8_GOT_SURROGATE;
1697
f9380377
KW
1698 if (flags & UTF8_WARN_SURROGATE) {
1699 *errors |= UTF8_GOT_SURROGATE;
1700
1701 if ( ! (flags & UTF8_CHECK_ONLY)
1702 && ckWARN_d(WARN_SURROGATE))
1703 {
2b5e7bc2
KW
1704 pack_warn = packWARN(WARN_SURROGATE);
1705
1706 /* These are the only errors that can occur with a
1707 * surrogate when the 'uv' isn't valid */
1708 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1709 message = Perl_form(aTHX_
1710 "UTF-16 surrogate (any UTF-8 sequence that"
1711 " starts with \"%s\" is for a surrogate)",
7e2f38b2 1712 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1713 }
1714 else {
c94c2f39 1715 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 1716 }
f9380377 1717 }
2b5e7bc2 1718 }
ba210ebe 1719
2b5e7bc2
KW
1720 if (flags & UTF8_DISALLOW_SURROGATE) {
1721 disallowed = TRUE;
f9380377 1722 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1723 }
1724 }
1725 else if (possible_problems & UTF8_GOT_SUPER) {
1726 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1727
f9380377
KW
1728 if (flags & UTF8_WARN_SUPER) {
1729 *errors |= UTF8_GOT_SUPER;
1730
1731 if ( ! (flags & UTF8_CHECK_ONLY)
1732 && ckWARN_d(WARN_NON_UNICODE))
1733 {
2b5e7bc2
KW
1734 pack_warn = packWARN(WARN_NON_UNICODE);
1735
1736 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1737 message = Perl_form(aTHX_
1738 "Any UTF-8 sequence that starts with"
1739 " \"%s\" is for a non-Unicode code point,"
1740 " may not be portable",
7e2f38b2 1741 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1742 }
1743 else {
c94c2f39 1744 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 1745 }
f9380377 1746 }
2b5e7bc2 1747 }
ba210ebe 1748
57ff5f59
KW
1749 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1750 * and before possibly bailing out, so that the more dire
1751 * warning will override the regular one. */
1752 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
2b5e7bc2 1753 if ( ! (flags & UTF8_CHECK_ONLY)
d044b7a7 1754 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
db0f09e6 1755 && ckWARN_d(WARN_NON_UNICODE))
2b5e7bc2 1756 {
db0f09e6 1757 pack_warn = packWARN(WARN_NON_UNICODE);
2b5e7bc2 1758
57ff5f59
KW
1759 /* If it is an overlong that evaluates to a code point
1760 * that doesn't have to use the Perl extended UTF-8, it
1761 * still used it, and so we output a message that
1762 * doesn't refer to the code point. The same is true
1763 * if there was a SHORT malformation where the code
1764 * point is not valid. In that case, 'uv' will have
1765 * been set to the REPLACEMENT CHAR, and the message
1766 * below without the code point in it will be selected
1767 * */
1768 if (UNICODE_IS_PERL_EXTENDED(uv)) {
2b5e7bc2 1769 message = Perl_form(aTHX_
57ff5f59 1770 perl_extended_cp_format, uv);
2b5e7bc2
KW
1771 }
1772 else {
1773 message = Perl_form(aTHX_
57ff5f59
KW
1774 "Any UTF-8 sequence that starts with"
1775 " \"%s\" is a Perl extension, and"
1776 " so is not portable",
1777 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1778 }
1779 }
1780
d044b7a7
KW
1781 if (flags & ( UTF8_WARN_PERL_EXTENDED
1782 |UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1783 {
d044b7a7 1784 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1785
d044b7a7 1786 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
f9380377
KW
1787 disallowed = TRUE;
1788 }
2b5e7bc2
KW
1789 }
1790 }
eb83ed87 1791
2b5e7bc2 1792 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1793 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1794 disallowed = TRUE;
1795 }
eb83ed87 1796
2b5e7bc2
KW
1797 /* The deprecated warning overrides any non-deprecated one. If
1798 * there are other problems, a deprecation message is not
1799 * really helpful, so don't bother to raise it in that case.
1800 * This also keeps the code from having to handle the case
1801 * where 'uv' is not valid. */
1802 if ( ! (orig_problems
1803 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
76513bdc
KW
1804 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1805 && ckWARN_d(WARN_DEPRECATED))
1806 {
1807 message = Perl_form(aTHX_ cp_above_legal_max,
1808 uv, MAX_NON_DEPRECATED_CP);
1809 pack_warn = packWARN(WARN_DEPRECATED);
2b5e7bc2
KW
1810 }
1811 }
1812 else if (possible_problems & UTF8_GOT_NONCHAR) {
1813 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1814
f9380377
KW
1815 if (flags & UTF8_WARN_NONCHAR) {
1816 *errors |= UTF8_GOT_NONCHAR;
1817
1818 if ( ! (flags & UTF8_CHECK_ONLY)
1819 && ckWARN_d(WARN_NONCHAR))
1820 {
2b5e7bc2
KW
1821 /* The code above should have guaranteed that we don't
1822 * get here with errors other than overlong */
1823 assert (! (orig_problems
1824 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1825
1826 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 1827 message = Perl_form(aTHX_ nonchar_cp_format, uv);
f9380377 1828 }
2b5e7bc2 1829 }
5b311467 1830
2b5e7bc2
KW
1831 if (flags & UTF8_DISALLOW_NONCHAR) {
1832 disallowed = TRUE;
f9380377 1833 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2 1834 }
6c64cd9d
KW
1835 }
1836 else if (possible_problems & UTF8_GOT_LONG) {
1837 possible_problems &= ~UTF8_GOT_LONG;
1838 *errors |= UTF8_GOT_LONG;
1839
1840 if (flags & UTF8_ALLOW_LONG) {
1841
1842 /* We don't allow the actual overlong value, unless the
1843 * special extra bit is also set */
1844 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
1845 & ~UTF8_ALLOW_LONG)))
1846 {
1847 uv = UNICODE_REPLACEMENT;
1848 }
1849 }
1850 else {
1851 disallowed = TRUE;
1852
1853 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1854 pack_warn = packWARN(WARN_UTF8);
1855
1856 /* These error types cause 'uv' to be something that
1857 * isn't what was intended, so can't use it in the
1858 * message. The other error types either can't
1859 * generate an overlong, or else the 'uv' is valid */
1860 if (orig_problems &
1861 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1862 {
1863 message = Perl_form(aTHX_
1864 "%s: %s (any UTF-8 sequence that starts"
1865 " with \"%s\" is overlong which can and"
1866 " should be represented with a"
1867 " different, shorter sequence)",
1868 malformed_text,
1869 _byte_dump_string(s0, send - s0, 0),
1870 _byte_dump_string(s0, curlen, 0));
1871 }
1872 else {
1873 U8 tmpbuf[UTF8_MAXBYTES+1];
1874 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1875 uv, 0);
1876 const char * preface = (uv <= PERL_UNICODE_MAX)
1877 ? "U+"
1878 : "0x";
1879 message = Perl_form(aTHX_
1880 "%s: %s (overlong; instead use %s to represent"
1881 " %s%0*" UVXf ")",
1882 malformed_text,
1883 _byte_dump_string(s0, send - s0, 0),
1884 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
1885 preface,
1886 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1887 small code points */
1888 uv);
1889 }
1890 }
1891 }
2b5e7bc2
KW
1892 } /* End of looking through the possible flags */
1893
1894 /* Display the message (if any) for the problem being handled in
1895 * this iteration of the loop */
1896 if (message) {
1897 if (PL_op)
1898 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1899 OP_DESC(PL_op));
1900 else
1901 Perl_warner(aTHX_ pack_warn, "%s", message);
1902 }
ddb65933 1903 } /* End of 'while (possible_problems)' */
a0dbb045 1904
2b5e7bc2
KW
1905 /* Since there was a possible problem, the returned length may need to
1906 * be changed from the one stored at the beginning of this function.
1907 * Instead of trying to figure out if that's needed, just do it. */
1908 if (retlen) {
1909 *retlen = curlen;
1910 }
a0dbb045 1911
2b5e7bc2
KW
1912 if (disallowed) {
1913 if (flags & UTF8_CHECK_ONLY && retlen) {
1914 *retlen = ((STRLEN) -1);
1915 }
1916 return 0;
1917 }
eb83ed87 1918 }
ba210ebe 1919
2b5e7bc2 1920 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1921}
1922
8e84507e 1923/*
ec5f19d0
KW
1924=for apidoc utf8_to_uvchr_buf
1925
1926Returns the native code point of the first character in the string C<s> which
1927is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1928C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1929
524080c4
KW
1930If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1931enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1932C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1933(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1934C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1935the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1936See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1937returned.
ec5f19d0 1938
760c7c2f
KW
1939Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1940unless those are turned off.
1941
ec5f19d0 1942=cut
52be2536
KW
1943
1944Also implemented as a macro in utf8.h
1945
ec5f19d0
KW
1946*/
1947
1948
1949UV
1950Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1951{
7f974d7e
KW
1952 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
1953
ec5f19d0
KW
1954 assert(s < send);
1955
1956 return utf8n_to_uvchr(s, send - s, retlen,
ddb65933 1957 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
ec5f19d0
KW
1958}
1959
52be2536
KW
1960/* This is marked as deprecated
1961 *
ec5f19d0
KW
1962=for apidoc utf8_to_uvuni_buf
1963
de69f3af
KW
1964Only in very rare circumstances should code need to be dealing in Unicode
1965(as opposed to native) code points. In those few cases, use
1966C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1967
1968Returns the Unicode (not-native) code point of the first character in the
1969string C<s> which
ec5f19d0
KW
1970is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1971C<retlen> will be set to the length, in bytes, of that character.
1972
524080c4
KW
1973If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1974enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1975NULL) to -1. If those warnings are off, the computed value if well-defined (or
1976the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1977is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1978next possible position in C<s> that could begin a non-malformed character.
de69f3af 1979See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1980
760c7c2f
KW
1981Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1982unless those are turned off.
1983
ec5f19d0
KW
1984=cut
1985*/
1986
1987UV
1988Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1989{
1990 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1991
1992 assert(send > s);
1993
5962d97e
KW
1994 /* Call the low level routine, asking for checks */
1995 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1996}
1997
b76347f2 1998/*
87cea99e 1999=for apidoc utf8_length
b76347f2
JH
2000
2001Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
2002Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
2003up past C<e>, croaks.
b76347f2
JH
2004
2005=cut
2006*/
2007
2008STRLEN
35a4481c 2009Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
2010{
2011 STRLEN len = 0;
2012
7918f24d
NC
2013 PERL_ARGS_ASSERT_UTF8_LENGTH;
2014
8850bf83
JH
2015 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2016 * the bitops (especially ~) can create illegal UTF-8.
2017 * In other words: in Perl UTF-8 is not just for Unicode. */
2018
a3b680e6
AL
2019 if (e < s)
2020 goto warn_and_return;
b76347f2 2021 while (s < e) {
4cbf4130 2022 s += UTF8SKIP(s);
8e91ec7f
AV
2023 len++;
2024 }
2025
2026 if (e != s) {
2027 len--;
2028 warn_and_return:
9b387841
NC
2029 if (PL_op)
2030 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2031 "%s in %s", unees, OP_DESC(PL_op));
2032 else
61a12c31 2033 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
2034 }
2035
2036 return len;
2037}
2038
b06226ff 2039/*
fed3ba5d
NC
2040=for apidoc bytes_cmp_utf8
2041
a1433954 2042Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
2043sequence of characters (stored as UTF-8)
2044in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
2045equal, -1 or -2 if the first string is less than the second string, +1 or +2
2046if the first string is greater than the second string.
2047
2048-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
2049longer string. -2 or +2 is returned if
2050there was a difference between characters
fed3ba5d
NC
2051within the strings.
2052
2053=cut
2054*/
2055
2056int
2057Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2058{
2059 const U8 *const bend = b + blen;
2060 const U8 *const uend = u + ulen;
2061
2062 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
2063
2064 while (b < bend && u < uend) {
2065 U8 c = *u++;
2066 if (!UTF8_IS_INVARIANT(c)) {
2067 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2068 if (u < uend) {
2069 U8 c1 = *u++;
2070 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 2071 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 2072 } else {
2b5e7bc2 2073 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 2074 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
56576a04
KW
2075 "%s %s%s",
2076 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2077 PL_op ? " in " : "",
2078 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
2079 return -2;
2080 }
2081 } else {
2082 if (PL_op)
2083 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2084 "%s in %s", unees, OP_DESC(PL_op));
2085 else
61a12c31 2086 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
2087 return -2; /* Really want to return undef :-) */
2088 }
2089 } else {
2090 return -2;
2091 }
2092 }
2093 if (*b != c) {
2094 return *b < c ? -2 : +2;
2095 }
2096 ++b;
2097 }
2098
2099 if (b == bend && u == uend)
2100 return 0;
2101
2102 return b < bend ? +1 : -1;
2103}
2104
2105/*
87cea99e 2106=for apidoc utf8_to_bytes
6940069f 2107
3bc0c78c 2108Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 2109Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 2110updates C<*lenp> to contain the new length.
3bc0c78c
KW
2111Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2112
2113Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2114having saved the value of C<*lenp> before the call, and subtracting the
2115after-call value of C<*lenp> from it.
6940069f 2116
a1433954 2117If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 2118
6940069f
GS
2119=cut
2120*/
2121
2122U8 *
09af0336 2123Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 2124{
9fe0d3c2 2125 U8 * first_variant;
246fae53 2126
7918f24d 2127 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 2128 PERL_UNUSED_CONTEXT;
7918f24d 2129
9fe0d3c2 2130 /* This is a no-op if no variants at all in the input */
09af0336 2131 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
2132 return s;
2133 }
2134
2135 {
3c5aa262 2136 U8 * const save = s;
09af0336 2137 U8 * const send = s + *lenp;
3c5aa262
KW
2138 U8 * d;
2139
2140 /* Nothing before the first variant needs to be changed, so start the real
2141 * work there */
2142 s = first_variant;
2143 while (s < send) {
2144 if (! UTF8_IS_INVARIANT(*s)) {
2145 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 2146 *lenp = ((STRLEN) -1);
3c5aa262
KW
2147 return 0;
2148 }
2149 s++;
d59937ca
KW
2150 }
2151 s++;
dcad2880 2152 }
dcad2880 2153
3c5aa262
KW
2154 /* Is downgradable, so do it */
2155 d = s = first_variant;
2156 while (s < send) {
2157 U8 c = *s++;
2158 if (! UVCHR_IS_INVARIANT(c)) {
2159 /* Then it is two-byte encoded */
2160 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2161 s++;
2162 }
2163 *d++ = c;
2164 }
2165 *d = '\0';
09af0336 2166 *lenp = d - save;
3c5aa262
KW
2167
2168 return save;
9fe0d3c2 2169 }
6940069f
GS
2170}
2171
2172/*
87cea99e 2173=for apidoc bytes_from_utf8
f9a63242 2174
09af0336 2175Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2176byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2177actually encoded in UTF-8.
2178
2179Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2180the input string.
2181
41ae6089
KW
2182Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2183not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2184C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2185
41ae6089 2186Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2187newly created string containing a downgraded copy of C<s>, and whose length is
23b37b12 2188returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
f9a63242 2189
3bc0c78c 2190Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2191having saved the value of C<*lenp> before the call, and subtracting the
2192after-call value of C<*lenp> from it.
3bc0c78c 2193
37607a96 2194=cut
976c1b08
KW
2195
2196There is a macro that avoids this function call, but this is retained for
2197anyone who calls it with the Perl_ prefix */
f9a63242
JH
2198
2199U8 *
41ae6089 2200Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2201{
7918f24d 2202 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2203 PERL_UNUSED_CONTEXT;
f9a63242 2204
976c1b08
KW
2205 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2206}
2207
2208/*
2209No = here because currently externally undocumented
2210for apidoc bytes_from_utf8_loc
2211
2212Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2213to store the location of the first character in C<"s"> that cannot be
2214converted to non-UTF8.
2215
2216If that parameter is C<NULL>, this function behaves identically to
2217C<bytes_from_utf8>.
2218
2219Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2220C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2221
2222Otherwise, the function returns a newly created C<NUL>-terminated string
2223containing the non-UTF8 equivalent of the convertible first portion of
2224C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2225If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2226and C<*first_non_downgradable> is set to C<NULL>.
2227
2228Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2229first character in the original string that wasn't converted. C<*is_utf8p> is
2230unchanged. Note that the new string may have length 0.
2231
2232Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2233C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2234converts as many characters in it as possible stopping at the first one it
385b74be 2235finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2236set to point to that. The function returns the portion that could be converted
2237in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2238not including the terminating C<NUL>. If the very first character in the
2239original could not be converted, C<*lenp> will be 0, and the new string will
2240contain just a single C<NUL>. If the entire input string was converted,
2241C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2242
2243Upon successful return, the number of variants in the converted portion of the
2244string can be computed by having saved the value of C<*lenp> before the call,
2245and subtracting the after-call value of C<*lenp> from it.
2246
2247=cut
2248
2249
2250*/
2251
2252U8 *
2253Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2254{
2255 U8 *d;
2256 const U8 *original = s;
2257 U8 *converted_start;
2258 const U8 *send = s + *lenp;
f9a63242 2259
976c1b08 2260 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2261
976c1b08
KW
2262 if (! *is_utf8p) {
2263 if (first_unconverted) {
2264 *first_unconverted = NULL;
2265 }
2266
2267 return (U8 *) original;
2268 }
2269
2270 Newx(d, (*lenp) + 1, U8);
2271
2272 converted_start = d;
7299a045
KW
2273 while (s < send) {
2274 U8 c = *s++;
2275 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2276
2277 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2278 * have to stop now */
2279 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2280 if (first_unconverted) {
2281 *first_unconverted = s - 1;
2282 goto finish_and_return;
2283 }
2284 else {
2285 Safefree(converted_start);
2286 return (U8 *) original;
2287 }
2288 }
2289
7299a045
KW
2290 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2291 s++;
38af28cf 2292 }
7299a045
KW
2293 *d++ = c;
2294 }
170a1c22 2295
976c1b08
KW
2296 /* Here, converted the whole of the input */
2297 *is_utf8p = FALSE;
2298 if (first_unconverted) {
2299 *first_unconverted = NULL;
170a1c22 2300 }
976c1b08
KW
2301
2302 finish_and_return:
2303 *d = '\0';
2304 *lenp = d - converted_start;
2305
2306 /* Trim unused space */
2307 Renew(converted_start, *lenp + 1, U8);
2308
2309 return converted_start;
f9a63242
JH
2310}
2311
2312/*
87cea99e 2313=for apidoc bytes_to_utf8
6940069f 2314
09af0336 2315Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2316UTF-8.
09af0336 2317Returns a pointer to the newly-created string, and sets C<*lenp> to
ff97e5cf 2318reflect the new length in bytes.
6940069f 2319
3bc0c78c 2320Upon successful return, the number of variants in the string can be computed by
23b37b12 2321having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2322after-call value of C<*lenp>.
2323
75200dff 2324A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2325
2326If you want to convert to UTF-8 from encodings other than
2327the native (Latin1 or EBCDIC),
a1433954 2328see L</sv_recode_to_utf8>().
c9ada85f 2329
497711e7 2330=cut
6940069f
GS
2331*/
2332
2333U8*
09af0336 2334Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2335{
09af0336 2336 const U8 * const send = s + (*lenp);
6940069f
GS
2337 U8 *d;
2338 U8 *dst;
7918f24d
NC
2339
2340 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2341 PERL_UNUSED_CONTEXT;
6940069f 2342
09af0336 2343 Newx(d, (*lenp) * 2 + 1, U8);
6940069f
GS
2344 dst = d;
2345
2346 while (s < send) {
55d09dc8
KW
2347 append_utf8_from_native_byte(*s, &d);
2348 s++;
6940069f
GS
2349 }
2350 *d = '\0';
09af0336 2351 *lenp = d-dst;
6940069f
GS
2352 return dst;
2353}
2354
a0ed51b3 2355/*
dea0fc0b 2356 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
2357 *
2358 * Destination must be pre-extended to 3/2 source. Do not use in-place.
2359 * We optimize for native, for obvious reasons. */
2360
2361U8*
dea0fc0b 2362Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 2363{
dea0fc0b
JH
2364 U8* pend;
2365 U8* dstart = d;
2366
7918f24d
NC
2367 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2368
dea0fc0b 2369 if (bytelen & 1)
56576a04
KW
2370 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2371 (UV)bytelen);
dea0fc0b
JH
2372
2373 pend = p + bytelen;
2374
a0ed51b3 2375 while (p < pend) {
dea0fc0b
JH
2376 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2377 p += 2;
2d1545e5 2378 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2379 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2380 continue;
2381 }
56d37426
KW
2382 if (uv <= MAX_UTF8_TWO_BYTE) {
2383 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2384 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2385 continue;
2386 }
46956fad
KW
2387#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2388#define LAST_HIGH_SURROGATE 0xDBFF
2389#define FIRST_LOW_SURROGATE 0xDC00
2390#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
2391
2392 /* This assumes that most uses will be in the first Unicode plane, not
2393 * needing surrogates */
2394 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2395 && uv <= UNICODE_SURROGATE_LAST))
2396 {
2397 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2398 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2399 }
2400 else {
01ea242b 2401 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2402 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2403 || UNLIKELY(low > LAST_LOW_SURROGATE))
2404 {
01ea242b 2405 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2406 }
2407 p += 2;
46956fad
KW
2408 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2409 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2410 }
a0ed51b3 2411 }
56d37426
KW
2412#ifdef EBCDIC
2413 d = uvoffuni_to_utf8_flags(d, uv, 0);
2414#else
a0ed51b3 2415 if (uv < 0x10000) {
eb160463
GS
2416 *d++ = (U8)(( uv >> 12) | 0xe0);
2417 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2418 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2419 continue;
2420 }
2421 else {
eb160463
GS
2422 *d++ = (U8)(( uv >> 18) | 0xf0);
2423 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2424 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2425 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2426 continue;
2427 }
56d37426 2428#endif
a0ed51b3 2429 }
dea0fc0b 2430 *newlen = d - dstart;
a0ed51b3
LW
2431 return d;
2432}
2433
2434/* Note: this one is slightly destructive of the source. */
2435
2436U8*
dea0fc0b 2437Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2438{
2439 U8* s = (U8*)p;
d4c19fe8 2440 U8* const send = s + bytelen;
7918f24d
NC
2441
2442 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2443
e0ea5e2d 2444 if (bytelen & 1)
147e3846 2445 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2446 (UV)bytelen);
2447
a0ed51b3 2448 while (s < send) {
d4c19fe8 2449 const U8 tmp = s[0];
a0ed51b3
LW
2450 s[0] = s[1];
2451 s[1] = tmp;
2452 s += 2;
2453 }
dea0fc0b 2454 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2455}
2456
922e8cb4
KW
2457bool
2458Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2459{
2460 U8 tmpbuf[UTF8_MAXBYTES+1];
2461 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2462 return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
922e8cb4
KW
2463}
2464
f9ae8fb6
JD
2465/* Internal function so we can deprecate the external one, and call
2466 this one from other deprecated functions in this file */
2467
f2645549
KW
2468bool
2469Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2470{
f2645549 2471 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2472
2473 if (*p == '_')
2474 return TRUE;
f25ce844 2475 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2476}
2477
5092f92a 2478bool
eba68aa0
KW
2479Perl__is_uni_perl_idcont(pTHX_ UV c)
2480{
2481 U8 tmpbuf[UTF8_MAXBYTES+1];
2482 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2483 return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
eba68aa0
KW
2484}
2485
2486bool
f91dcd13
KW
2487Perl__is_uni_perl_idstart(pTHX_ UV c)
2488{
2489 U8 tmpbuf[UTF8_MAXBYTES+1];
2490 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2491 return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
f91dcd13
KW
2492}
2493
3a4c58c9 2494UV
56576a04
KW
2495Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2496 const char S_or_s)
3a4c58c9
KW
2497{
2498 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2499 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2500 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2501 * either "SS" or "Ss". Which one to use is passed into the routine in
2502 * 'S_or_s' to avoid a test */
2503
2504 UV converted = toUPPER_LATIN1_MOD(c);
2505
2506 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2507
2508 assert(S_or_s == 'S' || S_or_s == 's');
2509
6f2d5cbc 2510 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2511 characters in this range */
3a4c58c9
KW
2512 *p = (U8) converted;
2513 *lenp = 1;
2514 return converted;
2515 }
2516
2517 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2518 * which it maps to one of them, so as to only have to have one check for
2519 * it in the main case */
2520 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2521 switch (c) {
2522 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2523 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2524 break;
2525 case MICRO_SIGN:
2526 converted = GREEK_CAPITAL_LETTER_MU;
2527 break;
79e064b9
KW
2528#if UNICODE_MAJOR_VERSION > 2 \
2529 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2530 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2531 case LATIN_SMALL_LETTER_SHARP_S:
2532 *(p)++ = 'S';
2533 *p = S_or_s;
2534 *lenp = 2;
2535 return 'S';
79e064b9 2536#endif
3a4c58c9 2537 default:
56576a04
KW
2538 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2539 " '%c' to map to '%c'",
2540 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2541 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2542 }
2543 }
2544
2545 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2546 *p = UTF8_TWO_BYTE_LO(converted);
2547 *lenp = 2;
2548
2549 return converted;
2550}
2551
50bda2c3
KW
2552/* Call the function to convert a UTF-8 encoded character to the specified case.
2553 * Note that there may be more than one character in the result.
2554 * INP is a pointer to the first byte of the input character
2555 * OUTP will be set to the first byte of the string of changed characters. It
2556 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2557 * LENP will be set to the length in bytes of the string of changed characters
2558 *
56576a04
KW
2559 * The functions return the ordinal of the first character in the string of
2560 * OUTP */
2561#define CALL_UPPER_CASE(uv, s, d, lenp) \
2562 _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2563#define CALL_TITLE_CASE(uv, s, d, lenp) \
2564 _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2565#define CALL_LOWER_CASE(uv, s, d, lenp) \
2566 _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2567
b9992569
KW
2568/* This additionally has the input parameter 'specials', which if non-zero will
2569 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2570 * folding); otherwise, when zero, this implies a simple case fold */
56576a04
KW
2571#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
2572_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2573
84afefe6
JH
2574UV
2575Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2576{
a1433954
KW
2577 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2578 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2579 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2580 * the changed version may be longer than the original character.
2581 *
2582 * The ordinal of the first character of the changed version is returned
2583 * (but note, as explained above, that there may be more.) */
2584
7918f24d
NC
2585 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2586
3a4c58c9
KW
2587 if (c < 256) {
2588 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2589 }
2590
0ebc6274 2591 uvchr_to_utf8(p, c);
b9992569 2592 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2593}
2594
84afefe6
JH
2595UV
2596Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2597{
7918f24d
NC
2598 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2599
3a4c58c9
KW
2600 if (c < 256) {
2601 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2602 }
2603
0ebc6274 2604 uvchr_to_utf8(p, c);
b9992569 2605 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2606}
2607
afc16117 2608STATIC U8
eaf412bf 2609S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2610{
2611 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2612 * those, converting the result to UTF-8. Since the result is always just
a1433954 2613 * one character, we allow <p> to be NULL */
afc16117
KW
2614
2615 U8 converted = toLOWER_LATIN1(c);
2616
eaf412bf
KW
2617 PERL_UNUSED_ARG(dummy);
2618
afc16117 2619 if (p != NULL) {
6f2d5cbc 2620 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2621 *p = converted;
2622 *lenp = 1;
2623 }
2624 else {
430c9760
KW
2625 /* Result is known to always be < 256, so can use the EIGHT_BIT
2626 * macros */
2627 *p = UTF8_EIGHT_BIT_HI(converted);
2628 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2629 *lenp = 2;
2630 }
2631 }
2632 return converted;
2633}
2634
84afefe6
JH
2635UV
2636Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2637{
7918f24d
NC
2638 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2639
afc16117 2640 if (c < 256) {
eaf412bf 2641 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
2642 }
2643
afc16117 2644 uvchr_to_utf8(p, c);
b9992569 2645 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2646}
2647
84afefe6 2648UV
56576a04
KW
2649Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2650 const unsigned int flags)
a1dde8de 2651{
51910141 2652 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2653 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2654 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2655 *
2656 * Not to be used for locale folds
51910141 2657 */
f673fad4 2658
a1dde8de
KW
2659 UV converted;
2660
2661 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2662 PERL_UNUSED_CONTEXT;
a1dde8de 2663
1ca267a5
KW
2664 assert (! (flags & FOLD_FLAGS_LOCALE));
2665
659a7c2d 2666 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2667 converted = GREEK_SMALL_LETTER_MU;
2668 }
9b63e895
KW
2669#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2670 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2671 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2672 else if ( (flags & FOLD_FLAGS_FULL)
2673 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2674 {
1ca267a5
KW
2675 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2676 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2677 * under those circumstances. */
2678 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2679 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2680 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2681 p, *lenp, U8);
2682 return LATIN_SMALL_LETTER_LONG_S;
2683 }
2684 else {
4f489194
KW
2685 *(p)++ = 's';
2686 *p = 's';
2687 *lenp = 2;
2688 return 's';
1ca267a5 2689 }
a1dde8de 2690 }
9b63e895 2691#endif
a1dde8de
KW
2692 else { /* In this range the fold of all other characters is their lower
2693 case */
2694 converted = toLOWER_LATIN1(c);
2695 }
2696
6f2d5cbc 2697 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2698 *p = (U8) converted;
2699 *lenp = 1;
2700 }
2701 else {
2702 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2703 *p = UTF8_TWO_BYTE_LO(converted);
2704 *lenp = 2;
2705 }
2706
2707 return converted;
2708}
2709
2710UV
31f05a37 2711Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2712{
4b593389 2713
a0270393
KW
2714 /* Not currently externally documented, and subject to change
2715 * <flags> bits meanings:
2716 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2717 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2718 * locale are to be used.
a0270393
KW
2719 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2720 */
4b593389 2721
36bb2ab6 2722 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2723
780fcc9f
KW
2724 if (flags & FOLD_FLAGS_LOCALE) {
2725 /* Treat a UTF-8 locale as not being in locale at all */
2726 if (IN_UTF8_CTYPE_LOCALE) {
2727 flags &= ~FOLD_FLAGS_LOCALE;
2728 }
2729 else {
2730 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2731 goto needs_full_generality;
780fcc9f 2732 }
31f05a37
KW
2733 }
2734
a1dde8de 2735 if (c < 256) {
e7b7ac46 2736 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2737 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2738 }
2739
2f306ab9 2740 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2741 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2742 uvchr_to_utf8(p, c);
b9992569 2743 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 2744 }
567b353c 2745 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
2746 the special flags. */
2747 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2748
2749 needs_full_generality:
a0270393 2750 uvchr_to_utf8(utf8_c, c);
56576a04
KW
2751 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
2752 p, lenp, flags);
a0270393 2753 }
84afefe6
JH
2754}
2755
26483009 2756PERL_STATIC_INLINE bool
5141f98e 2757S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2758 const char *const swashname, SV* const invlist)
bde6a22d 2759{
ea317ccb
KW
2760 /* returns a boolean giving whether or not the UTF8-encoded character that
2761 * starts at <p> is in the swash indicated by <swashname>. <swash>
2762 * contains a pointer to where the swash indicated by <swashname>
2763 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2764 * look at <*swash> and only generate a swash if it is not null. <invlist>
2765 * is NULL or an inversion list that defines the swash. If not null, it
2766 * saves time during initialization of the swash.
ea317ccb
KW
2767 *
2768 * Note that it is assumed that the buffer length of <p> is enough to
2769 * contain all the bytes that comprise the character. Thus, <*p> should
2770 * have been checked before this call for mal-formedness enough to assure
2771 * that. */
2772
7918f24d
NC
2773 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2774
492a624f 2775 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2776 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2777 * as far as there being enough bytes available in it to accommodate the
2778 * character without reading beyond the end, and pass that number on to the
2779 * validating routine */
6302f837 2780 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
86ae6e94 2781 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
99a765e9 2782 _UTF8_NO_CONFIDENCE_IN_CURLEN,
86ae6e94
KW
2783 1 /* Die */ );
2784 NOT_REACHED; /* NOTREACHED */
28123549 2785 }
86ae6e94 2786
87367d5f
KW
2787 if (!*swash) {
2788 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2789 *swash = _core_swash_init("utf8",
2790
2791 /* Only use the name if there is no inversion
2792 * list; otherwise will go out to disk */
2793 (invlist) ? "" : swashname,
2794
2795 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2796 }
28123549 2797
bde6a22d
NC
2798 return swash_fetch(*swash, p, TRUE) != 0;
2799}
2800
da8c1a98 2801PERL_STATIC_INLINE bool
56576a04
KW
2802S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
2803 SV **swash, const char *const swashname,
2804 SV* const invlist)
da8c1a98
KW
2805{
2806 /* returns a boolean giving whether or not the UTF8-encoded character that
2807 * starts at <p>, and extending no further than <e - 1> is in the swash
2808 * indicated by <swashname>. <swash> contains a pointer to where the swash
2809 * indicated by <swashname> is to be stored; which this routine will do, so
2810 * that future calls will look at <*swash> and only generate a swash if it
2811 * is not null. <invlist> is NULL or an inversion list that defines the
2812 * swash. If not null, it saves time during initialization of the swash.
2813 */
2814
2815 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
2816
2817 if (! isUTF8_CHAR(p, e)) {
2818 _force_out_malformed_utf8_message(p, e, 0, 1);
2819 NOT_REACHED; /* NOTREACHED */
2820 }
2821
2822 if (!*swash) {
2823 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2824 *swash = _core_swash_init("utf8",
2825
2826 /* Only use the name if there is no inversion
2827 * list; otherwise will go out to disk */
2828 (invlist) ? "" : swashname,
2829
2830 &PL_sv_undef, 1, 0, invlist, &flags);
2831 }
2832
2833 return swash_fetch(*swash, p, TRUE) != 0;
2834}
2835
34aeb2e9
KW
2836STATIC void
2837S_warn_on_first_deprecated_use(pTHX_ const char * const name,
2838 const char * const alternative,
2839 const bool use_locale,
2840 const char * const file,
2841 const unsigned line)
2842{
2843 const char * key;
2844
2845 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
2846
2847 if (ckWARN_d(WARN_DEPRECATED)) {
2848
2849 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
2850 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
2851 if (! PL_seen_deprecated_macro) {
2852 PL_seen_deprecated_macro = newHV();
2853 }
2854 if (! hv_store(PL_seen_deprecated_macro, key,
2855 strlen(key), &PL_sv_undef, 0))
2856 {
2857 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2858 }
2859
c44e9413 2860 if (instr(file, "mathoms.c")) {
607313a1
KW
2861 Perl_warner(aTHX_ WARN_DEPRECATED,
2862 "In %s, line %d, starting in Perl v5.30, %s()"
2863 " will be removed. Avoid this message by"
2864 " converting to use %s().\n",
2865 file, line, name, alternative);
2866 }
2867 else {
34aeb2e9
KW
2868 Perl_warner(aTHX_ WARN_DEPRECATED,
2869 "In %s, line %d, starting in Perl v5.30, %s() will"
2870 " require an additional parameter. Avoid this"
2871 " message by converting to use %s().\n",
2872 file, line, name, alternative);
607313a1 2873 }
34aeb2e9
KW
2874 }
2875 }
2876}
2877
bde6a22d 2878bool
34aeb2e9 2879Perl__is_utf8_FOO(pTHX_ U8 classnum,
be99e2c2 2880 const U8 * const p,
34aeb2e9
KW
2881 const char * const name,
2882 const char * const alternative,
2883 const bool use_utf8,
2884 const bool use_locale,
2885 const char * const file,
2886 const unsigned line)
922e8cb4 2887{
922e8cb4
KW
2888 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2889
34aeb2e9
KW
2890 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
2891
2892 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
34aeb2e9
KW
2893
2894 switch (classnum) {
2895 case _CC_WORDCHAR:
2896 case _CC_DIGIT:
2897 case _CC_ALPHA:
2898 case _CC_LOWER:
2899 case _CC_UPPER:
2900 case _CC_PUNCT:
2901 case _CC_PRINT:
2902 case _CC_ALPHANUMERIC:
2903 case _CC_GRAPH:
2904 case _CC_CASED:
2905
2906 return is_utf8_common(p,
2907 &PL_utf8_swash_ptrs[classnum],
2908 swash_property_names[classnum],
2909 PL_XPosix_ptrs[classnum]);
2910
2911 case _CC_SPACE:
2912 return is_XPERLSPACE_high(p);
2913 case _CC_BLANK:
2914 return is_HORIZWS_high(p);
2915 case _CC_XDIGIT:
2916 return is_XDIGIT_high(p);
2917 case _CC_CNTRL:
2918 return 0;
2919 case _CC_ASCII:
2920 return 0;
2921 case _CC_VERTSPACE:
2922 return is_VERTWS_high(p);
2923 case _CC_IDFIRST:
2924 if (! PL_utf8_perl_idstart) {
22f0498f
KW
2925 PL_utf8_perl_idstart
2926 = _new_invlist_C_array(_Perl_IDStart_invlist);
34aeb2e9 2927 }
22f0498f
KW
2928 return is_utf8_common(p, &PL_utf8_perl_idstart,
2929 "_Perl_IDStart", NULL);
34aeb2e9
KW
2930 case _CC_IDCONT:
2931 if (! PL_utf8_perl_idcont) {
22f0498f
KW
2932 PL_utf8_perl_idcont
2933 = _new_invlist_C_array(_Perl_IDCont_invlist);
34aeb2e9 2934 }
22f0498f
KW
2935 return is_utf8_common(p, &PL_utf8_perl_idcont,
2936 "_Perl_IDCont", NULL);
34aeb2e9
KW
2937 }
2938 }
2939
2940 /* idcont is the same as wordchar below 256 */
2941 if (classnum == _CC_IDCONT) {
2942 classnum = _CC_WORDCHAR;
2943 }
2944 else if (classnum == _CC_IDFIRST) {
2945 if (*p == '_') {
2946 return TRUE;
2947 }
2948 classnum = _CC_ALPHA;
2949 }
2950
2951 if (! use_locale) {
2952 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2953 return _generic_isCC(*p, classnum);
2954 }
922e8cb4 2955
34aeb2e9
KW
2956 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
2957 }
2958 else {
2959 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2960 return isFOO_lc(classnum, *p);
2961 }
2962
2963 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
2964 }
2965
2966 NOT_REACHED; /* NOTREACHED */
922e8cb4
KW
2967}
2968
2969bool
da8c1a98
KW
2970Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
2971 const U8 * const e)
2972{
2973 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
2974
2975 assert(classnum < _FIRST_NON_SWASH_CC);
2976
2977 return is_utf8_common_with_len(p,
2978 e,
2979 &PL_utf8_swash_ptrs[classnum],
2980 swash_property_names[classnum],
2981 PL_XPosix_ptrs[classnum]);
2982}
2983
2984bool
da8c1a98
KW
2985Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
2986{
2987 SV* invlist = NULL;
2988
2989 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
2990
2991 if (! PL_utf8_perl_idstart) {
2992 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2993 }
2994 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
2995 "_Perl_IDStart", invlist);
2996}
2997
2998bool
f2645549 2999Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 3000{
f2645549 3001 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
3002
3003 if (*p == '_')
3004 return TRUE;
f25ce844 3005 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
3006}
3007
3008bool
da8c1a98
KW
3009Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
3010{
3011 SV* invlist = NULL;
3012
3013 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
3014
3015 if (! PL_utf8_perl_idcont) {
3016 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
3017 }
3018 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
3019 "_Perl_IDCont", invlist);
3020}
3021
3022bool
f2645549 3023Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 3024{
f2645549 3025 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 3026
f25ce844 3027 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
3028}
3029
3030bool
f2645549 3031Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 3032{
f2645549 3033 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 3034
f25ce844 3035 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
3036}
3037
3038bool
7dbf68d2
KW
3039Perl__is_utf8_mark(pTHX_ const U8 *p)
3040{
7dbf68d2
KW
3041 PERL_ARGS_ASSERT__IS_UTF8_MARK;
3042
f25ce844 3043 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
3044}
3045
b9992569 3046 /* change namve uv1 to 'from' */
6a4a25f4 3047STATIC UV
b9992569
KW
3048S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
3049 SV **swashp, const char *normal, const char *special)
3050{
0134edef 3051 STRLEN len = 0;
7918f24d 3052
b9992569 3053 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 3054
36eaa811
KW
3055 /* For code points that don't change case, we already know that the output
3056 * of this function is the unchanged input, so we can skip doing look-ups
3057 * for them. Unfortunately the case-changing code points are scattered
3058 * around. But there are some long consecutive ranges where there are no
3059 * case changing code points. By adding tests, we can eliminate the lookup
3060 * for all the ones in such ranges. This is currently done here only for
3061 * just a few cases where the scripts are in common use in modern commerce
3062 * (and scripts adjacent to those which can be included without additional
3063 * tests). */
3064
3065 if (uv1 >= 0x0590) {
3066 /* This keeps from needing further processing the code points most
3067 * likely to be used in the following non-cased scripts: Hebrew,
3068 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
3069 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
3070 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
3071 if (uv1 < 0x10A0) {
3072 goto cases_to_self;
3073 }
3074
3075 /* The following largish code point ranges also don't have case
3076 * changes, but khw didn't think they warranted extra tests to speed
3077 * them up (which would slightly slow down everything else above them):
3078 * 1100..139F Hangul Jamo, Ethiopic
3079 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
3080 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
3081 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
3082 * Combining Diacritical Marks Extended, Balinese,
3083 * Sundanese, Batak, Lepcha, Ol Chiki
3084 * 2000..206F General Punctuation
3085 */
3086
3087 if (uv1 >= 0x2D30) {
3088
3089 /* This keeps the from needing further processing the code points
3090 * most likely to be used in the following non-cased major scripts:
3091 * CJK, Katakana, Hiragana, plus some less-likely scripts.
3092 *
3093 * (0x2D30 above might have to be changed to 2F00 in the unlikely
3094 * event that Unicode eventually allocates the unused block as of
3095 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
3096 * that the test suite will start having failures to alert you
3097 * should that happen) */
3098 if (uv1 < 0xA640) {
3099 goto cases_to_self;
3100 }
3101
3102 if (uv1 >= 0xAC00) {
3103 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
3104 if (ckWARN_d(WARN_SURROGATE)) {
3105 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3106 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
56576a04
KW
3107 "Operation \"%s\" returns its argument for"
3108 " UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
3109 }
3110 goto cases_to_self;
3111 }
36eaa811
KW
3112
3113 /* AC00..FAFF Catches Hangul syllables and private use, plus
3114 * some others */
3115 if (uv1 < 0xFB00) {
3116 goto cases_to_self;
3117
3118 }
3119
5af9bc97 3120 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
76513bdc
KW
3121 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
3122 && ckWARN_d(WARN_DEPRECATED))
3123 {
3124 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
3125 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
5af9bc97
KW
3126 }
3127 if (ckWARN_d(WARN_NON_UNICODE)) {
3128 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3129 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
56576a04
KW
3130 "Operation \"%s\" returns its argument for"
3131 " non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
3132 }
3133 goto cases_to_self;
3134 }
3bfc1e70
KW
3135#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3136 if (UNLIKELY(uv1
3137 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3138 {
3139
56576a04
KW
3140 /* As of Unicode 10.0, this means we avoid swash creation
3141 * for anything beyond high Plane 1 (below emojis) */
3bfc1e70
KW
3142 goto cases_to_self;
3143 }
3144#endif
36eaa811
KW
3145 }
3146 }
9ae3ac1a 3147
36eaa811
KW
3148 /* Note that non-characters are perfectly legal, so no warning should
3149 * be given. There are so few of them, that it isn't worth the extra
3150 * tests to avoid swash creation */
9ae3ac1a
KW
3151 }
3152
0134edef 3153 if (!*swashp) /* load on-demand */
56576a04
KW
3154 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
3155 4, 0, NULL, NULL);
0134edef 3156
a6f87d8c 3157 if (special) {
0134edef 3158 /* It might be "special" (sometimes, but not always,
2a37f04d 3159 * a multicharacter mapping) */
4a8240a3 3160 HV *hv = NULL;
b08cf34e
JH
3161 SV **svp;
3162
4a8240a3
KW
3163 /* If passed in the specials name, use that; otherwise use any
3164 * given in the swash */
3165 if (*special != '\0') {
3166 hv = get_hv(special, 0);
3167 }
3168 else {
3169 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
3170 if (svp) {
3171 hv = MUTABLE_HV(SvRV(*svp));
3172 }
3173 }
3174
176fe009 3175 if (hv
5f560d8a 3176 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
3177 && (*svp))
3178 {
cfd0369c 3179 const char *s;
47654450 3180
cfd0369c 3181 s = SvPV_const(*svp, len);
47654450 3182 if (len == 1)
f4cd282c 3183 /* EIGHTBIT */
c80e42f3 3184 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 3185 else {
d2dcd0fb 3186 Copy(s, ustrp, len, U8);
29e98929 3187 }
983ffd37 3188 }
0134edef
JH
3189 }
3190
3191 if (!len && *swashp) {
4a4088c4 3192 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 3193
0134edef
JH
3194 if (uv2) {
3195 /* It was "normal" (a single character mapping). */
f4cd282c 3196 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
3197 }
3198 }
1feea2c7 3199
cbe07460
KW
3200 if (len) {
3201 if (lenp) {
3202 *lenp = len;
3203 }
3204 return valid_utf8_to_uvchr(ustrp, 0);
3205 }
3206
3207 /* Here, there was no mapping defined, which means that the code point maps
3208 * to itself. Return the inputs */
e24dfe9c 3209 cases_to_self:
bfdf22ec 3210 len = UTF8SKIP(p);
ca9fab46
KW
3211 if (p != ustrp) { /* Don't copy onto itself */
3212 Copy(p, ustrp, len, U8);
3213 }
0134edef 3214
2a37f04d
JH
3215 if (lenp)
3216 *lenp = len;
3217
f4cd282c 3218 return uv1;
cbe07460 3219
a0ed51b3
LW
3220}
3221
051a06d4 3222STATIC UV
56576a04
KW
3223S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3224 U8* const ustrp, STRLEN *lenp)
051a06d4 3225{
4a4088c4 3226 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3227 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3228 * result contains a character that crosses the 255/256 boundary, disallow
3229 * the change, and return the original code point. See L<perlfunc/lc> for
3230 * why;
051a06d4 3231 *
a1433954
KW
3232 * p points to the original string whose case was changed; assumed
3233 * by this routine to be well-formed
051a06d4 3234 * result the code point of the first character in the changed-case string
56576a04
KW
3235 * ustrp points to the changed-case string (<result> represents its
3236 * first char)
051a06d4
KW
3237 * lenp points to the length of <ustrp> */
3238
3239 UV original; /* To store the first code point of <p> */
3240
3241 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3242
a4f12ed7 3243 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3244
3245 /* We know immediately if the first character in the string crosses the
3246 * boundary, so can skip */
3247 if (result > 255) {
3248
3249 /* Look at every character in the result; if any cross the
3250 * boundary, the whole thing is disallowed */
3251 U8* s = ustrp + UTF8SKIP(ustrp);
3252 U8* e = ustrp + *lenp;
3253 while (s < e) {
a4f12ed7 3254 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3255 goto bad_crossing;
3256 }
3257 s += UTF8SKIP(s);
3258 }
3259
613abc6d
KW
3260 /* Here, no characters crossed, result is ok as-is, but we warn. */
3261 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3262 return result;
3263 }
3264
7b52d656 3265 bad_crossing:
051a06d4
KW
3266
3267 /* Failed, have to return the original */
4b88fb76 3268 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3269
3270 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3271 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
56576a04
KW
3272 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3273 " locale; resolved to \"\\x{%" UVXf "}\".",
357aadde 3274 OP_DESC(PL_op),
ab0b796c
KW
3275 original,
3276 original);
051a06d4
KW
3277 Copy(p, ustrp, *lenp, char);
3278 return original;
3279}
3280
607313a1
KW
3281STATIC U32
3282S_check_and_deprecate(pTHX_ const U8 *p,
3283 const U8 **e,
3284 const unsigned int type, /* See below */
3285 const bool use_locale, /* Is this a 'LC_'
3286 macro call? */
3287 const char * const file,
3288 const unsigned line)
3289{
3290 /* This is a temporary function to deprecate the unsafe calls to the case
3291 * changing macros and functions. It keeps all the special stuff in just
3292 * one place.
3293 *
3294 * It updates *e with the pointer to the end of the input string. If using
3295 * the old-style macros, *e is NULL on input, and so this function assumes
3296 * the input string is long enough to hold the entire UTF-8 sequence, and
3297 * sets *e accordingly, but it then returns a flag to pass the
3298 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3299 * using the full length if possible.
3300 *
3301 * It also does the assert that *e > p when *e is not NULL. This should be
3302 * migrated to the callers when this function gets deleted.
3303 *
3304 * The 'type' parameter is used for the caller to specify which case
3305 * changing function this is called from: */
3306
3307# define DEPRECATE_TO_UPPER 0
3308# define DEPRECATE_TO_TITLE 1
3309# define DEPRECATE_TO_LOWER 2
3310# define DEPRECATE_TO_FOLD 3
3311
3312 U32 utf8n_flags = 0;
3313 const char * name;
3314 const char * alternative;
3315
3316 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3317
3318 if (*e == NULL) {
3319 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3320 *e = p + UTF8SKIP(p);
3321
3322 /* For mathoms.c calls, we use the function name we know is stored
c44e9413 3323 * there. It could be part of a larger path */
607313a1 3324 if (type == DEPRECATE_TO_UPPER) {
c44e9413 3325 name = instr(file, "mathoms.c")
607313a1
KW
3326 ? "to_utf8_upper"
3327 : "toUPPER_utf8";
3328 alternative = "toUPPER_utf8_safe";
3329 }
3330 else if (type == DEPRECATE_TO_TITLE) {
c44e9413 3331 name = instr(file, "mathoms.c")
607313a1
KW
3332 ? "to_utf8_title"
3333 : "toTITLE_utf8";
3334 alternative = "toTITLE_utf8_safe";
3335 }
3336 else if (type == DEPRECATE_TO_LOWER) {
c44e9413 3337 name = instr(file, "mathoms.c")
607313a1
KW
3338 ? "to_utf8_lower"
3339 : "toLOWER_utf8";
3340 alternative = "toLOWER_utf8_safe";
3341 }
3342 else if (type == DEPRECATE_TO_FOLD) {
c44e9413 3343 name = instr(file, "mathoms.c")
607313a1
KW
3344 ? "to_utf8_fold"
3345 : "toFOLD_utf8";
3346 alternative = "toFOLD_utf8_safe";
3347 }
3348 else Perl_croak(aTHX_ "panic: Unexpected case change type");
3349
3350 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3351 }
3352 else {
3353 assert (p < *e);
3354 }
3355
3356 return utf8n_flags;
3357}
3358
eaf412bf
KW
3359/* The process for changing the case is essentially the same for the four case
3360 * change types, except there are complications for folding. Otherwise the
3361 * difference is only which case to change to. To make sure that they all do
3362 * the same thing, the bodies of the functions are extracted out into the
3363 * following two macros. The functions are written with the same variable
3364 * names, and these are known and used inside these macros. It would be
3365 * better, of course, to have inline functions to do it, but since different
3366 * macros are called, depending on which case is being changed to, this is not
3367 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3368 * function can start with the common start macro, then finish with its special
3369 * handling; while the other three cases can just use the common end macro.
3370 *
3371 * The algorithm is to use the proper (passed in) macro or function to change
3372 * the case for code points that are below 256. The macro is used if using
3373 * locale rules for the case change; the function if not. If the code point is
3374 * above 255, it is computed from the input UTF-8, and another macro is called
3375 * to do the conversion. If necessary, the output is converted to UTF-8. If
3376 * using a locale, we have to check that the change did not cross the 255/256
3377 * boundary, see check_locale_boundary_crossing() for further details.
3378 *
3379 * The macros are split with the correct case change for the below-256 case
3380 * stored into 'result', and in the middle of an else clause for the above-255
3381 * case. At that point in the 'else', 'result' is not the final result, but is
3382 * the input code point calculated from the UTF-8. The fold code needs to
3383 * realize all this and take it from there.
3384 *
3385 * If you read the two macros as sequential, it's easier to understand what's
3386 * going on. */
3387#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
3388 L1_func_extra_param) \
a239b1e2 3389 \
eaf412bf
KW
3390 if (flags & (locale_flags)) { \
3391 /* Treat a UTF-8 locale as not being in locale at all */ \
3392 if (IN_UTF8_CTYPE_LOCALE) { \
3393 flags &= ~(locale_flags); \
3394 } \
3395 else { \
3396 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3397 } \
3398 } \
3399 \
3400 if (UTF8_IS_INVARIANT(*p)) { \
3401 if (flags & (locale_flags)) { \
3402 result = LC_L1_change_macro(*p); \
3403 } \
3404 else { \
3405 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3406 } \
3407 } \
a239b1e2 3408 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
eaf412bf
KW
3409 if (flags & (locale_flags)) { \
3410 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
3411 *(p+1))); \
3412 } \
3413 else { \
3414 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
3415 ustrp, lenp, L1_func_extra_param); \
3416 } \
3417 } \
fa8ab374
KW
3418 else { /* malformed UTF-8 or ord above 255 */ \
3419 STRLEN len_result; \
fa8ab374
KW
3420 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3421 if (len_result == (STRLEN) -1) { \
607313a1
KW
3422 _force_out_malformed_utf8_message(p, e, utf8n_flags, \
3423 1 /* Die */ ); \
fa8ab374 3424 }
eaf412bf
KW
3425
3426#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3427 result = change_macro(result, p, ustrp, lenp); \
3428 \
3429 if (flags & (locale_flags)) { \
3430 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3431 } \
3432 return result; \
3433 } \
3434 \
3435 /* Here, used locale rules. Convert back to UTF-8 */ \
3436 if (UTF8_IS_INVARIANT(result)) { \
3437 *ustrp = (U8) result; \
3438 *lenp = 1; \
3439 } \
3440 else { \
3441 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3442 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3443 *lenp = 2; \
3444 } \
3445 \
3446 return result;
3447
d3e79532 3448/*
87cea99e 3449=for apidoc to_utf8_upper
d3e79532 3450
a239b1e2 3451Instead use L</toUPPER_utf8_safe>.
a1433954 3452
d3e79532
JH
3453=cut */
3454
051a06d4 3455/* Not currently externally documented, and subject to change:
31f05a37
KW
3456 * <flags> is set iff iff the rules from the current underlying locale are to
3457 * be used. */
051a06d4 3458
2104c8d9 3459UV
607313a1
KW
3460Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3461 const U8 *e,
3462 U8* ustrp,
3463 STRLEN *lenp,
3464 bool flags,
3465 const char * const file,
3466 const int line)
a0ed51b3 3467{
051a06d4 3468 UV result;
607313a1
KW
3469 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
3470 cBOOL(flags), file, line);
051a06d4
KW
3471
3472 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 3473
eaf412bf
KW
3474 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3475 /* 2nd char of uc(U+DF) is 'S' */
3476 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
3477 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
983ffd37 3478}
a0ed51b3 3479
d3e79532 3480/*
87cea99e 3481=for apidoc to_utf8_title
d3e79532 3482
a239b1e2 3483Instead use L</toTITLE_utf8_safe>.
a1433954 3484
d3e79532
JH
3485=cut */
3486
051a06d4 3487/* Not currently externally documented, and subject to change:
31f05a37
KW
3488 * <flags> is set iff the rules from the current underlying locale are to be
3489 * used. Since titlecase is not defined in POSIX, for other than a
3490 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 3491 */
051a06d4 3492
983ffd37 3493UV
607313a1
KW
3494Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3495 const U8 *e,
3496 U8* ustrp,
3497 STRLEN *lenp,
3498 bool flags,
3499 const char * const file,
3500 const int line)
983ffd37 3501{
051a06d4 3502 UV result;
607313a1
KW
3503 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
3504 cBOOL(flags), file, line);
051a06d4
KW
3505
3506 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 3507
eaf412bf
KW
3508 /* 2nd char of ucfirst(U+DF) is 's' */
3509 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
3510 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
a0ed51b3
LW
3511}
3512
d3e79532 3513/*
87cea99e 3514=for apidoc to_utf8_lower
d3e79532 3515
a239b1e2 3516Instead use L</toLOWER_utf8_safe>.
a1433954 3517
d3e79532
JH
3518=cut */
3519
051a06d4 3520/* Not currently externally documented, and subject to change:
31f05a37
KW
3521 * <flags> is set iff iff the rules from the current underlying locale are to
3522 * be used.
3523 */
051a06d4 3524
2104c8d9 3525UV
607313a1
KW
3526Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3527 const U8 *e,
3528 U8* ustrp,
3529 STRLEN *lenp,
3530 bool flags,
3531 const char * const file,
3532 const int line)
a0ed51b3 3533{
051a06d4 3534 UV result;
607313a1
KW
3535 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
3536 cBOOL(flags), file, line);
051a06d4 3537
051a06d4 3538 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 3539
eaf412bf
KW
3540 CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
3541 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
b4e400f9
JH
3542}
3543
d3e79532 3544/*
87cea99e 3545=for apidoc to_utf8_fold
d3e79532 3546
a239b1e2 3547Instead use L</toFOLD_utf8_safe>.
a1433954 3548
d3e79532
JH
3549=cut */
3550
051a06d4
KW
3551/* Not currently externally documented, and subject to change,
3552 * in <flags>
31f05a37
KW
3553 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3554 * locale are to be used.
051a06d4
KW
3555 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3556 * otherwise simple folds
a0270393
KW
3557 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3558 * prohibited
445bf929 3559 */
36bb2ab6 3560
b4e400f9 3561UV
607313a1
KW
3562Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3563 const U8 *e,
3564 U8* ustrp,
3565 STRLEN *lenp,
3566 U8 flags,
3567 const char * const file,
3568 const int line)
b4e400f9 3569{
051a06d4 3570 UV result;
607313a1
KW
3571 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
3572 cBOOL(flags), file, line);
051a06d4 3573
36bb2ab6 3574 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 3575
a0270393
KW
3576 /* These are mutually exclusive */
3577 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3578
50ba90ff
KW
3579 assert(p != ustrp); /* Otherwise overwrites */
3580
eaf412bf
KW
3581 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
3582 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
31f05a37 3583
eaf412bf 3584 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3585
1ca267a5
KW
3586 if (flags & FOLD_FLAGS_LOCALE) {
3587
76f2ffcd 3588# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3589 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3590
3591# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3592# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
3593
3594 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 3595
538e84ed
KW
3596 /* Special case these two characters, as what normally gets
3597 * returned under locale doesn't work */
76f2ffcd
KW
3598 if (UTF8SKIP(p) == cap_sharp_s_len
3599 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 3600 {
ab0b796c
KW
3601 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3602 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3603 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3604 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3605 goto return_long_s;
3606 }
0766489e
KW
3607 else
3608#endif
3609 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 3610 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 3611 {
ab0b796c
KW
3612 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3613 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3614 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3615 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3616 goto return_ligature_st;
3617 }
74894415
KW
3618
3619#if UNICODE_MAJOR_VERSION == 3 \
3620 && UNICODE_DOT_VERSION == 0 \
3621 && UNICODE_DOT_DOT_VERSION == 1
3622# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3623
3624 /* And special case this on this Unicode version only, for the same
3625 * reaons the other two are special cased. They would cross the
3626 * 255/256 boundary which is forbidden under /l, and so the code
3627 * wouldn't catch that they are equivalent (which they are only in
3628 * this release) */
3629 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3630 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3631 {
3632 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3633 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3634 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3635 "resolved to \"\\x{0131}\".");
3636 goto return_dotless_i;
3637 }
3638#endif
3639
357aadde 3640 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 3641 }
a0270393
KW
3642 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3643 return result;
3644 }
3645 else {
4a4088c4 3646 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3647 * character above the ASCII range, and the result should not
3648 * contain an ASCII character. */
a0270393
KW
3649
3650 UV original; /* To store the first code point of <p> */
3651
3652 /* Look at every character in the result; if any cross the
3653 * boundary, the whole thing is disallowed */
3654 U8* s = ustrp;
3655 U8* e = ustrp + *lenp;
3656 while (s < e) {
3657 if (isASCII(*s)) {
3658 /* Crossed, have to return the original */
3659 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3660
9fc2026f 3661 /* But in these instances, there is an alternative we can
1ca267a5 3662 * return that is valid */
0766489e
KW
3663 if (original == LATIN_SMALL_LETTER_SHARP_S
3664#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3665 || original == LATIN_CAPITAL_LETTER_SHARP_S
3666#endif
3667 ) {
1ca267a5
KW
3668 goto return_long_s;
3669 }
9fc2026f
KW
3670 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3671 goto return_ligature_st;
3672 }
74894415
KW
3673#if UNICODE_MAJOR_VERSION == 3 \
3674 && UNICODE_DOT_VERSION == 0 \
3675 && UNICODE_DOT_DOT_VERSION == 1
3676
3677 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3678 goto return_dotless_i;
3679 }
3680#endif
a0270393
KW
3681 Copy(p, ustrp, *lenp, char);
3682 return original;
3683 }
3684 s += UTF8SKIP(s);
3685 }
051a06d4 3686
a0270393
KW
3687 /* Here, no characters crossed, result is ok as-is */
3688 return result;
3689 }
051a06d4
KW
3690 }
3691
4a4088c4 3692 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3693 if (UTF8_IS_INVARIANT(result)) {
3694 *ustrp = (U8) result;
3695 *lenp = 1;
3696 }
3697 else {
62cb07ea
KW
3698 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3699 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3700 *lenp = 2;
3701 }
3702
051a06d4 3703 return result;
1ca267a5
KW
3704
3705 return_long_s:
3706 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3707 * folds to a string of two of these characters. By returning this
3708 * instead, then, e.g.,
3709 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")