This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest:t/utf8_warn_base.pl: Clarify some comments
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
d22ec717
KW
40 "Use of code point 0x%" UVXf " is not allowed; the"
41 " permissible max is 0x%" UVXf;
760c7c2f 42
d22ec717 43#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
901b21bf 44
48ef279e 45/*
ccfc67b7 46=head1 Unicode Support
7fefc6c1 47These are various utility functions for manipulating UTF8-encoded
72d33970 48strings. For the uninitiated, this is a method of representing arbitrary
61296642 49Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
50characters in the ASCII range are unmodified, and a zero byte never appears
51within non-zero characters.
166f8a29 52
eaf7a4d2
CS
53=cut
54*/
55
9cbfb8ab
KW
56void
57Perl__force_out_malformed_utf8_message(pTHX_
58 const U8 *const p, /* First byte in UTF-8 sequence */
59 const U8 * const e, /* Final byte in sequence (may include
60 multiple chars */
61 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
62 usually 0, or some DISALLOW flags */
63 const bool die_here) /* If TRUE, this function does not return */
64{
65 /* This core-only function is to be called when a malformed UTF-8 character
66 * is found, in order to output the detailed information about the
67 * malformation before dieing. The reason it exists is for the occasions
68 * when such a malformation is fatal, but warnings might be turned off, so
69 * that normally they would not be actually output. This ensures that they
70 * do get output. Because a sequence may be malformed in more than one
71 * way, multiple messages may be generated, so we can't make them fatal, as
72 * that would cause the first one to die.
73 *
74 * Instead we pretend -W was passed to perl, then die afterwards. The
75 * flexibility is here to return to the caller so they can finish up and
76 * die themselves */
77 U32 errors;
78
79 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
80
81 ENTER;
c15a80f3 82 SAVEI8(PL_dowarn);
9cbfb8ab
KW
83 SAVESPTR(PL_curcop);
84
85 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
86 if (PL_curcop) {
87 PL_curcop->cop_warnings = pWARN_ALL;
88 }
89
90 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
91
92 LEAVE;
93
94 if (! errors) {
95 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
96 " be called only when there are errors found");
97 }
98
99 if (die_here) {
100 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
101 }
102}
103
bb07812e
KW
104STATIC HV *
105S_new_msg_hv(pTHX_ const char * const message, /* The message text */
106 U32 categories, /* Packed warning categories */
107 U32 flag) /* Flag associated with this message */
108{
109 /* Creates, populates, and returns an HV* that describes an error message
110 * for the translators between UTF8 and code point */
111
112 SV* msg_sv = newSVpv(message, 0);
113 SV* category_sv = newSVuv(categories);
114 SV* flag_bit_sv = newSVuv(flag);
115
116 HV* msg_hv = newHV();
117
118 PERL_ARGS_ASSERT_NEW_MSG_HV;
119
120 hv_stores(msg_hv, "text", msg_sv);
121 hv_stores(msg_hv, "warn_categories", category_sv);
122 hv_stores(msg_hv, "flag_bit", flag_bit_sv);
123
124 return msg_hv;
125}
126
eaf7a4d2 127/*
378516de 128=for apidoc uvoffuni_to_utf8_flags
eebe1485 129
a27992cc 130THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
131Instead, B<Almost all code should use L</uvchr_to_utf8> or
132L</uvchr_to_utf8_flags>>.
a27992cc 133
de69f3af
KW
134This function is like them, but the input is a strict Unicode
135(as opposed to native) code point. Only in very rare circumstances should code
136not be using the native code point.
949cf498 137
efa9cd84 138For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 139
eebe1485
SC
140=cut
141*/
142
c94c2f39
KW
143/* All these formats take a single UV code point argument */
144const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
145const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
146 " is not recommended for open interchange";
147const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
148 " may not be portable";
57ff5f59
KW
149const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
150 " Unicode, requires a Perl extension," \
151 " and so is not portable";
c94c2f39 152
8ee1cdcb
KW
153#define HANDLE_UNICODE_SURROGATE(uv, flags) \
154 STMT_START { \
155 if (flags & UNICODE_WARN_SURROGATE) { \
156 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
c94c2f39 157 surrogate_cp_format, uv); \
8ee1cdcb
KW
158 } \
159 if (flags & UNICODE_DISALLOW_SURROGATE) { \
160 return NULL; \
161 } \
162 } STMT_END;
163
164#define HANDLE_UNICODE_NONCHAR(uv, flags) \
165 STMT_START { \
166 if (flags & UNICODE_WARN_NONCHAR) { \
167 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
c94c2f39 168 nonchar_cp_format, uv); \
8ee1cdcb
KW
169 } \
170 if (flags & UNICODE_DISALLOW_NONCHAR) { \
171 return NULL; \
172 } \
173 } STMT_END;
174
ba6ed43c
KW
175/* Use shorter names internally in this file */
176#define SHIFT UTF_ACCUMULATION_SHIFT
177#undef MARK
178#define MARK UTF_CONTINUATION_MARK
179#define MASK UTF_CONTINUATION_MASK
180
dfe13c55 181U8 *
4b31b634 182Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
a0ed51b3 183{
378516de 184 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 185
2d1545e5 186 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 187 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
188 return d;
189 }
facc1dc2 190
3ea68d71 191 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
192 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
193 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
194 return d;
195 }
d9432125 196
ba6ed43c
KW
197 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
198 * below, the 16 is for start bytes E0-EF (which are all the possible ones
199 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
200 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
201 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
202 * 0x800-0xFFFF on ASCII */
203 if (uv < (16 * (1U << (2 * SHIFT)))) {
204 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
205 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
206 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
207
208#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
209 aren't tested here */
210 /* The most likely code points in this range are below the surrogates.
211 * Do an extra test to quickly exclude those. */
212 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
213 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
214 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
215 {
8ee1cdcb
KW
216 HANDLE_UNICODE_NONCHAR(uv, flags);
217 }
218 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
219 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 220 }
ba6ed43c
KW
221 }
222#endif
223 return d;
224 }
225
226 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
227 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
228 * happen starting with 4-byte characters on ASCII platforms. We unify the
229 * code for these with EBCDIC, even though some of them require 5-bytes on
230 * those, because khw believes the code saving is worth the very slight
231 * performance hit on these high EBCDIC code points. */
232
233 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
d22ec717
KW
234 if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
235 Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
a5bf80e0 236 }
0a8a1a5b
KW
237 if ( (flags & UNICODE_WARN_SUPER)
238 || ( (flags & UNICODE_WARN_PERL_EXTENDED)
239 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
240 {
241 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
242
243 /* Choose the more dire applicable warning */
d044b7a7 244 (UNICODE_IS_PERL_EXTENDED(uv))
57ff5f59 245 ? perl_extended_cp_format
c94c2f39 246 : super_cp_format,
a5bf80e0
KW
247 uv);
248 }
56576a04 249 if ( (flags & UNICODE_DISALLOW_SUPER)
0a8a1a5b
KW
250 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
251 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
252 {
253 return NULL;
254 }
255 }
ba6ed43c
KW
256 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
257 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 258 }
d9432125 259
ba6ed43c
KW
260 /* Test for and handle 4-byte result. In the test immediately below, the
261 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
262 * characters). The 3 is for 3 continuation bytes; these each contribute
263 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
264 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
265 * 0x1_0000-0x1F_FFFF on ASCII */
266 if (uv < (8 * (1U << (3 * SHIFT)))) {
267 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
268 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
269 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
270 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
271
272#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
273 characters. The end-plane non-characters for EBCDIC were
274 handled just above */
275 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
276 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 277 }
ba6ed43c
KW
278 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
279 HANDLE_UNICODE_SURROGATE(uv, flags);
280 }
281#endif
282
283 return d;
284 }
285
286 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
287 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
288 * format. The unrolled version above turns out to not save all that much
289 * time, and at these high code points (well above the legal Unicode range
290 * on ASCII platforms, and well above anything in common use in EBCDIC),
291 * khw believes that less code outweighs slight performance gains. */
292
d9432125 293 {
5aaebcb3 294 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
295 U8 *p = d+len-1;
296 while (p > d) {
4c8cd605 297 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
298 uv >>= UTF_ACCUMULATION_SHIFT;
299 }
4c8cd605 300 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
301 return d+len;
302 }
a0ed51b3 303}
a5bf80e0 304
646ca15d 305/*
07693fe6
KW
306=for apidoc uvchr_to_utf8
307
bcb1a2d4 308Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 309of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
310C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
311the byte after the end of the new character. In other words,
07693fe6
KW
312
313 d = uvchr_to_utf8(d, uv);
314
315is the recommended wide native character-aware way of saying
316
317 *(d++) = uv;
318
d22ec717
KW
319This function accepts any code point from 0..C<IV_MAX> as input.
320C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
321
322It is possible to forbid or warn on non-Unicode code points, or those that may
323be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 324
07693fe6
KW
325=cut
326*/
327
de69f3af
KW
328/* This is also a macro */
329PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
330
07693fe6
KW
331U8 *
332Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
333{
de69f3af 334 return uvchr_to_utf8(d, uv);
07693fe6
KW
335}
336
de69f3af
KW
337/*
338=for apidoc uvchr_to_utf8_flags
339
340Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 341of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
342C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
343the byte after the end of the new character. In other words,
de69f3af
KW
344
345 d = uvchr_to_utf8_flags(d, uv, flags);
346
347or, in most cases,
348
349 d = uvchr_to_utf8_flags(d, uv, 0);
350
351This is the Unicode-aware way of saying
352
353 *(d++) = uv;
354
d22ec717
KW
355If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
356input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
357
358Specifying C<flags> can further restrict what is allowed and not warned on, as
359follows:
de69f3af 360
796b6530 361If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
362the function will raise a warning, provided UTF8 warnings are enabled. If
363instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
364NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 365
760c7c2f
KW
366Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
367affect how the function handles a Unicode non-character.
93e6dbd6 368
760c7c2f
KW
369And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
370affect the handling of code points that are above the Unicode maximum of
3710x10FFFF. Languages other than Perl may not be able to accept files that
372contain these.
93e6dbd6
KW
373
374The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
375the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
376three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
377allowed inputs to the strict UTF-8 traditionally defined by Unicode.
378Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
379C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
380above-Unicode and surrogate flags, but not the non-character ones, as
381defined in
382L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
383See L<perlunicode/Noncharacter code points>.
93e6dbd6 384
57ff5f59
KW
385Extremely high code points were never specified in any standard, and require an
386extension to UTF-8 to express, which Perl does. It is likely that programs
387written in something other than Perl would not be able to read files that
388contain these; nor would Perl understand files written by something that uses a
389different extension. For these reasons, there is a separate set of flags that
390can warn and/or disallow these extremely high code points, even if other
391above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
392and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
393L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
394treat all above-Unicode code points, including these, as malformations. (Note
395that the Unicode standard considers anything above 0x10FFFF to be illegal, but
396there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
397
398A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
399retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
400C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
401C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
402flags can apply to code points that actually do fit in 31 bits. This happens
403on EBCDIC platforms, and sometimes when the L<overlong
404malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
405describe the situation in all cases.
de69f3af 406
de69f3af
KW
407=cut
408*/
409
410/* This is also a macro */
411PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
412
07693fe6
KW
413U8 *
414Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
415{
de69f3af 416 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
417}
418
57ff5f59
KW
419#ifndef UV_IS_QUAD
420
e050c007
KW
421STATIC int
422S_is_utf8_cp_above_31_bits(const U8 * const s,
423 const U8 * const e,
424 const bool consider_overlongs)
83dc0f42
KW
425{
426 /* Returns TRUE if the first code point represented by the Perl-extended-
427 * UTF-8-encoded string starting at 's', and looking no further than 'e -
428 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
429 *
430 * The function handles the case where the input bytes do not include all
431 * the ones necessary to represent a full character. That is, they may be
432 * the intial bytes of the representation of a code point, but possibly
433 * the final ones necessary for the complete representation may be beyond
434 * 'e - 1'.
435 *
e050c007
KW
436 * The function also can handle the case where the input is an overlong
437 * sequence. If 'consider_overlongs' is 0, the function assumes the
438 * input is not overlong, without checking, and will return based on that
439 * assumption. If this parameter is 1, the function will go to the trouble
440 * of figuring out if it actually evaluates to above or below 31 bits.
83dc0f42 441 *
e050c007 442 * The sequence is otherwise assumed to be well-formed, without checking.
83dc0f42
KW
443 */
444
e050c007
KW
445 const STRLEN len = e - s;
446 int is_overlong;
447
448 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
449
450 assert(! UTF8_IS_INVARIANT(*s) && e > s);
451
83dc0f42
KW
452#ifdef EBCDIC
453
e050c007 454 PERL_UNUSED_ARG(consider_overlongs);
83dc0f42 455
e050c007
KW
456 /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
457 * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
458 * also be the start byte for a 31-bit code point; we need at least 2
459 * bytes, and maybe up through 8 bytes, to determine that. (It can also be
460 * the start byte for an overlong sequence, but for 30-bit or smaller code
461 * points, so we don't have to worry about overlongs on EBCDIC.) */
462 if (*s != 0xFE) {
463 return 0;
464 }
83dc0f42 465
e050c007
KW
466 if (len == 1) {
467 return -1;
468 }
83dc0f42 469
e050c007 470#else
83dc0f42 471
e050c007
KW
472 /* On ASCII, FE and FF are the only start bytes that can evaluate to
473 * needing more than 31 bits. */
474 if (LIKELY(*s < 0xFE)) {
475 return 0;
476 }
83dc0f42 477
e050c007
KW
478 /* What we have left are FE and FF. Both of these require more than 31
479 * bits unless they are for overlongs. */
480 if (! consider_overlongs) {
481 return 1;
482 }
83dc0f42 483
e050c007
KW
484 /* Here, we have FE or FF. If the input isn't overlong, it evaluates to
485 * above 31 bits. But we need more than one byte to discern this, so if
486 * passed just the start byte, it could be an overlong evaluating to
487 * smaller */
488 if (len == 1) {
489 return -1;
490 }
83dc0f42 491
e050c007
KW
492 /* Having excluded len==1, and knowing that FE and FF are both valid start
493 * bytes, we can call the function below to see if the sequence is
494 * overlong. (We don't need the full generality of the called function,
495 * but for these huge code points, speed shouldn't be a consideration, and
496 * the compiler does have enough information, since it's static to this
497 * file, to optimize to just the needed parts.) */
498 is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
83dc0f42 499
e050c007
KW
500 /* If it isn't overlong, more than 31 bits are required. */
501 if (is_overlong == 0) {
502 return 1;
503 }
83dc0f42 504
e050c007
KW
505 /* If it is indeterminate if it is overlong, return that */
506 if (is_overlong < 0) {
507 return -1;
508 }
509
510 /* Here is overlong. Such a sequence starting with FE is below 31 bits, as
511 * the max it can be is 2**31 - 1 */
512 if (*s == 0xFE) {
513 return 0;
83dc0f42
KW
514 }
515
e050c007
KW
516#endif
517
518 /* Here, ASCII and EBCDIC rejoin:
519 * On ASCII: We have an overlong sequence starting with FF
520 * On EBCDIC: We have a sequence starting with FE. */
521
522 { /* For C89, use a block so the declaration can be close to its use */
523
524#ifdef EBCDIC
525
5f995336
KW
526 /* U+7FFFFFFF (2 ** 31 - 1)
527 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
528 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
529 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
530 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
531 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
532 * U+80000000 (2 ** 31):
533 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
534 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
535 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
536 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
e050c007
KW
537 *
538 * and since we know that *s = \xfe, any continuation sequcence
539 * following it that is gt the below is above 31 bits
540 [0] [1] [2] [3] [4] [5] [6] */
541 const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
542
543#else
544
545 /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
546 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
547 * FF overlong for U+80000000 (2 ** 31):
548 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
549 * and since we know that *s = \xff, any continuation sequcence
550 * following it that is gt the below is above 30 bits
551 [0] [1] [2] [3] [4] [5] [6] */
552 const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
5f995336 553
83dc0f42
KW
554
555#endif
e050c007
KW
556 const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
557 const STRLEN cmp_len = MIN(conts_len, len - 1);
558
559 /* Now compare the continuation bytes in s with the ones we have
560 * compiled in that are for the largest 30 bit code point. If we have
561 * enough bytes available to determine the answer, or the bytes we do
562 * have differ from them, we can compare the two to get a definitive
563 * answer (Note that in UTF-EBCDIC, the two lowest possible
564 * continuation bytes are \x41 and \x42.) */
565 if (cmp_len >= conts_len || memNE(s + 1,
566 conts_for_highest_30_bit,
567 cmp_len))
568 {
569 return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
570 }
83dc0f42 571
e050c007
KW
572 /* Here, all the bytes we have are the same as the highest 30-bit code
573 * point, but we are missing so many bytes that we can't make the
574 * determination */
575 return -1;
576 }
83dc0f42
KW
577}
578
57ff5f59
KW
579#endif
580
d6be65ae 581PERL_STATIC_INLINE int
12a4bed3
KW
582S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
583{
d6be65ae
KW
584 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
585 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
586 * it isn't, and -1 if there isn't enough information to tell. This last
587 * return value can happen if the sequence is incomplete, missing some
588 * trailing bytes that would form a complete character. If there are
589 * enough bytes to make a definitive decision, this function does so.
590 * Usually 2 bytes sufficient.
591 *
592 * Overlongs can occur whenever the number of continuation bytes changes.
593 * That means whenever the number of leading 1 bits in a start byte
594 * increases from the next lower start byte. That happens for start bytes
595 * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
596 * start bytes have already been excluded, so don't need to be tested here;
12a4bed3
KW
597 * ASCII platforms: C0, C1
598 * EBCDIC platforms C0, C1, C2, C3, C4, E0
d6be65ae 599 */
12a4bed3
KW
600
601 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
602 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
603
604 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
605 assert(len > 1 && UTF8_IS_START(*s));
606
607 /* Each platform has overlongs after the start bytes given above (expressed
608 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
609 * the logic is the same, except the E0 overlong has already been excluded
610 * on EBCDIC platforms. The values below were found by manually
611 * inspecting the UTF-8 patterns. See the tables in utf8.h and
612 * utfebcdic.h. */
613
614# ifdef EBCDIC
615# define F0_ABOVE_OVERLONG 0xB0
616# define F8_ABOVE_OVERLONG 0xA8
617# define FC_ABOVE_OVERLONG 0xA4
618# define FE_ABOVE_OVERLONG 0xA2
619# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
620 /* I8(0xfe) is FF */
621# else
622
623 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
d6be65ae 624 return 1;
12a4bed3
KW
625 }
626
627# define F0_ABOVE_OVERLONG 0x90
628# define F8_ABOVE_OVERLONG 0x88
629# define FC_ABOVE_OVERLONG 0x84
630# define FE_ABOVE_OVERLONG 0x82
631# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
632# endif
633
634
635 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
636 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
637 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
638 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
639 {
d6be65ae 640 return 1;
12a4bed3
KW
641 }
642
b0b342d4 643 /* Check for the FF overlong */
d6be65ae 644 return isFF_OVERLONG(s, len);
b0b342d4
KW
645}
646
8d6204cc 647PERL_STATIC_INLINE int
b0b342d4
KW
648S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
649{
8d6204cc
KW
650 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
651 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
652 * it isn't, and -1 if there isn't enough information to tell. This last
653 * return value can happen if the sequence is incomplete, missing some
654 * trailing bytes that would form a complete character. If there are
655 * enough bytes to make a definitive decision, this function does so. */
656
b0b342d4 657 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3 658
8d6204cc
KW
659 /* To be an FF overlong, all the available bytes must match */
660 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
661 MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
662 {
663 return 0;
664 }
665
666 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
667 * be there; what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 668 * utfebcdic.h. */
8d6204cc
KW
669 if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
670 return 1;
671 }
12a4bed3 672
8d6204cc
KW
673 /* The missing bytes could cause the result to go one way or the other, so
674 * the result is indeterminate */
675 return -1;
12a4bed3
KW
676}
677
d22ec717 678#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
a77c906e
KW
679# ifdef EBCDIC /* Actually is I8 */
680# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 681 "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
682# else
683# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 684 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
685# endif
686#endif
687
c285bbc4 688PERL_STATIC_INLINE int
e050c007
KW
689S_does_utf8_overflow(const U8 * const s,
690 const U8 * e,
691 const bool consider_overlongs)
a77c906e 692{
c285bbc4 693 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
d22ec717
KW
694 * 'e' - 1 would overflow an IV on this platform; that is if it represents
695 * a code point larger than the highest representable code point. It
696 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
697 * enough information to tell. This last return value can happen if the
698 * sequence is incomplete, missing some trailing bytes that would form a
699 * complete character. If there are enough bytes to make a definitive
700 * decision, this function does so.
c285bbc4 701 *
e050c007
KW
702 * If 'consider_overlongs' is TRUE, the function checks for the possibility
703 * that the sequence is an overlong that doesn't overflow. Otherwise, it
704 * assumes the sequence is not an overlong. This can give different
705 * results only on ASCII 32-bit platforms.
706 *
c285bbc4
KW
707 * (For ASCII platforms, we could use memcmp() because we don't have to
708 * convert each byte to I8, but it's very rare input indeed that would
709 * approach overflow, so the loop below will likely only get executed once.)
710 *
711 * 'e' - 1 must not be beyond a full character. */
a77c906e 712
a77c906e
KW
713
714 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
715 assert(s <= e && s + UTF8SKIP(s) >= e);
716
d22ec717
KW
717#if ! defined(UV_IS_QUAD)
718
719 return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
720
721#else
722
723 PERL_UNUSED_ARG(consider_overlongs);
724
725 {
726 const STRLEN len = e - s;
727 const U8 *x;
728 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
729
730 for (x = s; x < e; x++, y++) {
731
732 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
733 continue;
734 }
735
736 /* If this byte is larger than the corresponding highest UTF-8
737 * byte, the sequence overflow; otherwise the byte is less than,
738 * and so the sequence doesn't overflow */
739 return NATIVE_UTF8_TO_I8(*x) > *y;
740
741 }
742
743 /* Got to the end and all bytes are the same. If the input is a whole
744 * character, it doesn't overflow. And if it is a partial character,
745 * there's not enough information to tell */
746 if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
747 return -1;
748 }
749
750 return 0;
751 }
752
753#endif
754
755}
756
757#if 0
758
759/* This is the portions of the above function that deal with UV_MAX instead of
760 * IV_MAX. They are left here in case we want to combine them so that internal
761 * uses can have larger code points. The only logic difference is that the
762 * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
763 * different logic.
764 */
765
766/* Anything larger than this will overflow the word if it were converted into a UV */
767#if defined(UV_IS_QUAD)
768# ifdef EBCDIC /* Actually is I8 */
769# define HIGHEST_REPRESENTABLE_UTF8 \
770 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
771# else
772# define HIGHEST_REPRESENTABLE_UTF8 \
773 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
774# endif
775#else /* 32-bit */
776# ifdef EBCDIC
777# define HIGHEST_REPRESENTABLE_UTF8 \
778 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
779# else
780# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
781# endif
782#endif
783
a77c906e
KW
784#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
785
786 /* On 32 bit ASCII machines, many overlongs that start with FF don't
787 * overflow */
e050c007 788 if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
c285bbc4
KW
789
790 /* To be such an overlong, the first bytes of 's' must match
791 * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
792 * don't have any additional bytes available, the sequence, when
793 * completed might or might not fit in 32 bits. But if we have that
794 * next byte, we can tell for sure. If it is <= 0x83, then it does
795 * fit. */
796 if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
797 return -1;
798 }
799
800 return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
a77c906e
KW
801 }
802
d22ec717
KW
803/* Starting with the #else, the rest of the function is identical except
804 * 1. we need to move the 'len' declaration to be global to the function
805 * 2. the endif move to just after the UNUSED_ARG.
806 * An empty endif is given just below to satisfy the preprocessor
807 */
a77c906e
KW
808#endif
809
d22ec717 810#endif
a77c906e 811
12a4bed3
KW
812#undef F0_ABOVE_OVERLONG
813#undef F8_ABOVE_OVERLONG
814#undef FC_ABOVE_OVERLONG
815#undef FE_ABOVE_OVERLONG
816#undef FF_OVERLONG_PREFIX
817
35f8c9bd 818STRLEN
edc2c47a 819Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 820{
2b479609 821 STRLEN len;
12a4bed3 822 const U8 *x;
35f8c9bd 823
2b479609
KW
824 /* A helper function that should not be called directly.
825 *
826 * This function returns non-zero if the string beginning at 's' and
827 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
828 * code point; otherwise it returns 0. The examination stops after the
829 * first code point in 's' is validated, not looking at the rest of the
830 * input. If 'e' is such that there are not enough bytes to represent a
831 * complete code point, this function will return non-zero anyway, if the
832 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
833 * excluded by 'flags'.
834 *
835 * A non-zero return gives the number of bytes required to represent the
836 * code point. Be aware that if the input is for a partial character, the
837 * return will be larger than 'e - s'.
838 *
839 * This function assumes that the code point represented is UTF-8 variant.
56576a04
KW
840 * The caller should have excluded the possibility of it being invariant
841 * before calling this function.
2b479609
KW
842 *
843 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
844 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
845 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
846 * disallowed by the flags. If the input is only for a partial character,
847 * the function will return non-zero if there is any sequence of
848 * well-formed UTF-8 that, when appended to the input sequence, could
849 * result in an allowed code point; otherwise it returns 0. Non characters
850 * cannot be determined based on partial character input. But many of the
851 * other excluded types can be determined with just the first one or two
852 * bytes.
853 *
854 */
855
856 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
857
858 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 859 |UTF8_DISALLOW_PERL_EXTENDED)));
2b479609 860 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 861
2b479609 862 /* A variant char must begin with a start byte */
35f8c9bd
KW
863 if (UNLIKELY(! UTF8_IS_START(*s))) {
864 return 0;
865 }
866
edc2c47a
KW
867 /* Examine a maximum of a single whole code point */
868 if (e - s > UTF8SKIP(s)) {
869 e = s + UTF8SKIP(s);
870 }
871
2b479609
KW
872 len = e - s;
873
874 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
875 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 876
56576a04
KW
877 /* Here, we are disallowing some set of largish code points, and the
878 * first byte indicates the sequence is for a code point that could be
879 * in the excluded set. We generally don't have to look beyond this or
880 * the second byte to see if the sequence is actually for one of the
881 * excluded classes. The code below is derived from this table:
882 *
2b479609
KW
883 * UTF-8 UTF-EBCDIC I8
884 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
885 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
886 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
887 *
56576a04
KW
888 * Keep in mind that legal continuation bytes range between \x80..\xBF
889 * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
890 * continuation bytes. Hence, we don't have to test the upper edge
891 * because if any of those is encountered, the sequence is malformed,
892 * and would fail elsewhere in this function.
893 *
894 * The code here likewise assumes that there aren't other
895 * malformations; again the function should fail elsewhere because of
896 * these. For example, an overlong beginning with FC doesn't actually
897 * have to be a super; it could actually represent a small code point,
898 * even U+0000. But, since overlongs (and other malformations) are
899 * illegal, the function should return FALSE in either case.
2b479609
KW
900 */
901
902#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
903# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 904# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 905
19794540
KW
906# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
907 /* B6 and B7 */ \
908 && ((s1) & 0xFE ) == 0xB6)
57ff5f59 909# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
2b479609
KW
910#else
911# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
912# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
913# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
57ff5f59 914# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
2b479609
KW
915#endif
916
917 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
918 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
919 {
2b479609
KW
920 return 0; /* Above Unicode */
921 }
922
d044b7a7 923 if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
57ff5f59 924 && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
2b479609 925 {
57ff5f59 926 return 0;
2b479609
KW
927 }
928
929 if (len > 1) {
930 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
931
932 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 933 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
934 {
935 return 0; /* Above Unicode */
936 }
937
938 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 939 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
940 {
941 return 0; /* Surrogate */
942 }
943
944 if ( (flags & UTF8_DISALLOW_NONCHAR)
945 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
946 {
947 return 0; /* Noncharacter code point */
948 }
949 }
950 }
951
952 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
953 for (x = s + 1; x < e; x++) {
954 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
955 return 0;
956 }
957 }
958
af13dd8a 959 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3 960 * overlong. */
d6be65ae 961 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
12a4bed3 962 return 0;
af13dd8a
KW
963 }
964
12a4bed3
KW
965 /* And finally, that the code point represented fits in a word on this
966 * platform */
e050c007
KW
967 if (0 < does_utf8_overflow(s, e,
968 0 /* Don't consider overlongs */
969 ))
970 {
12a4bed3 971 return 0;
35f8c9bd
KW
972 }
973
2b479609 974 return UTF8SKIP(s);
35f8c9bd
KW
975}
976
7e2f38b2 977char *
63ab03b3 978Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
7cf8d05d
KW
979{
980 /* Returns a mortalized C string that is a displayable copy of the 'len'
63ab03b3 981 * bytes starting at 'start'. 'format' gives how to display each byte.
7e2f38b2
KW
982 * Currently, there are only two formats, so it is currently a bool:
983 * 0 \xab
984 * 1 ab (that is a space between two hex digit bytes)
985 */
7cf8d05d
KW
986
987 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
988 trailing NUL */
63ab03b3
KW
989 const U8 * s = start;
990 const U8 * const e = start + len;
7cf8d05d
KW
991 char * output;
992 char * d;
993
994 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
995
996 Newx(output, output_len, char);
997 SAVEFREEPV(output);
998
999 d = output;
63ab03b3 1000 for (s = start; s < e; s++) {
7cf8d05d
KW
1001 const unsigned high_nibble = (*s & 0xF0) >> 4;
1002 const unsigned low_nibble = (*s & 0x0F);
1003
7e2f38b2 1004 if (format) {
63ab03b3
KW
1005 if (s > start) {
1006 *d++ = ' ';
1007 }
7e2f38b2
KW
1008 }
1009 else {
1010 *d++ = '\\';
1011 *d++ = 'x';
1012 }
7cf8d05d
KW
1013
1014 if (high_nibble < 10) {
1015 *d++ = high_nibble + '0';
1016 }
1017 else {
1018 *d++ = high_nibble - 10 + 'a';
1019 }
1020
1021 if (low_nibble < 10) {
1022 *d++ = low_nibble + '0';
1023 }
1024 else {
1025 *d++ = low_nibble - 10 + 'a';
1026 }
1027 }
1028
1029 *d = '\0';
1030 return output;
1031}
1032
806547a7 1033PERL_STATIC_INLINE char *
7cf8d05d
KW
1034S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1035
1036 /* How many bytes to print */
3cc6a05e 1037 STRLEN print_len,
7cf8d05d
KW
1038
1039 /* Which one is the non-continuation */
1040 const STRLEN non_cont_byte_pos,
1041
1042 /* How many bytes should there be? */
1043 const STRLEN expect_len)
806547a7
KW
1044{
1045 /* Return the malformation warning text for an unexpected continuation
1046 * byte. */
1047
7cf8d05d 1048 const char * const where = (non_cont_byte_pos == 1)
806547a7 1049 ? "immediately"
7cf8d05d
KW
1050 : Perl_form(aTHX_ "%d bytes",
1051 (int) non_cont_byte_pos);
806547a7
KW
1052
1053 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1054
7cf8d05d
KW
1055 /* We don't need to pass this parameter, but since it has already been
1056 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1057 assert(expect_len == UTF8SKIP(s));
1058
1059 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1060 " %s after start byte 0x%02x; need %d bytes, got %d)",
1061 malformed_text,
7e2f38b2 1062 _byte_dump_string(s, print_len, 0),
7cf8d05d
KW
1063 *(s + non_cont_byte_pos),
1064 where,
1065 *s,
1066 (int) expect_len,
1067 (int) non_cont_byte_pos);
806547a7
KW
1068}
1069
35f8c9bd
KW
1070/*
1071
de69f3af 1072=for apidoc utf8n_to_uvchr
378516de
KW
1073
1074THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 1075Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 1076
9041c2e3 1077Bottom level UTF-8 decode routine.
de69f3af 1078Returns the native code point value of the first character in the string C<s>,
746afd53
KW
1079which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1080C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1081the length, in bytes, of that character.
949cf498
KW
1082
1083The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
1084well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1085causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1086is the next possible position in C<s> that could begin a non-malformed
1087character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1088is raised. Some UTF-8 input sequences may contain multiple malformations.
1089This function tries to find every possible one in each call, so multiple
56576a04 1090warnings can be raised for the same sequence.
949cf498
KW
1091
1092Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1093individual types of malformations, such as the sequence being overlong (that
1094is, when there is a shorter sequence that can express the same code point;
1095overlong sequences are expressly forbidden in the UTF-8 standard due to
1096potential security issues). Another malformation example is the first byte of
1097a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
1098flags. Even if allowed, this function generally returns the Unicode
1099REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1100F<utf8.h> to override this behavior for the overlong malformations, but don't
1101do that except for very specialized purposes.
949cf498 1102
796b6530 1103The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
1104flags) malformation is found. If this flag is set, the routine assumes that
1105the caller will raise a warning, and this function will silently just set
d088425d
KW
1106C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1107
75200dff 1108Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 1109character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
1110in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1111be set to 1. To disambiguate, upon a zero return, see if the first byte of
1112C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 1113error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
1114
1115Certain code points are considered problematic. These are Unicode surrogates,
746afd53 1116Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 1117By default these are considered regular code points, but certain situations
ecc1615f
KW
1118warrant special handling for them, which can be specified using the C<flags>
1119parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1120three classes are treated as malformations and handled as such. The flags
1121C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1122C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1123disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1124restricts the allowed inputs to the strict UTF-8 traditionally defined by
1125Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1126definition given by
1127L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1128The difference between traditional strictness and C9 strictness is that the
1129latter does not forbid non-character code points. (They are still discouraged,
1130however.) For more discussion see L<perlunicode/Noncharacter code points>.
1131
1132The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1133C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
1134C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1135raised for their respective categories, but otherwise the code points are
1136considered valid (not malformations). To get a category to both be treated as
1137a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 1138(But note that warnings are not raised if lexically disabled nor if
796b6530 1139C<UTF8_CHECK_ONLY> is also specified.)
949cf498 1140
57ff5f59
KW
1141Extremely high code points were never specified in any standard, and require an
1142extension to UTF-8 to express, which Perl does. It is likely that programs
1143written in something other than Perl would not be able to read files that
1144contain these; nor would Perl understand files written by something that uses a
1145different extension. For these reasons, there is a separate set of flags that
1146can warn and/or disallow these extremely high code points, even if other
1147above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1148C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
1149L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
1150above-Unicode code points, including these, as malformations.
1151(Note that the Unicode standard considers anything above 0x10FFFF to be
1152illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1153(2**31 -1))
1154
1155A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1156retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1157C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1158C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1159can apply to code points that actually do fit in 31 bits. This happens on
1160EBCDIC platforms, and sometimes when the L<overlong
1161malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1162describe the situation in all cases.
1163
ab8e6d41 1164
949cf498
KW
1165All other code points corresponding to Unicode characters, including private
1166use and those yet to be assigned, are never considered malformed and never
1167warn.
67e989fb 1168
37607a96 1169=cut
f9380377
KW
1170
1171Also implemented as a macro in utf8.h
1172*/
1173
1174UV
1175Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
1176 STRLEN curlen,
1177 STRLEN *retlen,
1178 const U32 flags)
1179{
1180 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1181
1182 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1183}
1184
2b9519f0
KW
1185/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
1186 * which requires this copyright notice */
1187
1188/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
1189
1190Permission is hereby granted, free of charge, to any person obtaining a copy of
1191this software and associated documentation files (the "Software"), to deal in
1192the Software without restriction, including without limitation the rights to
1193use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
1194of the Software, and to permit persons to whom the Software is furnished to do
1195so, subject to the following conditions:
1196
1197The above copyright notice and this permission notice shall be included in all
1198copies or substantial portions of the Software.
1199
1200THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1201IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1202FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1203AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1204LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1205OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1206SOFTWARE.
1207
1208*/
1209
1210#if 0
1211static U8 utf8d_C9[] = {
1212 /* The first part of the table maps bytes to character classes that
1213 * to reduce the size of the transition table and create bitmasks. */
1214 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
1215 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
1216 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
1217 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
1218 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
1219 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
1220 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
1221 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
1222
1223 /* The second part is a transition table that maps a combination
1224 * of a state of the automaton and a character class to a state. */
1225 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
1226 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
1227 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
1228 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
1229 12,36,12,12,12,12,12,12,12,12,12,12
1230};
1231
1232#endif
1233
1234#ifndef EBCDIC
1235
1236/* This is a version of the above table customized for Perl that doesn't
1237 * exclude surrogates and accepts start bytes up through F7 (representing
1238 * 2**21 - 1). */
1239static U8 dfa_tab_for_perl[] = {
1240 /* The first part of the table maps bytes to character classes to reduce
1241 * the size of the transition table and create bitmasks. */
1242 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
1243 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
1244 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
1245 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
1246 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
1247 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
1248 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
1249 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
1250
1251 /* The second part is a transition table that maps a combination
1252 * of a state of the automaton and a character class to a state. */
1253 0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
1254 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
1255 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
1256 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
1257 12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
1258
1259 /* The customization was to repurpose the surrogates type '4' to instead be
1260 * for start bytes F1-F7. Types 5 and 6 are now unused, and their entries in
1261 * the transition part of the table are set to 12, so are illegal.
1262 *
1263 * To do higher code points would require expansion and some rearrangement of
1264 * the table. The type '1' entries for continuation bytes 80-8f would have to
1265 * be split into several types, because they aren't treated uniformly for
1266 * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
1267 * 80-81. We start needing to worry about overflow if FE is included.
1268 * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
1269 * from the web site that these are used to right shift). FE would
1270 * necessarily be type 7; and FF, type 8. And new states would have to be
1271 * created for F8 and FC (and FE and FF if used), so quite a bit of work would
1272 * be involved.
1273 *
1274 * XXX Better would be to customize the table so that the noncharacters are
1275 * excluded. This again is non trivial, but doing so would simplify the code
1276 * that uses this, and might make it small enough to make it inlinable */
1277};
1278
1279#endif
1280
f9380377
KW
1281/*
1282
1283=for apidoc utf8n_to_uvchr_error
1284
1285THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1286Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1287
1288This function is for code that needs to know what the precise malformation(s)
37657a5b
KW
1289are when an error is found. If you also need to know the generated warning
1290messages, use L</utf8n_to_uvchr_msgs>() instead.
f9380377
KW
1291
1292It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1293all the others, C<errors>. If this parameter is 0, this function behaves
1294identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1295to a C<U32> variable, which this function sets to indicate any errors found.
1296Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1297C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1298of these bits will be set if a malformation is found, even if the input
7a65503b 1299C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1300exceptions are noted:
1301
1302=over 4
1303
57ff5f59 1304=item C<UTF8_GOT_PERL_EXTENDED>
f9380377 1305
57ff5f59
KW
1306The input sequence is not standard UTF-8, but a Perl extension. This bit is
1307set only if the input C<flags> parameter contains either the
1308C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1309
1310Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1311and so some extension must be used to express them. Perl uses a natural
1312extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1313extension to represent even higher ones, so that any code point that fits in a
131464-bit word can be represented. Text using these extensions is not likely to
1315be portable to non-Perl code. We lump both of these extensions together and
1316refer to them as Perl extended UTF-8. There exist other extensions that people
1317have invented, incompatible with Perl's.
1318
1319On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1320extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1321than on ASCII. Prior to that, code points 2**31 and higher were simply
1322unrepresentable, and a different, incompatible method was used to represent
1323code points between 2**30 and 2**31 - 1.
1324
1325On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1326Perl extended UTF-8 is used.
1327
1328In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1329may use for backward compatibility. That name is misleading, as this flag may
1330be set when the code point actually does fit in 31 bits. This happens on
1331EBCDIC platforms, and sometimes when the L<overlong
1332malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1333describes the situation in all cases.
f9380377
KW
1334
1335=item C<UTF8_GOT_CONTINUATION>
1336
1337The input sequence was malformed in that the first byte was a a UTF-8
1338continuation byte.
1339
1340=item C<UTF8_GOT_EMPTY>
1341
1342The input C<curlen> parameter was 0.
1343
1344=item C<UTF8_GOT_LONG>
1345
1346The input sequence was malformed in that there is some other sequence that
1347evaluates to the same code point, but that sequence is shorter than this one.
1348
fecaf136
KW
1349Until Unicode 3.1, it was legal for programs to accept this malformation, but
1350it was discovered that this created security issues.
1351
f9380377
KW
1352=item C<UTF8_GOT_NONCHAR>
1353
1354The code point represented by the input UTF-8 sequence is for a Unicode
1355non-character code point.
1356This bit is set only if the input C<flags> parameter contains either the
1357C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1358
1359=item C<UTF8_GOT_NON_CONTINUATION>
1360
1361The input sequence was malformed in that a non-continuation type byte was found
1362in a position where only a continuation type one should be.
1363
1364=item C<UTF8_GOT_OVERFLOW>
1365
1366The input sequence was malformed in that it is for a code point that is not
d22ec717 1367representable in the number of bits available in an IV on the current platform.
f9380377
KW
1368
1369=item C<UTF8_GOT_SHORT>
1370
1371The input sequence was malformed in that C<curlen> is smaller than required for
1372a complete sequence. In other words, the input is for a partial character
1373sequence.
1374
1375=item C<UTF8_GOT_SUPER>
1376
1377The input sequence was malformed in that it is for a non-Unicode code point;
1378that is, one above the legal Unicode maximum.
1379This bit is set only if the input C<flags> parameter contains either the
1380C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1381
1382=item C<UTF8_GOT_SURROGATE>
1383
1384The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1385code point.
1386This bit is set only if the input C<flags> parameter contains either the
1387C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1388
1389=back
1390
133551d8
KW
1391To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1392flag to suppress any warnings, and then examine the C<*errors> return.
1393
f9380377 1394=cut
37657a5b
KW
1395
1396Also implemented as a macro in utf8.h
37607a96 1397*/
67e989fb 1398
a0ed51b3 1399UV
f9380377 1400Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
37657a5b
KW
1401 STRLEN curlen,
1402 STRLEN *retlen,
1403 const U32 flags,
1404 U32 * errors)
1405{
1406 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1407
1408 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1409}
1410
1411/*
1412
1413=for apidoc utf8n_to_uvchr_msgs
1414
1415THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1416Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1417
1418This function is for code that needs to know what the precise malformation(s)
1419are when an error is found, and wants the corresponding warning and/or error
1420messages to be returned to the caller rather than be displayed. All messages
1421that would have been displayed if all lexcial warnings are enabled will be
1422returned.
1423
1424It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1425placed after all the others, C<msgs>. If this parameter is 0, this function
1426behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1427be a pointer to an C<AV *> variable, in which this function creates a new AV to
1428contain any appropriate messages. The elements of the array are ordered so
1429that the first message that would have been displayed is in the 0th element,
1430and so on. Each element is a hash with three key-value pairs, as follows:
1431
1432=over 4
1433
1434=item C<text>
1435
1436The text of the message as a C<SVpv>.
1437
1438=item C<warn_categories>
1439
1440The warning category (or categories) packed into a C<SVuv>.
1441
1442=item C<flag>
1443
1444A single flag bit associated with this message, in a C<SVuv>.
1445The bit corresponds to some bit in the C<*errors> return value,
1446such as C<UTF8_GOT_LONG>.
1447
1448=back
1449
1450It's important to note that specifying this parameter as non-null will cause
1451any warnings this function would otherwise generate to be suppressed, and
1452instead be placed in C<*msgs>. The caller can check the lexical warnings state
1453(or not) when choosing what to do with the returned messages.
1454
1455If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1456no AV is created.
1457
1458The caller, of course, is responsible for freeing any returned AV.
1459
1460=cut
1461*/
1462
1463UV
1464Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
1465 STRLEN curlen,
1466 STRLEN *retlen,
1467 const U32 flags,
1468 U32 * errors,
1469 AV ** msgs)
a0ed51b3 1470{
d4c19fe8 1471 const U8 * const s0 = s;
2b9519f0 1472 const U8 * send = s0 + curlen;
2b5e7bc2
KW
1473 U32 possible_problems = 0; /* A bit is set here for each potential problem
1474 found as we go along */
2b9519f0 1475 UV uv;
2b5e7bc2
KW
1476 STRLEN expectlen = 0; /* How long should this sequence be?
1477 (initialized to silence compilers' wrong
1478 warning) */
e308b348 1479 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1480 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1481 this gets set and discarded */
a0dbb045 1482
2b5e7bc2
KW
1483 /* The below are used only if there is both an overlong malformation and a
1484 * too short one. Otherwise the first two are set to 's0' and 'send', and
1485 * the third not used at all */
1486 U8 * adjusted_s0 = (U8 *) s0;
e9f2c446
KW
1487 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1488 routine; see [perl #130921] */
2b5e7bc2 1489 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1490
2b9519f0
KW
1491 UV state = 0;
1492
37657a5b 1493 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
f9380377
KW
1494
1495 if (errors) {
1496 *errors = 0;
1497 }
1498 else {
1499 errors = &discard_errors;
1500 }
a0dbb045 1501
eb83ed87
KW
1502 /* The order of malformation tests here is important. We should consume as
1503 * few bytes as possible in order to not skip any valid character. This is
1504 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1505 * http://unicode.org/reports/tr36 for more discussion as to why. For
1506 * example, once we've done a UTF8SKIP, we can tell the expected number of
1507 * bytes, and could fail right off the bat if the input parameters indicate
1508 * that there are too few available. But it could be that just that first
1509 * byte is garbled, and the intended character occupies fewer bytes. If we
1510 * blindly assumed that the first byte is correct, and skipped based on
1511 * that number, we could skip over a valid input character. So instead, we
1512 * always examine the sequence byte-by-byte.
1513 *
1514 * We also should not consume too few bytes, otherwise someone could inject
1515 * things. For example, an input could be deliberately designed to
1516 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1517 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1518 * which is actually part of of the overflowing sequence), that could look
1519 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1520 * sequence and process the rest, inappropriately.
1521 *
1522 * Some possible input sequences are malformed in more than one way. This
1523 * function goes to lengths to try to find all of them. This is necessary
1524 * for correctness, as the inputs may allow one malformation but not
1525 * another, and if we abandon searching for others after finding the
1526 * allowed one, we could allow in something that shouldn't have been.
1527 */
eb83ed87 1528
b5b9af04 1529 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1530 possible_problems |= UTF8_GOT_EMPTY;
1531 curlen = 0;
5a48568d 1532 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1533 goto ready_to_handle_errors;
0c443dc2
JH
1534 }
1535
eb83ed87
KW
1536 expectlen = UTF8SKIP(s);
1537
1538 /* A well-formed UTF-8 character, as the vast majority of calls to this
1539 * function will be for, has this expected length. For efficiency, set
1540 * things up here to return it. It will be overriden only in those rare
1541 * cases where a malformation is found */
1542 if (retlen) {
1543 *retlen = expectlen;
1544 }
1545
1546 /* An invariant is trivially well-formed */
2b9519f0
KW
1547 if (UTF8_IS_INVARIANT(*s0)) {
1548 return *s0;
a0ed51b3 1549 }
67e989fb 1550
2b9519f0
KW
1551#ifndef EBCDIC
1552
1553 /* Measurements show that this dfa is somewhat faster than the regular code
1554 * below, so use it first, dropping down for the non-normal cases. */
1555
1556# define PERL_UTF8_DECODE_REJECT 12
1557
1558 while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
1559 UV type = dfa_tab_for_perl[*s];
1560
1561 if (state != 0) {
1562 uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
1563 state = dfa_tab_for_perl[256 + state + type];
1564 }
1565 else {
1566 uv = (0xff >> type) & (*s);
1567 state = dfa_tab_for_perl[256 + type];
1568 }
1569
1570 if (state == 0) {
1571
1572 /* If this could be a code point that the flags don't allow (the first
1573 * surrogate is the first such possible one), delve further, but we already
1574 * have calculated 'uv' */
1575 if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1576 |UTF8_WARN_ILLEGAL_INTERCHANGE))
1577 && uv >= UNICODE_SURROGATE_FIRST)
1578 {
1579 curlen = s + 1 - s0;
1580 goto got_uv;
1581 }
1582
1583 return uv;
1584 }
1585
1586 s++;
1587 }
1588
1589 /* Here, is some sort of failure. Use the full mechanism */
1590
1591 uv = *s0;
1592
1593#endif
1594
eb83ed87 1595 /* A continuation character can't start a valid sequence */
b5b9af04 1596 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1597 possible_problems |= UTF8_GOT_CONTINUATION;
1598 curlen = 1;
1599 uv = UNICODE_REPLACEMENT;
1600 goto ready_to_handle_errors;
ba210ebe 1601 }
9041c2e3 1602
dcd27b3c 1603 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1604 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1605 * because it excludes start bytes like \xC0 that always lead to
1606 * overlongs.) */
dcd27b3c 1607
534752c1
KW
1608 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1609 * that indicate the number of bytes in the character's whole UTF-8
1610 * sequence, leaving just the bits that are part of the value. */
1611 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1612
e308b348
KW
1613 /* Setup the loop end point, making sure to not look past the end of the
1614 * input string, and flag it as too short if the size isn't big enough. */
e308b348
KW
1615 if (UNLIKELY(curlen < expectlen)) {
1616 possible_problems |= UTF8_GOT_SHORT;
1617 avail_len = curlen;
e308b348
KW
1618 }
1619 else {
2b9519f0 1620 send = (U8*) s0 + expectlen;
e308b348 1621 }
e308b348 1622
eb83ed87 1623 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1624 * accumulating each into the working value as we go. */
eb83ed87 1625 for (s = s0 + 1; s < send; s++) {
b5b9af04 1626 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1627 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1628 continue;
1629 }
1630
1631 /* Here, found a non-continuation before processing all expected bytes.
1632 * This byte indicates the beginning of a new character, so quit, even
1633 * if allowing this malformation. */
2b5e7bc2 1634 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1635 break;
eb83ed87
KW
1636 } /* End of loop through the character's bytes */
1637
1638 /* Save how many bytes were actually in the character */
1639 curlen = s - s0;
1640
2b5e7bc2
KW
1641 /* Note that there are two types of too-short malformation. One is when
1642 * there is actual wrong data before the normal termination of the
1643 * sequence. The other is that the sequence wasn't complete before the end
1644 * of the data we are allowed to look at, based on the input 'curlen'.
1645 * This means that we were passed data for a partial character, but it is
1646 * valid as far as we saw. The other is definitely invalid. This
1647 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1648 * separate.
1649 *
1650 * A convenience macro that matches either of the too-short conditions. */
1651# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1652
1653 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1654 uv_so_far = uv;
1655 uv = UNICODE_REPLACEMENT;
1656 }
2b5e7bc2 1657
08e73697
KW
1658 /* Check for overflow. The algorithm requires us to not look past the end
1659 * of the current character, even if partial, so the upper limit is 's' */
e050c007
KW
1660 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1661 1 /* Do consider overlongs */
1662 )))
1663 {
2b5e7bc2
KW
1664 possible_problems |= UTF8_GOT_OVERFLOW;
1665 uv = UNICODE_REPLACEMENT;
eb83ed87 1666 }
eb83ed87 1667
2b5e7bc2
KW
1668 /* Check for overlong. If no problems so far, 'uv' is the correct code
1669 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1670 * we must look at the UTF-8 byte sequence itself to see if it is for an
1671 * overlong */
1672 if ( ( LIKELY(! possible_problems)
1673 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
56576a04 1674 || ( UNLIKELY(possible_problems)
2b5e7bc2
KW
1675 && ( UNLIKELY(! UTF8_IS_START(*s0))
1676 || ( curlen > 1
d6be65ae 1677 && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
08e73697 1678 s - s0))))))
2f8f112e 1679 {
2b5e7bc2
KW
1680 possible_problems |= UTF8_GOT_LONG;
1681
abc28b54 1682 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
56576a04 1683
abc28b54
KW
1684 /* The calculation in the 'true' branch of this 'if'
1685 * below won't work if overflows, and isn't needed
1686 * anyway. Further below we handle all overflow
1687 * cases */
1688 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1689 {
2b5e7bc2
KW
1690 UV min_uv = uv_so_far;
1691 STRLEN i;
1692
1693 /* Here, the input is both overlong and is missing some trailing
1694 * bytes. There is no single code point it could be for, but there
1695 * may be enough information present to determine if what we have
1696 * so far is for an unallowed code point, such as for a surrogate.
56576a04
KW
1697 * The code further below has the intelligence to determine this,
1698 * but just for non-overlong UTF-8 sequences. What we do here is
1699 * calculate the smallest code point the input could represent if
1700 * there were no too short malformation. Then we compute and save
1701 * the UTF-8 for that, which is what the code below looks at
1702 * instead of the raw input. It turns out that the smallest such
1703 * code point is all we need. */
2b5e7bc2
KW
1704 for (i = curlen; i < expectlen; i++) {
1705 min_uv = UTF8_ACCUMULATE(min_uv,
1706 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1707 }
1708
e9f2c446 1709 adjusted_s0 = temp_char_buf;
57ff5f59 1710 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
2b5e7bc2 1711 }
eb83ed87
KW
1712 }
1713
2b9519f0
KW
1714 got_uv:
1715
56576a04
KW
1716 /* Here, we have found all the possible problems, except for when the input
1717 * is for a problematic code point not allowed by the input parameters. */
1718
06188866
KW
1719 /* uv is valid for overlongs */
1720 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1721
1722 /* isn't problematic if < this */
1723 && uv >= UNICODE_SURROGATE_FIRST)
2b5e7bc2 1724 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1725
1726 /* if overflow, we know without looking further
1727 * precisely which of the problematic types it is,
1728 * and we deal with those in the overflow handling
1729 * code */
1730 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
57ff5f59
KW
1731 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1732 || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
760c7c2f
KW
1733 && ((flags & ( UTF8_DISALLOW_NONCHAR
1734 |UTF8_DISALLOW_SURROGATE
1735 |UTF8_DISALLOW_SUPER
d044b7a7 1736 |UTF8_DISALLOW_PERL_EXTENDED
760c7c2f
KW
1737 |UTF8_WARN_NONCHAR
1738 |UTF8_WARN_SURROGATE
1739 |UTF8_WARN_SUPER
d22ec717 1740 |UTF8_WARN_PERL_EXTENDED))))
eb83ed87 1741 {
2b5e7bc2
KW
1742 /* If there were no malformations, or the only malformation is an
1743 * overlong, 'uv' is valid */
1744 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1745 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1746 possible_problems |= UTF8_GOT_SURROGATE;
1747 }
1748 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1749 possible_problems |= UTF8_GOT_SUPER;
1750 }
1751 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1752 possible_problems |= UTF8_GOT_NONCHAR;
1753 }
1754 }
1755 else { /* Otherwise, need to look at the source UTF-8, possibly
1756 adjusted to be non-overlong */
1757
1758 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1759 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1760 {
2b5e7bc2
KW
1761 possible_problems |= UTF8_GOT_SUPER;
1762 }
1763 else if (curlen > 1) {
1764 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1765 NATIVE_UTF8_TO_I8(*adjusted_s0),
1766 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1767 {
2b5e7bc2 1768 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1769 }
2b5e7bc2
KW
1770 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1771 NATIVE_UTF8_TO_I8(*adjusted_s0),
1772 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1773 {
1774 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1775 }
1776 }
c0236afe 1777
2b5e7bc2
KW
1778 /* We need a complete well-formed UTF-8 character to discern
1779 * non-characters, so can't look for them here */
1780 }
1781 }
949cf498 1782
2b5e7bc2
KW
1783 ready_to_handle_errors:
1784
1785 /* At this point:
1786 * curlen contains the number of bytes in the sequence that
1787 * this call should advance the input by.
e308b348
KW
1788 * avail_len gives the available number of bytes passed in, but
1789 * only if this is less than the expected number of
1790 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1791 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1792 * is set in it for each potential problem found.
1793 * uv contains the code point the input sequence
1794 * represents; or if there is a problem that prevents
1795 * a well-defined value from being computed, it is
1796 * some subsitute value, typically the REPLACEMENT
1797 * CHARACTER.
1798 * s0 points to the first byte of the character
56576a04
KW
1799 * s points to just after were we left off processing
1800 * the character
1801 * send points to just after where that character should
1802 * end, based on how many bytes the start byte tells
1803 * us should be in it, but no further than s0 +
1804 * avail_len
2b5e7bc2 1805 */
eb83ed87 1806
2b5e7bc2
KW
1807 if (UNLIKELY(possible_problems)) {
1808 bool disallowed = FALSE;
1809 const U32 orig_problems = possible_problems;
1810
37657a5b
KW
1811 if (msgs) {
1812 *msgs = NULL;
1813 }
1814
2b5e7bc2
KW
1815 while (possible_problems) { /* Handle each possible problem */
1816 UV pack_warn = 0;
1817 char * message = NULL;
37657a5b 1818 U32 this_flag_bit = 0;
2b5e7bc2
KW
1819
1820 /* Each 'if' clause handles one problem. They are ordered so that
1821 * the first ones' messages will be displayed before the later
6c64cd9d
KW
1822 * ones; this is kinda in decreasing severity order. But the
1823 * overlong must come last, as it changes 'uv' looked at by the
1824 * others */
2b5e7bc2
KW
1825 if (possible_problems & UTF8_GOT_OVERFLOW) {
1826
56576a04
KW
1827 /* Overflow means also got a super and are using Perl's
1828 * extended UTF-8, but we handle all three cases here */
2b5e7bc2 1829 possible_problems
d044b7a7 1830 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
f9380377
KW
1831 *errors |= UTF8_GOT_OVERFLOW;
1832
1833 /* But the API says we flag all errors found */
1834 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1835 *errors |= UTF8_GOT_SUPER;
1836 }
ddb65933 1837 if (flags
d044b7a7 1838 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1839 {
d044b7a7 1840 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1841 }
2b5e7bc2 1842
d60baaa7 1843 /* Disallow if any of the three categories say to */
56576a04 1844 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
d60baaa7 1845 || (flags & ( UTF8_DISALLOW_SUPER
d044b7a7 1846 |UTF8_DISALLOW_PERL_EXTENDED)))
d60baaa7
KW
1847 {
1848 disallowed = TRUE;
1849 }
1850
d22ec717
KW
1851 /* Likewise, warn if any say to */
1852 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1853 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
d60baaa7 1854 {
2b5e7bc2 1855
ddb65933
KW
1856 /* The warnings code explicitly says it doesn't handle the
1857 * case of packWARN2 and two categories which have
1858 * parent-child relationship. Even if it works now to
1859 * raise the warning if either is enabled, it wouldn't
1860 * necessarily do so in the future. We output (only) the
56576a04 1861 * most dire warning */
ddb65933 1862 if (! (flags & UTF8_CHECK_ONLY)) {
37657a5b 1863 if (msgs || ckWARN_d(WARN_UTF8)) {
ddb65933
KW
1864 pack_warn = packWARN(WARN_UTF8);
1865 }
37657a5b 1866 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
ddb65933
KW
1867 pack_warn = packWARN(WARN_NON_UNICODE);
1868 }
1869 if (pack_warn) {
1870 message = Perl_form(aTHX_ "%s: %s (overflows)",
1871 malformed_text,
05b9033b 1872 _byte_dump_string(s0, curlen, 0));
37657a5b 1873 this_flag_bit = UTF8_GOT_OVERFLOW;
ddb65933 1874 }
2b5e7bc2
KW
1875 }
1876 }
1877 }
1878 else if (possible_problems & UTF8_GOT_EMPTY) {
1879 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1880 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1881
1882 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1883
1884 /* This so-called malformation is now treated as a bug in
1885 * the caller. If you have nothing to decode, skip calling
1886 * this function */
1887 assert(0);
1888
2b5e7bc2 1889 disallowed = TRUE;
37657a5b
KW
1890 if ( (msgs
1891 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1892 {
2b5e7bc2
KW
1893 pack_warn = packWARN(WARN_UTF8);
1894 message = Perl_form(aTHX_ "%s (empty string)",
1895 malformed_text);
37657a5b 1896 this_flag_bit = UTF8_GOT_EMPTY;
2b5e7bc2
KW
1897 }
1898 }
1899 }
1900 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1901 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1902 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1903
1904 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1905 disallowed = TRUE;
37657a5b
KW
1906 if (( msgs
1907 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1908 {
2b5e7bc2
KW
1909 pack_warn = packWARN(WARN_UTF8);
1910 message = Perl_form(aTHX_
1911 "%s: %s (unexpected continuation byte 0x%02x,"
1912 " with no preceding start byte)",
1913 malformed_text,
7e2f38b2 1914 _byte_dump_string(s0, 1, 0), *s0);
37657a5b 1915 this_flag_bit = UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1916 }
1917 }
1918 }
2b5e7bc2
KW
1919 else if (possible_problems & UTF8_GOT_SHORT) {
1920 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1921 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1922
1923 if (! (flags & UTF8_ALLOW_SHORT)) {
1924 disallowed = TRUE;
37657a5b
KW
1925 if (( msgs
1926 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1927 {
2b5e7bc2
KW
1928 pack_warn = packWARN(WARN_UTF8);
1929 message = Perl_form(aTHX_
56576a04
KW
1930 "%s: %s (too short; %d byte%s available, need %d)",
1931 malformed_text,
1932 _byte_dump_string(s0, send - s0, 0),
1933 (int)avail_len,
1934 avail_len == 1 ? "" : "s",
1935 (int)expectlen);
37657a5b 1936 this_flag_bit = UTF8_GOT_SHORT;
2b5e7bc2
KW
1937 }
1938 }
ba210ebe 1939
2b5e7bc2 1940 }
e308b348
KW
1941 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1942 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1943 *errors |= UTF8_GOT_NON_CONTINUATION;
1944
1945 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1946 disallowed = TRUE;
37657a5b
KW
1947 if (( msgs
1948 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1949 {
99a765e9
KW
1950
1951 /* If we don't know for sure that the input length is
1952 * valid, avoid as much as possible reading past the
1953 * end of the buffer */
1954 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1955 ? s - s0
1956 : send - s0;
e308b348
KW
1957 pack_warn = packWARN(WARN_UTF8);
1958 message = Perl_form(aTHX_ "%s",
1959 unexpected_non_continuation_text(s0,
99a765e9 1960 printlen,
e308b348
KW
1961 s - s0,
1962 (int) expectlen));
37657a5b 1963 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
e308b348
KW
1964 }
1965 }
1966 }
2b5e7bc2
KW
1967 else if (possible_problems & UTF8_GOT_SURROGATE) {
1968 possible_problems &= ~UTF8_GOT_SURROGATE;
1969
f9380377
KW
1970 if (flags & UTF8_WARN_SURROGATE) {
1971 *errors |= UTF8_GOT_SURROGATE;
1972
1973 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 1974 && (msgs || ckWARN_d(WARN_SURROGATE)))
f9380377 1975 {
2b5e7bc2
KW
1976 pack_warn = packWARN(WARN_SURROGATE);
1977
1978 /* These are the only errors that can occur with a
1979 * surrogate when the 'uv' isn't valid */
1980 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1981 message = Perl_form(aTHX_
1982 "UTF-16 surrogate (any UTF-8 sequence that"
1983 " starts with \"%s\" is for a surrogate)",
7e2f38b2 1984 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1985 }
1986 else {
c94c2f39 1987 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 1988 }
37657a5b 1989 this_flag_bit = UTF8_GOT_SURROGATE;
f9380377 1990 }
2b5e7bc2 1991 }
ba210ebe 1992
2b5e7bc2
KW
1993 if (flags & UTF8_DISALLOW_SURROGATE) {
1994 disallowed = TRUE;
f9380377 1995 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1996 }
1997 }
1998 else if (possible_problems & UTF8_GOT_SUPER) {
1999 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 2000
f9380377
KW
2001 if (flags & UTF8_WARN_SUPER) {
2002 *errors |= UTF8_GOT_SUPER;
2003
2004 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2005 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
f9380377 2006 {
2b5e7bc2
KW
2007 pack_warn = packWARN(WARN_NON_UNICODE);
2008
2009 if (orig_problems & UTF8_GOT_TOO_SHORT) {
2010 message = Perl_form(aTHX_
2011 "Any UTF-8 sequence that starts with"
2012 " \"%s\" is for a non-Unicode code point,"
2013 " may not be portable",
7e2f38b2 2014 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
2015 }
2016 else {
c94c2f39 2017 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 2018 }
37657a5b 2019 this_flag_bit = UTF8_GOT_SUPER;
f9380377 2020 }
2b5e7bc2 2021 }
ba210ebe 2022
57ff5f59
KW
2023 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2024 * and before possibly bailing out, so that the more dire
2025 * warning will override the regular one. */
2026 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
2b5e7bc2 2027 if ( ! (flags & UTF8_CHECK_ONLY)
d044b7a7 2028 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
37657a5b 2029 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
2b5e7bc2 2030 {
db0f09e6 2031 pack_warn = packWARN(WARN_NON_UNICODE);
2b5e7bc2 2032
57ff5f59
KW
2033 /* If it is an overlong that evaluates to a code point
2034 * that doesn't have to use the Perl extended UTF-8, it
2035 * still used it, and so we output a message that
2036 * doesn't refer to the code point. The same is true
2037 * if there was a SHORT malformation where the code
2038 * point is not valid. In that case, 'uv' will have
2039 * been set to the REPLACEMENT CHAR, and the message
2040 * below without the code point in it will be selected
2041 * */
2042 if (UNICODE_IS_PERL_EXTENDED(uv)) {
2b5e7bc2 2043 message = Perl_form(aTHX_
57ff5f59 2044 perl_extended_cp_format, uv);
2b5e7bc2
KW
2045 }
2046 else {
2047 message = Perl_form(aTHX_
57ff5f59
KW
2048 "Any UTF-8 sequence that starts with"
2049 " \"%s\" is a Perl extension, and"
2050 " so is not portable",
2051 _byte_dump_string(s0, curlen, 0));
2b5e7bc2 2052 }
37657a5b 2053 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2b5e7bc2
KW
2054 }
2055
d044b7a7
KW
2056 if (flags & ( UTF8_WARN_PERL_EXTENDED
2057 |UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 2058 {
d044b7a7 2059 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 2060
d044b7a7 2061 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
f9380377
KW
2062 disallowed = TRUE;
2063 }
2b5e7bc2
KW
2064 }
2065 }
eb83ed87 2066
2b5e7bc2 2067 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 2068 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
2069 disallowed = TRUE;
2070 }
2b5e7bc2
KW
2071 }
2072 else if (possible_problems & UTF8_GOT_NONCHAR) {
2073 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 2074
f9380377
KW
2075 if (flags & UTF8_WARN_NONCHAR) {
2076 *errors |= UTF8_GOT_NONCHAR;
2077
2078 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2079 && (msgs || ckWARN_d(WARN_NONCHAR)))
f9380377 2080 {
2b5e7bc2
KW
2081 /* The code above should have guaranteed that we don't
2082 * get here with errors other than overlong */
2083 assert (! (orig_problems
2084 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2085
2086 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 2087 message = Perl_form(aTHX_ nonchar_cp_format, uv);
37657a5b 2088 this_flag_bit = UTF8_GOT_NONCHAR;
f9380377 2089 }
2b5e7bc2 2090 }
5b311467 2091
2b5e7bc2
KW
2092 if (flags & UTF8_DISALLOW_NONCHAR) {
2093 disallowed = TRUE;
f9380377 2094 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2 2095 }
6c64cd9d
KW
2096 }
2097 else if (possible_problems & UTF8_GOT_LONG) {
2098 possible_problems &= ~UTF8_GOT_LONG;
2099 *errors |= UTF8_GOT_LONG;
2100
2101 if (flags & UTF8_ALLOW_LONG) {
2102
2103 /* We don't allow the actual overlong value, unless the
2104 * special extra bit is also set */
2105 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2106 & ~UTF8_ALLOW_LONG)))
2107 {
2108 uv = UNICODE_REPLACEMENT;
2109 }
2110 }
2111 else {
2112 disallowed = TRUE;
2113
37657a5b
KW
2114 if (( msgs
2115 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2116 {
6c64cd9d
KW
2117 pack_warn = packWARN(WARN_UTF8);
2118
2119 /* These error types cause 'uv' to be something that
2120 * isn't what was intended, so can't use it in the
2121 * message. The other error types either can't
2122 * generate an overlong, or else the 'uv' is valid */
2123 if (orig_problems &
2124 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2125 {
2126 message = Perl_form(aTHX_
2127 "%s: %s (any UTF-8 sequence that starts"
2128 " with \"%s\" is overlong which can and"
2129 " should be represented with a"
2130 " different, shorter sequence)",
2131 malformed_text,
2132 _byte_dump_string(s0, send - s0, 0),
2133 _byte_dump_string(s0, curlen, 0));
2134 }
2135 else {
2136 U8 tmpbuf[UTF8_MAXBYTES+1];
1be62ab9
KW
2137 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2138 uv, 0);
d819dc50
KW
2139 /* Don't use U+ for non-Unicode code points, which
2140 * includes those in the Latin1 range */
2141 const char * preface = ( uv > PERL_UNICODE_MAX
2142#ifdef EBCDIC
2143 || uv <= 0xFF
2144#endif
2145 )
2146 ? "0x"
2147 : "U+";
6c64cd9d
KW
2148 message = Perl_form(aTHX_
2149 "%s: %s (overlong; instead use %s to represent"
2150 " %s%0*" UVXf ")",
2151 malformed_text,
2152 _byte_dump_string(s0, send - s0, 0),
2153 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2154 preface,
2155 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2156 small code points */
1be62ab9 2157 UNI_TO_NATIVE(uv));
6c64cd9d 2158 }
37657a5b 2159 this_flag_bit = UTF8_GOT_LONG;
6c64cd9d
KW
2160 }
2161 }
2b5e7bc2
KW
2162 } /* End of looking through the possible flags */
2163
2164 /* Display the message (if any) for the problem being handled in
2165 * this iteration of the loop */
2166 if (message) {
37657a5b 2167 if (msgs) {
37657a5b
KW
2168 assert(this_flag_bit);
2169
2170 if (*msgs == NULL) {
2171 *msgs = newAV();
2172 }
2173
bb07812e
KW
2174 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2175 pack_warn,
2176 this_flag_bit)));
37657a5b
KW
2177 }
2178 else if (PL_op)
2b5e7bc2
KW
2179 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2180 OP_DESC(PL_op));
2181 else
2182 Perl_warner(aTHX_ pack_warn, "%s", message);
2183 }
ddb65933 2184 } /* End of 'while (possible_problems)' */
a0dbb045 2185
2b5e7bc2
KW
2186 /* Since there was a possible problem, the returned length may need to
2187 * be changed from the one stored at the beginning of this function.
2188 * Instead of trying to figure out if that's needed, just do it. */
2189 if (retlen) {
2190 *retlen = curlen;
2191 }
a0dbb045 2192
2b5e7bc2
KW
2193 if (disallowed) {
2194 if (flags & UTF8_CHECK_ONLY && retlen) {
2195 *retlen = ((STRLEN) -1);
2196 }
2197 return 0;
2198 }
eb83ed87 2199 }
ba210ebe 2200
2b5e7bc2 2201 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
2202}
2203
8e84507e 2204/*
ec5f19d0
KW
2205=for apidoc utf8_to_uvchr_buf
2206
2207Returns the native code point of the first character in the string C<s> which
2208is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 2209C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 2210
524080c4
KW
2211If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2212enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 2213C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 2214(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 2215C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 2216the next possible position in C<s> that could begin a non-malformed character.
de69f3af 2217See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 2218returned.
ec5f19d0
KW
2219
2220=cut
52be2536
KW
2221
2222Also implemented as a macro in utf8.h
2223
ec5f19d0
KW
2224*/
2225
2226
2227UV
2228Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2229{
7f974d7e
KW
2230 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2231
ec5f19d0
KW
2232 assert(s < send);
2233
2234 return utf8n_to_uvchr(s, send - s, retlen,
ddb65933 2235 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
ec5f19d0
KW
2236}
2237
52be2536
KW
2238/* This is marked as deprecated
2239 *
ec5f19d0
KW
2240=for apidoc utf8_to_uvuni_buf
2241
de69f3af
KW
2242Only in very rare circumstances should code need to be dealing in Unicode
2243(as opposed to native) code points. In those few cases, use
2244C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
2245
2246Returns the Unicode (not-native) code point of the first character in the
2247string C<s> which
ec5f19d0
KW
2248is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2249C<retlen> will be set to the length, in bytes, of that character.
2250
524080c4
KW
2251If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2252enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2253NULL) to -1. If those warnings are off, the computed value if well-defined (or
2254the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2255is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2256next possible position in C<s> that could begin a non-malformed character.
de69f3af 2257See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0
KW
2258
2259=cut
2260*/
2261
2262UV
2263Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2264{
2265 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2266
2267 assert(send > s);
2268
5962d97e 2269 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
2270}
2271
b76347f2 2272/*
87cea99e 2273=for apidoc utf8_length
b76347f2 2274
b2e7ed74
KW
2275Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2276at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2277same place, it returns 0 with no warning raised.
2278
2279If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2280and returns the number of valid characters.
b76347f2
JH
2281
2282=cut
2283*/
2284
2285STRLEN
35a4481c 2286Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
2287{
2288 STRLEN len = 0;
2289
7918f24d
NC
2290 PERL_ARGS_ASSERT_UTF8_LENGTH;
2291
8850bf83
JH
2292 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2293 * the bitops (especially ~) can create illegal UTF-8.
2294 * In other words: in Perl UTF-8 is not just for Unicode. */
2295
a3b680e6
AL
2296 if (e < s)
2297 goto warn_and_return;
b76347f2 2298 while (s < e) {
4cbf4130 2299 s += UTF8SKIP(s);
8e91ec7f
AV
2300 len++;
2301 }
2302
2303 if (e != s) {
2304 len--;
2305 warn_and_return:
9b387841
NC
2306 if (PL_op)
2307 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2308 "%s in %s", unees, OP_DESC(PL_op));
2309 else
61a12c31 2310 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
2311 }
2312
2313 return len;
2314}
2315
b06226ff 2316/*
fed3ba5d
NC
2317=for apidoc bytes_cmp_utf8
2318
a1433954 2319Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
2320sequence of characters (stored as UTF-8)
2321in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
2322equal, -1 or -2 if the first string is less than the second string, +1 or +2
2323if the first string is greater than the second string.
2324
2325-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
2326longer string. -2 or +2 is returned if
2327there was a difference between characters
fed3ba5d
NC
2328within the strings.
2329
2330=cut
2331*/
2332
2333int
2334Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2335{
2336 const U8 *const bend = b + blen;
2337 const U8 *const uend = u + ulen;
2338
2339 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
2340
2341 while (b < bend && u < uend) {
2342 U8 c = *u++;
2343 if (!UTF8_IS_INVARIANT(c)) {
2344 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2345 if (u < uend) {
2346 U8 c1 = *u++;
2347 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 2348 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 2349 } else {
2b5e7bc2 2350 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 2351 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
56576a04
KW
2352 "%s %s%s",
2353 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2354 PL_op ? " in " : "",
2355 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
2356 return -2;
2357 }
2358 } else {
2359 if (PL_op)
2360 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2361 "%s in %s", unees, OP_DESC(PL_op));
2362 else
61a12c31 2363 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
2364 return -2; /* Really want to return undef :-) */
2365 }
2366 } else {
2367 return -2;
2368 }
2369 }
2370 if (*b != c) {
2371 return *b < c ? -2 : +2;
2372 }
2373 ++b;
2374 }
2375
2376 if (b == bend && u == uend)
2377 return 0;
2378
2379 return b < bend ? +1 : -1;
2380}
2381
2382/*
87cea99e 2383=for apidoc utf8_to_bytes
6940069f 2384
3bc0c78c 2385Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 2386Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 2387updates C<*lenp> to contain the new length.
3bc0c78c
KW
2388Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2389
2390Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2391having saved the value of C<*lenp> before the call, and subtracting the
2392after-call value of C<*lenp> from it.
6940069f 2393
a1433954 2394If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 2395
6940069f
GS
2396=cut
2397*/
2398
2399U8 *
09af0336 2400Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 2401{
9fe0d3c2 2402 U8 * first_variant;
246fae53 2403
7918f24d 2404 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 2405 PERL_UNUSED_CONTEXT;
7918f24d 2406
9fe0d3c2 2407 /* This is a no-op if no variants at all in the input */
09af0336 2408 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
2409 return s;
2410 }
2411
2412 {
3c5aa262 2413 U8 * const save = s;
09af0336 2414 U8 * const send = s + *lenp;
3c5aa262
KW
2415 U8 * d;
2416
2417 /* Nothing before the first variant needs to be changed, so start the real
2418 * work there */
2419 s = first_variant;
2420 while (s < send) {
2421 if (! UTF8_IS_INVARIANT(*s)) {
2422 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 2423 *lenp = ((STRLEN) -1);
3c5aa262
KW
2424 return 0;
2425 }
2426 s++;
d59937ca
KW
2427 }
2428 s++;
dcad2880 2429 }
dcad2880 2430
3c5aa262
KW
2431 /* Is downgradable, so do it */
2432 d = s = first_variant;
2433 while (s < send) {
2434 U8 c = *s++;
2435 if (! UVCHR_IS_INVARIANT(c)) {
2436 /* Then it is two-byte encoded */
2437 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2438 s++;
2439 }
2440 *d++ = c;
2441 }
2442 *d = '\0';
09af0336 2443 *lenp = d - save;
3c5aa262
KW
2444
2445 return save;
9fe0d3c2 2446 }
6940069f
GS
2447}
2448
2449/*
87cea99e 2450=for apidoc bytes_from_utf8
f9a63242 2451
09af0336 2452Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2453byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2454actually encoded in UTF-8.
2455
2456Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2457the input string.
2458
41ae6089
KW
2459Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2460not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2461C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2462
41ae6089 2463Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2464newly created string containing a downgraded copy of C<s>, and whose length is
23b37b12 2465returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
f9a63242 2466
3bc0c78c 2467Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2468having saved the value of C<*lenp> before the call, and subtracting the
2469after-call value of C<*lenp> from it.
3bc0c78c 2470
37607a96 2471=cut
976c1b08
KW
2472
2473There is a macro that avoids this function call, but this is retained for
2474anyone who calls it with the Perl_ prefix */
f9a63242
JH
2475
2476U8 *
41ae6089 2477Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2478{
7918f24d 2479 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2480 PERL_UNUSED_CONTEXT;
f9a63242 2481
976c1b08
KW
2482 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2483}
2484
2485/*
2486No = here because currently externally undocumented
2487for apidoc bytes_from_utf8_loc
2488
2489Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2490to store the location of the first character in C<"s"> that cannot be
2491converted to non-UTF8.
2492
2493If that parameter is C<NULL>, this function behaves identically to
2494C<bytes_from_utf8>.
2495
2496Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2497C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2498
2499Otherwise, the function returns a newly created C<NUL>-terminated string
2500containing the non-UTF8 equivalent of the convertible first portion of
2501C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2502If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2503and C<*first_non_downgradable> is set to C<NULL>.
2504
2505Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2506first character in the original string that wasn't converted. C<*is_utf8p> is
2507unchanged. Note that the new string may have length 0.
2508
2509Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2510C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2511converts as many characters in it as possible stopping at the first one it
385b74be 2512finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2513set to point to that. The function returns the portion that could be converted
2514in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2515not including the terminating C<NUL>. If the very first character in the
2516original could not be converted, C<*lenp> will be 0, and the new string will
2517contain just a single C<NUL>. If the entire input string was converted,
2518C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2519
2520Upon successful return, the number of variants in the converted portion of the
2521string can be computed by having saved the value of C<*lenp> before the call,
2522and subtracting the after-call value of C<*lenp> from it.
2523
2524=cut
2525
2526
2527*/
2528
2529U8 *
2530Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2531{
2532 U8 *d;
2533 const U8 *original = s;
2534 U8 *converted_start;
2535 const U8 *send = s + *lenp;
f9a63242 2536
976c1b08 2537 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2538
976c1b08
KW
2539 if (! *is_utf8p) {
2540 if (first_unconverted) {
2541 *first_unconverted = NULL;
2542 }
2543
2544 return (U8 *) original;
2545 }
2546
2547 Newx(d, (*lenp) + 1, U8);
2548
2549 converted_start = d;
7299a045
KW
2550 while (s < send) {
2551 U8 c = *s++;
2552 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2553
2554 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2555 * have to stop now */
2556 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2557 if (first_unconverted) {
2558 *first_unconverted = s - 1;
2559 goto finish_and_return;
2560 }
2561 else {
2562 Safefree(converted_start);
2563 return (U8 *) original;
2564 }
2565 }
2566
7299a045
KW
2567 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2568 s++;
38af28cf 2569 }
7299a045
KW
2570 *d++ = c;
2571 }
170a1c22 2572
976c1b08
KW
2573 /* Here, converted the whole of the input */
2574 *is_utf8p = FALSE;
2575 if (first_unconverted) {
2576 *first_unconverted = NULL;
170a1c22 2577 }
976c1b08
KW
2578
2579 finish_and_return:
46a08a6f
KW
2580 *d = '\0';
2581 *lenp = d - converted_start;
976c1b08
KW
2582
2583 /* Trim unused space */
2584 Renew(converted_start, *lenp + 1, U8);
2585
2586 return converted_start;
f9a63242
JH
2587}
2588
2589/*
87cea99e 2590=for apidoc bytes_to_utf8
6940069f 2591
09af0336 2592Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2593UTF-8.
09af0336 2594Returns a pointer to the newly-created string, and sets C<*lenp> to
ff97e5cf 2595reflect the new length in bytes.
6940069f 2596
3bc0c78c 2597Upon successful return, the number of variants in the string can be computed by
23b37b12 2598having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2599after-call value of C<*lenp>.
2600
75200dff 2601A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2602
2603If you want to convert to UTF-8 from encodings other than
2604the native (Latin1 or EBCDIC),
a1433954 2605see L</sv_recode_to_utf8>().
c9ada85f 2606
497711e7 2607=cut
6940069f
GS
2608*/
2609
2610U8*
09af0336 2611Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2612{
09af0336 2613 const U8 * const send = s + (*lenp);
6940069f
GS
2614 U8 *d;
2615 U8 *dst;
7918f24d
NC
2616
2617 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2618 PERL_UNUSED_CONTEXT;
6940069f 2619
09af0336 2620 Newx(d, (*lenp) * 2 + 1, U8);
6940069f
GS
2621 dst = d;
2622
2623 while (s < send) {
55d09dc8
KW
2624 append_utf8_from_native_byte(*s, &d);
2625 s++;
6940069f 2626 }
2e11cf67 2627
6940069f 2628 *d = '\0';
09af0336 2629 *lenp = d-dst;
2e11cf67
KW
2630
2631 /* Trim unused space */
2632 Renew(dst, *lenp + 1, U8);
2633
6940069f
GS
2634 return dst;
2635}
2636
a0ed51b3 2637/*
624504c5
KW
2638 * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
2639 * use utf16_to_utf8_reversed().
a0ed51b3 2640 *
624504c5
KW
2641 * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
2642 * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
2643 * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
2644 *
2645 * These functions don't check for overflow. The worst case is every code
2646 * point in the input is 2 bytes, and requires 4 bytes on output. (If the code
2647 * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
2648 * destination must be pre-extended to 2 times the source length.
2649 *
2650 * Do not use in-place. We optimize for native, for obvious reasons. */
a0ed51b3
LW
2651
2652U8*
dea0fc0b 2653Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 2654{
dea0fc0b
JH
2655 U8* pend;
2656 U8* dstart = d;
2657
7918f24d
NC
2658 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2659
dea0fc0b 2660 if (bytelen & 1)
56576a04
KW
2661 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2662 (UV)bytelen);
dea0fc0b
JH
2663
2664 pend = p + bytelen;
2665
a0ed51b3 2666 while (p < pend) {
dea0fc0b
JH
2667 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2668 p += 2;
2d1545e5 2669 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2670 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2671 continue;
2672 }
56d37426
KW
2673 if (uv <= MAX_UTF8_TWO_BYTE) {
2674 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2675 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2676 continue;
2677 }
ffd0a9d3 2678
46956fad
KW
2679#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2680#define LAST_HIGH_SURROGATE 0xDBFF
2681#define FIRST_LOW_SURROGATE 0xDC00
2682#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
ffd0a9d3 2683#define FIRST_IN_PLANE1 0x10000
e23c50db
KW
2684
2685 /* This assumes that most uses will be in the first Unicode plane, not
2686 * needing surrogates */
2687 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2688 && uv <= UNICODE_SURROGATE_LAST))
2689 {
2690 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2691 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2692 }
2693 else {
01ea242b 2694 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2695 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2696 || UNLIKELY(low > LAST_LOW_SURROGATE))
2697 {
01ea242b 2698 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2699 }
2700 p += 2;
46956fad 2701 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
ffd0a9d3 2702 + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
01ea242b 2703 }
a0ed51b3 2704 }
56d37426
KW
2705#ifdef EBCDIC
2706 d = uvoffuni_to_utf8_flags(d, uv, 0);
2707#else
ffd0a9d3 2708 if (uv < FIRST_IN_PLANE1) {
eb160463
GS
2709 *d++ = (U8)(( uv >> 12) | 0xe0);
2710 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2711 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2712 continue;
2713 }
2714 else {
eb160463
GS
2715 *d++ = (U8)(( uv >> 18) | 0xf0);
2716 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2717 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2718 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2719 continue;
2720 }
56d37426 2721#endif
a0ed51b3 2722 }
dea0fc0b 2723 *newlen = d - dstart;
a0ed51b3
LW
2724 return d;
2725}
2726
2727/* Note: this one is slightly destructive of the source. */
2728
2729U8*
dea0fc0b 2730Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2731{
2732 U8* s = (U8*)p;
d4c19fe8 2733 U8* const send = s + bytelen;
7918f24d
NC
2734
2735 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2736
e0ea5e2d 2737 if (bytelen & 1)
147e3846 2738 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2739 (UV)bytelen);
2740
a0ed51b3 2741 while (s < send) {
d4c19fe8 2742 const U8 tmp = s[0];
a0ed51b3
LW
2743 s[0] = s[1];
2744 s[1] = tmp;
2745 s += 2;
2746 }
dea0fc0b 2747 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2748}
2749
922e8cb4
KW
2750bool
2751Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2752{
2753 U8 tmpbuf[UTF8_MAXBYTES+1];
2754 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2755 return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
922e8cb4
KW
2756}
2757
f9ae8fb6
JD
2758/* Internal function so we can deprecate the external one, and call
2759 this one from other deprecated functions in this file */
2760
f2645549
KW
2761bool
2762Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2763{
f2645549 2764 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2765
2766 if (*p == '_')
2767 return TRUE;
f25ce844 2768 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2769}
2770
5092f92a 2771bool
eba68aa0
KW
2772Perl__is_uni_perl_idcont(pTHX_ UV c)
2773{
2774 U8 tmpbuf[UTF8_MAXBYTES+1];
2775 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2776 return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
eba68aa0
KW
2777}
2778
2779bool
f91dcd13
KW
2780Perl__is_uni_perl_idstart(pTHX_ UV c)
2781{
2782 U8 tmpbuf[UTF8_MAXBYTES+1];
2783 uvchr_to_utf8(tmpbuf, c);
da8c1a98 2784 return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
f91dcd13
KW
2785}
2786
3a4c58c9 2787UV
56576a04
KW
2788Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2789 const char S_or_s)
3a4c58c9
KW
2790{
2791 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2792 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2793 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2794 * either "SS" or "Ss". Which one to use is passed into the routine in
2795 * 'S_or_s' to avoid a test */
2796
2797 UV converted = toUPPER_LATIN1_MOD(c);
2798
2799 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2800
2801 assert(S_or_s == 'S' || S_or_s == 's');
2802
6f2d5cbc 2803 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2804 characters in this range */
3a4c58c9
KW
2805 *p = (U8) converted;
2806 *lenp = 1;
2807 return converted;
2808 }
2809
2810 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2811 * which it maps to one of them, so as to only have to have one check for
2812 * it in the main case */
2813 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2814 switch (c) {
2815 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2816 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2817 break;
2818 case MICRO_SIGN:
2819 converted = GREEK_CAPITAL_LETTER_MU;
2820 break;
79e064b9
KW
2821#if UNICODE_MAJOR_VERSION > 2 \
2822 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2823 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2824 case LATIN_SMALL_LETTER_SHARP_S:
2825 *(p)++ = 'S';
2826 *p = S_or_s;
2827 *lenp = 2;
2828 return 'S';
79e064b9 2829#endif
3a4c58c9 2830 default:
56576a04
KW
2831 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2832 " '%c' to map to '%c'",
2833 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2834 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2835 }
2836 }
2837
2838 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2839 *p = UTF8_TWO_BYTE_LO(converted);
2840 *lenp = 2;
2841
2842 return converted;
2843}
2844
50bda2c3
KW
2845/* Call the function to convert a UTF-8 encoded character to the specified case.
2846 * Note that there may be more than one character in the result.
2847 * INP is a pointer to the first byte of the input character
2848 * OUTP will be set to the first byte of the string of changed characters. It
2849 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2850 * LENP will be set to the length in bytes of the string of changed characters
2851 *
56576a04
KW
2852 * The functions return the ordinal of the first character in the string of
2853 * OUTP */
2854#define CALL_UPPER_CASE(uv, s, d, lenp) \
2855 _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2856#define CALL_TITLE_CASE(uv, s, d, lenp) \
2857 _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2858#define CALL_LOWER_CASE(uv, s, d, lenp) \
2859 _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2860
b9992569
KW
2861/* This additionally has the input parameter 'specials', which if non-zero will
2862 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2863 * folding); otherwise, when zero, this implies a simple case fold */
56576a04
KW
2864#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
2865_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2866
84afefe6
JH
2867UV
2868Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2869{
a1433954
KW
2870 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2871 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2872 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2873 * the changed version may be longer than the original character.
2874 *
2875 * The ordinal of the first character of the changed version is returned
2876 * (but note, as explained above, that there may be more.) */
2877
7918f24d
NC
2878 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2879
3a4c58c9
KW
2880 if (c < 256) {
2881 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2882 }
2883
0ebc6274 2884 uvchr_to_utf8(p, c);
b9992569 2885 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2886}
2887
84afefe6
JH
2888UV
2889Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2890{
7918f24d
NC
2891 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2892
3a4c58c9
KW
2893 if (c < 256) {
2894 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2895 }
2896
0ebc6274 2897 uvchr_to_utf8(p, c);
b9992569 2898 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2899}
2900
afc16117 2901STATIC U8
eaf412bf 2902S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2903{
2904 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2905 * those, converting the result to UTF-8. Since the result is always just
a1433954 2906 * one character, we allow <p> to be NULL */
afc16117
KW
2907
2908 U8 converted = toLOWER_LATIN1(c);
2909
eaf412bf
KW
2910 PERL_UNUSED_ARG(dummy);
2911
afc16117 2912 if (p != NULL) {
6f2d5cbc 2913 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2914 *p = converted;
2915 *lenp = 1;
2916 }
2917 else {
430c9760
KW
2918 /* Result is known to always be < 256, so can use the EIGHT_BIT
2919 * macros */
2920 *p = UTF8_EIGHT_BIT_HI(converted);
2921 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2922 *lenp = 2;
2923 }
2924 }
2925 return converted;
2926}
2927
84afefe6
JH
2928UV
2929Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2930{
7918f24d
NC
2931 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2932
afc16117 2933 if (c < 256) {
eaf412bf 2934 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
2935 }
2936
afc16117 2937 uvchr_to_utf8(p, c);
b9992569 2938 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2939}
2940
84afefe6 2941UV
56576a04
KW
2942Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2943 const unsigned int flags)
a1dde8de 2944{
51910141 2945 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2946 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2947 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2948 *
2949 * Not to be used for locale folds
51910141 2950 */
f673fad4 2951
a1dde8de
KW
2952 UV converted;
2953
2954 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2955 PERL_UNUSED_CONTEXT;
a1dde8de 2956
1ca267a5
KW
2957 assert (! (flags & FOLD_FLAGS_LOCALE));
2958
659a7c2d 2959 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2960 converted = GREEK_SMALL_LETTER_MU;
2961 }
9b63e895
KW
2962#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2963 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2964 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2965 else if ( (flags & FOLD_FLAGS_FULL)
2966 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2967 {
1ca267a5
KW
2968 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2969 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2970 * under those circumstances. */
2971 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2972 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2973 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2974 p, *lenp, U8);
2975 return LATIN_SMALL_LETTER_LONG_S;
2976 }
2977 else {
4f489194
KW
2978 *(p)++ = 's';
2979 *p = 's';
2980 *lenp = 2;
2981 return 's';
1ca267a5 2982 }
a1dde8de 2983 }
9b63e895 2984#endif
a1dde8de
KW
2985 else { /* In this range the fold of all other characters is their lower
2986 case */
2987 converted = toLOWER_LATIN1(c);
2988 }
2989
6f2d5cbc 2990 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2991 *p = (U8) converted;
2992 *lenp = 1;
2993 }
2994 else {
2995 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2996 *p = UTF8_TWO_BYTE_LO(converted);
2997 *lenp = 2;
2998 }
2999
3000 return converted;
3001}
3002
3003UV
31f05a37 3004Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 3005{
4b593389 3006
a0270393
KW
3007 /* Not currently externally documented, and subject to change
3008 * <flags> bits meanings:
3009 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
3010 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3011 * locale are to be used.
a0270393
KW
3012 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3013 */
4b593389 3014
36bb2ab6 3015 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 3016
780fcc9f
KW
3017 if (flags & FOLD_FLAGS_LOCALE) {
3018 /* Treat a UTF-8 locale as not being in locale at all */
3019 if (IN_UTF8_CTYPE_LOCALE) {
3020 flags &= ~FOLD_FLAGS_LOCALE;
3021 }
3022 else {
3023 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 3024 goto needs_full_generality;
780fcc9f 3025 }
31f05a37
KW
3026 }
3027
a1dde8de 3028 if (c < 256) {
e7b7ac46 3029 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 3030 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
3031 }
3032
2f306ab9 3033 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
3034 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3035 uvchr_to_utf8(p, c);
b9992569 3036 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 3037 }
567b353c 3038 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
3039 the special flags. */
3040 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
3041
3042 needs_full_generality:
a0270393 3043 uvchr_to_utf8(utf8_c, c);
56576a04
KW
3044 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
3045 p, lenp, flags);
a0270393 3046 }
84afefe6
JH
3047}
3048
26483009 3049PERL_STATIC_INLINE bool
5141f98e 3050S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 3051 const char *const swashname, SV* const invlist)
bde6a22d 3052{
ea317ccb
KW
3053 /* returns a boolean giving whether or not the UTF8-encoded character that
3054 * starts at <p> is in the swash indicated by <swashname>. <swash>
3055 * contains a pointer to where the swash indicated by <swashname>
3056 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
3057 * look at <*swash> and only generate a swash if it is not null. <invlist>
3058 * is NULL or an inversion list that defines the swash. If not null, it
3059 * saves time during initialization of the swash.
ea317ccb
KW
3060 *
3061 * Note that it is assumed that the buffer length of <p> is enough to
3062 * contain all the bytes that comprise the character. Thus, <*p> should
3063 * have been checked before this call for mal-formedness enough to assure
3064 * that. */
3065
7918f24d
NC
3066 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3067
492a624f 3068 /* The API should have included a length for the UTF-8 character in <p>,
28123549 3069 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
3070 * as far as there being enough bytes available in it to accommodate the
3071 * character without reading beyond the end, and pass that number on to the
3072 * validating routine */
6302f837 3073 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
86ae6e94 3074 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
99a765e9 3075 _UTF8_NO_CONFIDENCE_IN_CURLEN,
86ae6e94
KW
3076 1 /* Die */ );
3077 NOT_REACHED; /* NOTREACHED */
28123549 3078 }
86ae6e94 3079
87367d5f
KW
3080 if (!*swash) {
3081 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
3082 *swash = _core_swash_init("utf8",
3083
3084 /* Only use the name if there is no inversion
3085 * list; otherwise will go out to disk */
3086 (invlist) ? "" : swashname,
3087
3088 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 3089 }
28123549 3090
bde6a22d
NC
3091 return swash_fetch(*swash, p, TRUE) != 0;
3092}
3093
da8c1a98 3094PERL_STATIC_INLINE bool
56576a04
KW
3095S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
3096 SV **swash, const char *const swashname,
3097 SV* const invlist)
da8c1a98
KW
3098{
3099 /* returns a boolean giving whether or not the UTF8-encoded character that
3100 * starts at <p>, and extending no further than <e - 1> is in the swash
3101 * indicated by <swashname>. <swash> contains a pointer to where the swash
3102 * indicated by <swashname> is to be stored; which this routine will do, so
3103 * that future calls will look at <*swash> and only generate a swash if it
3104 * is not null. <invlist> is NULL or an inversion list that defines the
3105 * swash. If not null, it saves time during initialization of the swash.
3106 */
3107
3108 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
3109
3110 if (! isUTF8_CHAR(p, e)) {
3111 _force_out_malformed_utf8_message(p, e, 0, 1);
3112 NOT_REACHED; /* NOTREACHED */
3113 }
3114
3115 if (!*swash) {
3116 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
3117 *swash = _core_swash_init("utf8",
3118
3119 /* Only use the name if there is no inversion
3120 * list; otherwise will go out to disk */
3121 (invlist) ? "" : swashname,
3122
3123 &PL_sv_undef, 1, 0, invlist, &flags);
3124 }
3125
3126 return swash_fetch(*swash, p, TRUE) != 0;
3127}
3128
34aeb2e9
KW
3129STATIC void
3130S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3131 const char * const alternative,
3132 const bool use_locale,
3133 const char * const file,
3134 const unsigned line)
3135{
3136 const char * key;
3137
3138 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3139
3140 if (ckWARN_d(WARN_DEPRECATED)) {
3141
3142 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3143 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3144 if (! PL_seen_deprecated_macro) {
3145 PL_seen_deprecated_macro = newHV();
3146 }
3147 if (! hv_store(PL_seen_deprecated_macro, key,
3148 strlen(key), &PL_sv_undef, 0))
3149 {
3150 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3151 }
3152
c44e9413 3153 if (instr(file, "mathoms.c")) {
607313a1
KW
3154 Perl_warner(aTHX_ WARN_DEPRECATED,
3155 "In %s, line %d, starting in Perl v5.30, %s()"
3156 " will be removed. Avoid this message by"
3157 " converting to use %s().\n",
3158 file, line, name, alternative);
3159 }
3160 else {
34aeb2e9
KW
3161 Perl_warner(aTHX_ WARN_DEPRECATED,
3162 "In %s, line %d, starting in Perl v5.30, %s() will"
3163 " require an additional parameter. Avoid this"
3164 " message by converting to use %s().\n",
3165 file, line, name, alternative);
607313a1 3166 }
34aeb2e9
KW
3167 }
3168 }
3169}
3170
bde6a22d 3171bool
34aeb2e9 3172Perl__is_utf8_FOO(pTHX_ U8 classnum,
be99e2c2 3173 const U8 * const p,
34aeb2e9
KW
3174 const char * const name,
3175 const char * const alternative,
3176 const bool use_utf8,
3177 const bool use_locale,
3178 const char * const file,
3179 const unsigned line)
922e8cb4 3180{
922e8cb4
KW
3181 PERL_ARGS_ASSERT__IS_UTF8_FOO;
3182
34aeb2e9
KW
3183 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3184
3185 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
34aeb2e9
KW
3186
3187 switch (classnum) {
3188 case _CC_WORDCHAR:
3189 case _CC_DIGIT:
3190 case _CC_ALPHA:
3191 case _CC_LOWER:
3192 case _CC_UPPER:
3193 case _CC_PUNCT:
3194 case _CC_PRINT:
3195 case _CC_ALPHANUMERIC:
3196 case _CC_GRAPH:
3197 case _CC_CASED:
3198
3199 return is_utf8_common(p,
3200 &PL_utf8_swash_ptrs[classnum],
3201 swash_property_names[classnum],
3202 PL_XPosix_ptrs[classnum]);
3203
3204 case _CC_SPACE:
3205 return is_XPERLSPACE_high(p);
3206 case _CC_BLANK:
3207 return is_HORIZWS_high(p);
3208 case _CC_XDIGIT:
3209 return is_XDIGIT_high(p);
3210 case _CC_CNTRL:
3211 return 0;
3212 case _CC_ASCII:
3213 return 0;
3214 case _CC_VERTSPACE:
3215 return is_VERTWS_high(p);
3216 case _CC_IDFIRST:
3217 if (! PL_utf8_perl_idstart) {
22f0498f
KW
3218 PL_utf8_perl_idstart
3219 = _new_invlist_C_array(_Perl_IDStart_invlist);
34aeb2e9 3220 }
22f0498f
KW
3221 return is_utf8_common(p, &PL_utf8_perl_idstart,
3222 "_Perl_IDStart", NULL);
34aeb2e9
KW
3223 case _CC_IDCONT:
3224 if (! PL_utf8_perl_idcont) {
22f0498f
KW
3225 PL_utf8_perl_idcont
3226 = _new_invlist_C_array(_Perl_IDCont_invlist);
34aeb2e9 3227 }
22f0498f
KW
3228 return is_utf8_common(p, &PL_utf8_perl_idcont,
3229 "_Perl_IDCont", NULL);
34aeb2e9
KW
3230 }
3231 }
3232
3233 /* idcont is the same as wordchar below 256 */
3234 if (classnum == _CC_IDCONT) {
3235 classnum = _CC_WORDCHAR;
3236 }
3237 else if (classnum == _CC_IDFIRST) {
3238 if (*p == '_') {
3239 return TRUE;
3240 }
3241 classnum = _CC_ALPHA;
3242 }
3243
3244 if (! use_locale) {
3245 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
3246 return _generic_isCC(*p, classnum);
3247 }
922e8cb4 3248
34aeb2e9
KW
3249 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
3250 }
3251 else {
3252 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
3253 return isFOO_lc(classnum, *p);
3254 }
3255
3256 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
3257 }
3258
3259 NOT_REACHED; /* NOTREACHED */
922e8cb4
KW
3260}
3261
3262bool
da8c1a98
KW
3263Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
3264 const U8 * const e)
3265{
3266 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
3267
3268 assert(classnum < _FIRST_NON_SWASH_CC);
3269
3270 return is_utf8_common_with_len(p,
3271 e,
3272 &PL_utf8_swash_ptrs[classnum],
3273 swash_property_names[classnum],
3274 PL_XPosix_ptrs[classnum]);
3275}
3276
3277bool
da8c1a98
KW
3278Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
3279{
3280 SV* invlist = NULL;
3281
3282 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
3283
3284 if (! PL_utf8_perl_idstart) {
3285 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
3286 }
3287 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
3288 "_Perl_IDStart", invlist);
3289}
3290
3291bool
f2645549 3292Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 3293{
f2645549 3294 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
3295
3296 if (*p == '_')
3297 return TRUE;
f25ce844 3298 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
3299}
3300
3301bool
da8c1a98
KW
3302Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
3303{
3304 SV* invlist = NULL;
3305
3306 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
3307
3308 if (! PL_utf8_perl_idcont) {
3309 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
3310 }
3311 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
3312 "_Perl_IDCont", invlist);
3313}
3314
3315bool
f2645549 3316Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 3317{
f2645549 3318 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 3319
f25ce844 3320 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
3321}
3322
3323bool
f2645549 3324Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 3325{
f2645549 3326 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 3327
f25ce844 3328 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
3329}
3330
3331bool
7dbf68d2
KW
3332Perl__is_utf8_mark(pTHX_ const U8 *p)
3333{
7dbf68d2
KW
3334 PERL_ARGS_ASSERT__IS_UTF8_MARK;
3335
f25ce844 3336 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
3337}
3338
b9992569 3339 /* change namve uv1 to 'from' */
6a4a25f4 3340STATIC UV
b9992569
KW
3341S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
3342 SV **swashp, const char *normal, const char *special)
3343{
0134edef 3344 STRLEN len = 0;
7918f24d 3345
b9992569 3346 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 3347
36eaa811
KW
3348 /* For code points that don't change case, we already know that the output
3349 * of this function is the unchanged input, so we can skip doing look-ups
3350 * for them. Unfortunately the case-changing code points are scattered
3351 * around. But there are some long consecutive ranges where there are no
3352 * case changing code points. By adding tests, we can eliminate the lookup
3353 * for all the ones in such ranges. This is currently done here only for
3354 * just a few cases where the scripts are in common use in modern commerce
3355 * (and scripts adjacent to those which can be included without additional
3356 * tests). */
3357
3358 if (uv1 >= 0x0590) {
3359 /* This keeps from needing further processing the code points most
3360 * likely to be used in the following non-cased scripts: Hebrew,
3361 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
3362 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
3363 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
3364 if (uv1 < 0x10A0) {
3365 goto cases_to_self;
3366 }
3367
3368 /* The following largish code point ranges also don't have case
3369 * changes, but khw didn't think they warranted extra tests to speed
3370 * them up (which would slightly slow down everything else above them):
3371 * 1100..139F Hangul Jamo, Ethiopic
3372 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
3373 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
3374 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
3375 * Combining Diacritical Marks Extended, Balinese,
3376 * Sundanese, Batak, Lepcha, Ol Chiki
3377 * 2000..206F General Punctuation
3378 */
3379
3380 if (uv1 >= 0x2D30) {
3381
3382 /* This keeps the from needing further processing the code points
3383 * most likely to be used in the following non-cased major scripts:
3384 * CJK, Katakana, Hiragana, plus some less-likely scripts.
3385 *
3386 * (0x2D30 above might have to be changed to 2F00 in the unlikely
3387 * event that Unicode eventually allocates the unused block as of
3388 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
3389 * that the test suite will start having failures to alert you
3390 * should that happen) */
3391 if (uv1 < 0xA640) {
3392 goto cases_to_self;
3393 }
3394
3395 if (uv1 >= 0xAC00) {
3396 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
3397 if (ckWARN_d(WARN_SURROGATE)) {
3398 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3399 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
56576a04
KW
3400 "Operation \"%s\" returns its argument for"
3401 " UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
3402 }
3403 goto cases_to_self;
3404 }
36eaa811
KW
3405
3406 /* AC00..FAFF Catches Hangul syllables and private use, plus
3407 * some others */
3408 if (uv1 < 0xFB00) {
3409 goto cases_to_self;
3410
3411 }
3412
5af9bc97 3413 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
d22ec717
KW
3414 if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
3415 Perl_croak(aTHX_ cp_above_legal_max, uv1,
3416 MAX_EXTERNALLY_LEGAL_CP);
5af9bc97
KW
3417 }
3418 if (ckWARN_d(WARN_NON_UNICODE)) {
3419 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3420 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
56576a04
KW
3421 "Operation \"%s\" returns its argument for"
3422 " non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
3423 }
3424 goto cases_to_self;
3425 }
3bfc1e70
KW
3426#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3427 if (UNLIKELY(uv1
3428 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3429 {
3430
56576a04
KW
3431 /* As of Unicode 10.0, this means we avoid swash creation
3432 * for anything beyond high Plane 1 (below emojis) */
3bfc1e70
KW
3433 goto cases_to_self;
3434 }
3435#endif
36eaa811
KW
3436 }
3437 }
9ae3ac1a 3438
36eaa811
KW
3439 /* Note that non-characters are perfectly legal, so no warning should
3440 * be given. There are so few of them, that it isn't worth the extra
3441 * tests to avoid swash creation */
9ae3ac1a
KW
3442 }
3443
0134edef 3444 if (!*swashp) /* load on-demand */
56576a04
KW
3445 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
3446 4, 0, NULL, NULL);
0134edef 3447
a6f87d8c 3448 if (special) {
0134edef 3449 /* It might be "special" (sometimes, but not always,
2a37f04d 3450 * a multicharacter mapping) */
4a8240a3 3451 HV *hv = NULL;
b08cf34e
JH
3452 SV **svp;
3453
4a8240a3
KW
3454 /* If passed in the specials name, use that; otherwise use any
3455 * given in the swash */
3456 if (*special != '\0') {
3457 hv = get_hv(special, 0);
3458 }
3459 else {
3460 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
3461 if (svp) {
3462 hv = MUTABLE_HV(SvRV(*svp));
3463 }
3464 }
3465
176fe009 3466 if (hv
5f560d8a 3467 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
3468 && (*svp))
3469 {
cfd0369c 3470 const char *s;
47654450 3471
cfd0369c 3472 s = SvPV_const(*svp, len);
47654450 3473 if (len == 1)
f4cd282c 3474 /* EIGHTBIT */
c80e42f3 3475 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 3476 else {
d2dcd0fb 3477 Copy(s, ustrp, len, U8);
29e98929 3478 }
983ffd37 3479 }
0134edef
JH
3480 }
3481
3482 if (!len && *swashp) {
4a4088c4 3483 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 3484
0134edef
JH
3485 if (uv2) {
3486 /* It was "normal" (a single character mapping). */
f4cd282c 3487 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
3488 }
3489 }
1feea2c7 3490
cbe07460
KW
3491 if (len) {
3492 if (lenp) {
3493 *lenp = len;
3494 }
3495 return valid_utf8_to_uvchr(ustrp, 0);
3496 }
3497
3498 /* Here, there was no mapping defined, which means that the code point maps
3499 * to itself. Return the inputs */
e24dfe9c 3500 cases_to_self:
bfdf22ec 3501 len = UTF8SKIP(p);
ca9fab46
KW
3502 if (p != ustrp) { /* Don't copy onto itself */
3503 Copy(p, ustrp, len, U8);
3504 }
0134edef 3505
2a37f04d
JH
3506 if (lenp)
3507 *lenp = len;
3508
f4cd282c 3509 return uv1;
cbe07460 3510
a0ed51b3
LW
3511}
3512
051a06d4 3513STATIC UV
56576a04
KW
3514S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3515 U8* const ustrp, STRLEN *lenp)
051a06d4 3516{
4a4088c4 3517 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3518 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3519 * result contains a character that crosses the 255/256 boundary, disallow
3520 * the change, and return the original code point. See L<perlfunc/lc> for
3521 * why;
051a06d4 3522 *
a1433954
KW
3523 * p points to the original string whose case was changed; assumed
3524 * by this routine to be well-formed
051a06d4 3525 * result the code point of the first character in the changed-case string
56576a04
KW
3526 * ustrp points to the changed-case string (<result> represents its
3527 * first char)
051a06d4
KW
3528 * lenp points to the length of <ustrp> */
3529
3530 UV original; /* To store the first code point of <p> */
3531
3532 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3533
a4f12ed7 3534 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3535
3536 /* We know immediately if the first character in the string crosses the
3537 * boundary, so can skip */
3538 if (result > 255) {
3539
3540 /* Look at every character in the result; if any cross the
3541 * boundary, the whole thing is disallowed */
3542 U8* s = ustrp + UTF8SKIP(ustrp);
3543 U8* e = ustrp + *lenp;
3544 while (s < e) {
a4f12ed7 3545 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3546 goto bad_crossing;
3547 }
3548 s += UTF8SKIP(s);
3549 }
3550
613abc6d
KW
3551 /* Here, no characters crossed, result is ok as-is, but we warn. */
3552 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3553 return result;
3554 }
3555
7b52d656 3556 bad_crossing:
051a06d4
KW
3557
3558 /* Failed, have to return the original */
4b88fb76 3559 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3560
3561 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3562 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
56576a04
KW
3563 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3564 " locale; resolved to \"\\x{%" UVXf "}\".",
357aadde 3565 OP_DESC(PL_op),
ab0b796c
KW
3566 original,
3567 original);
051a06d4
KW
3568 Copy(p, ustrp, *lenp, char);
3569 return original;
3570}
3571
607313a1
KW
3572STATIC U32
3573S_check_and_deprecate(pTHX_ const U8 *p,
3574 const U8 **e,
3575 const unsigned int type, /* See below */
3576 const bool use_locale, /* Is this a 'LC_'
3577 macro call? */
3578 const char * const file,
3579 const unsigned line)
3580{
3581 /* This is a temporary function to deprecate the unsafe calls to the case
3582 * changing macros and functions. It keeps all the special stuff in just
3583 * one place.
3584 *
3585 * It updates *e with the pointer to the end of the input string. If using
3586 * the old-style macros, *e is NULL on input, and so this function assumes
3587 * the input string is long enough to hold the entire UTF-8 sequence, and
3588 * sets *e accordingly, but it then returns a flag to pass the
3589 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3590 * using the full length if possible.
3591 *
3592 * It also does the assert that *e > p when *e is not NULL. This should be
3593 * migrated to the callers when this function gets deleted.
3594 *
3595 * The 'type' parameter is used for the caller to specify which case
3596 * changing function this is called from: */
3597
3598# define DEPRECATE_TO_UPPER 0
3599# define DEPRECATE_TO_TITLE 1
3600# define DEPRECATE_TO_LOWER 2
3601# define DEPRECATE_TO_FOLD 3
3602
3603 U32 utf8n_flags = 0;
3604 const char * name;
3605 const char * alternative;
3606
3607 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3608
3609 if (*e == NULL) {
3610 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3611 *e = p + UTF8SKIP(p);
3612
3613 /* For mathoms.c calls, we use the function name we know is stored
c44e9413 3614 * there. It could be part of a larger path */
607313a1 3615 if (type == DEPRECATE_TO_UPPER) {
c44e9413 3616 name = instr(file, "mathoms.c")
607313a1
KW
3617 ? "to_utf8_upper"
3618 : "toUPPER_utf8";
3619 alternative = "toUPPER_utf8_safe";
3620 }
3621 else if (type == DEPRECATE_TO_TITLE) {
c44e9413 3622 name = instr(file, "mathoms.c")
607313a1
KW
3623 ? "to_utf8_title"
3624 : "toTITLE_utf8";
3625 alternative = "toTITLE_utf8_safe";
3626 }
3627 else if (type == DEPRECATE_TO_LOWER) {
c44e9413 3628 name = instr(file, "mathoms.c")
607313a1
KW
3629 ? "to_utf8_lower"
3630 : "toLOWER_utf8";
3631 alternative = "toLOWER_utf8_safe";
3632 }
3633 else if (type == DEPRECATE_TO_FOLD) {
c44e9413 3634 name = instr(file, "mathoms.c")
607313a1
KW
3635 ? "to_utf8_fold"
3636 : "toFOLD_utf8";
3637 alternative = "toFOLD_utf8_safe";
3638 }
3639 else Perl_croak(aTHX_ "panic: Unexpected case change type");
3640
3641 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3642 }
3643 else {
3644 assert (p < *e);
3645 }
3646
3647 return utf8n_flags;
3648}
3649
eaf412bf
KW
3650/* The process for changing the case is essentially the same for the four case
3651 * change types, except there are complications for folding. Otherwise the
3652 * difference is only which case to change to. To make sure that they all do
3653 * the same thing, the bodies of the functions are extracted out into the
3654 * following two macros. The functions are written with the same variable
3655 * names, and these are known and used inside these macros. It would be
3656 * better, of course, to have inline functions to do it, but since different
3657 * macros are called, depending on which case is being changed to, this is not
3658 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3659 * function can start with the common start macro, then finish with its special
3660 * handling; while the other three cases can just use the common end macro.
3661 *
3662 * The algorithm is to use the proper (passed in) macro or function to change
3663 * the case for code points that are below 256. The macro is used if using
3664 * locale rules for the case change; the function if not. If the code point is
3665 * above 255, it is computed from the input UTF-8, and another macro is called
3666 * to do the conversion. If necessary, the output is converted to UTF-8. If
3667 * using a locale, we have to check that the change did not cross the 255/256
3668 * boundary, see check_locale_boundary_crossing() for further details.
3669 *
3670 * The macros are split with the correct case change for the below-256 case
3671 * stored into 'result', and in the middle of an else clause for the above-255
3672 * case. At that point in the 'else', 'result' is not the final result, but is
3673 * the input code point calculated from the UTF-8. The fold code needs to
3674 * realize all this and take it from there.
3675 *
3676 * If you read the two macros as sequential, it's easier to understand what's
3677 * going on. */
3678#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
3679 L1_func_extra_param) \
a239b1e2 3680 \
eaf412bf
KW
3681 if (flags & (locale_flags)) { \
3682 /* Treat a UTF-8 locale as not being in locale at all */ \
3683 if (IN_UTF8_CTYPE_LOCALE) { \
3684 flags &= ~(locale_flags); \
3685 } \
3686 else { \
3687 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3688 } \
3689 } \
3690 \
3691 if (UTF8_IS_INVARIANT(*p)) { \
3692 if (flags & (locale_flags)) { \
3693 result = LC_L1_change_macro(*p); \
3694 } \
3695 else { \
3696 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3697 } \
3698 } \
a239b1e2 3699 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
eaf412bf
KW
3700 if (flags & (locale_flags)) { \
3701 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
3702 *(p+1))); \
3703 } \
3704 else { \
3705 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
3706 ustrp, lenp, L1_func_extra_param); \
3707 } \
3708 } \
fa8ab374
KW
3709 else { /* malformed UTF-8 or ord above 255 */ \
3710 STRLEN len_result; \
fa8ab374
KW
3711 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3712 if (len_result == (STRLEN) -1) { \
607313a1
KW
3713 _force_out_malformed_utf8_message(p, e, utf8n_flags, \
3714 1 /* Die */ ); \
fa8ab374 3715 }
eaf412bf
KW
3716
3717#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3718 result = change_macro(result, p, ustrp, lenp); \
3719 \
3720 if (flags & (locale_flags)) { \
3721 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3722 } \
3723 return result; \
3724 } \
3725 \
3726 /* Here, used locale rules. Convert back to UTF-8 */ \
3727 if (UTF8_IS_INVARIANT(result)) { \
3728 *ustrp = (U8) result; \
3729 *lenp = 1; \
3730 } \
3731 else { \
3732 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3733 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3734 *lenp = 2; \
3735 } \
3736 \
3737 return result;
3738
d3e79532 3739/*
87cea99e 3740=for apidoc to_utf8_upper
d3e79532