This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update perlfaq to CPAN version 5.20200523
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
fb7e7255 39
48ef279e 40/*
7fefc6c1 41These are various utility functions for manipulating UTF8-encoded
72d33970 42strings. For the uninitiated, this is a method of representing arbitrary
61296642 43Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
eaf7a4d2
CS
46*/
47
dd051059
DM
48/* helper for Perl__force_out_malformed_utf8_message(). Like
49 * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
50 * PL_compiling */
51
52static void
53S_restore_cop_warnings(pTHX_ void *p)
54{
1943af61 55 free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
dd051059
DM
56}
57
58
9cbfb8ab
KW
59void
60Perl__force_out_malformed_utf8_message(pTHX_
61 const U8 *const p, /* First byte in UTF-8 sequence */
62 const U8 * const e, /* Final byte in sequence (may include
63 multiple chars */
64 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
65 usually 0, or some DISALLOW flags */
66 const bool die_here) /* If TRUE, this function does not return */
67{
68 /* This core-only function is to be called when a malformed UTF-8 character
69 * is found, in order to output the detailed information about the
70 * malformation before dieing. The reason it exists is for the occasions
71 * when such a malformation is fatal, but warnings might be turned off, so
72 * that normally they would not be actually output. This ensures that they
73 * do get output. Because a sequence may be malformed in more than one
74 * way, multiple messages may be generated, so we can't make them fatal, as
75 * that would cause the first one to die.
76 *
77 * Instead we pretend -W was passed to perl, then die afterwards. The
78 * flexibility is here to return to the caller so they can finish up and
79 * die themselves */
80 U32 errors;
81
82 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
83
84 ENTER;
c15a80f3 85 SAVEI8(PL_dowarn);
9cbfb8ab
KW
86 SAVESPTR(PL_curcop);
87
88 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
89 if (PL_curcop) {
dd051059
DM
90 /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
91 * than PL_compiling */
92 SAVEDESTRUCTOR_X(S_restore_cop_warnings,
93 (void*)PL_curcop->cop_warnings);
9cbfb8ab
KW
94 PL_curcop->cop_warnings = pWARN_ALL;
95 }
96
97 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
98
99 LEAVE;
100
101 if (! errors) {
102 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
103 " be called only when there are errors found");
104 }
105
106 if (die_here) {
107 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
108 }
109}
110
bb07812e
KW
111STATIC HV *
112S_new_msg_hv(pTHX_ const char * const message, /* The message text */
113 U32 categories, /* Packed warning categories */
114 U32 flag) /* Flag associated with this message */
115{
116 /* Creates, populates, and returns an HV* that describes an error message
117 * for the translators between UTF8 and code point */
118
119 SV* msg_sv = newSVpv(message, 0);
120 SV* category_sv = newSVuv(categories);
121 SV* flag_bit_sv = newSVuv(flag);
122
123 HV* msg_hv = newHV();
124
125 PERL_ARGS_ASSERT_NEW_MSG_HV;
126
2b672cf5
KW
127 (void) hv_stores(msg_hv, "text", msg_sv);
128 (void) hv_stores(msg_hv, "warn_categories", category_sv);
129 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
bb07812e
KW
130
131 return msg_hv;
132}
133
eaf7a4d2 134/*
378516de 135=for apidoc uvoffuni_to_utf8_flags
eebe1485 136
a27992cc 137THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
138Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
139L<perlapi/uvchr_to_utf8_flags>>.
a27992cc 140
de69f3af
KW
141This function is like them, but the input is a strict Unicode
142(as opposed to native) code point. Only in very rare circumstances should code
143not be using the native code point.
949cf498 144
09232555 145For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
949cf498 146
eebe1485
SC
147=cut
148*/
149
33f38593
KW
150U8 *
151Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
152{
153 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
154
155 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
156}
157
c94c2f39
KW
158/* All these formats take a single UV code point argument */
159const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
160const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
161 " is not recommended for open interchange";
162const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
163 " may not be portable";
c94c2f39 164
33f38593 165#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
8ee1cdcb
KW
166 STMT_START { \
167 if (flags & UNICODE_WARN_SURROGATE) { \
33f38593
KW
168 U32 category = packWARN(WARN_SURROGATE); \
169 const char * format = surrogate_cp_format; \
170 if (msgs) { \
171 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
172 category, \
173 UNICODE_GOT_SURROGATE); \
174 } \
175 else { \
176 Perl_ck_warner_d(aTHX_ category, format, uv); \
177 } \
8ee1cdcb
KW
178 } \
179 if (flags & UNICODE_DISALLOW_SURROGATE) { \
180 return NULL; \
181 } \
182 } STMT_END;
183
33f38593 184#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
8ee1cdcb
KW
185 STMT_START { \
186 if (flags & UNICODE_WARN_NONCHAR) { \
33f38593
KW
187 U32 category = packWARN(WARN_NONCHAR); \
188 const char * format = nonchar_cp_format; \
189 if (msgs) { \
190 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
191 category, \
192 UNICODE_GOT_NONCHAR); \
193 } \
194 else { \
195 Perl_ck_warner_d(aTHX_ category, format, uv); \
196 } \
8ee1cdcb
KW
197 } \
198 if (flags & UNICODE_DISALLOW_NONCHAR) { \
199 return NULL; \
200 } \
201 } STMT_END;
202
ba6ed43c
KW
203/* Use shorter names internally in this file */
204#define SHIFT UTF_ACCUMULATION_SHIFT
205#undef MARK
206#define MARK UTF_CONTINUATION_MARK
207#define MASK UTF_CONTINUATION_MASK
208
33f38593
KW
209/*
210=for apidoc uvchr_to_utf8_flags_msgs
211
212THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
213
214Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
215
216This function is for code that wants any warning and/or error messages to be
217returned to the caller rather than be displayed. All messages that would have
884a31ee 218been displayed if all lexical warnings are enabled will be returned.
33f38593
KW
219
220It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
221placed after all the others, C<msgs>. If this parameter is 0, this function
222behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
223be a pointer to an C<HV *> variable, in which this function creates a new HV to
224contain any appropriate messages. The hash has three key-value pairs, as
225follows:
226
227=over 4
228
229=item C<text>
230
231The text of the message as a C<SVpv>.
232
233=item C<warn_categories>
234
235The warning category (or categories) packed into a C<SVuv>.
236
237=item C<flag>
238
239A single flag bit associated with this message, in a C<SVuv>.
240The bit corresponds to some bit in the C<*errors> return value,
241such as C<UNICODE_GOT_SURROGATE>.
242
243=back
244
245It's important to note that specifying this parameter as non-null will cause
246any warnings this function would otherwise generate to be suppressed, and
247instead be placed in C<*msgs>. The caller can check the lexical warnings state
248(or not) when choosing what to do with the returned messages.
249
250The caller, of course, is responsible for freeing any returned HV.
251
252=cut
253*/
254
255/* Undocumented; we don't want people using this. Instead they should use
256 * uvchr_to_utf8_flags_msgs() */
dfe13c55 257U8 *
33f38593 258Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
a0ed51b3 259{
33f38593
KW
260 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
261
262 if (msgs) {
263 *msgs = NULL;
264 }
7918f24d 265
2d1545e5 266 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 267 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
268 return d;
269 }
facc1dc2 270
3ea68d71 271 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
272 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
273 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
274 return d;
275 }
d9432125 276
ba6ed43c
KW
277 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
278 * below, the 16 is for start bytes E0-EF (which are all the possible ones
279 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
280 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
281 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
282 * 0x800-0xFFFF on ASCII */
283 if (uv < (16 * (1U << (2 * SHIFT)))) {
284 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
285 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
286 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
287
288#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
289 aren't tested here */
290 /* The most likely code points in this range are below the surrogates.
291 * Do an extra test to quickly exclude those. */
292 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
293 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
294 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
295 {
33f38593 296 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
8ee1cdcb
KW
297 }
298 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
33f38593 299 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
760c7c2f 300 }
ba6ed43c
KW
301 }
302#endif
303 return d;
304 }
305
306 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
307 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
308 * happen starting with 4-byte characters on ASCII platforms. We unify the
309 * code for these with EBCDIC, even though some of them require 5-bytes on
310 * those, because khw believes the code saving is worth the very slight
311 * performance hit on these high EBCDIC code points. */
312
313 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
24b4c303
KW
314 if (UNLIKELY( uv > MAX_LEGAL_CP
315 && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
316 {
fb2f0a6a 317 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
a5bf80e0 318 }
33f38593
KW
319 if ( (flags & UNICODE_WARN_SUPER)
320 || ( (flags & UNICODE_WARN_PERL_EXTENDED)
0a8a1a5b 321 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0 322 {
33f38593
KW
323 const char * format = super_cp_format;
324 U32 category = packWARN(WARN_NON_UNICODE);
325 U32 flag = UNICODE_GOT_SUPER;
326
327 /* Choose the more dire applicable warning */
328 if (UNICODE_IS_PERL_EXTENDED(uv)) {
8911f9b0 329 format = PL_extended_cp_format;
dc4a6683 330 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
33f38593
KW
331 if (flags & (UNICODE_WARN_PERL_EXTENDED
332 |UNICODE_DISALLOW_PERL_EXTENDED))
333 {
334 flag = UNICODE_GOT_PERL_EXTENDED;
335 }
336 }
a5bf80e0 337
33f38593
KW
338 if (msgs) {
339 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
340 category, flag);
341 }
dc4a6683
KW
342 else if ( ckWARN_d(WARN_NON_UNICODE)
343 || ( (flag & UNICODE_GOT_PERL_EXTENDED)
344 && ckWARN(WARN_PORTABLE)))
345 {
346 Perl_warner(aTHX_ category, format, uv);
33f38593 347 }
a5bf80e0 348 }
56576a04 349 if ( (flags & UNICODE_DISALLOW_SUPER)
0a8a1a5b
KW
350 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
351 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
352 {
353 return NULL;
354 }
355 }
ba6ed43c 356 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
33f38593 357 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
507b9800 358 }
d9432125 359
ba6ed43c
KW
360 /* Test for and handle 4-byte result. In the test immediately below, the
361 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
362 * characters). The 3 is for 3 continuation bytes; these each contribute
363 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
364 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
365 * 0x1_0000-0x1F_FFFF on ASCII */
366 if (uv < (8 * (1U << (3 * SHIFT)))) {
367 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
368 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
369 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
370 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
371
372#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
373 characters. The end-plane non-characters for EBCDIC were
374 handled just above */
375 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
33f38593 376 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
d528804a 377 }
ba6ed43c 378 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
33f38593 379 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
ba6ed43c
KW
380 }
381#endif
382
383 return d;
384 }
385
386 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
387 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
388 * format. The unrolled version above turns out to not save all that much
389 * time, and at these high code points (well above the legal Unicode range
390 * on ASCII platforms, and well above anything in common use in EBCDIC),
391 * khw believes that less code outweighs slight performance gains. */
392
d9432125 393 {
5aaebcb3 394 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
395 U8 *p = d+len-1;
396 while (p > d) {
957a9e81
KW
397 *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
398 uv >>= SHIFT;
1d72bdf6 399 }
4c8cd605 400 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
401 return d+len;
402 }
a0ed51b3 403}
a5bf80e0 404
646ca15d 405/*
07693fe6
KW
406=for apidoc uvchr_to_utf8
407
bcb1a2d4 408Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 409of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
410C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
411the byte after the end of the new character. In other words,
07693fe6
KW
412
413 d = uvchr_to_utf8(d, uv);
414
415is the recommended wide native character-aware way of saying
416
417 *(d++) = uv;
418
d22ec717
KW
419This function accepts any code point from 0..C<IV_MAX> as input.
420C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
421
422It is possible to forbid or warn on non-Unicode code points, or those that may
423be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 424
07693fe6
KW
425=cut
426*/
427
de69f3af
KW
428/* This is also a macro */
429PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
430
07693fe6
KW
431U8 *
432Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
433{
de69f3af 434 return uvchr_to_utf8(d, uv);
07693fe6
KW
435}
436
de69f3af
KW
437/*
438=for apidoc uvchr_to_utf8_flags
439
440Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 441of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
442C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
443the byte after the end of the new character. In other words,
de69f3af
KW
444
445 d = uvchr_to_utf8_flags(d, uv, flags);
446
447or, in most cases,
448
449 d = uvchr_to_utf8_flags(d, uv, 0);
450
451This is the Unicode-aware way of saying
452
453 *(d++) = uv;
454
d22ec717
KW
455If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
456input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
457
458Specifying C<flags> can further restrict what is allowed and not warned on, as
459follows:
de69f3af 460
796b6530 461If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
462the function will raise a warning, provided UTF8 warnings are enabled. If
463instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
464NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 465
760c7c2f
KW
466Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
467affect how the function handles a Unicode non-character.
93e6dbd6 468
760c7c2f
KW
469And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
470affect the handling of code points that are above the Unicode maximum of
4710x10FFFF. Languages other than Perl may not be able to accept files that
472contain these.
93e6dbd6
KW
473
474The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
475the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
476three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
477allowed inputs to the strict UTF-8 traditionally defined by Unicode.
478Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
479C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
480above-Unicode and surrogate flags, but not the non-character ones, as
481defined in
e2176993 482L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
ecc1615f 483See L<perlunicode/Noncharacter code points>.
93e6dbd6 484
57ff5f59
KW
485Extremely high code points were never specified in any standard, and require an
486extension to UTF-8 to express, which Perl does. It is likely that programs
487written in something other than Perl would not be able to read files that
488contain these; nor would Perl understand files written by something that uses a
489different extension. For these reasons, there is a separate set of flags that
490can warn and/or disallow these extremely high code points, even if other
491above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
492and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
eb992c6f 493C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
57ff5f59
KW
494treat all above-Unicode code points, including these, as malformations. (Note
495that the Unicode standard considers anything above 0x10FFFF to be illegal, but
496there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
497
498A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
499retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
500C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
7c4a22ed
KW
501C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
502platforms,these flags can apply to code points that actually do fit in 31 bits.
503The new names accurately describe the situation in all cases.
de69f3af 504
de69f3af
KW
505=cut
506*/
507
508/* This is also a macro */
509PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
510
07693fe6
KW
511U8 *
512Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
513{
de69f3af 514 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
515}
516
57ff5f59
KW
517#ifndef UV_IS_QUAD
518
e050c007
KW
519STATIC int
520S_is_utf8_cp_above_31_bits(const U8 * const s,
521 const U8 * const e,
522 const bool consider_overlongs)
83dc0f42
KW
523{
524 /* Returns TRUE if the first code point represented by the Perl-extended-
525 * UTF-8-encoded string starting at 's', and looking no further than 'e -
526 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
527 *
528 * The function handles the case where the input bytes do not include all
529 * the ones necessary to represent a full character. That is, they may be
530 * the intial bytes of the representation of a code point, but possibly
531 * the final ones necessary for the complete representation may be beyond
532 * 'e - 1'.
533 *
e050c007
KW
534 * The function also can handle the case where the input is an overlong
535 * sequence. If 'consider_overlongs' is 0, the function assumes the
536 * input is not overlong, without checking, and will return based on that
537 * assumption. If this parameter is 1, the function will go to the trouble
538 * of figuring out if it actually evaluates to above or below 31 bits.
83dc0f42 539 *
e050c007 540 * The sequence is otherwise assumed to be well-formed, without checking.
83dc0f42
KW
541 */
542
e050c007
KW
543 const STRLEN len = e - s;
544 int is_overlong;
545
546 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
547
548 assert(! UTF8_IS_INVARIANT(*s) && e > s);
549
83dc0f42
KW
550#ifdef EBCDIC
551
e050c007 552 PERL_UNUSED_ARG(consider_overlongs);
83dc0f42 553
e050c007
KW
554 /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
555 * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
556 * also be the start byte for a 31-bit code point; we need at least 2
557 * bytes, and maybe up through 8 bytes, to determine that. (It can also be
558 * the start byte for an overlong sequence, but for 30-bit or smaller code
559 * points, so we don't have to worry about overlongs on EBCDIC.) */
560 if (*s != 0xFE) {
561 return 0;
562 }
83dc0f42 563
e050c007
KW
564 if (len == 1) {
565 return -1;
566 }
83dc0f42 567
e050c007 568#else
83dc0f42 569
e050c007
KW
570 /* On ASCII, FE and FF are the only start bytes that can evaluate to
571 * needing more than 31 bits. */
572 if (LIKELY(*s < 0xFE)) {
573 return 0;
574 }
83dc0f42 575
e050c007
KW
576 /* What we have left are FE and FF. Both of these require more than 31
577 * bits unless they are for overlongs. */
578 if (! consider_overlongs) {
579 return 1;
580 }
83dc0f42 581
e050c007
KW
582 /* Here, we have FE or FF. If the input isn't overlong, it evaluates to
583 * above 31 bits. But we need more than one byte to discern this, so if
584 * passed just the start byte, it could be an overlong evaluating to
585 * smaller */
586 if (len == 1) {
587 return -1;
588 }
83dc0f42 589
e050c007
KW
590 /* Having excluded len==1, and knowing that FE and FF are both valid start
591 * bytes, we can call the function below to see if the sequence is
592 * overlong. (We don't need the full generality of the called function,
593 * but for these huge code points, speed shouldn't be a consideration, and
594 * the compiler does have enough information, since it's static to this
595 * file, to optimize to just the needed parts.) */
596 is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
83dc0f42 597
e050c007
KW
598 /* If it isn't overlong, more than 31 bits are required. */
599 if (is_overlong == 0) {
600 return 1;
601 }
83dc0f42 602
e050c007
KW
603 /* If it is indeterminate if it is overlong, return that */
604 if (is_overlong < 0) {
605 return -1;
606 }
607
608 /* Here is overlong. Such a sequence starting with FE is below 31 bits, as
609 * the max it can be is 2**31 - 1 */
610 if (*s == 0xFE) {
611 return 0;
83dc0f42
KW
612 }
613
e050c007
KW
614#endif
615
616 /* Here, ASCII and EBCDIC rejoin:
617 * On ASCII: We have an overlong sequence starting with FF
618 * On EBCDIC: We have a sequence starting with FE. */
619
620 { /* For C89, use a block so the declaration can be close to its use */
621
622#ifdef EBCDIC
623
5f995336
KW
624 /* U+7FFFFFFF (2 ** 31 - 1)
625 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
626 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
627 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
628 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
629 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
630 * U+80000000 (2 ** 31):
631 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
632 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
633 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
634 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
e050c007
KW
635 *
636 * and since we know that *s = \xfe, any continuation sequcence
637 * following it that is gt the below is above 31 bits
638 [0] [1] [2] [3] [4] [5] [6] */
639 const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
640
641#else
642
643 /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
644 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
645 * FF overlong for U+80000000 (2 ** 31):
646 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
647 * and since we know that *s = \xff, any continuation sequcence
648 * following it that is gt the below is above 30 bits
649 [0] [1] [2] [3] [4] [5] [6] */
650 const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
5f995336 651
83dc0f42
KW
652
653#endif
e050c007
KW
654 const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
655 const STRLEN cmp_len = MIN(conts_len, len - 1);
656
657 /* Now compare the continuation bytes in s with the ones we have
658 * compiled in that are for the largest 30 bit code point. If we have
659 * enough bytes available to determine the answer, or the bytes we do
660 * have differ from them, we can compare the two to get a definitive
661 * answer (Note that in UTF-EBCDIC, the two lowest possible
662 * continuation bytes are \x41 and \x42.) */
663 if (cmp_len >= conts_len || memNE(s + 1,
664 conts_for_highest_30_bit,
665 cmp_len))
666 {
667 return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
668 }
83dc0f42 669
e050c007
KW
670 /* Here, all the bytes we have are the same as the highest 30-bit code
671 * point, but we are missing so many bytes that we can't make the
672 * determination */
673 return -1;
674 }
83dc0f42
KW
675}
676
57ff5f59
KW
677#endif
678
d6be65ae 679PERL_STATIC_INLINE int
12a4bed3
KW
680S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
681{
d6be65ae
KW
682 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
683 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
684 * it isn't, and -1 if there isn't enough information to tell. This last
685 * return value can happen if the sequence is incomplete, missing some
686 * trailing bytes that would form a complete character. If there are
687 * enough bytes to make a definitive decision, this function does so.
688 * Usually 2 bytes sufficient.
689 *
690 * Overlongs can occur whenever the number of continuation bytes changes.
691 * That means whenever the number of leading 1 bits in a start byte
692 * increases from the next lower start byte. That happens for start bytes
693 * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
694 * start bytes have already been excluded, so don't need to be tested here;
12a4bed3
KW
695 * ASCII platforms: C0, C1
696 * EBCDIC platforms C0, C1, C2, C3, C4, E0
d6be65ae 697 */
12a4bed3
KW
698
699 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
700 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
701
702 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
703 assert(len > 1 && UTF8_IS_START(*s));
704
705 /* Each platform has overlongs after the start bytes given above (expressed
706 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
707 * the logic is the same, except the E0 overlong has already been excluded
708 * on EBCDIC platforms. The values below were found by manually
709 * inspecting the UTF-8 patterns. See the tables in utf8.h and
710 * utfebcdic.h. */
711
712# ifdef EBCDIC
713# define F0_ABOVE_OVERLONG 0xB0
714# define F8_ABOVE_OVERLONG 0xA8
715# define FC_ABOVE_OVERLONG 0xA4
716# define FE_ABOVE_OVERLONG 0xA2
717# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
718 /* I8(0xfe) is FF */
719# else
720
721 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
d6be65ae 722 return 1;
12a4bed3
KW
723 }
724
725# define F0_ABOVE_OVERLONG 0x90
726# define F8_ABOVE_OVERLONG 0x88
727# define FC_ABOVE_OVERLONG 0x84
728# define FE_ABOVE_OVERLONG 0x82
729# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
730# endif
731
732
733 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
734 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
735 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
736 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
737 {
d6be65ae 738 return 1;
12a4bed3
KW
739 }
740
b0b342d4 741 /* Check for the FF overlong */
d6be65ae 742 return isFF_OVERLONG(s, len);
b0b342d4
KW
743}
744
8d6204cc 745PERL_STATIC_INLINE int
b0b342d4
KW
746S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
747{
8d6204cc
KW
748 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
749 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
750 * it isn't, and -1 if there isn't enough information to tell. This last
751 * return value can happen if the sequence is incomplete, missing some
752 * trailing bytes that would form a complete character. If there are
753 * enough bytes to make a definitive decision, this function does so. */
754
b0b342d4 755 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3 756
8d6204cc
KW
757 /* To be an FF overlong, all the available bytes must match */
758 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
759 MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
760 {
761 return 0;
762 }
763
764 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
765 * be there; what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 766 * utfebcdic.h. */
8d6204cc
KW
767 if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
768 return 1;
769 }
12a4bed3 770
8d6204cc
KW
771 /* The missing bytes could cause the result to go one way or the other, so
772 * the result is indeterminate */
773 return -1;
12a4bed3
KW
774}
775
d22ec717 776#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
a77c906e
KW
777# ifdef EBCDIC /* Actually is I8 */
778# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 779 "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
780# else
781# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 782 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
783# endif
784#endif
785
c285bbc4 786PERL_STATIC_INLINE int
e050c007
KW
787S_does_utf8_overflow(const U8 * const s,
788 const U8 * e,
789 const bool consider_overlongs)
a77c906e 790{
c285bbc4 791 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
d22ec717
KW
792 * 'e' - 1 would overflow an IV on this platform; that is if it represents
793 * a code point larger than the highest representable code point. It
794 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
795 * enough information to tell. This last return value can happen if the
796 * sequence is incomplete, missing some trailing bytes that would form a
797 * complete character. If there are enough bytes to make a definitive
798 * decision, this function does so.
c285bbc4 799 *
e050c007
KW
800 * If 'consider_overlongs' is TRUE, the function checks for the possibility
801 * that the sequence is an overlong that doesn't overflow. Otherwise, it
802 * assumes the sequence is not an overlong. This can give different
803 * results only on ASCII 32-bit platforms.
804 *
c285bbc4
KW
805 * (For ASCII platforms, we could use memcmp() because we don't have to
806 * convert each byte to I8, but it's very rare input indeed that would
807 * approach overflow, so the loop below will likely only get executed once.)
808 *
809 * 'e' - 1 must not be beyond a full character. */
a77c906e 810
a77c906e
KW
811
812 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
813 assert(s <= e && s + UTF8SKIP(s) >= e);
814
d22ec717
KW
815#if ! defined(UV_IS_QUAD)
816
817 return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
818
819#else
820
821 PERL_UNUSED_ARG(consider_overlongs);
822
823 {
824 const STRLEN len = e - s;
825 const U8 *x;
826 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
827
828 for (x = s; x < e; x++, y++) {
829
830 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
831 continue;
832 }
833
834 /* If this byte is larger than the corresponding highest UTF-8
835 * byte, the sequence overflow; otherwise the byte is less than,
836 * and so the sequence doesn't overflow */
837 return NATIVE_UTF8_TO_I8(*x) > *y;
838
839 }
840
841 /* Got to the end and all bytes are the same. If the input is a whole
842 * character, it doesn't overflow. And if it is a partial character,
843 * there's not enough information to tell */
844 if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
845 return -1;
846 }
847
848 return 0;
849 }
850
851#endif
852
853}
854
855#if 0
856
857/* This is the portions of the above function that deal with UV_MAX instead of
858 * IV_MAX. They are left here in case we want to combine them so that internal
859 * uses can have larger code points. The only logic difference is that the
860 * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
861 * different logic.
862 */
863
864/* Anything larger than this will overflow the word if it were converted into a UV */
865#if defined(UV_IS_QUAD)
866# ifdef EBCDIC /* Actually is I8 */
867# define HIGHEST_REPRESENTABLE_UTF8 \
868 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
869# else
870# define HIGHEST_REPRESENTABLE_UTF8 \
871 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
872# endif
873#else /* 32-bit */
874# ifdef EBCDIC
875# define HIGHEST_REPRESENTABLE_UTF8 \
876 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
877# else
878# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
879# endif
880#endif
881
a77c906e
KW
882#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
883
884 /* On 32 bit ASCII machines, many overlongs that start with FF don't
885 * overflow */
e050c007 886 if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
c285bbc4
KW
887
888 /* To be such an overlong, the first bytes of 's' must match
889 * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
890 * don't have any additional bytes available, the sequence, when
891 * completed might or might not fit in 32 bits. But if we have that
892 * next byte, we can tell for sure. If it is <= 0x83, then it does
893 * fit. */
894 if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
895 return -1;
896 }
897
898 return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
a77c906e
KW
899 }
900
d22ec717
KW
901/* Starting with the #else, the rest of the function is identical except
902 * 1. we need to move the 'len' declaration to be global to the function
903 * 2. the endif move to just after the UNUSED_ARG.
904 * An empty endif is given just below to satisfy the preprocessor
905 */
a77c906e
KW
906#endif
907
d22ec717 908#endif
a77c906e 909
12a4bed3
KW
910#undef F0_ABOVE_OVERLONG
911#undef F8_ABOVE_OVERLONG
912#undef FC_ABOVE_OVERLONG
913#undef FE_ABOVE_OVERLONG
914#undef FF_OVERLONG_PREFIX
915
35f8c9bd 916STRLEN
1376b35c 917Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 918{
2b479609 919 STRLEN len;
12a4bed3 920 const U8 *x;
35f8c9bd 921
2b479609
KW
922 /* A helper function that should not be called directly.
923 *
924 * This function returns non-zero if the string beginning at 's' and
925 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
926 * code point; otherwise it returns 0. The examination stops after the
927 * first code point in 's' is validated, not looking at the rest of the
928 * input. If 'e' is such that there are not enough bytes to represent a
929 * complete code point, this function will return non-zero anyway, if the
930 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
931 * excluded by 'flags'.
932 *
933 * A non-zero return gives the number of bytes required to represent the
934 * code point. Be aware that if the input is for a partial character, the
935 * return will be larger than 'e - s'.
936 *
937 * This function assumes that the code point represented is UTF-8 variant.
56576a04
KW
938 * The caller should have excluded the possibility of it being invariant
939 * before calling this function.
2b479609
KW
940 *
941 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
942 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
943 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
944 * disallowed by the flags. If the input is only for a partial character,
945 * the function will return non-zero if there is any sequence of
946 * well-formed UTF-8 that, when appended to the input sequence, could
947 * result in an allowed code point; otherwise it returns 0. Non characters
948 * cannot be determined based on partial character input. But many of the
949 * other excluded types can be determined with just the first one or two
950 * bytes.
951 *
952 */
953
1376b35c 954 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
2b479609
KW
955
956 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 957 |UTF8_DISALLOW_PERL_EXTENDED)));
2b479609 958 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 959
2b479609 960 /* A variant char must begin with a start byte */
35f8c9bd
KW
961 if (UNLIKELY(! UTF8_IS_START(*s))) {
962 return 0;
963 }
964
edc2c47a
KW
965 /* Examine a maximum of a single whole code point */
966 if (e - s > UTF8SKIP(s)) {
967 e = s + UTF8SKIP(s);
968 }
969
2b479609
KW
970 len = e - s;
971
972 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
973 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 974
56576a04
KW
975 /* Here, we are disallowing some set of largish code points, and the
976 * first byte indicates the sequence is for a code point that could be
977 * in the excluded set. We generally don't have to look beyond this or
978 * the second byte to see if the sequence is actually for one of the
979 * excluded classes. The code below is derived from this table:
980 *
2b479609
KW
981 * UTF-8 UTF-EBCDIC I8
982 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
983 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
984 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
985 *
56576a04
KW
986 * Keep in mind that legal continuation bytes range between \x80..\xBF
987 * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
988 * continuation bytes. Hence, we don't have to test the upper edge
989 * because if any of those is encountered, the sequence is malformed,
990 * and would fail elsewhere in this function.
991 *
992 * The code here likewise assumes that there aren't other
993 * malformations; again the function should fail elsewhere because of
994 * these. For example, an overlong beginning with FC doesn't actually
995 * have to be a super; it could actually represent a small code point,
996 * even U+0000. But, since overlongs (and other malformations) are
997 * illegal, the function should return FALSE in either case.
2b479609
KW
998 */
999
1000#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
1001# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 1002# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 1003
19794540
KW
1004# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
1005 /* B6 and B7 */ \
1006 && ((s1) & 0xFE ) == 0xB6)
57ff5f59 1007# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
2b479609
KW
1008#else
1009# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
1010# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
1011# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
57ff5f59 1012# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
2b479609
KW
1013#endif
1014
1015 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
1016 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1017 {
2b479609
KW
1018 return 0; /* Above Unicode */
1019 }
1020
d044b7a7 1021 if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
57ff5f59 1022 && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
2b479609 1023 {
57ff5f59 1024 return 0;
2b479609
KW
1025 }
1026
1027 if (len > 1) {
1028 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
1029
1030 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 1031 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
1032 {
1033 return 0; /* Above Unicode */
1034 }
1035
1036 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 1037 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
1038 {
1039 return 0; /* Surrogate */
1040 }
1041
1042 if ( (flags & UTF8_DISALLOW_NONCHAR)
1043 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
1044 {
1045 return 0; /* Noncharacter code point */
1046 }
1047 }
1048 }
1049
1050 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
1051 for (x = s + 1; x < e; x++) {
1052 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
1053 return 0;
1054 }
1055 }
1056
af13dd8a 1057 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3 1058 * overlong. */
d6be65ae 1059 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
12a4bed3 1060 return 0;
af13dd8a
KW
1061 }
1062
12a4bed3
KW
1063 /* And finally, that the code point represented fits in a word on this
1064 * platform */
e050c007
KW
1065 if (0 < does_utf8_overflow(s, e,
1066 0 /* Don't consider overlongs */
1067 ))
1068 {
12a4bed3 1069 return 0;
35f8c9bd
KW
1070 }
1071
2b479609 1072 return UTF8SKIP(s);
35f8c9bd
KW
1073}
1074
7e2f38b2 1075char *
63ab03b3 1076Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
7cf8d05d
KW
1077{
1078 /* Returns a mortalized C string that is a displayable copy of the 'len'
63ab03b3 1079 * bytes starting at 'start'. 'format' gives how to display each byte.
7e2f38b2
KW
1080 * Currently, there are only two formats, so it is currently a bool:
1081 * 0 \xab
1082 * 1 ab (that is a space between two hex digit bytes)
1083 */
7cf8d05d
KW
1084
1085 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
1086 trailing NUL */
63ab03b3
KW
1087 const U8 * s = start;
1088 const U8 * const e = start + len;
7cf8d05d
KW
1089 char * output;
1090 char * d;
1091
1092 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
1093
1094 Newx(output, output_len, char);
1095 SAVEFREEPV(output);
1096
1097 d = output;
63ab03b3 1098 for (s = start; s < e; s++) {
7cf8d05d
KW
1099 const unsigned high_nibble = (*s & 0xF0) >> 4;
1100 const unsigned low_nibble = (*s & 0x0F);
1101
7e2f38b2 1102 if (format) {
63ab03b3
KW
1103 if (s > start) {
1104 *d++ = ' ';
1105 }
7e2f38b2
KW
1106 }
1107 else {
1108 *d++ = '\\';
1109 *d++ = 'x';
1110 }
7cf8d05d
KW
1111
1112 if (high_nibble < 10) {
1113 *d++ = high_nibble + '0';
1114 }
1115 else {
1116 *d++ = high_nibble - 10 + 'a';
1117 }
1118
1119 if (low_nibble < 10) {
1120 *d++ = low_nibble + '0';
1121 }
1122 else {
1123 *d++ = low_nibble - 10 + 'a';
1124 }
1125 }
1126
1127 *d = '\0';
1128 return output;
1129}
1130
806547a7 1131PERL_STATIC_INLINE char *
7cf8d05d
KW
1132S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1133
421da25c 1134 /* Max number of bytes to print */
3cc6a05e 1135 STRLEN print_len,
7cf8d05d
KW
1136
1137 /* Which one is the non-continuation */
1138 const STRLEN non_cont_byte_pos,
1139
1140 /* How many bytes should there be? */
1141 const STRLEN expect_len)
806547a7
KW
1142{
1143 /* Return the malformation warning text for an unexpected continuation
1144 * byte. */
1145
7cf8d05d 1146 const char * const where = (non_cont_byte_pos == 1)
806547a7 1147 ? "immediately"
7cf8d05d
KW
1148 : Perl_form(aTHX_ "%d bytes",
1149 (int) non_cont_byte_pos);
421da25c
KW
1150 const U8 * x = s + non_cont_byte_pos;
1151 const U8 * e = s + print_len;
806547a7
KW
1152
1153 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1154
7cf8d05d
KW
1155 /* We don't need to pass this parameter, but since it has already been
1156 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1157 assert(expect_len == UTF8SKIP(s));
1158
421da25c
KW
1159 /* As a defensive coding measure, don't output anything past a NUL. Such
1160 * bytes shouldn't be in the middle of a malformation, and could mark the
1161 * end of the allocated string, and what comes after is undefined */
1162 for (; x < e; x++) {
1163 if (*x == '\0') {
1164 x++; /* Output this particular NUL */
1165 break;
1166 }
1167 }
1168
7cf8d05d
KW
1169 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1170 " %s after start byte 0x%02x; need %d bytes, got %d)",
1171 malformed_text,
421da25c 1172 _byte_dump_string(s, x - s, 0),
7cf8d05d
KW
1173 *(s + non_cont_byte_pos),
1174 where,
1175 *s,
1176 (int) expect_len,
1177 (int) non_cont_byte_pos);
806547a7
KW
1178}
1179
35f8c9bd
KW
1180/*
1181
de69f3af 1182=for apidoc utf8n_to_uvchr
378516de
KW
1183
1184THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1185Most code should use L</utf8_to_uvchr_buf>() rather than call this
1186directly.
67e989fb 1187
9041c2e3 1188Bottom level UTF-8 decode routine.
de69f3af 1189Returns the native code point value of the first character in the string C<s>,
746afd53
KW
1190which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1191C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1192the length, in bytes, of that character.
949cf498
KW
1193
1194The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
1195well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1196causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1197is the next possible position in C<s> that could begin a non-malformed
1198character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1199is raised. Some UTF-8 input sequences may contain multiple malformations.
1200This function tries to find every possible one in each call, so multiple
56576a04 1201warnings can be raised for the same sequence.
949cf498
KW
1202
1203Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1204individual types of malformations, such as the sequence being overlong (that
1205is, when there is a shorter sequence that can express the same code point;
1206overlong sequences are expressly forbidden in the UTF-8 standard due to
1207potential security issues). Another malformation example is the first byte of
1208a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
1209flags. Even if allowed, this function generally returns the Unicode
1210REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1211F<utf8.h> to override this behavior for the overlong malformations, but don't
1212do that except for very specialized purposes.
949cf498 1213
796b6530 1214The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
1215flags) malformation is found. If this flag is set, the routine assumes that
1216the caller will raise a warning, and this function will silently just set
d088425d
KW
1217C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1218
75200dff 1219Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 1220character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
1221in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1222be set to 1. To disambiguate, upon a zero return, see if the first byte of
1223C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 1224error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
1225
1226Certain code points are considered problematic. These are Unicode surrogates,
746afd53 1227Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 1228By default these are considered regular code points, but certain situations
ecc1615f
KW
1229warrant special handling for them, which can be specified using the C<flags>
1230parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1231three classes are treated as malformations and handled as such. The flags
1232C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1233C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1234disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1235restricts the allowed inputs to the strict UTF-8 traditionally defined by
1236Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1237definition given by
e2176993 1238L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
ecc1615f
KW
1239The difference between traditional strictness and C9 strictness is that the
1240latter does not forbid non-character code points. (They are still discouraged,
1241however.) For more discussion see L<perlunicode/Noncharacter code points>.
1242
1243The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1244C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
1245C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1246raised for their respective categories, but otherwise the code points are
1247considered valid (not malformations). To get a category to both be treated as
1248a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 1249(But note that warnings are not raised if lexically disabled nor if
796b6530 1250C<UTF8_CHECK_ONLY> is also specified.)
949cf498 1251
57ff5f59
KW
1252Extremely high code points were never specified in any standard, and require an
1253extension to UTF-8 to express, which Perl does. It is likely that programs
1254written in something other than Perl would not be able to read files that
1255contain these; nor would Perl understand files written by something that uses a
1256different extension. For these reasons, there is a separate set of flags that
1257can warn and/or disallow these extremely high code points, even if other
1258above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1259C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
eb992c6f 1260C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
57ff5f59
KW
1261above-Unicode code points, including these, as malformations.
1262(Note that the Unicode standard considers anything above 0x10FFFF to be
1263illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1264(2**31 -1))
1265
1266A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1267retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1268C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1269C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1270can apply to code points that actually do fit in 31 bits. This happens on
1271EBCDIC platforms, and sometimes when the L<overlong
1272malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1273describe the situation in all cases.
1274
ab8e6d41 1275
949cf498
KW
1276All other code points corresponding to Unicode characters, including private
1277use and those yet to be assigned, are never considered malformed and never
1278warn.
67e989fb 1279
5af38e47
KW
1280=for apidoc Amnh||UTF8_CHECK_ONLY
1281=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1282=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1283=for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1284=for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1285=for apidoc Amnh||UTF8_DISALLOW_SUPER
1286=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1287=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1288=for apidoc Amnh||UTF8_WARN_SURROGATE
1289=for apidoc Amnh||UTF8_WARN_NONCHAR
1290=for apidoc Amnh||UTF8_WARN_SUPER
1291=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1292=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1293
37607a96 1294=cut
f9380377
KW
1295
1296Also implemented as a macro in utf8.h
1297*/
1298
1299UV
e6a4ffc3
KW
1300Perl_utf8n_to_uvchr(const U8 *s,
1301 STRLEN curlen,
1302 STRLEN *retlen,
1303 const U32 flags)
f9380377
KW
1304{
1305 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1306
1307 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1308}
1309
1310/*
1311
1312=for apidoc utf8n_to_uvchr_error
1313
1314THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1315Most code should use L</utf8_to_uvchr_buf>() rather than call this
1316directly.
f9380377
KW
1317
1318This function is for code that needs to know what the precise malformation(s)
37657a5b
KW
1319are when an error is found. If you also need to know the generated warning
1320messages, use L</utf8n_to_uvchr_msgs>() instead.
f9380377
KW
1321
1322It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1323all the others, C<errors>. If this parameter is 0, this function behaves
1324identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1325to a C<U32> variable, which this function sets to indicate any errors found.
1326Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1327C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1328of these bits will be set if a malformation is found, even if the input
7a65503b 1329C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1330exceptions are noted:
1331
1332=over 4
1333
57ff5f59 1334=item C<UTF8_GOT_PERL_EXTENDED>
f9380377 1335
57ff5f59
KW
1336The input sequence is not standard UTF-8, but a Perl extension. This bit is
1337set only if the input C<flags> parameter contains either the
1338C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1339
1340Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1341and so some extension must be used to express them. Perl uses a natural
1342extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1343extension to represent even higher ones, so that any code point that fits in a
134464-bit word can be represented. Text using these extensions is not likely to
1345be portable to non-Perl code. We lump both of these extensions together and
1346refer to them as Perl extended UTF-8. There exist other extensions that people
1347have invented, incompatible with Perl's.
1348
1349On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1350extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1351than on ASCII. Prior to that, code points 2**31 and higher were simply
1352unrepresentable, and a different, incompatible method was used to represent
1353code points between 2**30 and 2**31 - 1.
1354
1355On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1356Perl extended UTF-8 is used.
1357
1358In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1359may use for backward compatibility. That name is misleading, as this flag may
1360be set when the code point actually does fit in 31 bits. This happens on
1361EBCDIC platforms, and sometimes when the L<overlong
1362malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1363describes the situation in all cases.
f9380377
KW
1364
1365=item C<UTF8_GOT_CONTINUATION>
1366
a3815e44 1367The input sequence was malformed in that the first byte was a UTF-8
f9380377
KW
1368continuation byte.
1369
1370=item C<UTF8_GOT_EMPTY>
1371
1372The input C<curlen> parameter was 0.
1373
1374=item C<UTF8_GOT_LONG>
1375
1376The input sequence was malformed in that there is some other sequence that
1377evaluates to the same code point, but that sequence is shorter than this one.
1378
fecaf136
KW
1379Until Unicode 3.1, it was legal for programs to accept this malformation, but
1380it was discovered that this created security issues.
1381
f9380377
KW
1382=item C<UTF8_GOT_NONCHAR>
1383
1384The code point represented by the input UTF-8 sequence is for a Unicode
1385non-character code point.
1386This bit is set only if the input C<flags> parameter contains either the
1387C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1388
1389=item C<UTF8_GOT_NON_CONTINUATION>
1390
1391The input sequence was malformed in that a non-continuation type byte was found
00d976bb 1392in a position where only a continuation type one should be. See also
eb992c6f 1393C<L</UTF8_GOT_SHORT>>.
f9380377
KW
1394
1395=item C<UTF8_GOT_OVERFLOW>
1396
1397The input sequence was malformed in that it is for a code point that is not
d22ec717 1398representable in the number of bits available in an IV on the current platform.
f9380377
KW
1399
1400=item C<UTF8_GOT_SHORT>
1401
1402The input sequence was malformed in that C<curlen> is smaller than required for
1403a complete sequence. In other words, the input is for a partial character
1404sequence.
1405
00d976bb
KW
1406
1407C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1408sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1409that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1410sequence was looked at. If no other flags are present, it means that the
1411sequence was valid as far as it went. Depending on the application, this could
1412mean one of three things:
1413
1414=over
1415
1416=item *
1417
1418The C<curlen> length parameter passed in was too small, and the function was
1419prevented from examining all the necessary bytes.
1420
1421=item *
1422
1423The buffer being looked at is based on reading data, and the data received so
1424far stopped in the middle of a character, so that the next read will
1425read the remainder of this character. (It is up to the caller to deal with the
1426split bytes somehow.)
1427
1428=item *
1429
1430This is a real error, and the partial sequence is all we're going to get.
1431
1432=back
1433
f9380377
KW
1434=item C<UTF8_GOT_SUPER>
1435
1436The input sequence was malformed in that it is for a non-Unicode code point;
1437that is, one above the legal Unicode maximum.
1438This bit is set only if the input C<flags> parameter contains either the
1439C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1440
1441=item C<UTF8_GOT_SURROGATE>
1442
1443The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1444code point.
1445This bit is set only if the input C<flags> parameter contains either the
1446C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1447
1448=back
1449
133551d8
KW
1450To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1451flag to suppress any warnings, and then examine the C<*errors> return.
1452
f9380377 1453=cut
37657a5b
KW
1454
1455Also implemented as a macro in utf8.h
37607a96 1456*/
67e989fb 1457
a0ed51b3 1458UV
e6a4ffc3 1459Perl_utf8n_to_uvchr_error(const U8 *s,
37657a5b
KW
1460 STRLEN curlen,
1461 STRLEN *retlen,
1462 const U32 flags,
1463 U32 * errors)
1464{
1465 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1466
1467 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1468}
1469
1470/*
1471
1472=for apidoc utf8n_to_uvchr_msgs
1473
1474THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1475Most code should use L</utf8_to_uvchr_buf>() rather than call this
1476directly.
37657a5b
KW
1477
1478This function is for code that needs to know what the precise malformation(s)
1479are when an error is found, and wants the corresponding warning and/or error
1480messages to be returned to the caller rather than be displayed. All messages
f1460a66 1481that would have been displayed if all lexical warnings are enabled will be
37657a5b
KW
1482returned.
1483
1484It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1485placed after all the others, C<msgs>. If this parameter is 0, this function
1486behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1487be a pointer to an C<AV *> variable, in which this function creates a new AV to
1488contain any appropriate messages. The elements of the array are ordered so
1489that the first message that would have been displayed is in the 0th element,
1490and so on. Each element is a hash with three key-value pairs, as follows:
1491
1492=over 4
1493
1494=item C<text>
1495
1496The text of the message as a C<SVpv>.
1497
1498=item C<warn_categories>
1499
1500The warning category (or categories) packed into a C<SVuv>.
1501
1502=item C<flag>
1503
1504A single flag bit associated with this message, in a C<SVuv>.
1505The bit corresponds to some bit in the C<*errors> return value,
1506such as C<UTF8_GOT_LONG>.
1507
1508=back
1509
1510It's important to note that specifying this parameter as non-null will cause
1511any warnings this function would otherwise generate to be suppressed, and
1512instead be placed in C<*msgs>. The caller can check the lexical warnings state
1513(or not) when choosing what to do with the returned messages.
1514
1515If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1516no AV is created.
1517
1518The caller, of course, is responsible for freeing any returned AV.
1519
1520=cut
1521*/
1522
1523UV
e6a4ffc3 1524Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
37657a5b
KW
1525 STRLEN curlen,
1526 STRLEN *retlen,
1527 const U32 flags,
1528 U32 * errors,
1529 AV ** msgs)
a0ed51b3 1530{
d4c19fe8 1531 const U8 * const s0 = s;
2b9519f0 1532 const U8 * send = s0 + curlen;
5af9f822
KW
1533 U32 possible_problems; /* A bit is set here for each potential problem
1534 found as we go along */
1535 UV uv;
1536 STRLEN expectlen; /* How long should this sequence be? */
1537 STRLEN avail_len; /* When input is too short, gives what that is */
1538 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
1539 gets set and discarded */
a0dbb045 1540
2b5e7bc2
KW
1541 /* The below are used only if there is both an overlong malformation and a
1542 * too short one. Otherwise the first two are set to 's0' and 'send', and
1543 * the third not used at all */
5af9f822 1544 U8 * adjusted_s0;
e9f2c446
KW
1545 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1546 routine; see [perl #130921] */
5af9f822 1547 UV uv_so_far;
e6a4ffc3 1548 dTHX;
5af9f822 1549
e6a4ffc3 1550 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
5af9f822
KW
1551
1552 /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1553 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1554 * syllables that the dfa doesn't properly handle. Quickly dispose of the
1555 * final case. */
1556
1557#ifndef EBCDIC
1558
1559 /* Each of the affected Hanguls starts with \xED */
1560
1561 if (is_HANGUL_ED_utf8_safe(s0, send)) {
1562 if (retlen) {
1563 *retlen = 3;
1564 }
1565 if (errors) {
1566 *errors = 0;
1567 }
1568 if (msgs) {
1569 *msgs = NULL;
1570 }
1571
1572 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1573 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1574 | (s0[2] & UTF_CONTINUATION_MASK);
1575 }
1576
1577#endif
1578
1579 /* In conjunction with the exhaustive tests that can be enabled in
1580 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1581 * what it is intended to do, and that no flaws in it are masked by
1582 * dropping down and executing the code below
1583 assert(! isUTF8_CHAR(s0, send)
1584 || UTF8_IS_SURROGATE(s0, send)
1585 || UTF8_IS_SUPER(s0, send)
1586 || UTF8_IS_NONCHAR(s0,send));
1587 */
1588
1589 s = s0;
1590 uv = *s0;
1591 possible_problems = 0;
1592 expectlen = 0;
1593 avail_len = 0;
1594 discard_errors = 0;
1595 adjusted_s0 = (U8 *) s0;
1596 uv_so_far = 0;
1597
f9380377
KW
1598 if (errors) {
1599 *errors = 0;
1600 }
1601 else {
1602 errors = &discard_errors;
1603 }
a0dbb045 1604
eb83ed87
KW
1605 /* The order of malformation tests here is important. We should consume as
1606 * few bytes as possible in order to not skip any valid character. This is
1607 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
e2176993 1608 * https://unicode.org/reports/tr36 for more discussion as to why. For
eb83ed87
KW
1609 * example, once we've done a UTF8SKIP, we can tell the expected number of
1610 * bytes, and could fail right off the bat if the input parameters indicate
1611 * that there are too few available. But it could be that just that first
1612 * byte is garbled, and the intended character occupies fewer bytes. If we
1613 * blindly assumed that the first byte is correct, and skipped based on
1614 * that number, we could skip over a valid input character. So instead, we
1615 * always examine the sequence byte-by-byte.
1616 *
1617 * We also should not consume too few bytes, otherwise someone could inject
1618 * things. For example, an input could be deliberately designed to
1619 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1620 * returning to the caller C<*retlen> pointing to the very next byte (one
a3815e44 1621 * which is actually part of the overflowing sequence), that could look
eb83ed87 1622 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1623 * sequence and process the rest, inappropriately.
1624 *
1625 * Some possible input sequences are malformed in more than one way. This
1626 * function goes to lengths to try to find all of them. This is necessary
1627 * for correctness, as the inputs may allow one malformation but not
1628 * another, and if we abandon searching for others after finding the
1629 * allowed one, we could allow in something that shouldn't have been.
1630 */
eb83ed87 1631
b5b9af04 1632 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1633 possible_problems |= UTF8_GOT_EMPTY;
1634 curlen = 0;
5a48568d 1635 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1636 goto ready_to_handle_errors;
0c443dc2
JH
1637 }
1638
eb83ed87
KW
1639 expectlen = UTF8SKIP(s);
1640
1641 /* A well-formed UTF-8 character, as the vast majority of calls to this
1642 * function will be for, has this expected length. For efficiency, set
1643 * things up here to return it. It will be overriden only in those rare
1644 * cases where a malformation is found */
1645 if (retlen) {
1646 *retlen = expectlen;
1647 }
1648
eb83ed87 1649 /* A continuation character can't start a valid sequence */
b5b9af04 1650 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1651 possible_problems |= UTF8_GOT_CONTINUATION;
1652 curlen = 1;
1653 uv = UNICODE_REPLACEMENT;
1654 goto ready_to_handle_errors;
ba210ebe 1655 }
9041c2e3 1656
dcd27b3c 1657 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1658 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1659 * because it excludes start bytes like \xC0 that always lead to
1660 * overlongs.) */
dcd27b3c 1661
534752c1
KW
1662 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1663 * that indicate the number of bytes in the character's whole UTF-8
1664 * sequence, leaving just the bits that are part of the value. */
1665 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1666
e308b348
KW
1667 /* Setup the loop end point, making sure to not look past the end of the
1668 * input string, and flag it as too short if the size isn't big enough. */
e308b348
KW
1669 if (UNLIKELY(curlen < expectlen)) {
1670 possible_problems |= UTF8_GOT_SHORT;
1671 avail_len = curlen;
e308b348
KW
1672 }
1673 else {
2b9519f0 1674 send = (U8*) s0 + expectlen;
e308b348 1675 }
e308b348 1676
eb83ed87 1677 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1678 * accumulating each into the working value as we go. */
eb83ed87 1679 for (s = s0 + 1; s < send; s++) {
b5b9af04 1680 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1681 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1682 continue;
1683 }
1684
1685 /* Here, found a non-continuation before processing all expected bytes.
1686 * This byte indicates the beginning of a new character, so quit, even
1687 * if allowing this malformation. */
2b5e7bc2 1688 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1689 break;
eb83ed87
KW
1690 } /* End of loop through the character's bytes */
1691
1692 /* Save how many bytes were actually in the character */
1693 curlen = s - s0;
1694
2b5e7bc2
KW
1695 /* Note that there are two types of too-short malformation. One is when
1696 * there is actual wrong data before the normal termination of the
1697 * sequence. The other is that the sequence wasn't complete before the end
1698 * of the data we are allowed to look at, based on the input 'curlen'.
1699 * This means that we were passed data for a partial character, but it is
1700 * valid as far as we saw. The other is definitely invalid. This
1701 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1702 * separate.
1703 *
1704 * A convenience macro that matches either of the too-short conditions. */
1705# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1706
1707 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1708 uv_so_far = uv;
1709 uv = UNICODE_REPLACEMENT;
1710 }
2b5e7bc2 1711
08e73697
KW
1712 /* Check for overflow. The algorithm requires us to not look past the end
1713 * of the current character, even if partial, so the upper limit is 's' */
e050c007
KW
1714 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1715 1 /* Do consider overlongs */
1716 )))
1717 {
2b5e7bc2
KW
1718 possible_problems |= UTF8_GOT_OVERFLOW;
1719 uv = UNICODE_REPLACEMENT;
eb83ed87 1720 }
eb83ed87 1721
2b5e7bc2
KW
1722 /* Check for overlong. If no problems so far, 'uv' is the correct code
1723 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1724 * we must look at the UTF-8 byte sequence itself to see if it is for an
1725 * overlong */
1726 if ( ( LIKELY(! possible_problems)
1727 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
56576a04 1728 || ( UNLIKELY(possible_problems)
2b5e7bc2
KW
1729 && ( UNLIKELY(! UTF8_IS_START(*s0))
1730 || ( curlen > 1
d6be65ae 1731 && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
08e73697 1732 s - s0))))))
2f8f112e 1733 {
2b5e7bc2
KW
1734 possible_problems |= UTF8_GOT_LONG;
1735
abc28b54 1736 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
56576a04 1737
abc28b54
KW
1738 /* The calculation in the 'true' branch of this 'if'
1739 * below won't work if overflows, and isn't needed
1740 * anyway. Further below we handle all overflow
1741 * cases */
1742 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1743 {
2b5e7bc2
KW
1744 UV min_uv = uv_so_far;
1745 STRLEN i;
1746
1747 /* Here, the input is both overlong and is missing some trailing
1748 * bytes. There is no single code point it could be for, but there
1749 * may be enough information present to determine if what we have
1750 * so far is for an unallowed code point, such as for a surrogate.
56576a04
KW
1751 * The code further below has the intelligence to determine this,
1752 * but just for non-overlong UTF-8 sequences. What we do here is
1753 * calculate the smallest code point the input could represent if
1754 * there were no too short malformation. Then we compute and save
1755 * the UTF-8 for that, which is what the code below looks at
1756 * instead of the raw input. It turns out that the smallest such
1757 * code point is all we need. */
2b5e7bc2
KW
1758 for (i = curlen; i < expectlen; i++) {
1759 min_uv = UTF8_ACCUMULATE(min_uv,
1760 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1761 }
1762
e9f2c446 1763 adjusted_s0 = temp_char_buf;
57ff5f59 1764 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
2b5e7bc2 1765 }
eb83ed87
KW
1766 }
1767
56576a04
KW
1768 /* Here, we have found all the possible problems, except for when the input
1769 * is for a problematic code point not allowed by the input parameters. */
1770
06188866
KW
1771 /* uv is valid for overlongs */
1772 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1773
1774 /* isn't problematic if < this */
1775 && uv >= UNICODE_SURROGATE_FIRST)
2b5e7bc2 1776 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1777
1778 /* if overflow, we know without looking further
1779 * precisely which of the problematic types it is,
1780 * and we deal with those in the overflow handling
1781 * code */
1782 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
57ff5f59
KW
1783 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1784 || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
760c7c2f
KW
1785 && ((flags & ( UTF8_DISALLOW_NONCHAR
1786 |UTF8_DISALLOW_SURROGATE
1787 |UTF8_DISALLOW_SUPER
d044b7a7 1788 |UTF8_DISALLOW_PERL_EXTENDED
760c7c2f
KW
1789 |UTF8_WARN_NONCHAR
1790 |UTF8_WARN_SURROGATE
1791 |UTF8_WARN_SUPER
d22ec717 1792 |UTF8_WARN_PERL_EXTENDED))))
eb83ed87 1793 {
2b5e7bc2
KW
1794 /* If there were no malformations, or the only malformation is an
1795 * overlong, 'uv' is valid */
1796 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1797 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1798 possible_problems |= UTF8_GOT_SURROGATE;
1799 }
1800 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1801 possible_problems |= UTF8_GOT_SUPER;
1802 }
1803 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1804 possible_problems |= UTF8_GOT_NONCHAR;
1805 }
1806 }
1807 else { /* Otherwise, need to look at the source UTF-8, possibly
1808 adjusted to be non-overlong */
1809
1810 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1811 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1812 {
2b5e7bc2
KW
1813 possible_problems |= UTF8_GOT_SUPER;
1814 }
1815 else if (curlen > 1) {
1816 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1817 NATIVE_UTF8_TO_I8(*adjusted_s0),
1818 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1819 {
2b5e7bc2 1820 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1821 }
2b5e7bc2
KW
1822 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1823 NATIVE_UTF8_TO_I8(*adjusted_s0),
1824 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1825 {
1826 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1827 }
1828 }
c0236afe 1829
2b5e7bc2
KW
1830 /* We need a complete well-formed UTF-8 character to discern
1831 * non-characters, so can't look for them here */
1832 }
1833 }
949cf498 1834
2b5e7bc2
KW
1835 ready_to_handle_errors:
1836
1837 /* At this point:
1838 * curlen contains the number of bytes in the sequence that
1839 * this call should advance the input by.
e308b348
KW
1840 * avail_len gives the available number of bytes passed in, but
1841 * only if this is less than the expected number of
1842 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1843 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1844 * is set in it for each potential problem found.
1845 * uv contains the code point the input sequence
1846 * represents; or if there is a problem that prevents
1847 * a well-defined value from being computed, it is
1848 * some subsitute value, typically the REPLACEMENT
1849 * CHARACTER.
1850 * s0 points to the first byte of the character
56576a04
KW
1851 * s points to just after were we left off processing
1852 * the character
1853 * send points to just after where that character should
1854 * end, based on how many bytes the start byte tells
1855 * us should be in it, but no further than s0 +
1856 * avail_len
2b5e7bc2 1857 */
eb83ed87 1858
2b5e7bc2
KW
1859 if (UNLIKELY(possible_problems)) {
1860 bool disallowed = FALSE;
1861 const U32 orig_problems = possible_problems;
1862
37657a5b
KW
1863 if (msgs) {
1864 *msgs = NULL;
1865 }
1866
2b5e7bc2 1867 while (possible_problems) { /* Handle each possible problem */
9fde5914 1868 U32 pack_warn = 0;
2b5e7bc2 1869 char * message = NULL;
37657a5b 1870 U32 this_flag_bit = 0;
2b5e7bc2
KW
1871
1872 /* Each 'if' clause handles one problem. They are ordered so that
1873 * the first ones' messages will be displayed before the later
6c64cd9d
KW
1874 * ones; this is kinda in decreasing severity order. But the
1875 * overlong must come last, as it changes 'uv' looked at by the
1876 * others */
2b5e7bc2
KW
1877 if (possible_problems & UTF8_GOT_OVERFLOW) {
1878
56576a04
KW
1879 /* Overflow means also got a super and are using Perl's
1880 * extended UTF-8, but we handle all three cases here */
2b5e7bc2 1881 possible_problems
d044b7a7 1882 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
f9380377
KW
1883 *errors |= UTF8_GOT_OVERFLOW;
1884
1885 /* But the API says we flag all errors found */
1886 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1887 *errors |= UTF8_GOT_SUPER;
1888 }
ddb65933 1889 if (flags
d044b7a7 1890 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1891 {
d044b7a7 1892 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1893 }
2b5e7bc2 1894
d60baaa7 1895 /* Disallow if any of the three categories say to */
56576a04 1896 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
d60baaa7 1897 || (flags & ( UTF8_DISALLOW_SUPER
d044b7a7 1898 |UTF8_DISALLOW_PERL_EXTENDED)))
d60baaa7
KW
1899 {
1900 disallowed = TRUE;
1901 }
1902
d22ec717
KW
1903 /* Likewise, warn if any say to */
1904 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1905 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
d60baaa7 1906 {
2b5e7bc2 1907
ddb65933
KW
1908 /* The warnings code explicitly says it doesn't handle the
1909 * case of packWARN2 and two categories which have
1910 * parent-child relationship. Even if it works now to
1911 * raise the warning if either is enabled, it wouldn't
1912 * necessarily do so in the future. We output (only) the
56576a04 1913 * most dire warning */
ddb65933 1914 if (! (flags & UTF8_CHECK_ONLY)) {
37657a5b 1915 if (msgs || ckWARN_d(WARN_UTF8)) {
ddb65933
KW
1916 pack_warn = packWARN(WARN_UTF8);
1917 }
37657a5b 1918 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
ddb65933
KW
1919 pack_warn = packWARN(WARN_NON_UNICODE);
1920 }
1921 if (pack_warn) {
1922 message = Perl_form(aTHX_ "%s: %s (overflows)",
1923 malformed_text,
05b9033b 1924 _byte_dump_string(s0, curlen, 0));
37657a5b 1925 this_flag_bit = UTF8_GOT_OVERFLOW;
ddb65933 1926 }
2b5e7bc2
KW
1927 }
1928 }
1929 }
1930 else if (possible_problems & UTF8_GOT_EMPTY) {
1931 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1932 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1933
1934 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1935
1936 /* This so-called malformation is now treated as a bug in
1937 * the caller. If you have nothing to decode, skip calling
1938 * this function */
1939 assert(0);
1940
2b5e7bc2 1941 disallowed = TRUE;
37657a5b
KW
1942 if ( (msgs
1943 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1944 {
2b5e7bc2
KW
1945 pack_warn = packWARN(WARN_UTF8);
1946 message = Perl_form(aTHX_ "%s (empty string)",
1947 malformed_text);
37657a5b 1948 this_flag_bit = UTF8_GOT_EMPTY;
2b5e7bc2
KW
1949 }
1950 }
1951 }
1952 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1953 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1954 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1955
1956 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1957 disallowed = TRUE;
37657a5b
KW
1958 if (( msgs
1959 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1960 {
2b5e7bc2
KW
1961 pack_warn = packWARN(WARN_UTF8);
1962 message = Perl_form(aTHX_
1963 "%s: %s (unexpected continuation byte 0x%02x,"
1964 " with no preceding start byte)",
1965 malformed_text,
7e2f38b2 1966 _byte_dump_string(s0, 1, 0), *s0);
37657a5b 1967 this_flag_bit = UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1968 }
1969 }
1970 }
2b5e7bc2
KW
1971 else if (possible_problems & UTF8_GOT_SHORT) {
1972 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1973 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1974
1975 if (! (flags & UTF8_ALLOW_SHORT)) {
1976 disallowed = TRUE;
37657a5b
KW
1977 if (( msgs
1978 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1979 {
2b5e7bc2
KW
1980 pack_warn = packWARN(WARN_UTF8);
1981 message = Perl_form(aTHX_
56576a04
KW
1982 "%s: %s (too short; %d byte%s available, need %d)",
1983 malformed_text,
1984 _byte_dump_string(s0, send - s0, 0),
1985 (int)avail_len,
1986 avail_len == 1 ? "" : "s",
1987 (int)expectlen);
37657a5b 1988 this_flag_bit = UTF8_GOT_SHORT;
2b5e7bc2
KW
1989 }
1990 }
ba210ebe 1991
2b5e7bc2 1992 }
e308b348
KW
1993 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1994 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1995 *errors |= UTF8_GOT_NON_CONTINUATION;
1996
1997 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1998 disallowed = TRUE;
37657a5b
KW
1999 if (( msgs
2000 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2001 {
99a765e9
KW
2002
2003 /* If we don't know for sure that the input length is
2004 * valid, avoid as much as possible reading past the
2005 * end of the buffer */
2006 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
100de20c
KW
2007 ? (int) (s - s0)
2008 : (int) (send - s0);
e308b348
KW
2009 pack_warn = packWARN(WARN_UTF8);
2010 message = Perl_form(aTHX_ "%s",
2011 unexpected_non_continuation_text(s0,
99a765e9 2012 printlen,
e308b348
KW
2013 s - s0,
2014 (int) expectlen));
37657a5b 2015 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
e308b348
KW
2016 }
2017 }
2018 }
2b5e7bc2
KW
2019 else if (possible_problems & UTF8_GOT_SURROGATE) {
2020 possible_problems &= ~UTF8_GOT_SURROGATE;
2021
f9380377
KW
2022 if (flags & UTF8_WARN_SURROGATE) {
2023 *errors |= UTF8_GOT_SURROGATE;
2024
2025 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2026 && (msgs || ckWARN_d(WARN_SURROGATE)))
f9380377 2027 {
2b5e7bc2
KW
2028 pack_warn = packWARN(WARN_SURROGATE);
2029
2030 /* These are the only errors that can occur with a
2031 * surrogate when the 'uv' isn't valid */
2032 if (orig_problems & UTF8_GOT_TOO_SHORT) {
2033 message = Perl_form(aTHX_
2034 "UTF-16 surrogate (any UTF-8 sequence that"
2035 " starts with \"%s\" is for a surrogate)",
7e2f38b2 2036 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
2037 }
2038 else {
c94c2f39 2039 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 2040 }
37657a5b 2041 this_flag_bit = UTF8_GOT_SURROGATE;
f9380377 2042 }
2b5e7bc2 2043 }
ba210ebe 2044
2b5e7bc2
KW
2045 if (flags & UTF8_DISALLOW_SURROGATE) {
2046 disallowed = TRUE;
f9380377 2047 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
2048 }
2049 }
2050 else if (possible_problems & UTF8_GOT_SUPER) {
2051 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 2052
f9380377
KW
2053 if (flags & UTF8_WARN_SUPER) {
2054 *errors |= UTF8_GOT_SUPER;
2055
2056 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2057 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
f9380377 2058 {
2b5e7bc2
KW
2059 pack_warn = packWARN(WARN_NON_UNICODE);
2060
2061 if (orig_problems & UTF8_GOT_TOO_SHORT) {
2062 message = Perl_form(aTHX_
2063 "Any UTF-8 sequence that starts with"
2064 " \"%s\" is for a non-Unicode code point,"
2065 " may not be portable",
7e2f38b2 2066 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
2067 }
2068 else {
c94c2f39 2069 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 2070 }
37657a5b 2071 this_flag_bit = UTF8_GOT_SUPER;
f9380377 2072 }
2b5e7bc2 2073 }
ba210ebe 2074
57ff5f59
KW
2075 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2076 * and before possibly bailing out, so that the more dire
2077 * warning will override the regular one. */
2078 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
2b5e7bc2 2079 if ( ! (flags & UTF8_CHECK_ONLY)
d044b7a7 2080 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
dc4a6683
KW
2081 && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
2082 || ckWARN(WARN_PORTABLE))))
2b5e7bc2 2083 {
dc4a6683 2084 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
2b5e7bc2 2085
57ff5f59
KW
2086 /* If it is an overlong that evaluates to a code point
2087 * that doesn't have to use the Perl extended UTF-8, it
2088 * still used it, and so we output a message that
2089 * doesn't refer to the code point. The same is true
2090 * if there was a SHORT malformation where the code
2091 * point is not valid. In that case, 'uv' will have
2092 * been set to the REPLACEMENT CHAR, and the message
2093 * below without the code point in it will be selected
2094 * */
2095 if (UNICODE_IS_PERL_EXTENDED(uv)) {
2b5e7bc2 2096 message = Perl_form(aTHX_
8911f9b0 2097 PL_extended_cp_format, uv);
2b5e7bc2
KW
2098 }
2099 else {
2100 message = Perl_form(aTHX_
57ff5f59
KW
2101 "Any UTF-8 sequence that starts with"
2102 " \"%s\" is a Perl extension, and"
2103 " so is not portable",
2104 _byte_dump_string(s0, curlen, 0));
2b5e7bc2 2105 }
37657a5b 2106 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2b5e7bc2
KW
2107 }
2108
d044b7a7
KW
2109 if (flags & ( UTF8_WARN_PERL_EXTENDED
2110 |UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 2111 {
d044b7a7 2112 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 2113
d044b7a7 2114 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
f9380377
KW
2115 disallowed = TRUE;
2116 }
2b5e7bc2
KW
2117 }
2118 }
eb83ed87 2119
2b5e7bc2 2120 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 2121 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
2122 disallowed = TRUE;
2123 }
2b5e7bc2
KW
2124 }
2125 else if (possible_problems & UTF8_GOT_NONCHAR) {
2126 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 2127
f9380377
KW
2128 if (flags & UTF8_WARN_NONCHAR) {
2129 *errors |= UTF8_GOT_NONCHAR;
2130
2131 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2132 && (msgs || ckWARN_d(WARN_NONCHAR)))
f9380377 2133 {
2b5e7bc2
KW
2134 /* The code above should have guaranteed that we don't
2135 * get here with errors other than overlong */
2136 assert (! (orig_problems
2137 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2138
2139 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 2140 message = Perl_form(aTHX_ nonchar_cp_format, uv);
37657a5b 2141 this_flag_bit = UTF8_GOT_NONCHAR;
f9380377 2142 }
2b5e7bc2 2143 }
5b311467 2144
2b5e7bc2
KW
2145 if (flags & UTF8_DISALLOW_NONCHAR) {
2146 disallowed = TRUE;
f9380377 2147 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2 2148 }
6c64cd9d
KW
2149 }
2150 else if (possible_problems & UTF8_GOT_LONG) {
2151 possible_problems &= ~UTF8_GOT_LONG;
2152 *errors |= UTF8_GOT_LONG;
2153
2154 if (flags & UTF8_ALLOW_LONG) {
2155
2156 /* We don't allow the actual overlong value, unless the
2157 * special extra bit is also set */
2158 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2159 & ~UTF8_ALLOW_LONG)))
2160 {
2161 uv = UNICODE_REPLACEMENT;
2162 }
2163 }
2164 else {
2165 disallowed = TRUE;
2166
37657a5b
KW
2167 if (( msgs
2168 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2169 {
6c64cd9d
KW
2170 pack_warn = packWARN(WARN_UTF8);
2171
2172 /* These error types cause 'uv' to be something that
2173 * isn't what was intended, so can't use it in the
2174 * message. The other error types either can't
2175 * generate an overlong, or else the 'uv' is valid */
2176 if (orig_problems &
2177 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2178 {
2179 message = Perl_form(aTHX_
2180 "%s: %s (any UTF-8 sequence that starts"
2181 " with \"%s\" is overlong which can and"
2182 " should be represented with a"
2183 " different, shorter sequence)",
2184 malformed_text,
2185 _byte_dump_string(s0, send - s0, 0),
2186 _byte_dump_string(s0, curlen, 0));
2187 }
2188 else {
2189 U8 tmpbuf[UTF8_MAXBYTES+1];
1be62ab9
KW
2190 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2191 uv, 0);
d819dc50
KW
2192 /* Don't use U+ for non-Unicode code points, which
2193 * includes those in the Latin1 range */
2194 const char * preface = ( uv > PERL_UNICODE_MAX
2195#ifdef EBCDIC
2196 || uv <= 0xFF
2197#endif
2198 )
2199 ? "0x"
2200 : "U+";
6c64cd9d
KW
2201 message = Perl_form(aTHX_
2202 "%s: %s (overlong; instead use %s to represent"
2203 " %s%0*" UVXf ")",
2204 malformed_text,
2205 _byte_dump_string(s0, send - s0, 0),
2206 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2207 preface,
2208 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2209 small code points */
1be62ab9 2210 UNI_TO_NATIVE(uv));
6c64cd9d 2211 }
37657a5b 2212 this_flag_bit = UTF8_GOT_LONG;
6c64cd9d
KW
2213 }
2214 }
2b5e7bc2
KW
2215 } /* End of looking through the possible flags */
2216
2217 /* Display the message (if any) for the problem being handled in
2218 * this iteration of the loop */
2219 if (message) {
37657a5b 2220 if (msgs) {
37657a5b
KW
2221 assert(this_flag_bit);
2222
2223 if (*msgs == NULL) {
2224 *msgs = newAV();
2225 }
2226
bb07812e
KW
2227 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2228 pack_warn,
2229 this_flag_bit)));
37657a5b
KW
2230 }
2231 else if (PL_op)
2b5e7bc2
KW
2232 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2233 OP_DESC(PL_op));
2234 else
2235 Perl_warner(aTHX_ pack_warn, "%s", message);
2236 }
ddb65933 2237 } /* End of 'while (possible_problems)' */
a0dbb045 2238
2b5e7bc2
KW
2239 /* Since there was a possible problem, the returned length may need to
2240 * be changed from the one stored at the beginning of this function.
2241 * Instead of trying to figure out if that's needed, just do it. */
2242 if (retlen) {
2243 *retlen = curlen;
2244 }
a0dbb045 2245
2b5e7bc2
KW
2246 if (disallowed) {
2247 if (flags & UTF8_CHECK_ONLY && retlen) {
2248 *retlen = ((STRLEN) -1);
2249 }
2250 return 0;
2251 }
eb83ed87 2252 }
ba210ebe 2253
2b5e7bc2 2254 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
2255}
2256
8e84507e 2257/*
ec5f19d0
KW
2258=for apidoc utf8_to_uvchr_buf
2259
2260Returns the native code point of the first character in the string C<s> which
2261is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 2262C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 2263
524080c4
KW
2264If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2265enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 2266C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 2267(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 2268C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 2269the next possible position in C<s> that could begin a non-malformed character.
de69f3af 2270See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 2271returned.
ec5f19d0
KW
2272
2273=cut
52be2536
KW
2274
2275Also implemented as a macro in utf8.h
2276
ec5f19d0
KW
2277*/
2278
2279
2280UV
2281Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2282{
7f974d7e
KW
2283 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2284
9a9a6c98 2285 return utf8_to_uvchr_buf_helper(s, send, retlen);
ec5f19d0
KW
2286}
2287
52be2536
KW
2288/* This is marked as deprecated
2289 *
ec5f19d0
KW
2290=for apidoc utf8_to_uvuni_buf
2291
de69f3af
KW
2292Only in very rare circumstances should code need to be dealing in Unicode
2293(as opposed to native) code points. In those few cases, use
09232555
KW
2294C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
2295If you are not absolutely sure this is one of those cases, then assume it isn't
2296and use plain C<utf8_to_uvchr_buf> instead.
4f83cdcd
KW
2297
2298Returns the Unicode (not-native) code point of the first character in the
2299string C<s> which
ec5f19d0
KW
2300is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2301C<retlen> will be set to the length, in bytes, of that character.
2302
524080c4
KW
2303If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2304enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2305NULL) to -1. If those warnings are off, the computed value if well-defined (or
2306the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2307is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2308next possible position in C<s> that could begin a non-malformed character.
09232555
KW
2309See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2310returned.
ec5f19d0
KW
2311
2312=cut
2313*/
2314
2315UV
2316Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2317{
2318 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2319
2320 assert(send > s);
2321
5962d97e 2322 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
2323}
2324
b76347f2 2325/*
87cea99e 2326=for apidoc utf8_length
b76347f2 2327
b2e7ed74
KW
2328Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2329at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2330same place, it returns 0 with no warning raised.
2331
2332If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2333and returns the number of valid characters.
b76347f2
JH
2334
2335=cut
2336*/
2337
2338STRLEN
35a4481c 2339Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
2340{
2341 STRLEN len = 0;
2342
7918f24d
NC
2343 PERL_ARGS_ASSERT_UTF8_LENGTH;
2344
8850bf83
JH
2345 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2346 * the bitops (especially ~) can create illegal UTF-8.
2347 * In other words: in Perl UTF-8 is not just for Unicode. */
2348
12c43b0a 2349 if (UNLIKELY(e < s))
a3b680e6 2350 goto warn_and_return;
b76347f2 2351 while (s < e) {
4cbf4130 2352 s += UTF8SKIP(s);
8e91ec7f
AV
2353 len++;
2354 }
2355
12c43b0a 2356 if (UNLIKELY(e != s)) {
8e91ec7f
AV
2357 len--;
2358 warn_and_return:
9b387841
NC
2359 if (PL_op)
2360 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2361 "%s in %s", unees, OP_DESC(PL_op));
2362 else
61a12c31 2363 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
2364 }
2365
2366 return len;
2367}
2368
b06226ff 2369/*
fed3ba5d
NC
2370=for apidoc bytes_cmp_utf8
2371
a1433954 2372Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
2373sequence of characters (stored as UTF-8)
2374in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
2375equal, -1 or -2 if the first string is less than the second string, +1 or +2
2376if the first string is greater than the second string.
2377
2378-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
2379longer string. -2 or +2 is returned if
2380there was a difference between characters
fed3ba5d
NC
2381within the strings.
2382
2383=cut
2384*/
2385
2386int
2387Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2388{
2389 const U8 *const bend = b + blen;
2390 const U8 *const uend = u + ulen;
2391
2392 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
2393
2394 while (b < bend && u < uend) {
2395 U8 c = *u++;
2396 if (!UTF8_IS_INVARIANT(c)) {
2397 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2398 if (u < uend) {
2399 U8 c1 = *u++;
2400 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 2401 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 2402 } else {
2b5e7bc2 2403 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 2404 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
56576a04
KW
2405 "%s %s%s",
2406 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2407 PL_op ? " in " : "",
2408 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
2409 return -2;
2410 }
2411 } else {
2412 if (PL_op)
2413 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2414 "%s in %s", unees, OP_DESC(PL_op));
2415 else
61a12c31 2416 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
2417 return -2; /* Really want to return undef :-) */
2418 }
2419 } else {
2420 return -2;
2421 }
2422 }
2423 if (*b != c) {
2424 return *b < c ? -2 : +2;
2425 }
2426 ++b;
2427 }
2428
2429 if (b == bend && u == uend)
2430 return 0;
2431
2432 return b < bend ? +1 : -1;
2433}
2434
2435/*
87cea99e 2436=for apidoc utf8_to_bytes
6940069f 2437
3bc0c78c 2438Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 2439Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 2440updates C<*lenp> to contain the new length.
3bc0c78c
KW
2441Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2442
2443Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2444having saved the value of C<*lenp> before the call, and subtracting the
2445after-call value of C<*lenp> from it.
6940069f 2446
a1433954 2447If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 2448
6940069f
GS
2449=cut
2450*/
2451
2452U8 *
09af0336 2453Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 2454{
9fe0d3c2 2455 U8 * first_variant;
246fae53 2456
7918f24d 2457 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 2458 PERL_UNUSED_CONTEXT;
7918f24d 2459
9fe0d3c2 2460 /* This is a no-op if no variants at all in the input */
09af0336 2461 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
2462 return s;
2463 }
2464
2465 {
3c5aa262 2466 U8 * const save = s;
09af0336 2467 U8 * const send = s + *lenp;
3c5aa262
KW
2468 U8 * d;
2469
2470 /* Nothing before the first variant needs to be changed, so start the real
2471 * work there */
2472 s = first_variant;
2473 while (s < send) {
2474 if (! UTF8_IS_INVARIANT(*s)) {
2475 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 2476 *lenp = ((STRLEN) -1);
3c5aa262
KW
2477 return 0;
2478 }
2479 s++;
d59937ca
KW
2480 }
2481 s++;
dcad2880 2482 }
dcad2880 2483
3c5aa262
KW
2484 /* Is downgradable, so do it */
2485 d = s = first_variant;
2486 while (s < send) {
2487 U8 c = *s++;
2488 if (! UVCHR_IS_INVARIANT(c)) {
2489 /* Then it is two-byte encoded */
2490 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2491 s++;
2492 }
2493 *d++ = c;
2494 }
2495 *d = '\0';
09af0336 2496 *lenp = d - save;
3c5aa262
KW
2497
2498 return save;
9fe0d3c2 2499 }
6940069f
GS
2500}
2501
2502/*
87cea99e 2503=for apidoc bytes_from_utf8
f9a63242 2504
09af0336 2505Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2506byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2507actually encoded in UTF-8.
2508
2509Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2510the input string.
2511
41ae6089
KW
2512Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2513not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2514C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2515
41ae6089 2516Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2517newly created string containing a downgraded copy of C<s>, and whose length is
9ff99fb3
KW
2518returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2519caller is responsible for arranging for the memory used by this string to get
2520freed.
f9a63242 2521
3bc0c78c 2522Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2523having saved the value of C<*lenp> before the call, and subtracting the
2524after-call value of C<*lenp> from it.
3bc0c78c 2525
37607a96 2526=cut
976c1b08
KW
2527
2528There is a macro that avoids this function call, but this is retained for
2529anyone who calls it with the Perl_ prefix */
f9a63242
JH
2530
2531U8 *
41ae6089 2532Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2533{
7918f24d 2534 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2535 PERL_UNUSED_CONTEXT;
f9a63242 2536
976c1b08
KW
2537 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2538}
2539
2540/*
df6bd76f 2541=for apidoc bytes_from_utf8_loc
976c1b08 2542
eda578be
KW
2543Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2544to where to store the location of the first character in C<"s"> that cannot be
976c1b08
KW
2545converted to non-UTF8.
2546
2547If that parameter is C<NULL>, this function behaves identically to
2548C<bytes_from_utf8>.
2549
2550Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2551C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2552
2553Otherwise, the function returns a newly created C<NUL>-terminated string
2554containing the non-UTF8 equivalent of the convertible first portion of
2555C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2556If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2557and C<*first_non_downgradable> is set to C<NULL>.
2558
8505db87 2559Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
976c1b08
KW
2560first character in the original string that wasn't converted. C<*is_utf8p> is
2561unchanged. Note that the new string may have length 0.
2562
2563Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2564C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2565converts as many characters in it as possible stopping at the first one it
385b74be 2566finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2567set to point to that. The function returns the portion that could be converted
2568in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2569not including the terminating C<NUL>. If the very first character in the
2570original could not be converted, C<*lenp> will be 0, and the new string will
2571contain just a single C<NUL>. If the entire input string was converted,
2572C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2573
2574Upon successful return, the number of variants in the converted portion of the
2575string can be computed by having saved the value of C<*lenp> before the call,
2576and subtracting the after-call value of C<*lenp> from it.
2577
2578=cut
2579
2580
2581*/
2582
2583U8 *
2584Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2585{
2586 U8 *d;
2587 const U8 *original = s;
2588 U8 *converted_start;
2589 const U8 *send = s + *lenp;
f9a63242 2590
976c1b08 2591 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2592
976c1b08
KW
2593 if (! *is_utf8p) {
2594 if (first_unconverted) {
2595 *first_unconverted = NULL;
2596 }
2597
2598 return (U8 *) original;
2599 }
2600
2601 Newx(d, (*lenp) + 1, U8);
2602
2603 converted_start = d;
7299a045
KW
2604 while (s < send) {
2605 U8 c = *s++;
2606 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2607
2608 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2609 * have to stop now */
2610 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2611 if (first_unconverted) {
2612 *first_unconverted = s - 1;
2613 goto finish_and_return;
2614 }
2615 else {
2616 Safefree(converted_start);
2617 return (U8 *) original;
2618 }
2619 }
2620
7299a045
KW
2621 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2622 s++;
38af28cf 2623 }
7299a045
KW
2624 *d++ = c;
2625 }
170a1c22 2626
976c1b08
KW
2627 /* Here, converted the whole of the input */
2628 *is_utf8p = FALSE;
2629 if (first_unconverted) {
2630 *first_unconverted = NULL;
170a1c22 2631 }
976c1b08
KW
2632
2633 finish_and_return:
46a08a6f
KW
2634 *d = '\0';
2635 *lenp = d - converted_start;
976c1b08
KW
2636
2637 /* Trim unused space */
2638 Renew(converted_start, *lenp + 1, U8);
2639
2640 return converted_start;
f9a63242
JH
2641}
2642
2643/*
87cea99e 2644=for apidoc bytes_to_utf8
6940069f 2645
09af0336 2646Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2647UTF-8.
09af0336 2648Returns a pointer to the newly-created string, and sets C<*lenp> to
9ff99fb3
KW
2649reflect the new length in bytes. The caller is responsible for arranging for
2650the memory used by this string to get freed.
6940069f 2651
3bc0c78c 2652Upon successful return, the number of variants in the string can be computed by
23b37b12 2653having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2654after-call value of C<*lenp>.
2655
75200dff 2656A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2657
2658If you want to convert to UTF-8 from encodings other than
2659the native (Latin1 or EBCDIC),
a1433954 2660see L</sv_recode_to_utf8>().
c9ada85f 2661
497711e7 2662=cut
6940069f
GS
2663*/
2664
2665U8*
09af0336 2666Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2667{
09af0336 2668 const U8 * const send = s + (*lenp);
6940069f
GS
2669 U8 *d;
2670 U8 *dst;
7918f24d
NC
2671
2672 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2673 PERL_UNUSED_CONTEXT;
6940069f 2674
d4662719
KW
2675 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2676 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
6940069f
GS
2677 dst = d;
2678
2679 while (s < send) {
55d09dc8
KW
2680 append_utf8_from_native_byte(*s, &d);
2681 s++;
6940069f 2682 }
2e11cf67 2683
6940069f 2684 *d = '\0';
09af0336 2685 *lenp = d-dst;
2e11cf67 2686
6940069f
GS
2687 return dst;
2688}
2689
a0ed51b3 2690/*
624504c5
KW
2691 * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
2692 * use utf16_to_utf8_reversed().
a0ed51b3 2693 *
624504c5
KW
2694 * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
2695 * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
2696 * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
2697 *
2698 * These functions don't check for overflow. The worst case is every code
2699 * point in the input is 2 bytes, and requires 4 bytes on output. (If the code
2700 * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
2701 * destination must be pre-extended to 2 times the source length.
2702 *
2703 * Do not use in-place. We optimize for native, for obvious reasons. */
a0ed51b3
LW
2704
2705U8*
f46dcac2 2706Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
a0ed51b3 2707{
dea0fc0b
JH
2708 U8* pend;
2709 U8* dstart = d;
2710
7918f24d
NC
2711 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2712
dea0fc0b 2713 if (bytelen & 1)
56576a04
KW
2714 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2715 (UV)bytelen);
dea0fc0b
JH
2716
2717 pend = p + bytelen;
2718
a0ed51b3 2719 while (p < pend) {
dea0fc0b
JH
2720 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2721 p += 2;
2d1545e5 2722 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2723 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2724 continue;
2725 }
56d37426
KW
2726 if (uv <= MAX_UTF8_TWO_BYTE) {
2727 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2728 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2729 continue;
2730 }
ffd0a9d3 2731
46956fad
KW
2732#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2733#define LAST_HIGH_SURROGATE 0xDBFF
2734#define FIRST_LOW_SURROGATE 0xDC00
2735#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
ffd0a9d3 2736#define FIRST_IN_PLANE1 0x10000
e23c50db
KW
2737
2738 /* This assumes that most uses will be in the first Unicode plane, not
2739 * needing surrogates */
b497502c
KW
2740 if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST,
2741 UNICODE_SURROGATE_LAST)))
e23c50db
KW
2742 {
2743 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2744 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2745 }
2746 else {
01ea242b 2747 UV low = (p[0] << 8) + p[1];
b497502c
KW
2748 if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
2749 LAST_LOW_SURROGATE)))
e23c50db 2750 {
01ea242b 2751 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2752 }
2753 p += 2;
46956fad 2754 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
ffd0a9d3 2755 + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
01ea242b 2756 }
a0ed51b3 2757 }
56d37426
KW
2758#ifdef EBCDIC
2759 d = uvoffuni_to_utf8_flags(d, uv, 0);
2760#else
ffd0a9d3 2761 if (uv < FIRST_IN_PLANE1) {
eb160463
GS
2762 *d++ = (U8)(( uv >> 12) | 0xe0);
2763 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2764 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2765 continue;
2766 }
2767 else {
eb160463
GS
2768 *d++ = (U8)(( uv >> 18) | 0xf0);
2769 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2770 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2771 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2772 continue;
2773 }
56d37426 2774#endif
a0ed51b3 2775 }
dea0fc0b 2776 *newlen = d - dstart;
a0ed51b3
LW
2777 return d;
2778}
2779
2780/* Note: this one is slightly destructive of the source. */
2781
2782U8*
f46dcac2 2783Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
a0ed51b3
LW
2784{
2785 U8* s = (U8*)p;
d4c19fe8 2786 U8* const send = s + bytelen;
7918f24d
NC
2787
2788 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2789
e0ea5e2d 2790 if (bytelen & 1)
147e3846 2791 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2792 (UV)bytelen);
2793
a0ed51b3 2794 while (s < send) {
d4c19fe8 2795 const U8 tmp = s[0];
a0ed51b3
LW
2796 s[0] = s[1];
2797 s[1] = tmp;
2798 s += 2;
2799 }
dea0fc0b 2800 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2801}
2802
922e8cb4
KW
2803bool
2804Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2805{
dc31b55c 2806 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
922e8cb4
KW
2807}
2808
5092f92a 2809bool
eba68aa0
KW
2810Perl__is_uni_perl_idcont(pTHX_ UV c)
2811{
c12658c9 2812 return _invlist_contains_cp(PL_utf8_perl_idcont, c);
eba68aa0
KW
2813}
2814
2815bool
f91dcd13
KW
2816Perl__is_uni_perl_idstart(pTHX_ UV c)
2817{
c12658c9 2818 return _invlist_contains_cp(PL_utf8_perl_idstart, c);
f91dcd13
KW
2819}
2820
3a4c58c9 2821UV
56576a04
KW
2822Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2823 const char S_or_s)
3a4c58c9
KW
2824{
2825 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2826 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2827 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2828 * either "SS" or "Ss". Which one to use is passed into the routine in
2829 * 'S_or_s' to avoid a test */
2830
2831 UV converted = toUPPER_LATIN1_MOD(c);
2832
2833 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2834
2835 assert(S_or_s == 'S' || S_or_s == 's');
2836
6f2d5cbc 2837 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2838 characters in this range */
3a4c58c9
KW
2839 *p = (U8) converted;
2840 *lenp = 1;
2841 return converted;
2842 }
2843
2844 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2845 * which it maps to one of them, so as to only have to have one check for
2846 * it in the main case */
2847 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2848 switch (c) {
2849 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2850 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2851 break;
2852 case MICRO_SIGN:
2853 converted = GREEK_CAPITAL_LETTER_MU;
2854 break;
79e064b9
KW
2855#if UNICODE_MAJOR_VERSION > 2 \
2856 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2857 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2858 case LATIN_SMALL_LETTER_SHARP_S:
2859 *(p)++ = 'S';
2860 *p = S_or_s;
2861 *lenp = 2;
2862 return 'S';
79e064b9 2863#endif
3a4c58c9 2864 default:
56576a04
KW
2865 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2866 " '%c' to map to '%c'",
2867 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2868 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2869 }
2870 }
2871
2872 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2873 *p = UTF8_TWO_BYTE_LO(converted);
2874 *lenp = 2;
2875
2876 return converted;
2877}
2878
fe63c520
KW
2879/* If compiled on an early Unicode version, there may not be auxiliary tables
2880 * */
2881#ifndef HAS_UC_AUX_TABLES
2882# define UC_AUX_TABLE_ptrs NULL
2883# define UC_AUX_TABLE_lengths NULL
2884#endif
2885#ifndef HAS_TC_AUX_TABLES
2886# define TC_AUX_TABLE_ptrs NULL
2887# define TC_AUX_TABLE_lengths NULL
2888#endif
2889#ifndef HAS_LC_AUX_TABLES
2890# define LC_AUX_TABLE_ptrs NULL
2891# define LC_AUX_TABLE_lengths NULL
2892#endif
2893#ifndef HAS_CF_AUX_TABLES
2894# define CF_AUX_TABLE_ptrs NULL
2895# define CF_AUX_TABLE_lengths NULL
2896#endif
2897#ifndef HAS_UC_AUX_TABLES
2898# define UC_AUX_TABLE_ptrs NULL
2899# define UC_AUX_TABLE_lengths NULL
2900#endif
2901
50bda2c3
KW
2902/* Call the function to convert a UTF-8 encoded character to the specified case.
2903 * Note that there may be more than one character in the result.
6fa2f9bc
KW
2904 * 's' is a pointer to the first byte of the input character
2905 * 'd' will be set to the first byte of the string of changed characters. It
50bda2c3 2906 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
6fa2f9bc 2907 * 'lenp' will be set to the length in bytes of the string of changed characters
50bda2c3 2908 *
56576a04 2909 * The functions return the ordinal of the first character in the string of
6fa2f9bc 2910 * 'd' */
56576a04 2911#define CALL_UPPER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2912 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
2913 Uppercase_Mapping_invmap, \
2914 UC_AUX_TABLE_ptrs, \
2915 UC_AUX_TABLE_lengths, \
2916 "uppercase")
56576a04 2917#define CALL_TITLE_CASE(uv, s, d, lenp) \
8946fcd9
KW
2918 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
2919 Titlecase_Mapping_invmap, \
2920 TC_AUX_TABLE_ptrs, \
2921 TC_AUX_TABLE_lengths, \
2922 "titlecase")
56576a04 2923#define CALL_LOWER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2924 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
2925 Lowercase_Mapping_invmap, \
2926 LC_AUX_TABLE_ptrs, \
2927 LC_AUX_TABLE_lengths, \
2928 "lowercase")
2929
50bda2c3 2930
b9992569
KW
2931/* This additionally has the input parameter 'specials', which if non-zero will
2932 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2933 * folding); otherwise, when zero, this implies a simple case fold */
56576a04 2934#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
8946fcd9
KW
2935 (specials) \
2936 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
2937 Case_Folding_invmap, \
2938 CF_AUX_TABLE_ptrs, \
2939 CF_AUX_TABLE_lengths, \
2940 "foldcase") \
2941 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
2942 Simple_Case_Folding_invmap, \
2943 NULL, NULL, \
2944 "foldcase")
c3fd2246 2945
84afefe6
JH
2946UV
2947Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2948{
a1433954
KW
2949 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2950 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2951 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2952 * the changed version may be longer than the original character.
2953 *
2954 * The ordinal of the first character of the changed version is returned
2955 * (but note, as explained above, that there may be more.) */
2956
7918f24d
NC
2957 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2958
3a4c58c9
KW
2959 if (c < 256) {
2960 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2961 }
2962
a13f1de4 2963 return CALL_UPPER_CASE(c, NULL, p, lenp);
a0ed51b3
LW
2964}
2965
84afefe6
JH
2966UV
2967Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2968{
7918f24d
NC
2969 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2970
3a4c58c9
KW
2971 if (c < 256) {
2972 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2973 }
2974
a13f1de4 2975 return CALL_TITLE_CASE(c, NULL, p, lenp);
a0ed51b3
LW
2976}
2977
afc16117 2978STATIC U8
eaf412bf 2979S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2980{
2981 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2982 * those, converting the result to UTF-8. Since the result is always just
a1433954 2983 * one character, we allow <p> to be NULL */
afc16117
KW
2984
2985 U8 converted = toLOWER_LATIN1(c);
2986
eaf412bf
KW
2987 PERL_UNUSED_ARG(dummy);
2988
afc16117 2989 if (p != NULL) {
6f2d5cbc 2990 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2991 *p = converted;
2992 *lenp = 1;
2993 }
2994 else {
430c9760
KW
2995 /* Result is known to always be < 256, so can use the EIGHT_BIT
2996 * macros */
2997 *p = UTF8_EIGHT_BIT_HI(converted);
2998 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2999 *lenp = 2;
3000 }
3001 }
3002 return converted;
3003}
3004
84afefe6
JH
3005UV
3006Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 3007{
7918f24d
NC
3008 PERL_ARGS_ASSERT_TO_UNI_LOWER;
3009
afc16117 3010 if (c < 256) {
eaf412bf 3011 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
3012 }
3013
a13f1de4 3014 return CALL_LOWER_CASE(c, NULL, p, lenp);
a0ed51b3
LW
3015}
3016
84afefe6 3017UV
7c0ab950 3018Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 3019{
51910141 3020 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 3021 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 3022 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
3023 *
3024 * Not to be used for locale folds
51910141 3025 */
f673fad4 3026
a1dde8de
KW
3027 UV converted;
3028
3029 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3030
1ca267a5
KW
3031 assert (! (flags & FOLD_FLAGS_LOCALE));
3032
659a7c2d 3033 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
3034 converted = GREEK_SMALL_LETTER_MU;
3035 }
9b63e895
KW
3036#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3037 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3038 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
3039 else if ( (flags & FOLD_FLAGS_FULL)
3040 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3041 {
1ca267a5
KW
3042 /* If can't cross 127/128 boundary, can't return "ss"; instead return
3043 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3044 * under those circumstances. */
3045 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3046 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3047 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3048 p, *lenp, U8);
3049 return LATIN_SMALL_LETTER_LONG_S;
3050 }
3051 else {
4f489194
KW
3052 *(p)++ = 's';
3053 *p = 's';
3054 *lenp = 2;
3055 return 's';
1ca267a5 3056 }
a1dde8de 3057 }
9b63e895 3058#endif
a1dde8de
KW
3059 else { /* In this range the fold of all other characters is their lower
3060 case */
3061 converted = toLOWER_LATIN1(c);
3062 }
3063
6f2d5cbc 3064 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
3065 *p = (U8) converted;
3066 *lenp = 1;
3067 }
3068 else {
3069 *(p)++ = UTF8_TWO_BYTE_HI(converted);
3070 *p = UTF8_TWO_BYTE_LO(converted);
3071 *lenp = 2;
3072 }
3073
3074 return converted;
3075}
3076
3077UV
31f05a37 3078Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 3079{
4b593389 3080
a0270393
KW
3081 /* Not currently externally documented, and subject to change
3082 * <flags> bits meanings:
3083 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
3084 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3085 * locale are to be used.
a0270393
KW
3086 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3087 */
4b593389 3088
36bb2ab6 3089 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 3090
780fcc9f 3091 if (flags & FOLD_FLAGS_LOCALE) {
b257a28c
KW
3092 /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3093 * except for potentially warning */
8b7358b9 3094 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
b257a28c 3095 if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
780fcc9f
KW
3096 flags &= ~FOLD_FLAGS_LOCALE;
3097 }
3098 else {
e7b7ac46 3099 goto needs_full_generality;
780fcc9f 3100 }
31f05a37
KW
3101 }
3102
a1dde8de 3103 if (c < 256) {
e7b7ac46 3104 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 3105 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
3106 }
3107
2f306ab9 3108 /* Here, above 255. If no special needs, just use the macro */
a0270393 3109 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
a13f1de4 3110 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 3111 }
567b353c 3112 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
3113 the special flags. */
3114 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
3115
3116 needs_full_generality:
a0270393 3117 uvchr_to_utf8(utf8_c, c);
56576a04
KW
3118 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
3119 p, lenp, flags);
a0270393 3120 }
84afefe6
JH
3121}
3122
26483009 3123PERL_STATIC_INLINE bool
dd1a3ba7
KW
3124S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3125 SV* const invlist)
da8c1a98
KW
3126{
3127 /* returns a boolean giving whether or not the UTF8-encoded character that
eb1f4bb4
KW
3128 * starts at <p>, and extending no further than <e - 1> is in the inversion
3129 * list <invlist>. */
da8c1a98 3130
b68ffe0c
KW
3131 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3132
dd1a3ba7 3133 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
da8c1a98 3134
b68ffe0c 3135 if (cp == 0 && (p >= e || *p != '\0')) {
da8c1a98
KW
3136 _force_out_malformed_utf8_message(p, e, 0, 1);
3137 NOT_REACHED; /* NOTREACHED */
3138 }
3139
eb1f4bb4 3140 assert(invlist);
b68ffe0c 3141 return _invlist_contains_cp(invlist, cp);
da8c1a98
KW
3142}
3143
059703b0 3144#if 0 /* Not currently used, but may be needed in the future */
dd1a3ba7
KW
3145PERLVAR(I, seen_deprecated_macro, HV *)
3146
34aeb2e9
KW
3147STATIC void
3148S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3149 const char * const alternative,
3150 const bool use_locale,
3151 const char * const file,
3152 const unsigned line)
3153{
3154 const char * key;
3155
3156 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3157
3158 if (ckWARN_d(WARN_DEPRECATED)) {
3159
3160 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3161 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3162 if (! PL_seen_deprecated_macro) {
3163 PL_seen_deprecated_macro = newHV();
3164 }
3165 if (! hv_store(PL_seen_deprecated_macro, key,
3166 strlen(key), &PL_sv_undef, 0))
3167 {
3168 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3169 }
3170
c44e9413 3171 if (instr(file, "mathoms.c")) {
607313a1 3172 Perl_warner(aTHX_ WARN_DEPRECATED,
5203d63d 3173 "In %s, line %d, starting in Perl v5.32, %s()"
607313a1
KW
3174 " will be removed. Avoid this message by"
3175 " converting to use %s().\n",
3176 file, line, name, alternative);
3177 }
3178 else {
34aeb2e9 3179 Perl_warner(aTHX_ WARN_DEPRECATED,
5203d63d 3180 "In %s, line %d, starting in Perl v5.32, %s() will"
34aeb2e9
KW
3181 " require an additional parameter. Avoid this"
3182 " message by converting to use %s().\n",
3183 file, line, name, alternative);
607313a1 3184 }
34aeb2e9
KW
3185 }
3186 }
3187}
059703b0 3188#endif
922e8cb4
KW
3189
3190bool
dd1a3ba7 3191Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
da8c1a98 3192{
dd1a3ba7 3193 PERL_ARGS_ASSERT__IS_UTF8_FOO;
da8c1a98 3194
dd1a3ba7 3195 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
da8c1a98
KW
3196}
3197
3198bool
dd1a3ba7 3199Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
da8c1a98 3200{
dd1a3ba7 3201 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
da8c1a98 3202
dd1a3ba7 3203 return is_utf8_common(p, e, PL_utf8_perl_idstart);
da8c1a98
KW
3204}
3205
3206bool
dd1a3ba7 3207Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
c11ff943 3208{
dd1a3ba7 3209 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
7dbf68d2 3210
dd1a3ba7 3211 return is_utf8_common(p, e, PL_utf8_perl_idcont);
7dbf68d2
KW
3212}
3213
6a4a25f4 3214STATIC UV
30613bdc
KW
3215S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
3216 U8* ustrp, STRLEN *lenp,
40d2776f
KW
3217 SV *invlist, const I32 * const invmap,
3218 const U32 * const * const aux_tables,
30613bdc
KW
3219 const U8 * const aux_table_lengths,
3220 const char * const normal)
b9992569 3221{
0134edef 3222 STRLEN len = 0;
7918f24d 3223
30613bdc
KW
3224 /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
3225 * by this routine to be valid) begins at 'p'. 'normal' is a string to use
3226 * to name the new case in any generated messages, as a fallback if the
3227 * operation being used is not available. The new case is given by the
3228 * data structures in the remaining arguments.
3229 *
3230 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3231 * entire changed case string, and the return value is the first code point
3232 * in that string */
3233
b9992569 3234 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 3235
36eaa811
KW
3236 /* For code points that don't change case, we already know that the output
3237 * of this function is the unchanged input, so we can skip doing look-ups
3238 * for them. Unfortunately the case-changing code points are scattered
3239 * around. But there are some long consecutive ranges where there are no
3240 * case changing code points. By adding tests, we can eliminate the lookup
3241 * for all the ones in such ranges. This is currently done here only for
3242 * just a few cases where the scripts are in common use in modern commerce
3243 * (and scripts adjacent to those which can be included without additional
3244 * tests). */
3245
3246 if (uv1 >= 0x0590) {
3247 /* This keeps from needing further processing the code points most
3248 * likely to be used in the following non-cased scripts: Hebrew,
3249 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
3250 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
3251 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
3252 if (uv1 < 0x10A0) {
3253 goto cases_to_self;
3254 }
3255
3256 /* The following largish code point ranges also don't have case
3257 * changes, but khw didn't think they warranted extra tests to speed
3258 * them up (which would slightly slow down everything else above them):
3259 * 1100..139F Hangul Jamo, Ethiopic
3260 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
3261 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
3262 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
3263 * Combining Diacritical Marks Extended, Balinese,
3264 * Sundanese, Batak, Lepcha, Ol Chiki
3265 * 2000..206F General Punctuation
3266 */
3267
3268 if (uv1 >= 0x2D30) {
3269
3270 /* This keeps the from needing further processing the code points
3271 * most likely to be used in the following non-cased major scripts:
3272 * CJK, Katakana, Hiragana, plus some less-likely scripts.
3273 *
3274 * (0x2D30 above might have to be changed to 2F00 in the unlikely
3275 * event that Unicode eventually allocates the unused block as of
3276 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
3277 * that the test suite will start having failures to alert you
3278 * should that happen) */
3279 if (uv1 < 0xA640) {
3280 goto cases_to_self;
3281 }
3282
3283 if (uv1 >= 0xAC00) {
3284 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
3285 if (ckWARN_d(WARN_SURROGATE)) {
3286 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3287 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
56576a04
KW
3288 "Operation \"%s\" returns its argument for"
3289 " UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
3290 }
3291 goto cases_to_self;
3292 }
36eaa811
KW
3293
3294 /* AC00..FAFF Catches Hangul syllables and private use, plus
3295 * some others */
3296 if (uv1 < 0xFB00) {
3297 goto cases_to_self;
36eaa811
KW
3298 }
3299
5af9bc97 3300 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
40606899 3301 if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
fb2f0a6a 3302 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
5af9bc97
KW
3303 }
3304 if (ckWARN_d(WARN_NON_UNICODE)) {
3305 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3306 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
56576a04
KW
3307 "Operation \"%s\" returns its argument for"
3308 " non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
3309 }
3310 goto cases_to_self;
3311 }
3bfc1e70
KW
3312#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3313 if (UNLIKELY(uv1
3314 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3315 {
3316
3bfc1e70
KW
3317 goto cases_to_self;
3318 }
3319#endif
36eaa811
KW
3320 }
3321 }
9ae3ac1a 3322
36eaa811 3323 /* Note that non-characters are perfectly legal, so no warning should
8946fcd9 3324 * be given. */
9ae3ac1a
KW
3325 }
3326
8946fcd9
KW
3327 {
3328 unsigned int i;
22197324 3329 const U32 * cp_list;
8946fcd9 3330 U8 * d;
69352d88
KW
3331
3332 /* 'index' is guaranteed to be non-negative, as this is an inversion
3333 * map that covers all possible inputs. See [perl #133365] */
8946fcd9 3334 SSize_t index = _invlist_search(invlist, uv1);
22197324 3335 I32 base = invmap[index];
0134edef 3336
30613bdc
KW
3337 /* The data structures are set up so that if 'base' is non-negative,
3338 * the case change is 1-to-1; and if 0, the change is to itself */
8946fcd9
KW
3339 if (base >= 0) {
3340 IV lc;
b08cf34e 3341
8946fcd9
KW
3342 if (base == 0) {
3343 goto cases_to_self;
4a8240a3 3344 }
4a8240a3 3345
30613bdc 3346 /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
8946fcd9
KW
3347 lc = base + uv1 - invlist_array(invlist)[index];
3348 *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
3349 return lc;
3350 }
1feea2c7 3351
30613bdc
KW
3352 /* Here 'base' is negative. That means the mapping is 1-to-many, and
3353 * requires an auxiliary table look up. abs(base) gives the index into
3354 * a list of such tables which points to the proper aux table. And a
3355 * parallel list gives the length of each corresponding aux table. */
8946fcd9 3356 cp_list = aux_tables[-base];
30613bdc
KW
3357
3358 /* Create the string of UTF-8 from the mapped-to code points */
8946fcd9
KW
3359 d = ustrp;
3360 for (i = 0; i < aux_table_lengths[-base]; i++) {
3361 d = uvchr_to_utf8(d, cp_list[i]);
cbe07460 3362 }
8946fcd9
KW
3363 *d = '\0';
3364 *lenp = d - ustrp;
3365
3366 return cp_list[0];
cbe07460
KW
3367 }
3368
3369 /* Here, there was no mapping defined, which means that the code point maps
3370 * to itself. Return the inputs */
e24dfe9c 3371 cases_to_self:
a13f1de4
KW
3372 if (p) {
3373 len = UTF8SKIP(p);
3374 if (p != ustrp) { /* Don't copy onto itself */
3375 Copy(p, ustrp, len, U8);
3376 }
3377 *lenp = len;
3378 }
3379 else {
3380 *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
ca9fab46 3381 }
2a37f04d 3382
f4cd282c 3383 return uv1;
cbe07460 3384
a0ed51b3
LW
3385}
3386
b74fe592 3387Size_t
1b292063 3388Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
40d2776f 3389 const U32 ** remaining_folds_to)
b74fe592
KW
3390{
3391 /* Returns the count of the number of code points that fold to the input
3392 * 'cp' (besides itself).
3393 *
3394 * If the return is 0, there is nothing else that folds to it, and
3395 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3396 *
3397 * If the return is 1, '*first_folds_to' is set to the single code point,
3398 * and '*remaining_folds_to' is set to NULL.
3399 *
3400 * Otherwise, '*first_folds_to' is set to a code point, and
3401 * '*remaining_fold_to' is set to an array that contains the others. The
3402 * length of this array is the returned count minus 1.
3403 *
3404 * The reason for this convolution is to avoid having to deal with
3405 * allocating and freeing memory. The lists are already constructed, so
3406 * the return can point to them, but single code points aren't, so would
1b292063
KW
3407 * need to be constructed if we didn't employ something like this API
3408 *
3409 * The code points returned by this function are all legal Unicode, which
3410 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3411 * constructed with this size (to save space and memory), and we return
3412 * pointers, so they must be this size */
b74fe592 3413
69352d88
KW
3414 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3415 * that covers all possible inputs. See [perl #133365] */
b74fe592 3416 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
40d2776f 3417 I32 base = _Perl_IVCF_invmap[index];
b74fe592
KW
3418
3419 PERL_ARGS_ASSERT__INVERSE_FOLDS;
3420
3421 if (base == 0) { /* No fold */
3422 *first_folds_to = 0;
3423 *remaining_folds_to = NULL;
3424 return 0;
3425 }
3426
3427#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
3428
3429 assert(base > 0);
3430
3431#else
3432
3433 if (UNLIKELY(base < 0)) { /* Folds to more than one character */
3434
3435 /* The data structure is set up so that the absolute value of 'base' is
3436 * an index into a table of pointers to arrays, with the array
3437 * corresponding to the index being the list of code points that fold
3438 * to 'cp', and the parallel array containing the length of the list
3439 * array */
3440 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
99f30495
KW
3441 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3442 /* +1 excludes first_folds_to */
b74fe592
KW
3443 return IVCF_AUX_TABLE_lengths[-base];
3444 }
3445
3446#endif
3447
3448 /* Only the single code point. This works like 'fc(G) = G - A + a' */
40d2776f
KW
3449 *first_folds_to = (U32) (base + cp
3450 - invlist_array(PL_utf8_foldclosures)[index]);
b74fe592
KW
3451 *remaining_folds_to = NULL;
3452 return 1;
3453}
3454
051a06d4 3455STATIC UV
56576a04
KW
3456S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3457 U8* const ustrp, STRLEN *lenp)
051a06d4 3458{
4a4088c4 3459 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3460 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3461 * result contains a character that crosses the 255/256 boundary, disallow
3462 * the change, and return the original code point. See L<perlfunc/lc> for
3463 * why;
051a06d4 3464 *
a1433954
KW
3465 * p points to the original string whose case was changed; assumed
3466 * by this routine to be well-formed
051a06d4 3467 * result the code point of the first character in the changed-case string
56576a04
KW
3468 * ustrp points to the changed-case string (<result> represents its
3469 * first char)
051a06d4
KW
3470 * lenp points to the length of <ustrp> */
3471
3472 UV original; /* To store the first code point of <p> */
3473
3474 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3475
a4f12ed7 3476 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3477
3478 /* We know immediately if the first character in the string crosses the
5e45c680 3479 * boundary, so can skip testing */
051a06d4
KW
3480 if (result > 255) {
3481
3482 /* Look at every character in the result; if any cross the
3483 * boundary, the whole thing is disallowed */
3484 U8* s = ustrp + UTF8SKIP(ustrp);
3485 U8* e = ustrp + *lenp;
3486 while (s < e) {
a4f12ed7 3487 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3488 goto bad_crossing;
3489 }
3490 s += UTF8SKIP(s);
3491 }
3492
613abc6d
KW
3493 /* Here, no characters crossed, result is ok as-is, but we warn. */
3494 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3495 return result;
3496 }
3497
7b52d656 3498 bad_crossing:
051a06d4
KW
3499
3500 /* Failed, have to return the original */
4b88fb76 3501 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3502
3503 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3504 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
56576a04
KW
3505 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3506 " locale; resolved to \"\\x{%" UVXf "}\".",
357aadde 3507 OP_DESC(PL_op),
ab0b796c
KW
3508 original,
3509 original);
051a06d4
KW
3510 Copy(p, ustrp, *lenp, char);
3511 return original;
3512}
3513
b257a28c
KW
3514STATIC UV
3515S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3516 U8 * ustrp, STRLEN *lenp)
3517{
3518 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3519 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3520 * Otherwise, it returns the first code point of the Turkic foldcased
3521 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3522 * contain *lenp bytes
3523 *
3524 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3525 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3526 * DOTLESS I */
3527
3528 PERL_ARGS_ASSERT_TURKIC_FC;
3529 assert(e > p);
3530
3531 if (UNLIKELY(*p == 'I')) {
3532 *lenp = 2;
3533 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3534 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3535 return LATIN_SMALL_LETTER_DOTLESS_I;
3536 }
3537
3538 if (UNLIKELY(memBEGINs(p, e - p,
3539 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3540 {
3541 *lenp = 1;
3542 *ustrp = 'i';
3543 return 'i';
3544 }
3545
3546 return 0;
3547}
3548
3549STATIC UV
3550S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3551 U8 * ustrp, STRLEN *lenp)
3552{
3553 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3554 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3555 * Otherwise, it returns the first code point of the Turkic lowercased
3556 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3557 * contain *lenp bytes */
3558
3559 PERL_ARGS_ASSERT_TURKIC_LC;
3560 assert(e > p0);
3561
3562 /* A 'I' requires context as to what to do */
3563 if (UNLIKELY(*p0 == 'I')) {
3564 const U8 * p = p0 + 1;
3565
3566 /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3567 * modified by a dot above lowercases to 'i' even in turkic locales. */
3568 while (p < e) {
3569 UV cp;
3570
3571 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3572 ustrp[0] = 'i';
3573 *lenp = 1;
3574 return 'i';
3575 }
3576
3577 /* For the dot above to modify the 'I', it must be part of a
3578 * combining sequence immediately following the 'I', and no other
3579 * modifier with a ccc of 230 may intervene */
3580 cp = utf8_to_uvchr_buf(p, e, NULL);
3581 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3582 break;
3583 }
3584
3585 /* Here the combining sequence continues */
3586 p += UTF8SKIP(p);
3587 }
3588 }
3589
3590 /* In all other cases the lc is the same as the fold */
3591 return turkic_fc(p0, e, ustrp, lenp);
3592}
3593
3594STATIC UV
3595S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3596 U8 * ustrp, STRLEN *lenp)
3597{
3598 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3599 * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3600 * Otherwise, it returns the first code point of the Turkic upper or
3601 * title-cased sequence, and the entire sequence will be stored in *ustrp.
3602 * ustrp will contain *lenp bytes
3603 *
3604 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
a3815e44 3605 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
b257a28c
KW
3606 * DOTLESS I */
3607
3608 PERL_ARGS_ASSERT_TURKIC_UC;
3609 assert(e > p);
3610
3611 if (*p == 'i') {
3612 *lenp = 2;
3613 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3614 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3615 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3616 }
3617
3618 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3619 *lenp = 1;
3620 *ustrp = 'I';
3621 return 'I';
3622 }
3623
3624 return 0;
3625}
3626
eaf412bf
KW
3627/* The process for changing the case is essentially the same for the four case
3628 * change types, except there are complications for folding. Otherwise the
3629 * difference is only which case to change to. To make sure that they all do
3630 * the same thing, the bodies of the functions are extracted out into the
3631 * following two macros. The functions are written with the same variable
3632 * names, and these are known and used inside these macros. It would be
3633 * better, of course, to have inline functions to do it, but since different
3634 * macros are called, depending on which case is being changed to, this is not
3635 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3636 * function can start with the common start macro, then finish with its special
3637 * handling; while the other three cases can just use the common end macro.
3638 *
3639 * The algorithm is to use the proper (passed in) macro or function to change
3640 * the case for code points that are below 256. The macro is used if using
3641 * locale rules for the case change; the function if not. If the code point is
3642 * above 255, it is computed from the input UTF-8, and another macro is called
3643 * to do the conversion. If necessary, the output is converted to UTF-8. If
3644 * using a locale, we have to check that the change did not cross the 255/256
3645 * boundary, see check_locale_boundary_crossing() for further details.
3646 *
3647 * The macros are split with the correct case change for the below-256 case
3648 * stored into 'result', and in the middle of an else clause for the above-255
3649 * case. At that point in the 'else', 'result' is not the final result, but is
3650 * the input code point calculated from the UTF-8. The fold code needs to
3651 * realize all this and take it from there.
3652 *
b257a28c
KW
3653 * To deal with Turkic locales, the function specified by the parameter
3654 * 'turkic' is called when appropriate.
3655 *
eaf412bf
KW
3656 * If you read the two macros as sequential, it's easier to understand what's
3657 * going on. */
3658#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
b257a28c 3659 L1_func_extra_param, turkic) \
a239b1e2 3660 \
eaf412bf 3661 if (flags & (locale_flags)) { \
8b7358b9 3662 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
eaf412bf 3663 if (IN_UTF8_CTYPE_LOCALE) { \
b257a28c
KW
3664 if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
3665 UV ret = turkic(p, e, ustrp, lenp); \
3666 if (ret) return ret; \
3667 } \
3668 \
3669 /* Otherwise, treat a UTF-8 locale as not being in locale at \
3670 * all */ \
eaf412bf
KW
3671 flags &= ~(locale_flags); \
3672 } \
eaf412bf
KW
3673 } \
3674 \
3675 if (UTF8_IS_INVARIANT(*p)) { \
3676 if (flags & (locale_flags)) { \
3677 result = LC_L1_change_macro(*p); \
3678 } \
3679 else { \
3680 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3681 } \
3682 } \
a239b1e2 3683 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
1a751160 3684 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
eaf412bf 3685 if (flags & (locale_flags)) { \
1a751160 3686 result = LC_L1_change_macro(c); \
eaf412bf
KW
3687 } \
3688 else { \
1a751160 3689 return L1_func(c, ustrp, lenp, L1_func_extra_param); \
eaf412bf
KW
3690 } \
3691 } \
fa8ab374
KW
3692 else { /* malformed UTF-8 or ord above 255 */ \
3693 STRLEN len_result; \
fa8ab374
KW
3694 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3695 if (len_result == (STRLEN) -1) { \
059703b0 3696 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
fa8ab374 3697 }
eaf412bf
KW
3698
3699#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3700 result = change_macro(result, p, ustrp, lenp); \
3701 \
3702 if (flags & (locale_flags)) { \
3703 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \