This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
locale.c: Add fallbacks if no mbtowc()
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
fb7e7255 39
48ef279e 40/*
7fefc6c1 41These are various utility functions for manipulating UTF8-encoded
72d33970 42strings. For the uninitiated, this is a method of representing arbitrary
61296642 43Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
eaf7a4d2
CS
46*/
47
dd051059
DM
48/* helper for Perl__force_out_malformed_utf8_message(). Like
49 * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
50 * PL_compiling */
51
52static void
53S_restore_cop_warnings(pTHX_ void *p)
54{
1943af61 55 free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
dd051059
DM
56}
57
58
9cbfb8ab
KW
59void
60Perl__force_out_malformed_utf8_message(pTHX_
61 const U8 *const p, /* First byte in UTF-8 sequence */
62 const U8 * const e, /* Final byte in sequence (may include
63 multiple chars */
64 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
65 usually 0, or some DISALLOW flags */
66 const bool die_here) /* If TRUE, this function does not return */
67{
68 /* This core-only function is to be called when a malformed UTF-8 character
69 * is found, in order to output the detailed information about the
70 * malformation before dieing. The reason it exists is for the occasions
71 * when such a malformation is fatal, but warnings might be turned off, so
72 * that normally they would not be actually output. This ensures that they
73 * do get output. Because a sequence may be malformed in more than one
74 * way, multiple messages may be generated, so we can't make them fatal, as
75 * that would cause the first one to die.
76 *
77 * Instead we pretend -W was passed to perl, then die afterwards. The
78 * flexibility is here to return to the caller so they can finish up and
79 * die themselves */
80 U32 errors;
81
82 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
83
84 ENTER;
c15a80f3 85 SAVEI8(PL_dowarn);
9cbfb8ab
KW
86 SAVESPTR(PL_curcop);
87
88 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
89 if (PL_curcop) {
dd051059
DM
90 /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
91 * than PL_compiling */
92 SAVEDESTRUCTOR_X(S_restore_cop_warnings,
93 (void*)PL_curcop->cop_warnings);
9cbfb8ab
KW
94 PL_curcop->cop_warnings = pWARN_ALL;
95 }
96
97 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
98
99 LEAVE;
100
101 if (! errors) {
1604cfb0 102 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
9cbfb8ab
KW
103 " be called only when there are errors found");
104 }
105
106 if (die_here) {
107 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
108 }
109}
110
bb07812e
KW
111STATIC HV *
112S_new_msg_hv(pTHX_ const char * const message, /* The message text */
113 U32 categories, /* Packed warning categories */
114 U32 flag) /* Flag associated with this message */
115{
116 /* Creates, populates, and returns an HV* that describes an error message
117 * for the translators between UTF8 and code point */
118
119 SV* msg_sv = newSVpv(message, 0);
120 SV* category_sv = newSVuv(categories);
121 SV* flag_bit_sv = newSVuv(flag);
122
123 HV* msg_hv = newHV();
124
125 PERL_ARGS_ASSERT_NEW_MSG_HV;
126
2b672cf5
KW
127 (void) hv_stores(msg_hv, "text", msg_sv);
128 (void) hv_stores(msg_hv, "warn_categories", category_sv);
129 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
bb07812e
KW
130
131 return msg_hv;
132}
133
eaf7a4d2 134/*
378516de 135=for apidoc uvoffuni_to_utf8_flags
eebe1485 136
a27992cc 137THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
138Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
139L<perlapi/uvchr_to_utf8_flags>>.
a27992cc 140
de69f3af
KW
141This function is like them, but the input is a strict Unicode
142(as opposed to native) code point. Only in very rare circumstances should code
143not be using the native code point.
949cf498 144
09232555 145For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
949cf498 146
eebe1485
SC
147=cut
148*/
149
33f38593
KW
150U8 *
151Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
152{
153 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
154
155 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
156}
157
c94c2f39
KW
158/* All these formats take a single UV code point argument */
159const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
160const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
161 " is not recommended for open interchange";
162const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
163 " may not be portable";
c94c2f39 164
ba6ed43c
KW
165/* Use shorter names internally in this file */
166#define SHIFT UTF_ACCUMULATION_SHIFT
167#undef MARK
168#define MARK UTF_CONTINUATION_MARK
169#define MASK UTF_CONTINUATION_MASK
170
33f38593
KW
171/*
172=for apidoc uvchr_to_utf8_flags_msgs
173
174THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
175
176Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
177
178This function is for code that wants any warning and/or error messages to be
179returned to the caller rather than be displayed. All messages that would have
884a31ee 180been displayed if all lexical warnings are enabled will be returned.
33f38593
KW
181
182It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
183placed after all the others, C<msgs>. If this parameter is 0, this function
184behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
185be a pointer to an C<HV *> variable, in which this function creates a new HV to
186contain any appropriate messages. The hash has three key-value pairs, as
187follows:
188
189=over 4
190
191=item C<text>
192
193The text of the message as a C<SVpv>.
194
195=item C<warn_categories>
196
197The warning category (or categories) packed into a C<SVuv>.
198
199=item C<flag>
200
201A single flag bit associated with this message, in a C<SVuv>.
202The bit corresponds to some bit in the C<*errors> return value,
203such as C<UNICODE_GOT_SURROGATE>.
204
205=back
206
207It's important to note that specifying this parameter as non-null will cause
208any warnings this function would otherwise generate to be suppressed, and
209instead be placed in C<*msgs>. The caller can check the lexical warnings state
210(or not) when choosing what to do with the returned messages.
211
212The caller, of course, is responsible for freeing any returned HV.
213
214=cut
215*/
216
217/* Undocumented; we don't want people using this. Instead they should use
218 * uvchr_to_utf8_flags_msgs() */
dfe13c55 219U8 *
b3501144 220Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
a0ed51b3 221{
e932a5ed
KW
222 U8 *p;
223 UV shifted_uv = input_uv;
224 STRLEN utf8_skip = OFFUNISKIP(input_uv);
225
33f38593
KW
226 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
227
228 if (msgs) {
229 *msgs = NULL;
230 }
7918f24d 231
e932a5ed
KW
232 switch (utf8_skip) {
233 case 1:
3e3eb1f6 234 *d++ = LATIN1_TO_NATIVE(input_uv);
1604cfb0 235 return d;
facc1dc2 236
e932a5ed
KW
237 default:
238 if ( UNLIKELY(input_uv > MAX_LEGAL_CP
239 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
240 {
dc1baa35
KW
241 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
242 NULL, 0, input_uv));
e932a5ed 243 }
d9432125 244
b3501144
KW
245 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
246 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
247 const char * format = PL_extended_cp_format;
248 if (msgs) {
249 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
250 category,
251 UNICODE_GOT_PERL_EXTENDED);
252 }
253 else {
254 Perl_ck_warner_d(aTHX_ category, format, input_uv);
255 }
256
257 /* Don't output a 2nd msg */
258 flags &= ~UNICODE_WARN_SUPER;
259 }
260
261 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
262 return NULL;
263 }
264
e932a5ed 265 p = d + utf8_skip - 1;
b3501144 266 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
e932a5ed
KW
267 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
268 shifted_uv >>= SHIFT;
ba6ed43c 269 }
ba6ed43c 270
e932a5ed 271 /* FALLTHROUGH */
ba6ed43c 272
b3501144
KW
273 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
274 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
275 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
276 shifted_uv >>= SHIFT;
277 /* FALLTHROUGH */
278
279 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
280 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
281 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
282 shifted_uv >>= SHIFT;
283 /* FALLTHROUGH */
284
bc658500 285 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
e932a5ed 286 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
b3501144 287 if (flags & UNICODE_WARN_SUPER) {
e932a5ed 288 U32 category = packWARN(WARN_NON_UNICODE);
b3501144 289 const char * format = super_cp_format;
a5bf80e0 290
e932a5ed
KW
291 if (msgs) {
292 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
b3501144
KW
293 category,
294 UNICODE_GOT_SUPER);
e932a5ed 295 }
b3501144
KW
296 else {
297 Perl_ck_warner_d(aTHX_ category, format, input_uv);
298 }
299
300 if (flags & UNICODE_DISALLOW_SUPER) {
301 return NULL;
e932a5ed 302 }
33f38593 303 }
e932a5ed
KW
304 if ( (flags & UNICODE_DISALLOW_SUPER)
305 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
306 && UNICODE_IS_PERL_EXTENDED(input_uv)))
dc4a6683 307 {
e932a5ed 308 return NULL;
33f38593 309 }
a5bf80e0 310 }
d9432125 311
bc658500
KW
312 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
313 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
e932a5ed
KW
314 shifted_uv >>= SHIFT;
315 /* FALLTHROUGH */
316
bc658500 317 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
8010ec0d 318 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
e932a5ed 319 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
473bd793
KW
320 if (flags & UNICODE_WARN_NONCHAR) {
321 U32 category = packWARN(WARN_NONCHAR);
322 const char * format = nonchar_cp_format;
323 if (msgs) {
324 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
325 category,
326 UNICODE_GOT_NONCHAR);
327 }
328 else {
329 Perl_ck_warner_d(aTHX_ category, format, input_uv);
330 }
331 }
332 if (flags & UNICODE_DISALLOW_NONCHAR) {
333 return NULL;
334 }
e932a5ed
KW
335 }
336 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
473bd793
KW
337 if (flags & UNICODE_WARN_SURROGATE) {
338 U32 category = packWARN(WARN_SURROGATE);
339 const char * format = surrogate_cp_format;
340 if (msgs) {
341 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
342 category,
343 UNICODE_GOT_SURROGATE);
344 }
345 else {
346 Perl_ck_warner_d(aTHX_ category, format, input_uv);
347 }
348 }
349 if (flags & UNICODE_DISALLOW_SURROGATE) {
350 return NULL;
351 }
e932a5ed 352 }
ba6ed43c 353 }
ba6ed43c 354
bc658500
KW
355 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
356 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
357 shifted_uv >>= SHIFT;
358 /* FALLTHROUGH */
359
360#ifdef EBCDIC
361
362 case 3:
e932a5ed
KW
363 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
364 shifted_uv >>= SHIFT;
365 /* FALLTHROUGH */
ba6ed43c 366
bc658500
KW
367#endif
368
369 /* FALLTHROUGH */
e932a5ed
KW
370 case 2:
371 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
372 shifted_uv >>= SHIFT;
373 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
374 | UTF_START_MARK(utf8_skip));
375 break;
1d72bdf6 376 }
e932a5ed
KW
377
378 return d + utf8_skip;
a0ed51b3 379}
a5bf80e0 380
646ca15d 381/*
07693fe6
KW
382=for apidoc uvchr_to_utf8
383
bcb1a2d4 384Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 385of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
386C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
387the byte after the end of the new character. In other words,
07693fe6
KW
388
389 d = uvchr_to_utf8(d, uv);
390
391is the recommended wide native character-aware way of saying
392
393 *(d++) = uv;
394
d22ec717
KW
395This function accepts any code point from 0..C<IV_MAX> as input.
396C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
397
398It is possible to forbid or warn on non-Unicode code points, or those that may
399be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 400
07693fe6
KW
401=cut
402*/
403
de69f3af
KW
404/* This is also a macro */
405PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
406
07693fe6
KW
407U8 *
408Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
409{
de69f3af 410 return uvchr_to_utf8(d, uv);
07693fe6
KW
411}
412
de69f3af
KW
413/*
414=for apidoc uvchr_to_utf8_flags
415
416Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 417of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
418C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
419the byte after the end of the new character. In other words,
de69f3af
KW
420
421 d = uvchr_to_utf8_flags(d, uv, flags);
422
423or, in most cases,
424
425 d = uvchr_to_utf8_flags(d, uv, 0);
426
427This is the Unicode-aware way of saying
428
429 *(d++) = uv;
430
d22ec717
KW
431If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
432input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
433
434Specifying C<flags> can further restrict what is allowed and not warned on, as
435follows:
de69f3af 436
796b6530 437If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
438the function will raise a warning, provided UTF8 warnings are enabled. If
439instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
440NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 441
760c7c2f
KW
442Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
443affect how the function handles a Unicode non-character.
93e6dbd6 444
760c7c2f
KW
445And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
446affect the handling of code points that are above the Unicode maximum of
4470x10FFFF. Languages other than Perl may not be able to accept files that
448contain these.
93e6dbd6
KW
449
450The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
451the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
452three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
453allowed inputs to the strict UTF-8 traditionally defined by Unicode.
454Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
455C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
456above-Unicode and surrogate flags, but not the non-character ones, as
457defined in
e2176993 458L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
ecc1615f 459See L<perlunicode/Noncharacter code points>.
93e6dbd6 460
57ff5f59
KW
461Extremely high code points were never specified in any standard, and require an
462extension to UTF-8 to express, which Perl does. It is likely that programs
463written in something other than Perl would not be able to read files that
464contain these; nor would Perl understand files written by something that uses a
465different extension. For these reasons, there is a separate set of flags that
466can warn and/or disallow these extremely high code points, even if other
467above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
468and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
eb992c6f 469C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
57ff5f59
KW
470treat all above-Unicode code points, including these, as malformations. (Note
471that the Unicode standard considers anything above 0x10FFFF to be illegal, but
472there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
473
474A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
475retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
476C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
7c4a22ed
KW
477C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
478platforms,these flags can apply to code points that actually do fit in 31 bits.
479The new names accurately describe the situation in all cases.
de69f3af 480
d145625f
KW
481=for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
482=for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
483=for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
484=for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
485=for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
486=for apidoc Amnh||UNICODE_DISALLOW_SUPER
487=for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
488=for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
489=for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
490=for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
491=for apidoc Amnh||UNICODE_WARN_NONCHAR
492=for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
493=for apidoc Amnh||UNICODE_WARN_SUPER
494=for apidoc Amnh||UNICODE_WARN_SURROGATE
495
de69f3af
KW
496=cut
497*/
498
499/* This is also a macro */
500PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
501
07693fe6
KW
502U8 *
503Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
504{
de69f3af 505 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
506}
507
d6be65ae 508PERL_STATIC_INLINE int
8b5f2733 509S_is_utf8_overlong(const U8 * const s, const STRLEN len)
12a4bed3 510{
d6be65ae
KW
511 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
512 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
513 * it isn't, and -1 if there isn't enough information to tell. This last
514 * return value can happen if the sequence is incomplete, missing some
515 * trailing bytes that would form a complete character. If there are
516 * enough bytes to make a definitive decision, this function does so.
8b5f2733 517 * Usually 2 bytes are sufficient.
d6be65ae
KW
518 *
519 * Overlongs can occur whenever the number of continuation bytes changes.
520 * That means whenever the number of leading 1 bits in a start byte
521 * increases from the next lower start byte. That happens for start bytes
8b5f2733 522 * C0, E0, F0, F8, FC, FE, and FF.
d6be65ae 523 */
12a4bed3 524
8b5f2733 525 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
12a4bed3 526
d49e4ce1
KW
527 /* Each platform has overlongs after the start bytes given above (expressed
528 * in I8 for EBCDIC). The values below were found by manually inspecting
529 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */
12a4bed3 530
d49e4ce1
KW
531 switch (NATIVE_UTF8_TO_I8(s[0])) {
532 default:
8b5f2733 533 assert(UTF8_IS_START(s[0]));
d49e4ce1 534 return 0;
12a4bed3 535
8b5f2733
KW
536 case 0xC0:
537 case 0xC1:
538 return 1;
539
540#ifdef EBCDIC
541
542 case 0xC2:
543 case 0xC3:
544 case 0xC4:
545 case 0xE0:
546 return 1;
547#else
548 case 0xE0:
549 return (len < 2) ? -1 : s[1] < 0xA0;
d49e4ce1 550#endif
12a4bed3 551
8b5f2733
KW
552 case 0xF0:
553 return (len < 2)
554 ? -1
555 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
556 case 0xF8:
557 return (len < 2)
558 ? -1
559 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
560 case 0xFC:
561 return (len < 2)
562 ? -1
563 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
564 case 0xFE:
565 return (len < 2)
566 ? -1
567 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
568 case 0xFF:
569 return isFF_overlong(s, len);
12a4bed3 570 }
b0b342d4
KW
571}
572
8d6204cc 573PERL_STATIC_INLINE int
ba8e516e 574S_isFF_overlong(const U8 * const s, const STRLEN len)
b0b342d4 575{
8d6204cc
KW
576 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
577 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
578 * it isn't, and -1 if there isn't enough information to tell. This last
579 * return value can happen if the sequence is incomplete, missing some
580 * trailing bytes that would form a complete character. If there are
581 * enough bytes to make a definitive decision, this function does so. */
582
b0b342d4 583 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3 584
d49e4ce1
KW
585#ifdef EBCDIC
586 /* This works on all three EBCDIC code pages traditionally supported by
587 * perl */
588# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
589#else
590# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
591#endif
592
8d6204cc
KW
593 /* To be an FF overlong, all the available bytes must match */
594 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
c5b28134 595 MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
8d6204cc
KW
596 {
597 return 0;
598 }
599
600 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
601 * be there; what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 602 * utfebcdic.h. */
c5b28134 603 if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
8d6204cc
KW
604 return 1;
605 }
12a4bed3 606
8d6204cc
KW
607 /* The missing bytes could cause the result to go one way or the other, so
608 * the result is indeterminate */
609 return -1;
12a4bed3
KW
610}
611
86fb75ad
KW
612/* At some point we may want to allow core to use up to UV_MAX */
613
614#ifdef EBCDIC /* Actually is I8 */
615# if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
7746124c 616# define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7"
86fb75ad
KW
617 /* UV_MAX "\xFF\xAF" */
618# else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
7746124c 619# define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
86fb75ad
KW
620 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
621# endif
622#else
623# if defined(UV_IS_QUAD)
7746124c 624# define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87"
86fb75ad 625 /* UV_MAX "\xFF\x80" */
a77c906e 626# else
7746124c 627# define HIGHEST_REPRESENTABLE_UTF "\xFD"
86fb75ad 628 /* UV_MAX "\xFE\x83" */
a77c906e
KW
629# endif
630#endif
631
c285bbc4 632PERL_STATIC_INLINE int
e050c007
KW
633S_does_utf8_overflow(const U8 * const s,
634 const U8 * e,
635 const bool consider_overlongs)
a77c906e 636{
c285bbc4 637 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
d22ec717
KW
638 * 'e' - 1 would overflow an IV on this platform; that is if it represents
639 * a code point larger than the highest representable code point. It
640 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
641 * enough information to tell. This last return value can happen if the
642 * sequence is incomplete, missing some trailing bytes that would form a
643 * complete character. If there are enough bytes to make a definitive
644 * decision, this function does so.
c285bbc4 645 *
e050c007
KW
646 * If 'consider_overlongs' is TRUE, the function checks for the possibility
647 * that the sequence is an overlong that doesn't overflow. Otherwise, it
648 * assumes the sequence is not an overlong. This can give different
649 * results only on ASCII 32-bit platforms.
650 *
c285bbc4
KW
651 * (For ASCII platforms, we could use memcmp() because we don't have to
652 * convert each byte to I8, but it's very rare input indeed that would
653 * approach overflow, so the loop below will likely only get executed once.)
654 *
527347e0
KW
655 */
656 const STRLEN len = e - s;
657 const U8 *x;
658 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
659 int is_overlong = 0;
a77c906e
KW
660
661 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
a77c906e 662
527347e0 663 for (x = s; x < e; x++, y++) {
d22ec717 664
527347e0
KW
665 /* 'y' is set up to not include the trailing bytes that are all the
666 * maximum possible continuation byte. So when we reach the end of 'y'
667 * (known to be NUL terminated), it is impossible for 'x' to contain
668 * bytes larger than those omitted bytes, and therefore 'x' can't
669 * overflow */
670 if (*y == '\0') {
671 return 0;
672 }
d22ec717 673
527347e0
KW
674 /* If this byte is less than the corresponding highest non-overflowing
675 * UTF-8, the sequence doesn't overflow */
676 if (NATIVE_UTF8_TO_I8(*x) < *y) {
677 return 0;
678 }
d22ec717 679
527347e0
KW
680 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
681 goto overflows_if_not_overlong;
682 }
683 }
d22ec717 684
527347e0
KW
685 /* Got to the end, and all bytes are the same. If the input is a whole
686 * character, it doesn't overflow. And if it is a partial character,
687 * there's not enough information to tell */
688 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
86fb75ad 689
527347e0 690 overflows_if_not_overlong:
d22ec717 691
527347e0
KW
692 /* Here, a well-formed sequence overflows. If we are assuming
693 * well-formedness, return that it overflows. */
694 if (! consider_overlongs) {
695 return 1;
696 }
d22ec717 697
527347e0
KW
698 /* Here, it could be the overlong malformation, and might not actuallly
699 * overflow if you were to calculate it out.
700 *
701 * See if it actually is overlong */
702 is_overlong = is_utf8_overlong(s, len);
d22ec717 703
527347e0
KW
704 /* If it isn't overlong, is well-formed, so overflows */
705 if (is_overlong == 0) {
706 return 1;
707 }
708
709 /* Not long enough to determine */
710 if (is_overlong < 0) {
711 return -1;
712 }
713
714 /* Here, it appears to overflow, but it is also overlong */
715
716#if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
d22ec717 717
527347e0
KW
718 /* On many platforms, it is impossible for an overlong to overflow. For
719 * these, no further work is necessary: we can return immediately that this
720 * overlong that is an apparent overflow actually isn't
721 *
722 * To see why, note that a length_N sequence can represent as overlongs all
723 * the code points representable by shorter length sequences, but no
724 * higher. If it could represent a higher code point without being an
725 * overlong, we wouldn't have had to increase the sequence length!
726 *
727 * The highest possible start byte is FF; the next highest is FE. The
728 * highest code point representable as an overlong on the platform is thus
729 * the highest code point representable by a non-overlong sequence whose
730 * start byte is FE. If that value doesn't overflow the platform's word
731 * size, overlongs can't overflow.
732 *
733 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
734 * information (the high 7 bits, all ones, say that the sequence is 7 bytes
735 * long, and the bottom, zero, bit is s placeholder. That leaves the 6
736 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
737 If that number of bits doesn't exceed the word size, it can't overflow. */
738
739 return 0;
740
741#else
742
743 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can
744 * represent, as an overlong, the highest code point representable by an FD
745 * start byte, which is 5*6 continuation bytes of info plus one bit from
746 * the start byte, or 31 bits. That doesn't overflow. More explicitly:
747 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
748 *
749 * That means only the FF start byte can have an overflowing overlong. */
750 if (*s < 0xFF) {
d22ec717
KW
751 return 0;
752 }
753
527347e0
KW
754 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
755 * evaluates to 2**31, so overflows an IV. For a UV it's
756 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
757# define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82"
758
759 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */
760 return -1;
761 }
762
763# define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
764
765 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
766
d22ec717
KW
767#endif
768
769}
770
35f8c9bd 771STRLEN
1aa501c2 772Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 773{
1aa501c2 774 SSize_t len, full_len;
35f8c9bd 775
1aa501c2 776 /* An internal helper function.
2b479609 777 *
1aa501c2
KW
778 * On input:
779 * 's' is a string, which is known to be syntactically valid UTF-8 as far
780 * as (e - 1); e > s must hold.
781 * 'e' This function is allowed to look at any byte from 's'...'e-1', but
782 * nowhere else. The function has to cope as best it can if that
783 * sequence does not form a full character.
2b479609 784 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
1aa501c2
KW
785 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns
786 * 0 if it determines the input will match something disallowed.
787 * On output:
788 * The return is the number of bytes required to represent the code point
789 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the
790 * input is for a partial character, a successful return will be larger
791 * than 'e - s'.
792 *
793 * If *s..*(e-1) is only for a partial character, the function will return
794 * non-zero if there is any sequence of well-formed UTF-8 that, when
795 * appended to the input sequence, could result in an allowed code point;
796 * otherwise it returns 0. Non characters cannot be determined based on
797 * partial character input. But many of the other excluded types can be
798 * determined with just the first one or two bytes.
2b479609
KW
799 *
800 */
801
1aa501c2 802 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
2b479609 803
1aa501c2 804 assert(e > s);
2b479609 805 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 806 |UTF8_DISALLOW_PERL_EXTENDED)));
22f363ff 807
1aa501c2 808 full_len = UTF8SKIP(s);
35f8c9bd 809
1aa501c2
KW
810 len = e - s;
811 if (len > full_len) {
812 e = s + full_len;
813 len = full_len;
35f8c9bd
KW
814 }
815
1aa501c2
KW
816 switch (full_len) {
817 bool is_super;
edc2c47a 818
1aa501c2
KW
819 default: /* Extended */
820 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
821 return 0;
822 }
2b479609 823
1aa501c2
KW
824 /* FALLTHROUGH */
825
826 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
827 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
828
829 if (flags & UTF8_DISALLOW_SUPER) {
830 return 0; /* Above Unicode */
2b479609
KW
831 }
832
1aa501c2
KW
833 return full_len;
834
835 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
836 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
837 || ( len > 1
838 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
839 && NATIVE_UTF8_TO_I8(s[1])
840 >= UTF_FIRST_CONT_BYTE_110000_));
841 if (is_super) {
842 if (flags & UTF8_DISALLOW_SUPER) {
843 return 0;
844 }
845 }
846 else if ( (flags & UTF8_DISALLOW_NONCHAR)
847 && len == full_len
848 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
2b479609 849 {
57ff5f59 850 return 0;
2b479609
KW
851 }
852
1aa501c2 853 return full_len;
2b479609 854
1aa501c2 855 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
2b479609 856
1aa501c2
KW
857 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
858 return full_len;
2b479609 859 }
2b479609 860
1aa501c2
KW
861 if ( (flags & UTF8_DISALLOW_SURROGATE)
862 && UNLIKELY(is_SURROGATE_utf8(s)))
863 {
864 return 0; /* Surrogate */
865 }
866
867 if ( (flags & UTF8_DISALLOW_NONCHAR)
868 && len == full_len
869 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
870 {
35f8c9bd
KW
871 return 0;
872 }
35f8c9bd 873
1aa501c2 874 return full_len;
af13dd8a 875
1aa501c2
KW
876 /* The lower code points don't have any disallowable characters */
877#ifdef EBCDIC
878 case 3:
879 return full_len;
880#endif
35f8c9bd 881
1aa501c2
KW
882 case 2:
883 case 1:
884 return full_len;
885 }
35f8c9bd
KW
886}
887
58b66e89
KW
888Size_t
889Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
890 const bool require_partial)
891{
892 /* This is called to determine if the UTF-8 sequence starting at s0 and
893 * continuing for up to one full character of bytes, but looking no further
894 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
895 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
896 * for the largest code points recognized by Perl, the ones that require
897 * the most UTF-8 bytes per character to represent (somewhat less than
898 * twice the size of the next longest kind). This sequence will only ever
899 * be Perl extended UTF-8.
900 *
901 * The routine returns 0 if the sequence is not fully valid, syntactically
902 * or semantically. That means it checks that everything following the
903 * start byte is a continuation byte, and that it doesn't overflow, nor is
904 * an overlong representation.
905 *
906 * If 'require_partial' is FALSE, the routine returns non-zero only if the
907 * input (as far as 'e-1') is a full character. The return is the count of
908 * the bytes in the character.
909 *
910 * If 'require_partial' is TRUE, the routine returns non-zero only if the
911 * input as far as 'e-1' is a partial, not full character, with no
912 * malformations found before position 'e'. The return is either just
913 * FALSE, or TRUE. */
914
915 const U8 *s = s0 + 1;
916 const U8 *send = e;
917
918 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
919
920 assert(s0 < e);
921 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
922
923 send = s + MIN(UTF8_MAXBYTES - 1, e - s);
924 while (s < send) {
925 if (! UTF8_IS_CONTINUATION(*s)) {
926 return 0;
927 }
928
929 s++;
930 }
931
932 if (0 < does_utf8_overflow(s0, e,
933 FALSE /* Don't consider_overlongs */
934 )) {
935 return 0;
936 }
937
938 if (0 < isFF_overlong(s0, e - s0)) {
939 return 0;
940 }
941
942 /* Here, the character is valid as far as it got. Check if got a partial
943 * character */
944 if (s - s0 < UTF8_MAXBYTES) {
945 return (require_partial) ? 1 : 0;
946 }
947
948 /* Here, got a full character */
949 return (require_partial) ? 0 : UTF8_MAXBYTES;
950}
951
7e2f38b2 952char *
63ab03b3 953Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
7cf8d05d
KW
954{
955 /* Returns a mortalized C string that is a displayable copy of the 'len'
63ab03b3 956 * bytes starting at 'start'. 'format' gives how to display each byte.
7e2f38b2
KW
957 * Currently, there are only two formats, so it is currently a bool:
958 * 0 \xab
959 * 1 ab (that is a space between two hex digit bytes)
960 */
7cf8d05d
KW
961
962 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
963 trailing NUL */
63ab03b3
KW
964 const U8 * s = start;
965 const U8 * const e = start + len;
7cf8d05d
KW
966 char * output;
967 char * d;
968
969 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
970
971 Newx(output, output_len, char);
972 SAVEFREEPV(output);
973
974 d = output;
63ab03b3 975 for (s = start; s < e; s++) {
7cf8d05d
KW
976 const unsigned high_nibble = (*s & 0xF0) >> 4;
977 const unsigned low_nibble = (*s & 0x0F);
978
7e2f38b2 979 if (format) {
63ab03b3
KW
980 if (s > start) {
981 *d++ = ' ';
982 }
7e2f38b2
KW
983 }
984 else {
985 *d++ = '\\';
986 *d++ = 'x';
987 }
7cf8d05d
KW
988
989 if (high_nibble < 10) {
990 *d++ = high_nibble + '0';
991 }
992 else {
993 *d++ = high_nibble - 10 + 'a';
994 }
995
996 if (low_nibble < 10) {
997 *d++ = low_nibble + '0';
998 }
999 else {
1000 *d++ = low_nibble - 10 + 'a';
1001 }
1002 }
1003
1004 *d = '\0';
1005 return output;
1006}
1007
806547a7 1008PERL_STATIC_INLINE char *
7cf8d05d
KW
1009S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1010
421da25c 1011 /* Max number of bytes to print */
3cc6a05e 1012 STRLEN print_len,
7cf8d05d
KW
1013
1014 /* Which one is the non-continuation */
1015 const STRLEN non_cont_byte_pos,
1016
1017 /* How many bytes should there be? */
1018 const STRLEN expect_len)
806547a7
KW
1019{
1020 /* Return the malformation warning text for an unexpected continuation
1021 * byte. */
1022
7cf8d05d 1023 const char * const where = (non_cont_byte_pos == 1)
806547a7 1024 ? "immediately"
7cf8d05d
KW
1025 : Perl_form(aTHX_ "%d bytes",
1026 (int) non_cont_byte_pos);
421da25c
KW
1027 const U8 * x = s + non_cont_byte_pos;
1028 const U8 * e = s + print_len;
806547a7
KW
1029
1030 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1031
7cf8d05d
KW
1032 /* We don't need to pass this parameter, but since it has already been
1033 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1034 assert(expect_len == UTF8SKIP(s));
1035
421da25c
KW
1036 /* As a defensive coding measure, don't output anything past a NUL. Such
1037 * bytes shouldn't be in the middle of a malformation, and could mark the
1038 * end of the allocated string, and what comes after is undefined */
1039 for (; x < e; x++) {
1040 if (*x == '\0') {
1041 x++; /* Output this particular NUL */
1042 break;
1043 }
1044 }
1045
7cf8d05d
KW
1046 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1047 " %s after start byte 0x%02x; need %d bytes, got %d)",
1048 malformed_text,
421da25c 1049 _byte_dump_string(s, x - s, 0),
7cf8d05d
KW
1050 *(s + non_cont_byte_pos),
1051 where,
1052 *s,
1053 (int) expect_len,
1054 (int) non_cont_byte_pos);
806547a7
KW
1055}
1056
35f8c9bd
KW
1057/*
1058
de69f3af 1059=for apidoc utf8n_to_uvchr
378516de
KW
1060
1061THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1062Most code should use L</utf8_to_uvchr_buf>() rather than call this
1063directly.
67e989fb 1064
9041c2e3 1065Bottom level UTF-8 decode routine.
de69f3af 1066Returns the native code point value of the first character in the string C<s>,
746afd53
KW
1067which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1068C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1069the length, in bytes, of that character.
949cf498
KW
1070
1071The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
1072well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1073causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1074is the next possible position in C<s> that could begin a non-malformed
1075character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1076is raised. Some UTF-8 input sequences may contain multiple malformations.
1077This function tries to find every possible one in each call, so multiple
56576a04 1078warnings can be raised for the same sequence.
949cf498
KW
1079
1080Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1081individual types of malformations, such as the sequence being overlong (that
1082is, when there is a shorter sequence that can express the same code point;
1083overlong sequences are expressly forbidden in the UTF-8 standard due to
1084potential security issues). Another malformation example is the first byte of
1085a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
1086flags. Even if allowed, this function generally returns the Unicode
1087REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1088F<utf8.h> to override this behavior for the overlong malformations, but don't
1089do that except for very specialized purposes.
949cf498 1090
796b6530 1091The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
1092flags) malformation is found. If this flag is set, the routine assumes that
1093the caller will raise a warning, and this function will silently just set
d088425d
KW
1094C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1095
75200dff 1096Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 1097character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
1098in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1099be set to 1. To disambiguate, upon a zero return, see if the first byte of
1100C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 1101error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
1102
1103Certain code points are considered problematic. These are Unicode surrogates,
746afd53 1104Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 1105By default these are considered regular code points, but certain situations
ecc1615f
KW
1106warrant special handling for them, which can be specified using the C<flags>
1107parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1108three classes are treated as malformations and handled as such. The flags
1109C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1110C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1111disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1112restricts the allowed inputs to the strict UTF-8 traditionally defined by
1113Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1114definition given by
e2176993 1115L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
ecc1615f
KW
1116The difference between traditional strictness and C9 strictness is that the
1117latter does not forbid non-character code points. (They are still discouraged,
1118however.) For more discussion see L<perlunicode/Noncharacter code points>.
1119
1120The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1121C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
1122C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1123raised for their respective categories, but otherwise the code points are
1124considered valid (not malformations). To get a category to both be treated as
1125a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 1126(But note that warnings are not raised if lexically disabled nor if
796b6530 1127C<UTF8_CHECK_ONLY> is also specified.)
949cf498 1128
57ff5f59
KW
1129Extremely high code points were never specified in any standard, and require an
1130extension to UTF-8 to express, which Perl does. It is likely that programs
1131written in something other than Perl would not be able to read files that
1132contain these; nor would Perl understand files written by something that uses a
1133different extension. For these reasons, there is a separate set of flags that
1134can warn and/or disallow these extremely high code points, even if other
1135above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1136C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
eb992c6f 1137C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
57ff5f59
KW
1138above-Unicode code points, including these, as malformations.
1139(Note that the Unicode standard considers anything above 0x10FFFF to be
1140illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1141(2**31 -1))
1142
1143A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1144retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1145C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1146C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1147can apply to code points that actually do fit in 31 bits. This happens on
1148EBCDIC platforms, and sometimes when the L<overlong
1149malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1150describe the situation in all cases.
1151
ab8e6d41 1152
949cf498
KW
1153All other code points corresponding to Unicode characters, including private
1154use and those yet to be assigned, are never considered malformed and never
1155warn.
67e989fb 1156
5af38e47
KW
1157=for apidoc Amnh||UTF8_CHECK_ONLY
1158=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1159=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1160=for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1161=for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1162=for apidoc Amnh||UTF8_DISALLOW_SUPER
1163=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1164=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1165=for apidoc Amnh||UTF8_WARN_SURROGATE
1166=for apidoc Amnh||UTF8_WARN_NONCHAR
1167=for apidoc Amnh||UTF8_WARN_SUPER
1168=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1169=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1170
37607a96 1171=cut
f9380377
KW
1172
1173Also implemented as a macro in utf8.h
1174*/
1175
1176UV
e6a4ffc3
KW
1177Perl_utf8n_to_uvchr(const U8 *s,
1178 STRLEN curlen,
1179 STRLEN *retlen,
1180 const U32 flags)
f9380377
KW
1181{
1182 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1183
1184 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1185}
1186
1187/*
1188
1189=for apidoc utf8n_to_uvchr_error
1190
1191THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1192Most code should use L</utf8_to_uvchr_buf>() rather than call this
1193directly.
f9380377
KW
1194
1195This function is for code that needs to know what the precise malformation(s)
37657a5b
KW
1196are when an error is found. If you also need to know the generated warning
1197messages, use L</utf8n_to_uvchr_msgs>() instead.
f9380377
KW
1198
1199It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1200all the others, C<errors>. If this parameter is 0, this function behaves
1201identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1202to a C<U32> variable, which this function sets to indicate any errors found.
1203Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1204C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1205of these bits will be set if a malformation is found, even if the input
7a65503b 1206C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1207exceptions are noted:
1208
1209=over 4
1210
57ff5f59 1211=item C<UTF8_GOT_PERL_EXTENDED>
f9380377 1212
57ff5f59
KW
1213The input sequence is not standard UTF-8, but a Perl extension. This bit is
1214set only if the input C<flags> parameter contains either the
1215C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1216
1217Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1218and so some extension must be used to express them. Perl uses a natural
1219extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1220extension to represent even higher ones, so that any code point that fits in a
122164-bit word can be represented. Text using these extensions is not likely to
1222be portable to non-Perl code. We lump both of these extensions together and
1223refer to them as Perl extended UTF-8. There exist other extensions that people
1224have invented, incompatible with Perl's.
1225
1226On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1227extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1228than on ASCII. Prior to that, code points 2**31 and higher were simply
1229unrepresentable, and a different, incompatible method was used to represent
1230code points between 2**30 and 2**31 - 1.
1231
1232On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1233Perl extended UTF-8 is used.
1234
1235In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1236may use for backward compatibility. That name is misleading, as this flag may
1237be set when the code point actually does fit in 31 bits. This happens on
1238EBCDIC platforms, and sometimes when the L<overlong
1239malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1240describes the situation in all cases.
f9380377
KW
1241
1242=item C<UTF8_GOT_CONTINUATION>
1243
a3815e44 1244The input sequence was malformed in that the first byte was a UTF-8
f9380377
KW
1245continuation byte.
1246
1247=item C<UTF8_GOT_EMPTY>
1248
1249The input C<curlen> parameter was 0.
1250
1251=item C<UTF8_GOT_LONG>
1252
1253The input sequence was malformed in that there is some other sequence that
1254evaluates to the same code point, but that sequence is shorter than this one.
1255
fecaf136
KW
1256Until Unicode 3.1, it was legal for programs to accept this malformation, but
1257it was discovered that this created security issues.
1258
f9380377
KW
1259=item C<UTF8_GOT_NONCHAR>
1260
1261The code point represented by the input UTF-8 sequence is for a Unicode
1262non-character code point.
1263This bit is set only if the input C<flags> parameter contains either the
1264C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1265
1266=item C<UTF8_GOT_NON_CONTINUATION>
1267
1268The input sequence was malformed in that a non-continuation type byte was found
00d976bb 1269in a position where only a continuation type one should be. See also
eb992c6f 1270C<L</UTF8_GOT_SHORT>>.
f9380377
KW
1271
1272=item C<UTF8_GOT_OVERFLOW>
1273
1274The input sequence was malformed in that it is for a code point that is not
d22ec717 1275representable in the number of bits available in an IV on the current platform.
f9380377
KW
1276
1277=item C<UTF8_GOT_SHORT>
1278
1279The input sequence was malformed in that C<curlen> is smaller than required for
1280a complete sequence. In other words, the input is for a partial character
1281sequence.
1282
00d976bb
KW
1283
1284C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1285sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1286that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1287sequence was looked at. If no other flags are present, it means that the
1288sequence was valid as far as it went. Depending on the application, this could
1289mean one of three things:
1290
1291=over
1292
1293=item *
1294
1295The C<curlen> length parameter passed in was too small, and the function was
1296prevented from examining all the necessary bytes.
1297
1298=item *
1299
1300The buffer being looked at is based on reading data, and the data received so
1301far stopped in the middle of a character, so that the next read will
1302read the remainder of this character. (It is up to the caller to deal with the
1303split bytes somehow.)
1304
1305=item *
1306
1307This is a real error, and the partial sequence is all we're going to get.
1308
1309=back
1310
f9380377
KW
1311=item C<UTF8_GOT_SUPER>
1312
1313The input sequence was malformed in that it is for a non-Unicode code point;
1314that is, one above the legal Unicode maximum.
1315This bit is set only if the input C<flags> parameter contains either the
1316C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1317
1318=item C<UTF8_GOT_SURROGATE>
1319
1320The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1321code point.
1322This bit is set only if the input C<flags> parameter contains either the
1323C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1324
1325=back
1326
133551d8
KW
1327To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1328flag to suppress any warnings, and then examine the C<*errors> return.
1329
d145625f
KW
1330=for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1331=for apidoc Amnh||UTF8_GOT_CONTINUATION
1332=for apidoc Amnh||UTF8_GOT_EMPTY
1333=for apidoc Amnh||UTF8_GOT_LONG
1334=for apidoc Amnh||UTF8_GOT_NONCHAR
1335=for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1336=for apidoc Amnh||UTF8_GOT_OVERFLOW
1337=for apidoc Amnh||UTF8_GOT_SHORT
1338=for apidoc Amnh||UTF8_GOT_SUPER
1339=for apidoc Amnh||UTF8_GOT_SURROGATE
1340
f9380377 1341=cut
37657a5b
KW
1342
1343Also implemented as a macro in utf8.h
37607a96 1344*/
67e989fb 1345
a0ed51b3 1346UV
e6a4ffc3 1347Perl_utf8n_to_uvchr_error(const U8 *s,
37657a5b
KW
1348 STRLEN curlen,
1349 STRLEN *retlen,
1350 const U32 flags,
1351 U32 * errors)
1352{
1353 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1354
1355 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1356}
1357
1358/*
1359
1360=for apidoc utf8n_to_uvchr_msgs
1361
1362THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
09232555
KW
1363Most code should use L</utf8_to_uvchr_buf>() rather than call this
1364directly.
37657a5b
KW
1365
1366This function is for code that needs to know what the precise malformation(s)
1367are when an error is found, and wants the corresponding warning and/or error
1368messages to be returned to the caller rather than be displayed. All messages
f1460a66 1369that would have been displayed if all lexical warnings are enabled will be
37657a5b
KW
1370returned.
1371
1372It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1373placed after all the others, C<msgs>. If this parameter is 0, this function
1374behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1375be a pointer to an C<AV *> variable, in which this function creates a new AV to
1376contain any appropriate messages. The elements of the array are ordered so
1377that the first message that would have been displayed is in the 0th element,
1378and so on. Each element is a hash with three key-value pairs, as follows:
1379
1380=over 4
1381
1382=item C<text>
1383
1384The text of the message as a C<SVpv>.
1385
1386=item C<warn_categories>
1387
1388The warning category (or categories) packed into a C<SVuv>.
1389
1390=item C<flag>
1391
1392A single flag bit associated with this message, in a C<SVuv>.
1393The bit corresponds to some bit in the C<*errors> return value,
1394such as C<UTF8_GOT_LONG>.
1395
1396=back
1397
1398It's important to note that specifying this parameter as non-null will cause
1399any warnings this function would otherwise generate to be suppressed, and
1400instead be placed in C<*msgs>. The caller can check the lexical warnings state
1401(or not) when choosing what to do with the returned messages.
1402
1403If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1404no AV is created.
1405
1406The caller, of course, is responsible for freeing any returned AV.
1407
1408=cut
1409*/
1410
1411UV
e6a4ffc3 1412Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
37657a5b
KW
1413 STRLEN curlen,
1414 STRLEN *retlen,
1415 const U32 flags,
1416 U32 * errors,
1417 AV ** msgs)
a0ed51b3 1418{
d4c19fe8 1419 const U8 * const s0 = s;
2b9519f0 1420 const U8 * send = s0 + curlen;
5af9f822
KW
1421 U32 possible_problems; /* A bit is set here for each potential problem
1422 found as we go along */
1423 UV uv;
1424 STRLEN expectlen; /* How long should this sequence be? */
1425 STRLEN avail_len; /* When input is too short, gives what that is */
1426 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
1427 gets set and discarded */
a0dbb045 1428
2b5e7bc2
KW
1429 /* The below are used only if there is both an overlong malformation and a
1430 * too short one. Otherwise the first two are set to 's0' and 'send', and
1431 * the third not used at all */
5af9f822 1432 U8 * adjusted_s0;
e9f2c446
KW
1433 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1434 routine; see [perl #130921] */
5af9f822 1435 UV uv_so_far;
e6a4ffc3 1436 dTHX;
5af9f822 1437
e6a4ffc3 1438 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
5af9f822
KW
1439
1440 /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1441 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1442 * syllables that the dfa doesn't properly handle. Quickly dispose of the
1443 * final case. */
1444
5af9f822
KW
1445 /* Each of the affected Hanguls starts with \xED */
1446
67260a96 1447 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
5af9f822
KW
1448 if (retlen) {
1449 *retlen = 3;
1450 }
1451 if (errors) {
1452 *errors = 0;
1453 }
1454 if (msgs) {
1455 *msgs = NULL;
1456 }
1457
1458 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1459 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1460 | (s0[2] & UTF_CONTINUATION_MASK);
1461 }
1462
5af9f822
KW
1463 /* In conjunction with the exhaustive tests that can be enabled in
1464 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1465 * what it is intended to do, and that no flaws in it are masked by
1466 * dropping down and executing the code below
1467 assert(! isUTF8_CHAR(s0, send)
1468 || UTF8_IS_SURROGATE(s0, send)
1469 || UTF8_IS_SUPER(s0, send)
1470 || UTF8_IS_NONCHAR(s0,send));
1471 */
1472
1473 s = s0;
5af9f822
KW
1474 possible_problems = 0;
1475 expectlen = 0;
1476 avail_len = 0;
1477 discard_errors = 0;
1478 adjusted_s0 = (U8 *) s0;
1479 uv_so_far = 0;
1480
f9380377
KW
1481 if (errors) {
1482 *errors = 0;
1483 }
1484 else {
1485 errors = &discard_errors;
1486 }
a0dbb045 1487
eb83ed87
KW
1488 /* The order of malformation tests here is important. We should consume as
1489 * few bytes as possible in order to not skip any valid character. This is
1490 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
e2176993 1491 * https://unicode.org/reports/tr36 for more discussion as to why. For
eb83ed87
KW
1492 * example, once we've done a UTF8SKIP, we can tell the expected number of
1493 * bytes, and could fail right off the bat if the input parameters indicate
1494 * that there are too few available. But it could be that just that first
1495 * byte is garbled, and the intended character occupies fewer bytes. If we
1496 * blindly assumed that the first byte is correct, and skipped based on
1497 * that number, we could skip over a valid input character. So instead, we
1498 * always examine the sequence byte-by-byte.
1499 *
1500 * We also should not consume too few bytes, otherwise someone could inject
1501 * things. For example, an input could be deliberately designed to
1502 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1503 * returning to the caller C<*retlen> pointing to the very next byte (one
a3815e44 1504 * which is actually part of the overflowing sequence), that could look
eb83ed87 1505 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1506 * sequence and process the rest, inappropriately.
1507 *
1508 * Some possible input sequences are malformed in more than one way. This
1509 * function goes to lengths to try to find all of them. This is necessary
1510 * for correctness, as the inputs may allow one malformation but not
1511 * another, and if we abandon searching for others after finding the
1512 * allowed one, we could allow in something that shouldn't have been.
1513 */
eb83ed87 1514
b5b9af04 1515 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1516 possible_problems |= UTF8_GOT_EMPTY;
1517 curlen = 0;
5a48568d 1518 uv = UNICODE_REPLACEMENT;
1604cfb0 1519 goto ready_to_handle_errors;
0c443dc2
JH
1520 }
1521
f85e7958 1522 /* We now know we can examine the first byte of the input */
eb83ed87 1523 expectlen = UTF8SKIP(s);
f85e7958 1524 uv = *s;
eb83ed87
KW
1525
1526 /* A well-formed UTF-8 character, as the vast majority of calls to this
1527 * function will be for, has this expected length. For efficiency, set
1528 * things up here to return it. It will be overriden only in those rare
1529 * cases where a malformation is found */
1530 if (retlen) {
1604cfb0 1531 *retlen = expectlen;
eb83ed87
KW
1532 }
1533
eb83ed87 1534 /* A continuation character can't start a valid sequence */
b5b9af04 1535 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1604cfb0 1536 possible_problems |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1537 curlen = 1;
1538 uv = UNICODE_REPLACEMENT;
1604cfb0 1539 goto ready_to_handle_errors;
ba210ebe 1540 }
9041c2e3 1541
dcd27b3c 1542 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1543 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1544 * because it excludes start bytes like \xC0 that always lead to
1545 * overlongs.) */
dcd27b3c 1546
534752c1
KW
1547 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1548 * that indicate the number of bytes in the character's whole UTF-8
1549 * sequence, leaving just the bits that are part of the value. */
1550 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1551
e308b348
KW
1552 /* Setup the loop end point, making sure to not look past the end of the
1553 * input string, and flag it as too short if the size isn't big enough. */
e308b348
KW
1554 if (UNLIKELY(curlen < expectlen)) {
1555 possible_problems |= UTF8_GOT_SHORT;
1556 avail_len = curlen;
e308b348
KW
1557 }
1558 else {
2b9519f0 1559 send = (U8*) s0 + expectlen;
e308b348 1560 }
e308b348 1561
eb83ed87 1562 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1563 * accumulating each into the working value as we go. */
eb83ed87 1564 for (s = s0 + 1; s < send; s++) {
1604cfb0
MS
1565 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1566 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1567 continue;
1568 }
1569
1570 /* Here, found a non-continuation before processing all expected bytes.
1571 * This byte indicates the beginning of a new character, so quit, even
1572 * if allowing this malformation. */
2b5e7bc2 1573 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1574 break;
eb83ed87
KW
1575 } /* End of loop through the character's bytes */
1576
1577 /* Save how many bytes were actually in the character */
1578 curlen = s - s0;
1579
2b5e7bc2
KW
1580 /* Note that there are two types of too-short malformation. One is when
1581 * there is actual wrong data before the normal termination of the
1582 * sequence. The other is that the sequence wasn't complete before the end
1583 * of the data we are allowed to look at, based on the input 'curlen'.
1584 * This means that we were passed data for a partial character, but it is
1585 * valid as far as we saw. The other is definitely invalid. This
1586 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1587 * separate.
1588 *
1589 * A convenience macro that matches either of the too-short conditions. */
1590# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1591
1592 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1593 uv_so_far = uv;
1594 uv = UNICODE_REPLACEMENT;
1595 }
2b5e7bc2 1596
08e73697
KW
1597 /* Check for overflow. The algorithm requires us to not look past the end
1598 * of the current character, even if partial, so the upper limit is 's' */
e050c007
KW
1599 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1600 1 /* Do consider overlongs */
1601 )))
1602 {
2b5e7bc2
KW
1603 possible_problems |= UTF8_GOT_OVERFLOW;
1604 uv = UNICODE_REPLACEMENT;
eb83ed87 1605 }
eb83ed87 1606
2b5e7bc2
KW
1607 /* Check for overlong. If no problems so far, 'uv' is the correct code
1608 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1609 * we must look at the UTF-8 byte sequence itself to see if it is for an
1610 * overlong */
1611 if ( ( LIKELY(! possible_problems)
1612 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
56576a04 1613 || ( UNLIKELY(possible_problems)
2b5e7bc2 1614 && ( UNLIKELY(! UTF8_IS_START(*s0))
8b5f2733 1615 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
2f8f112e 1616 {
2b5e7bc2
KW
1617 possible_problems |= UTF8_GOT_LONG;
1618
abc28b54 1619 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
56576a04 1620
abc28b54
KW
1621 /* The calculation in the 'true' branch of this 'if'
1622 * below won't work if overflows, and isn't needed
1623 * anyway. Further below we handle all overflow
1624 * cases */
1625 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1626 {
2b5e7bc2
KW
1627 UV min_uv = uv_so_far;
1628 STRLEN i;
1629
1630 /* Here, the input is both overlong and is missing some trailing
1631 * bytes. There is no single code point it could be for, but there
1632 * may be enough information present to determine if what we have
1633 * so far is for an unallowed code point, such as for a surrogate.
56576a04
KW
1634 * The code further below has the intelligence to determine this,
1635 * but just for non-overlong UTF-8 sequences. What we do here is
1636 * calculate the smallest code point the input could represent if
1637 * there were no too short malformation. Then we compute and save
1638 * the UTF-8 for that, which is what the code below looks at
1639 * instead of the raw input. It turns out that the smallest such
1640 * code point is all we need. */
2b5e7bc2
KW
1641 for (i = curlen; i < expectlen; i++) {
1642 min_uv = UTF8_ACCUMULATE(min_uv,
4a2c769e 1643 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
2b5e7bc2
KW
1644 }
1645
e9f2c446 1646 adjusted_s0 = temp_char_buf;
57ff5f59 1647 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
2b5e7bc2 1648 }
eb83ed87
KW
1649 }
1650
56576a04
KW
1651 /* Here, we have found all the possible problems, except for when the input
1652 * is for a problematic code point not allowed by the input parameters. */
1653
06188866
KW
1654 /* uv is valid for overlongs */
1655 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
8010ec0d 1656 && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
2b5e7bc2 1657 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1658
1659 /* if overflow, we know without looking further
1660 * precisely which of the problematic types it is,
1661 * and we deal with those in the overflow handling
1662 * code */
1663 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
57ff5f59 1664 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
43732c4f 1665 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1604cfb0 1666 && ((flags & ( UTF8_DISALLOW_NONCHAR
760c7c2f
KW
1667 |UTF8_DISALLOW_SURROGATE
1668 |UTF8_DISALLOW_SUPER
d044b7a7 1669 |UTF8_DISALLOW_PERL_EXTENDED
1604cfb0 1670 |UTF8_WARN_NONCHAR
760c7c2f
KW
1671 |UTF8_WARN_SURROGATE
1672 |UTF8_WARN_SUPER
d22ec717 1673 |UTF8_WARN_PERL_EXTENDED))))
eb83ed87 1674 {
2b5e7bc2
KW
1675 /* If there were no malformations, or the only malformation is an
1676 * overlong, 'uv' is valid */
1677 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1678 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1679 possible_problems |= UTF8_GOT_SURROGATE;
1680 }
8010ec0d 1681 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
2b5e7bc2
KW
1682 possible_problems |= UTF8_GOT_SUPER;
1683 }
1684 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1685 possible_problems |= UTF8_GOT_NONCHAR;
1686 }
1687 }
1688 else { /* Otherwise, need to look at the source UTF-8, possibly
1689 adjusted to be non-overlong */
1690
1691 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
2e5a4e5a 1692 > UTF_START_BYTE_110000_))
ea5ced44 1693 {
2b5e7bc2
KW
1694 possible_problems |= UTF8_GOT_SUPER;
1695 }
1696 else if (curlen > 1) {
2e5a4e5a
KW
1697 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
1698 == UTF_START_BYTE_110000_
1699 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1700 >= UTF_FIRST_CONT_BYTE_110000_))
ea5ced44 1701 {
2b5e7bc2 1702 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1703 }
d4bf6b07 1704 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
2b5e7bc2 1705 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1706 }
1707 }
c0236afe 1708
2b5e7bc2
KW
1709 /* We need a complete well-formed UTF-8 character to discern
1710 * non-characters, so can't look for them here */
1711 }
1712 }
949cf498 1713
2b5e7bc2
KW
1714 ready_to_handle_errors:
1715
1716 /* At this point:
1717 * curlen contains the number of bytes in the sequence that
1718 * this call should advance the input by.
e308b348
KW
1719 * avail_len gives the available number of bytes passed in, but
1720 * only if this is less than the expected number of
1721 * bytes, based on the code point's start byte.
4c46f86a 1722 * possible_problems is 0 if there weren't any problems; otherwise a bit
2b5e7bc2
KW
1723 * is set in it for each potential problem found.
1724 * uv contains the code point the input sequence
1725 * represents; or if there is a problem that prevents
1726 * a well-defined value from being computed, it is
1727 * some subsitute value, typically the REPLACEMENT
1728 * CHARACTER.
1729 * s0 points to the first byte of the character
c33e0000 1730 * s points to just after where we left off processing
56576a04
KW
1731 * the character
1732 * send points to just after where that character should
1733 * end, based on how many bytes the start byte tells
1734 * us should be in it, but no further than s0 +
1735 * avail_len
2b5e7bc2 1736 */
eb83ed87 1737
2b5e7bc2
KW
1738 if (UNLIKELY(possible_problems)) {
1739 bool disallowed = FALSE;
1740 const U32 orig_problems = possible_problems;
1741
37657a5b
KW
1742 if (msgs) {
1743 *msgs = NULL;
1744 }
1745
2b5e7bc2 1746 while (possible_problems) { /* Handle each possible problem */
9fde5914 1747 U32 pack_warn = 0;
2b5e7bc2 1748 char * message = NULL;
37657a5b 1749 U32 this_flag_bit = 0;
2b5e7bc2
KW
1750
1751 /* Each 'if' clause handles one problem. They are ordered so that
1752 * the first ones' messages will be displayed before the later
6c64cd9d
KW
1753 * ones; this is kinda in decreasing severity order. But the
1754 * overlong must come last, as it changes 'uv' looked at by the
1755 * others */
2b5e7bc2
KW
1756 if (possible_problems & UTF8_GOT_OVERFLOW) {
1757
56576a04
KW
1758 /* Overflow means also got a super and are using Perl's
1759 * extended UTF-8, but we handle all three cases here */
2b5e7bc2 1760 possible_problems
d044b7a7 1761 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
f9380377
KW
1762 *errors |= UTF8_GOT_OVERFLOW;
1763
1764 /* But the API says we flag all errors found */
1765 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1766 *errors |= UTF8_GOT_SUPER;
1767 }
ddb65933 1768 if (flags
d044b7a7 1769 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1770 {
d044b7a7 1771 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1772 }
2b5e7bc2 1773
d60baaa7 1774 /* Disallow if any of the three categories say to */
56576a04 1775 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
d60baaa7 1776 || (flags & ( UTF8_DISALLOW_SUPER
d044b7a7 1777 |UTF8_DISALLOW_PERL_EXTENDED)))
d60baaa7
KW
1778 {
1779 disallowed = TRUE;
1780 }
1781
d22ec717
KW
1782 /* Likewise, warn if any say to */
1783 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1784 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
d60baaa7 1785 {
2b5e7bc2 1786
ddb65933
KW
1787 /* The warnings code explicitly says it doesn't handle the
1788 * case of packWARN2 and two categories which have
1789 * parent-child relationship. Even if it works now to
1790 * raise the warning if either is enabled, it wouldn't
1791 * necessarily do so in the future. We output (only) the
56576a04 1792 * most dire warning */
ddb65933 1793 if (! (flags & UTF8_CHECK_ONLY)) {
37657a5b 1794 if (msgs || ckWARN_d(WARN_UTF8)) {
ddb65933
KW
1795 pack_warn = packWARN(WARN_UTF8);
1796 }
37657a5b 1797 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
ddb65933
KW
1798 pack_warn = packWARN(WARN_NON_UNICODE);
1799 }
1800 if (pack_warn) {
1801 message = Perl_form(aTHX_ "%s: %s (overflows)",
1802 malformed_text,
05b9033b 1803 _byte_dump_string(s0, curlen, 0));
37657a5b 1804 this_flag_bit = UTF8_GOT_OVERFLOW;
ddb65933 1805 }
2b5e7bc2
KW
1806 }
1807 }
1808 }
1809 else if (possible_problems & UTF8_GOT_EMPTY) {
1810 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1811 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1812
1813 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1814
1815 /* This so-called malformation is now treated as a bug in
1816 * the caller. If you have nothing to decode, skip calling
1817 * this function */
1818 assert(0);
1819
2b5e7bc2 1820 disallowed = TRUE;
37657a5b
KW
1821 if ( (msgs
1822 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1823 {
2b5e7bc2
KW
1824 pack_warn = packWARN(WARN_UTF8);
1825 message = Perl_form(aTHX_ "%s (empty string)",
1826 malformed_text);
37657a5b 1827 this_flag_bit = UTF8_GOT_EMPTY;
2b5e7bc2
KW
1828 }
1829 }
1830 }
1831 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1832 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1833 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1834
1835 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1836 disallowed = TRUE;
37657a5b
KW
1837 if (( msgs
1838 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1839 {
2b5e7bc2
KW
1840 pack_warn = packWARN(WARN_UTF8);
1841 message = Perl_form(aTHX_
1842 "%s: %s (unexpected continuation byte 0x%02x,"
1843 " with no preceding start byte)",
1844 malformed_text,
7e2f38b2 1845 _byte_dump_string(s0, 1, 0), *s0);
37657a5b 1846 this_flag_bit = UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1847 }
1848 }
1849 }
2b5e7bc2
KW
1850 else if (possible_problems & UTF8_GOT_SHORT) {
1851 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1852 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1853
1854 if (! (flags & UTF8_ALLOW_SHORT)) {
1855 disallowed = TRUE;
37657a5b
KW
1856 if (( msgs
1857 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1858 {
2b5e7bc2
KW
1859 pack_warn = packWARN(WARN_UTF8);
1860 message = Perl_form(aTHX_
56576a04
KW
1861 "%s: %s (too short; %d byte%s available, need %d)",
1862 malformed_text,
1863 _byte_dump_string(s0, send - s0, 0),
1864 (int)avail_len,
1865 avail_len == 1 ? "" : "s",
1866 (int)expectlen);
37657a5b 1867 this_flag_bit = UTF8_GOT_SHORT;
2b5e7bc2
KW
1868 }
1869 }
ba210ebe 1870
2b5e7bc2 1871 }
e308b348
KW
1872 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1873 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1874 *errors |= UTF8_GOT_NON_CONTINUATION;
1875
1876 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1877 disallowed = TRUE;
37657a5b
KW
1878 if (( msgs
1879 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1880 {
99a765e9
KW
1881
1882 /* If we don't know for sure that the input length is
1883 * valid, avoid as much as possible reading past the
1884 * end of the buffer */
1885 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
100de20c
KW
1886 ? (int) (s - s0)
1887 : (int) (send - s0);
e308b348
KW
1888 pack_warn = packWARN(WARN_UTF8);
1889 message = Perl_form(aTHX_ "%s",
1890 unexpected_non_continuation_text(s0,
99a765e9 1891 printlen,
e308b348
KW
1892 s - s0,
1893 (int) expectlen));
37657a5b 1894 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
e308b348
KW
1895 }
1896 }
1897 }
2b5e7bc2
KW
1898 else if (possible_problems & UTF8_GOT_SURROGATE) {
1899 possible_problems &= ~UTF8_GOT_SURROGATE;
1900
f9380377
KW
1901 if (flags & UTF8_WARN_SURROGATE) {
1902 *errors |= UTF8_GOT_SURROGATE;
1903
1904 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 1905 && (msgs || ckWARN_d(WARN_SURROGATE)))
f9380377 1906 {
2b5e7bc2
KW
1907 pack_warn = packWARN(WARN_SURROGATE);
1908
1909 /* These are the only errors that can occur with a
1910 * surrogate when the 'uv' isn't valid */
1911 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1912 message = Perl_form(aTHX_
1913 "UTF-16 surrogate (any UTF-8 sequence that"
1914 " starts with \"%s\" is for a surrogate)",
7e2f38b2 1915 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1916 }
1917 else {
c94c2f39 1918 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 1919 }
37657a5b 1920 this_flag_bit = UTF8_GOT_SURROGATE;
f9380377 1921 }
2b5e7bc2 1922 }
ba210ebe 1923
2b5e7bc2
KW
1924 if (flags & UTF8_DISALLOW_SURROGATE) {
1925 disallowed = TRUE;
f9380377 1926 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1927 }
1928 }
1929 else if (possible_problems & UTF8_GOT_SUPER) {
1930 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1931
f9380377
KW
1932 if (flags & UTF8_WARN_SUPER) {
1933 *errors |= UTF8_GOT_SUPER;
1934
1935 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 1936 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
f9380377 1937 {
2b5e7bc2
KW
1938 pack_warn = packWARN(WARN_NON_UNICODE);
1939
1940 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1941 message = Perl_form(aTHX_
1942 "Any UTF-8 sequence that starts with"
1943 " \"%s\" is for a non-Unicode code point,"
1944 " may not be portable",
7e2f38b2 1945 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
1946 }
1947 else {
c94c2f39 1948 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 1949 }
37657a5b 1950 this_flag_bit = UTF8_GOT_SUPER;
f9380377 1951 }
2b5e7bc2 1952 }
ba210ebe 1953
57ff5f59
KW
1954 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1955 * and before possibly bailing out, so that the more dire
1956 * warning will override the regular one. */
43732c4f 1957 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
2b5e7bc2 1958 if ( ! (flags & UTF8_CHECK_ONLY)
d044b7a7 1959 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
dc4a6683
KW
1960 && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
1961 || ckWARN(WARN_PORTABLE))))
2b5e7bc2 1962 {
dc4a6683 1963 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
2b5e7bc2 1964
57ff5f59
KW
1965 /* If it is an overlong that evaluates to a code point
1966 * that doesn't have to use the Perl extended UTF-8, it
1967 * still used it, and so we output a message that
1968 * doesn't refer to the code point. The same is true
1969 * if there was a SHORT malformation where the code
1970 * point is not valid. In that case, 'uv' will have
1971 * been set to the REPLACEMENT CHAR, and the message
1972 * below without the code point in it will be selected
1973 * */
1974 if (UNICODE_IS_PERL_EXTENDED(uv)) {
2b5e7bc2 1975 message = Perl_form(aTHX_
8911f9b0 1976 PL_extended_cp_format, uv);
2b5e7bc2
KW
1977 }
1978 else {
1979 message = Perl_form(aTHX_
57ff5f59
KW
1980 "Any UTF-8 sequence that starts with"
1981 " \"%s\" is a Perl extension, and"
1982 " so is not portable",
1983 _byte_dump_string(s0, curlen, 0));
2b5e7bc2 1984 }
37657a5b 1985 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2b5e7bc2
KW
1986 }
1987
d044b7a7
KW
1988 if (flags & ( UTF8_WARN_PERL_EXTENDED
1989 |UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1990 {
d044b7a7 1991 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1992
d044b7a7 1993 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
f9380377
KW
1994 disallowed = TRUE;
1995 }
2b5e7bc2
KW
1996 }
1997 }
eb83ed87 1998
2b5e7bc2 1999 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 2000 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
2001 disallowed = TRUE;
2002 }
2b5e7bc2
KW
2003 }
2004 else if (possible_problems & UTF8_GOT_NONCHAR) {
2005 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 2006
f9380377
KW
2007 if (flags & UTF8_WARN_NONCHAR) {
2008 *errors |= UTF8_GOT_NONCHAR;
2009
2010 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2011 && (msgs || ckWARN_d(WARN_NONCHAR)))
f9380377 2012 {
2b5e7bc2
KW
2013 /* The code above should have guaranteed that we don't
2014 * get here with errors other than overlong */
2015 assert (! (orig_problems
2016 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2017
2018 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 2019 message = Perl_form(aTHX_ nonchar_cp_format, uv);
37657a5b 2020 this_flag_bit = UTF8_GOT_NONCHAR;
f9380377 2021 }
2b5e7bc2 2022 }
5b311467 2023
2b5e7bc2
KW
2024 if (flags & UTF8_DISALLOW_NONCHAR) {
2025 disallowed = TRUE;
f9380377 2026 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2 2027 }
6c64cd9d
KW
2028 }
2029 else if (possible_problems & UTF8_GOT_LONG) {
2030 possible_problems &= ~UTF8_GOT_LONG;
2031 *errors |= UTF8_GOT_LONG;
2032
2033 if (flags & UTF8_ALLOW_LONG) {
2034
2035 /* We don't allow the actual overlong value, unless the
2036 * special extra bit is also set */
2037 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2038 & ~UTF8_ALLOW_LONG)))
2039 {
2040 uv = UNICODE_REPLACEMENT;
2041 }
2042 }
2043 else {
2044 disallowed = TRUE;
2045
37657a5b
KW
2046 if (( msgs
2047 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2048 {
6c64cd9d
KW
2049 pack_warn = packWARN(WARN_UTF8);
2050
2051 /* These error types cause 'uv' to be something that
2052 * isn't what was intended, so can't use it in the
2053 * message. The other error types either can't
2054 * generate an overlong, or else the 'uv' is valid */
2055 if (orig_problems &
2056 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2057 {
2058 message = Perl_form(aTHX_
2059 "%s: %s (any UTF-8 sequence that starts"
2060 " with \"%s\" is overlong which can and"
2061 " should be represented with a"
2062 " different, shorter sequence)",
2063 malformed_text,
2064 _byte_dump_string(s0, send - s0, 0),
2065 _byte_dump_string(s0, curlen, 0));
2066 }
2067 else {
2068 U8 tmpbuf[UTF8_MAXBYTES+1];
1be62ab9
KW
2069 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2070 uv, 0);
d819dc50
KW
2071 /* Don't use U+ for non-Unicode code points, which
2072 * includes those in the Latin1 range */
8010ec0d 2073 const char * preface = ( UNICODE_IS_SUPER(uv)
d819dc50 2074#ifdef EBCDIC
8010ec0d 2075 || uv <= 0xFF
d819dc50 2076#endif
8010ec0d 2077 )
d819dc50
KW
2078 ? "0x"
2079 : "U+";
6c64cd9d
KW
2080 message = Perl_form(aTHX_
2081 "%s: %s (overlong; instead use %s to represent"
2082 " %s%0*" UVXf ")",
2083 malformed_text,
2084 _byte_dump_string(s0, send - s0, 0),
2085 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2086 preface,
2087 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2088 small code points */
1be62ab9 2089 UNI_TO_NATIVE(uv));
6c64cd9d 2090 }
37657a5b 2091 this_flag_bit = UTF8_GOT_LONG;
6c64cd9d
KW
2092 }
2093 }
2b5e7bc2
KW
2094 } /* End of looking through the possible flags */
2095
2096 /* Display the message (if any) for the problem being handled in
2097 * this iteration of the loop */
2098 if (message) {
37657a5b 2099 if (msgs) {
37657a5b
KW
2100 assert(this_flag_bit);
2101
2102 if (*msgs == NULL) {
2103 *msgs = newAV();
2104 }
2105
bb07812e
KW
2106 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2107 pack_warn,
2108 this_flag_bit)));
37657a5b
KW
2109 }
2110 else if (PL_op)
2b5e7bc2
KW
2111 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2112 OP_DESC(PL_op));
2113 else
2114 Perl_warner(aTHX_ pack_warn, "%s", message);
2115 }
ddb65933 2116 } /* End of 'while (possible_problems)' */
a0dbb045 2117
2b5e7bc2
KW
2118 /* Since there was a possible problem, the returned length may need to
2119 * be changed from the one stored at the beginning of this function.
c33e0000 2120 * Instead of trying to figure out if it has changed, just do it. */
2b5e7bc2
KW
2121 if (retlen) {
2122 *retlen = curlen;
2123 }
a0dbb045 2124
2b5e7bc2
KW
2125 if (disallowed) {
2126 if (flags & UTF8_CHECK_ONLY && retlen) {
2127 *retlen = ((STRLEN) -1);
2128 }
2129 return 0;
2130 }
eb83ed87 2131 }
ba210ebe 2132
2b5e7bc2 2133 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
2134}
2135
8e84507e 2136/*
ec5f19d0
KW
2137=for apidoc utf8_to_uvchr_buf
2138
2139Returns the native code point of the first character in the string C<s> which
2140is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 2141C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 2142
524080c4
KW
2143If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2144enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 2145C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 2146(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 2147C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 2148the next possible position in C<s> that could begin a non-malformed character.
de69f3af 2149See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 2150returned.
ec5f19d0
KW
2151
2152=cut
52be2536
KW
2153
2154Also implemented as a macro in utf8.h
2155
ec5f19d0
KW
2156*/
2157
2158
2159UV
2160Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2161{
7f974d7e
KW
2162 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2163
9a9a6c98 2164 return utf8_to_uvchr_buf_helper(s, send, retlen);
ec5f19d0
KW
2165}
2166
b76347f2 2167/*
87cea99e 2168=for apidoc utf8_length
b76347f2 2169
b2e7ed74
KW
2170Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2171at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2172same place, it returns 0 with no warning raised.
2173
2174If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2175and returns the number of valid characters.
b76347f2
JH
2176
2177=cut
2178*/
2179
2180STRLEN
35a4481c 2181Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
2182{
2183 STRLEN len = 0;
2184
7918f24d
NC
2185 PERL_ARGS_ASSERT_UTF8_LENGTH;
2186
8850bf83
JH
2187 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2188 * the bitops (especially ~) can create illegal UTF-8.
2189 * In other words: in Perl UTF-8 is not just for Unicode. */
2190
b76347f2 2191 while (s < e) {
6a2e93d9
KW
2192 Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2193
2194 if (UNLIKELY(e - s < expected_byte_count)) {
2195 goto warn_and_return;
2196 }
2197
1604cfb0 2198 len++;
6a2e93d9 2199 s += expected_byte_count;
8e91ec7f
AV
2200 }
2201
6a2e93d9
KW
2202 if (LIKELY(e == s)) {
2203 return len;
b76347f2
JH
2204 }
2205
6a2e93d9
KW
2206 /* Here, s > e on entry */
2207
2208 warn_and_return:
2209 if (PL_op)
2210 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2211 "%s in %s", unees, OP_DESC(PL_op));
2212 else
2213 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2214
b76347f2
JH
2215 return len;
2216}
2217
b06226ff 2218/*
fed3ba5d
NC
2219=for apidoc bytes_cmp_utf8
2220
a1433954 2221Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
2222sequence of characters (stored as UTF-8)
2223in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
2224equal, -1 or -2 if the first string is less than the second string, +1 or +2
2225if the first string is greater than the second string.
2226
2227-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
2228longer string. -2 or +2 is returned if
2229there was a difference between characters
fed3ba5d
NC
2230within the strings.
2231
2232=cut
2233*/
2234
2235int
2236Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2237{
2238 const U8 *const bend = b + blen;
2239 const U8 *const uend = u + ulen;
2240
2241 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
2242
2243 while (b < bend && u < uend) {
2244 U8 c = *u++;
1604cfb0
MS
2245 if (!UTF8_IS_INVARIANT(c)) {
2246 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2247 if (u < uend) {
2248 U8 c1 = *u++;
2249 if (UTF8_IS_CONTINUATION(c1)) {
2250 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2251 } else {
2b5e7bc2 2252 /* diag_listed_as: Malformed UTF-8 character%s */
1604cfb0 2253 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
56576a04
KW
2254 "%s %s%s",
2255 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2256 PL_op ? " in " : "",
2257 PL_op ? OP_DESC(PL_op) : "");
1604cfb0
MS
2258 return -2;
2259 }
2260 } else {
2261 if (PL_op)
2262 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2263 "%s in %s", unees, OP_DESC(PL_op));
2264 else
2265 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2266 return -2; /* Really want to return undef :-) */
2267 }
2268 } else {
2269 return -2;
2270 }
2271 }
2272 if (*b != c) {
2273 return *b < c ? -2 : +2;
2274 }
2275 ++b;
fed3ba5d
NC
2276 }
2277
2278 if (b == bend && u == uend)
1604cfb0 2279 return 0;
fed3ba5d
NC
2280
2281 return b < bend ? +1 : -1;
2282}
2283
2284/*
87cea99e 2285=for apidoc utf8_to_bytes
6940069f 2286
3bc0c78c 2287Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 2288Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 2289updates C<*lenp> to contain the new length.
3bc0c78c
KW
2290Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2291
2292Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2293having saved the value of C<*lenp> before the call, and subtracting the
2294after-call value of C<*lenp> from it.
6940069f 2295
a1433954 2296If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 2297
6940069f
GS
2298=cut
2299*/
2300
2301U8 *
09af0336 2302Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 2303{
9fe0d3c2 2304 U8 * first_variant;
246fae53 2305
7918f24d 2306 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 2307 PERL_UNUSED_CONTEXT;
7918f24d 2308
9fe0d3c2 2309 /* This is a no-op if no variants at all in the input */
09af0336 2310 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
2311 return s;
2312 }
2313
9b6b0f24
KW
2314 /* Nothing before 'first_variant' needs to be changed, so start the real
2315 * work there */
2316
2317 U8 * const save = s;
2318 U8 * const send = s + *lenp;
2319 U8 * d;
2320
2321#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
2322
2323 /* There is some start-up/tear-down overhead with this, so no real gain
2324 * unless the string is long enough. The current value is just a
2325 * guess. */
2326 if (*lenp > 5 * PERL_WORDSIZE) {
2327
2328 /* First, go through the string a word at-a-time to verify that it is
2329 * downgradable. If it contains any start byte besides C2 and C3, then
2330 * it isn't. */
2331
2332 const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
2333 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
2334 const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
2335
2336 /* Points to the first byte >=s which is positioned at a word boundary.
2337 * If s is on a word boundary, it is s, otherwise it is the first byte
2338 * of the next word. */
2339 U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2340 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2341
2342 /* Here there is at least a full word beyond the first word boundary.
2343 * Process up to that boundary. */
2344 while (s < partial_word_end) {
3c5aa262
KW
2345 if (! UTF8_IS_INVARIANT(*s)) {
2346 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 2347 *lenp = ((STRLEN) -1);
9b6b0f24 2348 return NULL;
3c5aa262
KW
2349 }
2350 s++;
d59937ca
KW
2351 }
2352 s++;
dcad2880 2353 }
dcad2880 2354
9b6b0f24
KW
2355 /* Adjust back down any overshoot */
2356 s = partial_word_end;
2357
2358 /* Process per-word */
2359 do {
2360
2361 PERL_UINTMAX_T C2_C3_start_bytes;
2362
2363 /* First find the bytes that are start bytes. ANDing with
2364 * C0C0...C0 causes any start byte to become C0; any other byte
2365 * becomes something else. Then XORing with C0 causes any start
2366 * byte to become 0; all other bytes non-zero. */
2367 PERL_UINTMAX_T start_bytes
2368 = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
2369
2370 /* These shifts causes the most significant bit to be set to 1 for
2371 * any bytes in the word that aren't completely 0. Hence after
2372 * these, only the start bytes have 0 in their msb */
2373 start_bytes |= start_bytes << 1;
2374 start_bytes |= start_bytes << 2;
2375 start_bytes |= start_bytes << 4;
2376
2377 /* When we complement, then AND with 8080...80, the start bytes
2378 * will have 1 in their msb, and all other bits are 0 */
2379 start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
2380
2381 /* Now repeat the procedure, but look for bytes that match only
2382 * C2-C3. */
2383 C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
2384 ^ C2_mask;
2385 C2_C3_start_bytes |= C2_C3_start_bytes << 1;
2386 C2_C3_start_bytes |= C2_C3_start_bytes << 2;
2387 C2_C3_start_bytes |= C2_C3_start_bytes << 4;
2388 C2_C3_start_bytes = ~ C2_C3_start_bytes
2389 & PERL_VARIANTS_WORD_MASK;
2390
2391 /* Here, start_bytes has a 1 in the msb of each byte that has a
2392 * start_byte; And
2393 * C2_C3_start_bytes has a 1 in the msb of each byte that has a
2394 * start_byte of C2 or C3
2395 * If they're not equal, there are start bytes that aren't C2
2396 * nor C3, hence this is not downgradable */
2397 if (start_bytes != C2_C3_start_bytes) {
2398 *lenp = ((STRLEN) -1);
2399 return NULL;
3c5aa262 2400 }
9b6b0f24
KW
2401
2402 s += PERL_WORDSIZE;
2403 } while (s + PERL_WORDSIZE <= send);
2404
2405 /* If the final byte was a start byte, it means that the character
2406 * straddles two words, so back off one to start looking below at the
2407 * first byte of the character */
2408 if (s > first_variant && UTF8_IS_START(*(s-1))) {
2409 s--;
3c5aa262 2410 }
9b6b0f24
KW
2411 }
2412
2413#endif
3c5aa262 2414
9b6b0f24
KW
2415 /* Do the straggler bytes beyond the final word boundary (or all bytes
2416 * in the case of EBCDIC) */
2417 while (s < send) {
2418 if (! UTF8_IS_INVARIANT(*s)) {
2419 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2420 *lenp = ((STRLEN) -1);
2421 return NULL;
2422 }
2423 s++;
2424 }
2425 s++;
9fe0d3c2 2426 }
9b6b0f24
KW
2427
2428 /* Here, we passed the tests above. For the EBCDIC case, everything
2429 * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC,
2430 * it means only that all start bytes were C2 or C3, hence any
2431 * well-formed sequences are downgradable. But we didn't test, for
2432 * example, that there weren't two C2's in a row. That means that in
2433 * the loop below, we have to be sure things are well-formed. Because
2434 * this is very very likely, and we don't care about having speedy
2435 * handling of malformed input, the loop proceeds as if well formed,
2436 * and should a malformed one come along, it undoes what it already has
2437 * done */
2438
2439 d = s = first_variant;
2440
2441 while (s < send) {
2442 U8 * s1;
2443
2444 if (UVCHR_IS_INVARIANT(*s)) {
2445 *d++ = *s++;
2446 continue;
2447 }
2448
2449 /* Here it is two-byte encoded. */
2450 if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
2451 && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
2452 {
2453 U8 first_byte = *s++;
2454 *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
2455 s++;
2456 continue;
2457 }
2458
2459 /* Here, it is malformed. This shouldn't happen on EBCDIC, and on
2460 * ASCII platforms, we know that the only start bytes in the text
2461 * are C2 and C3, and the code above has made sure that it doesn't
2462 * end with a start byte. That means the only malformations that
2463 * are possible are a start byte without a continuation (either
2464 * followed by another start byte or an invariant) or an unexpected
2465 * continuation.
2466 *
2467 * We have to undo all we've done before, back down to the first
2468 * UTF-8 variant. Note that each 2-byte variant we've done so far
2469 * (converted to single byte) slides things to the left one byte,
2470 * and so we have bytes that haven't been written over.
2471 *
2472 * Here, 'd' points to the next position to overwrite, and 's'
2473 * points to the first invalid byte. That means 'd's contents
2474 * haven't been changed yet, nor has anything else beyond it in the
2475 * string. In restoring to the original contents, we don't need to
2476 * do anything past (d-1).
2477 *
2478 * In particular, the bytes from 'd' to 's' have not been changed.
2479 * This loop uses a new variable 's1' (to avoid confusing 'source'
2480 * and 'destination') set to 'd', and moves 's' and 's1' in lock
2481 * step back so that afterwards, 's1' points to the first changed
2482 * byte that will be the source for the first byte (or bytes) at
2483 * 's' that need to be changed back. Note that s1 can expand to
2484 * two bytes */
2485 s1 = d;
2486 while (s >= d) {
2487 s--;
2488 if (! UVCHR_IS_INVARIANT(*s1)) {
2489 s--;
2490 }
2491 s1--;
2492 }
2493
2494 /* Do the changing back */
2495 while (s1 >= first_variant) {
2496 if (UVCHR_IS_INVARIANT(*s1)) {
2497 *s-- = *s1--;
2498 }
2499 else {
2500 *s-- = UTF8_EIGHT_BIT_LO(*s1);
2501 *s-- = UTF8_EIGHT_BIT_HI(*s1);
2502 s1--;
2503 }
2504 }
2505
2506 *lenp = ((STRLEN) -1);
2507 return NULL;
2508 }
2509
2510 /* Success! */
2511 *d = '\0';
2512 *lenp = d - save;
2513
2514 return save;
6940069f
GS
2515}
2516
2517/*
87cea99e 2518=for apidoc bytes_from_utf8
f9a63242 2519
09af0336 2520Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2521byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2522actually encoded in UTF-8.
2523
2524Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2525the input string.
2526
41ae6089
KW
2527Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2528not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2529C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2530
41ae6089 2531Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2532newly created string containing a downgraded copy of C<s>, and whose length is
9ff99fb3
KW
2533returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2534caller is responsible for arranging for the memory used by this string to get
2535freed.
f9a63242 2536
3bc0c78c 2537Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2538having saved the value of C<*lenp> before the call, and subtracting the
2539after-call value of C<*lenp> from it.
3bc0c78c 2540
37607a96 2541=cut
976c1b08
KW
2542
2543There is a macro that avoids this function call, but this is retained for
2544anyone who calls it with the Perl_ prefix */
f9a63242
JH
2545
2546U8 *
41ae6089 2547Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2548{
7918f24d 2549 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2550 PERL_UNUSED_CONTEXT;
f9a63242 2551
976c1b08
KW
2552 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2553}
2554
2555/*
df6bd76f 2556=for apidoc bytes_from_utf8_loc
976c1b08 2557
eda578be
KW
2558Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2559to where to store the location of the first character in C<"s"> that cannot be
976c1b08
KW
2560converted to non-UTF8.
2561
2562If that parameter is C<NULL>, this function behaves identically to
2563C<bytes_from_utf8>.
2564
2565Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2566C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2567
2568Otherwise, the function returns a newly created C<NUL>-terminated string
2569containing the non-UTF8 equivalent of the convertible first portion of
2570C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2571If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2572and C<*first_non_downgradable> is set to C<NULL>.
2573
8505db87 2574Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
976c1b08
KW
2575first character in the original string that wasn't converted. C<*is_utf8p> is
2576unchanged. Note that the new string may have length 0.
2577
2578Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2579C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2580converts as many characters in it as possible stopping at the first one it
385b74be 2581finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2582set to point to that. The function returns the portion that could be converted
2583in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2584not including the terminating C<NUL>. If the very first character in the
2585original could not be converted, C<*lenp> will be 0, and the new string will
2586contain just a single C<NUL>. If the entire input string was converted,
2587C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2588
2589Upon successful return, the number of variants in the converted portion of the
2590string can be computed by having saved the value of C<*lenp> before the call,
2591and subtracting the after-call value of C<*lenp> from it.
2592
2593=cut
2594
2595
2596*/
2597
2598U8 *
2599Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2600{
2601 U8 *d;
2602 const U8 *original = s;
2603 U8 *converted_start;
2604 const U8 *send = s + *lenp;
f9a63242 2605
976c1b08 2606 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2607
976c1b08
KW
2608 if (! *is_utf8p) {
2609 if (first_unconverted) {
2610 *first_unconverted = NULL;
2611 }
2612
2613 return (U8 *) original;
2614 }
2615
2616 Newx(d, (*lenp) + 1, U8);
2617
2618 converted_start = d;
7299a045
KW
2619 while (s < send) {
2620 U8 c = *s++;
2621 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2622
2623 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2624 * have to stop now */
2625 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2626 if (first_unconverted) {
2627 *first_unconverted = s - 1;
2628 goto finish_and_return;
2629 }
2630 else {
2631 Safefree(converted_start);
2632 return (U8 *) original;
2633 }
2634 }
2635
7299a045
KW
2636 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2637 s++;
38af28cf 2638 }
7299a045
KW
2639 *d++ = c;
2640 }
170a1c22 2641
976c1b08
KW
2642 /* Here, converted the whole of the input */
2643 *is_utf8p = FALSE;
2644 if (first_unconverted) {
2645 *first_unconverted = NULL;
170a1c22 2646 }
976c1b08
KW
2647
2648 finish_and_return:
46a08a6f
KW
2649 *d = '\0';
2650 *lenp = d - converted_start;
976c1b08
KW
2651
2652 /* Trim unused space */
2653 Renew(converted_start, *lenp + 1, U8);
2654
2655 return converted_start;
f9a63242
JH
2656}
2657
2658/*
87cea99e 2659=for apidoc bytes_to_utf8
6940069f 2660
09af0336 2661Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2662UTF-8.
09af0336 2663Returns a pointer to the newly-created string, and sets C<*lenp> to
9ff99fb3
KW
2664reflect the new length in bytes. The caller is responsible for arranging for
2665the memory used by this string to get freed.
6940069f 2666
3bc0c78c 2667Upon successful return, the number of variants in the string can be computed by
23b37b12 2668having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2669after-call value of C<*lenp>.
2670
75200dff 2671A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2672
2673If you want to convert to UTF-8 from encodings other than
2674the native (Latin1 or EBCDIC),
a1433954 2675see L</sv_recode_to_utf8>().
c9ada85f 2676
497711e7 2677=cut
6940069f
GS
2678*/
2679
2680U8*
09af0336 2681Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2682{
09af0336 2683 const U8 * const send = s + (*lenp);
6940069f
GS
2684 U8 *d;
2685 U8 *dst;
7918f24d
NC
2686
2687 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2688 PERL_UNUSED_CONTEXT;
6940069f 2689
d4662719
KW
2690 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2691 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
6940069f
GS
2692 dst = d;
2693
2694 while (s < send) {
55d09dc8
KW
2695 append_utf8_from_native_byte(*s, &d);
2696 s++;
6940069f 2697 }
2e11cf67 2698
6940069f 2699 *d = '\0';
09af0336 2700 *lenp = d-dst;
2e11cf67 2701
6940069f
GS
2702 return dst;
2703}
2704
a0ed51b3 2705/*
5fd26678
KW
2706 * Convert native UTF-16 to UTF-8. Called via the more public functions
2707 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2708 * little-endian,
a0ed51b3 2709 *
5fd26678
KW
2710 * 'p' is the UTF-16 input string, passed as a pointer to U8.
2711 * 'bytelen' is its length (must be even)
2712 * 'd' is the pointer to the destination buffer. The caller must ensure that
2713 * the space is large enough. The maximum expansion factor is 2 times
2714 * 'bytelen'. 1.5 if never going to run on an EBCDIC box.
2715 * '*newlen' will contain the number of bytes this function filled of 'd'.
2716 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2717 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
624504c5 2718 *
5fd26678
KW
2719 * The expansion factor is because UTF-16 requires 2 bytes for every code point
2720 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code
2721 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for
2722 * every code point below 0x1000; otherwise 4-5 bytes.
624504c5 2723 *
5fd26678
KW
2724 * The worst case is where every code point is below U+10000, hence requiring 2
2725 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2726 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2727 *
2728 * Do not use in-place. */
a0ed51b3
LW
2729
2730U8*
5fd26678
KW
2731Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2732 const bool high_byte, /* Which of next two bytes is
2733 high order */
2734 const bool low_byte)
a0ed51b3 2735{
dea0fc0b
JH
2736 U8* pend;
2737 U8* dstart = d;
2738
5fd26678 2739 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
7918f24d 2740
dea0fc0b 2741 if (bytelen & 1)
5fd26678
KW
2742 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2743 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
dea0fc0b
JH
2744 pend = p + bytelen;
2745
a0ed51b3 2746 while (p < pend) {
2fbb06c1 2747
5fd26678
KW
2748 /* Next 16 bits is what we want. (The bool is cast to U8 because on
2749 * platforms where a bool is implemented as a signed char, a compiler
2750 * warning may be generated) */
2751 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
1604cfb0 2752 p += 2;
2fbb06c1
KW
2753
2754 /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2755 * */
2756 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
ffd0a9d3 2757
46956fad
KW
2758#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2759#define LAST_HIGH_SURROGATE 0xDBFF
2760#define FIRST_LOW_SURROGATE 0xDC00
2761#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
ffd0a9d3 2762#define FIRST_IN_PLANE1 0x10000
e23c50db 2763
e23c50db
KW
2764 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2765 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2766 }
1604cfb0 2767 else {
5fd26678
KW
2768 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2769 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2770 LAST_LOW_SURROGATE)))
e23c50db 2771 {
1604cfb0 2772 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db 2773 }
5fd26678 2774
1604cfb0 2775 p += 2;
5fd26678
KW
2776
2777 /* Here uv is the high surrogate. Combine with low surrogate
2778 * just computed to form the actual U32 code point.
2779 *
2780 * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2781 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2782 + low_surrogate - FIRST_LOW_SURROGATE;
1604cfb0
MS
2783 }
2784 }
2fbb06c1 2785
5fd26678 2786 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
3c2b2fcb 2787 d = uvchr_to_utf8(d, uv);
a0ed51b3 2788 }
2fbb06c1 2789
dea0fc0b 2790 *newlen = d - dstart;
a0ed51b3
LW
2791 return d;
2792}
2793
5fd26678
KW
2794U8*
2795Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2796{
2797 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2798
2799 return utf16_to_utf8(p, d, bytelen, newlen);
2800}
a0ed51b3
LW
2801
2802U8*
f46dcac2 2803Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
a0ed51b3 2804{
7918f24d
NC
2805 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2806
5fd26678 2807 return utf16_to_utf8_reversed(p, d, bytelen, newlen);
a0ed51b3
LW
2808}
2809
6af810c4
KW
2810/*
2811 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2812 * big-endian and utf8_to_utf16_reversed() for little-endian,
2813 *
2814 * 's' is the UTF-8 input string, passed as a pointer to U8.
2815 * 'bytelen' is its length
2816 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The
2817 * caller must ensure that the space is large enough. The maximum
2818 * expansion factor is 2 times 'bytelen'. This happens when the input is
2819 * entirely single-byte ASCII, expanding to two-byte UTF-16.
2820 * '*newlen' will contain the number of bytes this function filled of 'd'.
2821 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2822 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
2823 *
2824 * Do not use in-place. */
2825U8*
2826Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2827 const bool high_byte, /* Which of next two bytes
2828 is high order */
2829 const bool low_byte)
2830{
2831 U8* send;
2832 U8* dstart = d;
2833
2834 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2835
2836 send = s + bytelen;
2837
2838 while (s < send) {
2839 STRLEN retlen;
3c2b2fcb 2840 UV uv = utf8n_to_uvchr(s, send - s, &retlen,
6af810c4 2841 /* No surrogates nor above-Unicode */
3c2b2fcb 2842 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
6af810c4
KW
2843
2844 /* The modern method is to keep going with malformed input,
2845 * substituting the REPLACEMENT CHARACTER */
2846 if (UNLIKELY(uv == 0 && *s != '\0')) {
2847 uv = UNICODE_REPLACEMENT;
2848 }
2849
2850 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */
2851
2852 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2853 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2854 + FIRST_HIGH_SURROGATE;
2855
2856 /* (The bool is cast to U8 because on platforms where a bool is
2857 * implemented as a signed char, a compiler warning may be
2858 * generated) */
2859 d[(U8) high_byte] = high_surrogate >> 8;
2860 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8);
2861 d += 2;
2862
2863 /* The low surrogate is the lower 10 bits plus the offset */
2864 uv &= nBIT_MASK(10);
2865 uv += FIRST_LOW_SURROGATE;
2866
2867 /* Drop down to output the low surrogate like it were a
2868 * non-surrogate */
2869 }
2870
2871 d[(U8) high_byte] = uv >> 8;
2872 d[(U8) low_byte] = uv & nBIT_MASK(8);
2873 d += 2;
2874
2875 s += retlen;
2876 }
2877
2878 *newlen = d - dstart;
2879 return d;
2880}
2881
922e8cb4
KW
2882bool
2883Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2884{
dc31b55c 2885 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
922e8cb4
KW
2886}
2887
5092f92a 2888bool
eba68aa0
KW
2889Perl__is_uni_perl_idcont(pTHX_ UV c)
2890{
c12658c9 2891 return _invlist_contains_cp(PL_utf8_perl_idcont, c);
eba68aa0
KW
2892}
2893
2894bool
f91dcd13
KW
2895Perl__is_uni_perl_idstart(pTHX_ UV c)
2896{
c12658c9 2897 return _invlist_contains_cp(PL_utf8_perl_idstart, c);
f91dcd13
KW
2898}
2899
3a4c58c9 2900UV
56576a04
KW
2901Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2902 const char S_or_s)
3a4c58c9
KW
2903{
2904 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2905 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2906 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2907 * either "SS" or "Ss". Which one to use is passed into the routine in
2908 * 'S_or_s' to avoid a test */
2909
2910 UV converted = toUPPER_LATIN1_MOD(c);
2911
2912 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2913
2914 assert(S_or_s == 'S' || S_or_s == 's');
2915
6f2d5cbc 2916 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
1604cfb0
MS
2917 characters in this range */
2918 *p = (U8) converted;
2919 *lenp = 1;
2920 return converted;
3a4c58c9
KW
2921 }
2922
2923 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2924 * which it maps to one of them, so as to only have to have one check for
2925 * it in the main case */
2926 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1604cfb0
MS
2927 switch (c) {
2928 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2929 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2930 break;
2931 case MICRO_SIGN:
2932 converted = GREEK_CAPITAL_LETTER_MU;
2933 break;
79e064b9
KW
2934#if UNICODE_MAJOR_VERSION > 2 \
2935 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2936 && UNICODE_DOT_DOT_VERSION >= 8)
1604cfb0
MS
2937 case LATIN_SMALL_LETTER_SHARP_S:
2938 *(p)++ = 'S';
2939 *p = S_or_s;
2940 *lenp = 2;
2941 return 'S';
79e064b9 2942#endif
1604cfb0
MS
2943 default:
2944 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
56576a04
KW
2945 " '%c' to map to '%c'",
2946 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1604cfb0
MS
2947 NOT_REACHED; /* NOTREACHED */
2948 }
3a4c58c9
KW
2949 }
2950
2951 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2952 *p = UTF8_TWO_BYTE_LO(converted);
2953 *lenp = 2;
2954
2955 return converted;
2956}
2957
fe63c520
KW
2958/* If compiled on an early Unicode version, there may not be auxiliary tables
2959 * */
2960#ifndef HAS_UC_AUX_TABLES
2961# define UC_AUX_TABLE_ptrs NULL
2962# define UC_AUX_TABLE_lengths NULL
2963#endif
2964#ifndef HAS_TC_AUX_TABLES
2965# define TC_AUX_TABLE_ptrs NULL
2966# define TC_AUX_TABLE_lengths NULL
2967#endif
2968#ifndef HAS_LC_AUX_TABLES
2969# define LC_AUX_TABLE_ptrs NULL
2970# define LC_AUX_TABLE_lengths NULL
2971#endif
2972#ifndef HAS_CF_AUX_TABLES
2973# define CF_AUX_TABLE_ptrs NULL
2974# define CF_AUX_TABLE_lengths NULL
2975#endif
fe63c520 2976
50bda2c3
KW
2977/* Call the function to convert a UTF-8 encoded character to the specified case.
2978 * Note that there may be more than one character in the result.
6fa2f9bc
KW
2979 * 's' is a pointer to the first byte of the input character
2980 * 'd' will be set to the first byte of the string of changed characters. It
50bda2c3 2981 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
6fa2f9bc 2982 * 'lenp' will be set to the length in bytes of the string of changed characters
50bda2c3 2983 *
56576a04 2984 * The functions return the ordinal of the first character in the string of
6fa2f9bc 2985 * 'd' */
56576a04 2986#define CALL_UPPER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2987 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
2988 Uppercase_Mapping_invmap, \
2989 UC_AUX_TABLE_ptrs, \
2990 UC_AUX_TABLE_lengths, \
2991 "uppercase")
56576a04 2992#define CALL_TITLE_CASE(uv, s, d, lenp) \
8946fcd9
KW
2993 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
2994 Titlecase_Mapping_invmap, \
2995 TC_AUX_TABLE_ptrs, \
2996 TC_AUX_TABLE_lengths, \
2997 "titlecase")
56576a04 2998#define CALL_LOWER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2999 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
3000 Lowercase_Mapping_invmap, \
3001 LC_AUX_TABLE_ptrs, \
3002 LC_AUX_TABLE_lengths, \
3003 "lowercase")
3004
50bda2c3 3005
b9992569
KW
3006/* This additionally has the input parameter 'specials', which if non-zero will
3007 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 3008 * folding); otherwise, when zero, this implies a simple case fold */
56576a04 3009#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
8946fcd9
KW
3010 (specials) \
3011 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
3012 Case_Folding_invmap, \
3013 CF_AUX_TABLE_ptrs, \
3014 CF_AUX_TABLE_lengths, \
3015 "foldcase") \
3016 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
3017 Simple_Case_Folding_invmap, \
3018 NULL, NULL, \
3019 "foldcase")
c3fd2246 3020
84afefe6
JH
3021UV
3022Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 3023{
a1433954
KW
3024 /* Convert the Unicode character whose ordinal is <c> to its uppercase
3025 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
3026 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
3027 * the changed version may be longer than the original character.
3028 *
3029 * The ordinal of the first character of the changed version is returned
3030 * (but note, as explained above, that there may be more.) */
3031
7918f24d
NC
3032 PERL_ARGS_ASSERT_TO_UNI_UPPER;
3033
3a4c58c9 3034 if (c < 256) {
1604cfb0 3035 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
3a4c58c9
KW
3036 }
3037
a13f1de4 3038 return CALL_UPPER_CASE(c, NULL, p, lenp);
a0ed51b3
LW
3039}
3040
84afefe6
JH
3041UV
3042Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 3043{
7918f24d
NC
3044 PERL_ARGS_ASSERT_TO_UNI_TITLE;
3045
3a4c58c9 3046 if (c < 256) {
1604cfb0 3047 return _to_upper_title_latin1((U8) c, p, lenp, 's');
3a4c58c9
KW
3048 }
3049
a13f1de4 3050 return CALL_TITLE_CASE(c, NULL, p, lenp);
a0ed51b3
LW
3051}
3052
afc16117 3053STATIC U8
eaf412bf 3054S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
3055{
3056 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 3057 * those, converting the result to UTF-8. Since the result is always just
a1433954 3058 * one character, we allow <p> to be NULL */
afc16117
KW
3059
3060 U8 converted = toLOWER_LATIN1(c);
3061
eaf412bf
KW
3062 PERL_UNUSED_ARG(dummy);
3063
afc16117 3064 if (p != NULL) {
1604cfb0
MS
3065 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
3066 *p = converted;
3067 *lenp = 1;
3068 }
3069 else {
430c9760
KW
3070 /* Result is known to always be < 256, so can use the EIGHT_BIT
3071 * macros */
1604cfb0
MS
3072 *p = UTF8_EIGHT_BIT_HI(converted);
3073 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
3074 *lenp = 2;
3075 }
afc16117
KW
3076 }
3077 return converted;
3078}
3079
84afefe6
JH
3080UV
3081Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 3082{
7918f24d
NC
3083 PERL_ARGS_ASSERT_TO_UNI_LOWER;
3084
afc16117 3085 if (c < 256) {
1604cfb0 3086 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
3087 }
3088
a13f1de4 3089 return CALL_LOWER_CASE(c, NULL, p, lenp);
a0ed51b3
LW
3090}
3091
84afefe6 3092UV
7c0ab950 3093Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 3094{
51910141 3095 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 3096 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 3097 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
3098 *
3099 * Not to be used for locale folds
51910141 3100 */
f673fad4 3101
a1dde8de
KW
3102 UV converted;
3103
3104 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3105
1ca267a5
KW
3106 assert (! (flags & FOLD_FLAGS_LOCALE));
3107
659a7c2d 3108 if (UNLIKELY(c == MICRO_SIGN)) {
1604cfb0 3109 converted = GREEK_SMALL_LETTER_MU;
a1dde8de 3110 }
9b63e895
KW
3111#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3112 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3113 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
3114 else if ( (flags & FOLD_FLAGS_FULL)
3115 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3116 {
1ca267a5
KW
3117 /* If can't cross 127/128 boundary, can't return "ss"; instead return
3118 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3119 * under those circumstances. */
3120 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
c5b28134 3121 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
1ca267a5
KW
3122 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3123 p, *lenp, U8);
3124 return LATIN_SMALL_LETTER_LONG_S;
3125 }
3126 else {
4f489194
KW
3127 *(p)++ = 's';
3128 *p = 's';
3129 *lenp = 2;
3130 return 's';
1ca267a5 3131 }
a1dde8de 3132 }
9b63e895 3133#endif
a1dde8de
KW
3134 else { /* In this range the fold of all other characters is their lower
3135 case */
1604cfb0 3136 converted = toLOWER_LATIN1(c);
a1dde8de
KW
3137 }
3138
6f2d5cbc 3139 if (UVCHR_IS_INVARIANT(converted)) {
1604cfb0
MS
3140 *p = (U8) converted;
3141 *lenp = 1;
a1dde8de
KW
3142 }
3143 else {
1604cfb0
MS
3144 *(p)++ = UTF8_TWO_BYTE_HI(converted);
3145 *p = UTF8_TWO_BYTE_LO(converted);
3146 *lenp = 2;
a1dde8de
KW
3147 }
3148
3149 return converted;
3150}
3151
3152UV
31f05a37 3153Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 3154{
4b593389 3155
a0270393
KW
3156 /* Not currently externally documented, and subject to change
3157 * <flags> bits meanings:
3158 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
3159 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3160 * locale are to be used.
a0270393
KW
3161 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3162 */
4b593389 3163
36bb2ab6 3164 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 3165
780fcc9f 3166 if (flags & FOLD_FLAGS_LOCALE) {
b257a28c
KW
3167 /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3168 * except for potentially warning */
1629a27e 3169 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
b257a28c 3170 if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
780fcc9f
KW
3171 flags &= ~FOLD_FLAGS_LOCALE;
3172 }
3173 else {
e7b7ac46 3174 goto needs_full_generality;
780fcc9f 3175 }
31f05a37
KW
3176 }
3177
a1dde8de 3178 if (c < 256) {
e7b7ac46 3179 return _to_fold_latin1((U8) c, p, lenp,
1604cfb0 3180 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
3181 }
3182
2f306ab9 3183 /* Here, above 255. If no special needs, just use the macro */
a0270393 3184 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1604cfb0 3185 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 3186 }
567b353c 3187 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
1604cfb0
MS
3188 the special flags. */
3189 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
3190
3191 needs_full_generality:
1604cfb0 3192 uvchr_to_utf8(utf8_c, c);
298e8f0e 3193 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
56576a04 3194 p, lenp, flags);
a0270393 3195 }
84afefe6
JH
3196}
3197
26483009 3198PERL_STATIC_INLINE bool
dd1a3ba7
KW
3199S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3200 SV* const invlist)
da8c1a98
KW
3201{
3202 /* returns a boolean giving whether or not the UTF8-encoded character that
eb1f4bb4
KW
3203 * starts at <p>, and extending no further than <e - 1> is in the inversion
3204 * list <invlist>. */
da8c1a98 3205
b68ffe0c
KW
3206 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3207
dd1a3ba7 3208 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
da8c1a98 3209
b68ffe0c 3210 if (cp == 0 && (p >= e || *p != '\0')) {
da8c1a98
KW
3211 _force_out_malformed_utf8_message(p, e, 0, 1);
3212 NOT_REACHED; /* NOTREACHED */
3213 }
3214
eb1f4bb4 3215 assert(invlist);
b68ffe0c 3216 return _invlist_contains_cp(invlist, cp);
da8c1a98
KW
3217}
3218
059703b0 3219#if 0 /* Not currently used, but may be needed in the future */
dd1a3ba7
KW
3220PERLVAR(I, seen_deprecated_macro, HV *)
3221
34aeb2e9
KW
3222STATIC void
3223S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3224 const char * const alternative,
3225 const bool use_locale,
3226 const char * const file,
3227 const unsigned line)
3228{
3229 const char * key;
3230
3231 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3232
3233 if (ckWARN_d(WARN_DEPRECATED)) {
3234
3235 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
1604cfb0 3236 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
34aeb2e9
KW
3237 if (! PL_seen_deprecated_macro) {
3238 PL_seen_deprecated_macro = newHV();
3239 }
3240 if (! hv_store(PL_seen_deprecated_macro, key,
3241 strlen(key), &PL_sv_undef, 0))
3242 {
1604cfb0 3243 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
34aeb2e9
KW
3244 }
3245
c44e9413 3246 if (instr(file, "mathoms.c")) {
607313a1 3247 Perl_warner(aTHX_ WARN_DEPRECATED,
5203d63d 3248 "In %s, line %d, starting in Perl v5.32, %s()"
607313a1
KW
3249 " will be removed. Avoid this message by"
3250 " converting to use %s().\n",
3251 file, line, name, alternative);
3252 }
3253 else {
34aeb2e9 3254 Perl_warner(aTHX_ WARN_DEPRECATED,
5203d63d 3255 "In %s, line %d, starting in Perl v5.32, %s() will"
34aeb2e9
KW
3256 " require an additional parameter. Avoid this"
3257 " message by converting to use %s().\n",
3258 file, line, name, alternative);
607313a1 3259 }
34aeb2e9
KW
3260 }
3261 }
3262}
059703b0 3263#endif
922e8cb4
KW
3264
3265bool
dd1a3ba7 3266Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
da8c1a98 3267{
dd1a3ba7 3268 PERL_ARGS_ASSERT__IS_UTF8_FOO;
da8c1a98 3269
dd1a3ba7 3270 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
da8c1a98
KW
3271}
3272
3273bool
dd1a3ba7 3274Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
da8c1a98 3275{
dd1a3ba7 3276 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
da8c1a98 3277
dd1a3ba7 3278 return is_utf8_common(p, e, PL_utf8_perl_idstart);
da8c1a98
KW
3279}
3280
3281bool
dd1a3ba7 3282Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
c11ff943 3283{
dd1a3ba7 3284 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
7dbf68d2 3285
dd1a3ba7 3286 return is_utf8_common(p, e, PL_utf8_perl_idcont);
7dbf68d2
KW
3287}
3288
6a4a25f4 3289STATIC UV
8447f104
KW
3290S_to_case_cp_list(pTHX_
3291 const UV original,
3292 const U32 ** const remaining_list,
3293 Size_t * remaining_count,
3294 SV *invlist, const I32 * const invmap,
3295 const U32 * const * const aux_tables,
3296 const U8 * const aux_table_lengths,
3297 const char * const normal)
b9992569 3298{
7b47c615
KW
3299 SSize_t index;
3300 I32 base;
3301
8447f104
KW
3302 /* Calculate the changed case of code point 'original'. The first code
3303 * point of the changed case is returned.
57aed126
KW
3304 *
3305 * If 'remaining_count' is not NULL, *remaining_count will be set to how
8447f104 3306 * many *other* code points are in the changed case. If non-zero and
57aed126
KW
3307 * 'remaining_list' is also not NULL, *remaining_list will be set to point
3308 * to a non-modifiable array containing the second and potentially third
3309 * code points in the changed case. (Unicode guarantees a maximum of 3.)
3310 * Note that this means that *remaining_list is undefined unless there are
3311 * multiple code points, and the caller has chosen to find out how many by
3312 * making 'remaining_count' not NULL.
30613bdc 3313 *
7b47c615
KW
3314 * 'normal' is a string to use to name the new case in any generated
3315 * messages, as a fallback if the operation being used is not available.
3316 *
3317 * The casing to use is given by the data structures in the remaining
3318 * arguments.
3319 */
30613bdc 3320
7b47c615
KW
3321 PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3322
265ec345
KW
3323 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3324 * that covers all possible inputs. See [perl #133365] */
3325 index = _invlist_search(invlist, original);
3326 base = invmap[index];
36eaa811 3327
57aed126
KW
3328 /* Most likely, the case change will contain just a single code point */
3329 if (remaining_count) {
3330 *remaining_count = 0;
3331 }
3332
a2475cdc 3333 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */
36eaa811 3334
a2475cdc
KW
3335 /* At this bottom level routine is where we warn about illegal code
3336 * points */
3337 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
265ec345
KW
3338 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3339 if (ckWARN_d(WARN_SURROGATE)) {
3340 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3341 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3342 "Operation \"%s\" returns its argument for"
3343 " UTF-16 surrogate U+%04" UVXf, desc, original);
3bfc1e70 3344 }
a2475cdc 3345 }
265ec345
KW
3346 else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3347 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3348 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3349 }
3350 if (ckWARN_d(WARN_NON_UNICODE)) {
3351 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3352 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3353 "Operation \"%s\" returns its argument for"
3354 " non-Unicode code point 0x%04" UVXf, desc, original);
3355 }
4a8240a3 3356 }
4a8240a3 3357
265ec345
KW
3358 /* Note that non-characters are perfectly legal, so no warning
3359 * should be given. */
8946fcd9 3360 }
1feea2c7 3361
265ec345
KW
3362 return original;
3363 }
3364
3365 if (LIKELY(base > 0)) { /* means original mapped to a single code point,
3366 different from itself */
265ec345
KW
3367 return base + original - invlist_array(invlist)[index];
3368 }
7b47c615 3369
265ec345
KW
3370 /* Here 'base' is negative. That means the mapping is 1-to-many, and
3371 * requires an auxiliary table look up. abs(base) gives the index into a
3372 * list of such tables which points to the proper aux table. And a
3373 * parallel list gives the length of each corresponding aux table. Skip
3374 * the first entry in the *remaining returns, as it is returned by the
3375 * function. */
3376 base = -base;
57aed126
KW
3377 if (remaining_count) {
3378 *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3379
3380 if (remaining_list) {
3381 *remaining_list = aux_tables[base] + 1;
3382 }
3383 }
30613bdc 3384
265ec345 3385 return (UV) aux_tables[base][0];
7b47c615 3386}
8946fcd9 3387
7b47c615
KW
3388STATIC UV
3389S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3390 U8* ustrp, STRLEN *lenp,
3391 SV *invlist, const I32 * const invmap,
3392 const U32 * const * const aux_tables,
3393 const U8 * const aux_table_lengths,
3394 const char * const normal)
3395{
3396 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to
3397 * the beginning of the (assumed to be valid) UTF-8 representation of
3398 * 'original'. 'normal' is a string to use to name the new case in any
3399 * generated messages, as a fallback if the operation being used is not
3400 * available. The new case is given by the data structures in the
3401 * remaining arguments.
3402 *
3403 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3404 * entire changed case string, and the return value is the first code point
3405 * in that string
3406 *
3407 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3408 * since the changed version may be longer than the original character. */
3409
3410 const U32 * remaining_list;
3411 Size_t remaining_count;
3412 UV first = to_case_cp_list(original,
3413 &remaining_list, &remaining_count,
3414 invlist, invmap,
3415 aux_tables, aux_table_lengths,
3416 normal);
3417
3418 PERL_ARGS_ASSERT__TO_UTF8_CASE;
3419
3420 /* If the code point maps to itself and we already have its representation,
3421 * copy it instead of recalculating */
3422 if (original == first && p) {
3423 *lenp = UTF8SKIP(p);
cbe07460 3424
a13f1de4 3425 if (p != ustrp) { /* Don't copy onto itself */
7b47c615 3426 Copy(p, ustrp, *lenp, U8);
a13f1de4 3427 }
a13f1de4
KW
3428 }
3429 else {
7b47c615
KW
3430 U8 * d = ustrp;
3431 Size_t i;
3432
3433 d = uvchr_to_utf8(d, first);
3434
3435 for (i = 0; i < remaining_count; i++) {
3436 d = uvchr_to_utf8(d, remaining_list[i]);
265ec345 3437 }
2a37f04d 3438
7b47c615
KW
3439 *d = '\0';
3440 *lenp = d - ustrp;
3441 }
cbe07460 3442
7b47c615 3443 return first;
a0ed51b3
LW
3444}
3445
b74fe592 3446Size_t
1b292063 3447Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
40d2776f 3448 const U32 ** remaining_folds_to)
b74fe592
KW
3449{
3450 /* Returns the count of the number of code points that fold to the input
3451 * 'cp' (besides itself).
3452 *
3453 * If the return is 0, there is nothing else that folds to it, and
3454 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3455 *
3456 * If the return is 1, '*first_folds_to' is set to the single code point,
3457 * and '*remaining_folds_to' is set to NULL.
3458 *
3459 * Otherwise, '*first_folds_to' is set to a code point, and
3460 * '*remaining_fold_to' is set to an array that contains the others. The
3461 * length of this array is the returned count minus 1.
3462 *
3463 * The reason for this convolution is to avoid having to deal with
3464 * allocating and freeing memory. The lists are already constructed, so
3465 * the return can point to them, but single code points aren't, so would
1b292063
KW
3466 * need to be constructed if we didn't employ something like this API
3467 *
3468 * The code points returned by this function are all legal Unicode, which
3469 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3470 * constructed with this size (to save space and memory), and we return
3471 * pointers, so they must be this size */
b74fe592 3472
69352d88
KW
3473 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3474 * that covers all possible inputs. See [perl #133365] */
b74fe592 3475 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
40d2776f 3476 I32 base = _Perl_IVCF_invmap[index];
b74fe592
KW
3477
3478 PERL_ARGS_ASSERT__INVERSE_FOLDS;
3479
3480 if (base == 0) { /* No fold */
3481 *first_folds_to = 0;
3482 *remaining_folds_to = NULL;
3483 return 0;
3484 }
3485
3486#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
3487
3488 assert(base > 0);
3489
3490#else
3491
3492 if (UNLIKELY(base < 0)) { /* Folds to more than one character */
3493
3494 /* The data structure is set up so that the absolute value of 'base' is
3495 * an index into a table of pointers to arrays, with the array
3496 * corresponding to the index being the list of code points that fold
3497 * to 'cp', and the parallel array containing the length of the list
3498 * array */
3499 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
99f30495
KW
3500 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3501 /* +1 excludes first_folds_to */
b74fe592
KW
3502 return IVCF_AUX_TABLE_lengths[-base];
3503 }
3504
3505#endif
3506
3507 /* Only the single code point. This works like 'fc(G) = G - A + a' */
40d2776f
KW
3508 *first_folds_to = (U32) (base + cp
3509 - invlist_array(PL_utf8_foldclosures)[index]);
b74fe592
KW
3510 *remaining_folds_to = NULL;
3511 return 1;
3512}
3513
051a06d4 3514STATIC UV
56576a04
KW
3515S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3516 U8* const ustrp, STRLEN *lenp)
051a06d4 3517{
4a4088c4 3518 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3519 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3520 * result contains a character that crosses the 255/256 boundary, disallow
3521 * the change, and return the original code point. See L<perlfunc/lc> for
3522 * why;
051a06d4 3523 *
a1433954
KW
3524 * p points to the original string whose case was changed; assumed
3525 * by this routine to be well-formed
051a06d4 3526 * result the code point of the first character in the changed-case string
56576a04
KW
3527 * ustrp points to the changed-case string (<result> represents its
3528 * first char)
051a06d4
KW
3529 * lenp points to the length of <ustrp> */
3530
3531 UV original; /* To store the first code point of <p> */
3532
3533 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3534
a4f12ed7 3535 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3536
3537 /* We know immediately if the first character in the string crosses the
5e45c680 3538 * boundary, so can skip testing */
051a06d4
KW
3539 if (result > 255) {
3540
1604cfb0
MS
3541 /* Look at every character in the result; if any cross the
3542 * boundary, the whole thing is disallowed */
3543 U8* s = ustrp + UTF8SKIP(ustrp);
3544 U8* e = ustrp + *lenp;
3545 while (s < e) {
3546 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3547 goto bad_crossing;
3548 }
3549 s += UTF8SKIP(s);
3550 }
051a06d4 3551
613abc6d
KW
3552 /* Here, no characters crossed, result is ok as-is, but we warn. */
3553 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
1604cfb0 3554 return result;
051a06d4
KW
3555 }
3556
7b52d656 3557 bad_crossing:
051a06d4
KW
3558
3559 /* Failed, have to return the original */
4b88fb76 3560 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3561
3562 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3563 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
56576a04
KW
3564 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3565 " locale; resolved to \"\\x{%" UVXf "}\".",
357aadde 3566 OP_DESC(PL_op),
ab0b796c
KW
3567 original,
3568 original);
051a06d4
KW
3569 Copy(p, ustrp, *lenp, char);
3570 return original;
3571}
3572
b257a28c
KW
3573STATIC UV
3574S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3575 U8 * ustrp, STRLEN *lenp)
3576{
3577 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3578 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3579 * Otherwise, it returns the first code point of the Turkic foldcased
3580 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3581 * contain *lenp bytes
3582 *
3583 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3584 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3585 * DOTLESS I */
3586
3587 PERL_ARGS_ASSERT_TURKIC_FC;
3588 assert(e > p);
3589
3590 if (UNLIKELY(*p == 'I')) {
3591 *lenp = 2;
3592 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3593 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3594 return LATIN_SMALL_LETTER_DOTLESS_I;
3595 }
3596
3597 if (UNLIKELY(memBEGINs(p, e - p,
3598 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3599 {
3600 *lenp = 1;
3601 *ustrp = 'i';
3602 return 'i';
3603 }
3604
3605 return 0;
3606}
3607
3608STATIC UV
3609S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3610 U8 * ustrp, STRLEN *lenp)
3611{
3612 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3613 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3614 * Otherwise, it returns the first code point of the Turkic lowercased
3615 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3616 * contain *lenp bytes */
3617
3618 PERL_ARGS_ASSERT_TURKIC_LC;
3619 assert(e > p0);
3620
3621 /* A 'I' requires context as to what to do */
3622 if (UNLIKELY(*p0 == 'I')) {
3623 const U8 * p = p0 + 1;
3624
3625 /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3626 * modified by a dot above lowercases to 'i' even in turkic locales. */
3627 while (p < e) {
3628 UV cp;
3629
3630 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3631 ustrp[0] = 'i';
3632 *lenp = 1;
3633 return 'i';
3634 }
3635
3636 /* For the dot above to modify the 'I', it must be part of a
3637 * combining sequence immediately following the 'I', and no other
3638 * modifier with a ccc of 230 may intervene */
3639 cp = utf8_to_uvchr_buf(p, e, NULL);
3640 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3641 break;
3642 }
3643
3644 /* Here the combining sequence continues */
3645 p += UTF8SKIP(p);
3646 }
3647 }
3648
3649 /* In all other cases the lc is the same as the fold */
3650 return turkic_fc(p0, e, ustrp, lenp);
3651}
3652
3653STATIC UV
3654S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3655 U8 * ustrp, STRLEN *lenp)
3656{
3657 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3658 * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3659 * Otherwise, it returns the first code point of the Turkic upper or
3660 * title-cased sequence, and the entire sequence will be stored in *ustrp.
3661 * ustrp will contain *lenp bytes
3662 *
3663 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
a3815e44 3664 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
b257a28c
KW
3665 * DOTLESS I */
3666
3667 PERL_ARGS_ASSERT_TURKIC_UC;
3668 assert(e > p);
3669
3670 if (*p == 'i') {
3671 *lenp = 2;
3672 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3673 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3674 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3675 }
3676
3677 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3678 *lenp = 1;
3679 *ustrp = 'I';
3680 return 'I';
3681 }
3682
3683 return 0;
3684}
3685
eaf412bf
KW
3686/* The process for changing the case is essentially the same for the four case
3687 * change types, except there are complications for folding. Otherwise the
3688 * difference is only which case to change to. To make sure that they all do
3689 * the same thing, the bodies of the functions are extracted out into the
3690 * following two macros. The functions are written with the same variable
3691 * names, and these are known and used inside these macros. It would be
3692 * better, of course, to have inline functions to do it, but since different
3693 * macros are called, depending on which case is being changed to, this is not
3694 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3695 * function can start with the common start macro, then finish with its special
3696 * handling; while the other three cases can just use the common end macro.
3697 *
3698 * The algorithm is to use the proper (passed in) macro or function to change
3699 * the case for code points that are below 256. The macro is used if using
3700 * locale rules for the case change; the function if not. If the code point is
3701 * above 255, it is computed from the input UTF-8, and another macro is called
3702 * to do the conversion. If necessary, the output is converted to UTF-8. If
3703 * using a locale, we have to check that the change did not cross the 255/256
3704 * boundary, see check_locale_boundary_crossing() for further details.
3705 *
3706 * The macros are split with the correct case change for the below-256 case
3707 * stored into 'result', and in the middle of an else clause for the above-255
3708 * case. At that point in the 'else', 'result' is not the final result, but is
3709 * the input code point calculated from the UTF-8. The fold code needs to
3710 * realize all this and take it from there.
3711 *
b257a28c
KW
3712 * To deal with Turkic locales, the function specified by the parameter
3713 * 'turkic' is called when appropriate.
3714 *
eaf412bf
KW
3715 * If you read the two macros as sequential, it's easier to understand what's
3716 * going on. */
1eafd03a 3717#define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \
b257a28c 3718 L1_func_extra_param, turkic) \
a239b1e2 3719 \
eaf412bf 3720 if (flags & (locale_flags)) { \
1629a27e 3721 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
eaf412bf 3722 if (IN_UTF8_CTYPE_LOCALE) { \
b257a28c
KW
3723 if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
3724 UV ret = turkic(p, e, ustrp, lenp); \
3725 if (ret) return ret; \
3726 } \
3727 \
3728 /* Otherwise, treat a UTF-8 locale as not being in locale at \
3729 * all */ \
eaf412bf
KW
3730 flags &= ~(locale_flags); \
3731 } \
eaf412bf
KW
3732 } \
3733 \
3734 if (UTF8_IS_INVARIANT(*p)) { \
3735 if (flags & (locale_flags)) { \
1eafd03a 3736 result = libc_change_function(*p); \
eaf412bf
KW
3737 } \
3738 else { \
3739 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3740 } \
3741 } \
a239b1e2 3742 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
1a751160 3743 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
eaf412bf 3744 if (flags & (locale_flags)) { \
1eafd03a 3745 result = libc_change_function(c); \
eaf412bf
KW
3746 } \
3747 else { \
1a751160 3748 return L1_func(c, ustrp, lenp, L1_func_extra_param); \
eaf412bf
KW
3749 } \
3750 } \
fa8ab374
KW
3751 else { /* malformed UTF-8 or ord above 255 */ \
3752 STRLEN len_result; \
fa8ab374
KW
3753 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3754 if (len_result == (STRLEN) -1) { \
059703b0 3755 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
fa8ab374 3756 }
eaf412bf
KW
3757
3758#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3759 result = change_macro(result, p, ustrp, lenp); \
3760 \
3761 if (flags & (locale_flags)) { \
3762 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3763 } \
3764 return result; \
3765 } \
3766 \
3767 /* Here, used locale rules. Convert back to UTF-8 */ \
3768 if (UTF8_IS_INVARIANT(result)) { \
3769 *ustrp = (U8) result; \
3770 *lenp = 1; \
3771 } \
3772 else { \
3773 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3774 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3775 *lenp = 2; \
3776 } \
3777 \
3778 return result;
3779
051a06d4 3780/* Not currently externally documented, and subject to change:
a3815e44 3781 * <flags> is set iff the rules from the current underlying locale are to
31f05a37 3782 * be used. */
051a06d4 3783
2104c8d9 3784UV
607313a1
KW
3785Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3786 const U8 *e,
3787 U8* ustrp,
3788 STRLEN *lenp,
059703b0 3789 bool flags)
a0ed51b3 3790{
051a06d4
KW
3791 UV result;
3792
3793 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 3794
eaf412bf
KW
3795 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3796 /* 2nd char of uc(U+DF) is 'S' */
1eafd03a 3797 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
b257a28c 3798 turkic_uc);
eaf412bf 3799 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
983ffd37 3800}
a0ed51b3 3801
051a06d4 3802/* Not currently externally documented, and subject to change:
31f05a37
KW
3803 * <flags> is set iff the rules from the current underlying locale are to be
3804 * used. Since titlecase is not defined in POSIX, for other than a
3805 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 3806 */
051a06d4 3807
983ffd37 3808UV
607313a1
KW
3809Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3810 const U8 *e,
3811 U8* ustrp,
3812 STRLEN *lenp,
059703b0 3813 bool flags)
983ffd37 3814{
051a06d4
KW
3815 UV result;
3816
3817 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 3818
eaf412bf 3819 /* 2nd char of ucfirst(U+DF) is 's' */
1eafd03a 3820 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
b257a28c 3821 turkic_uc);
eaf412bf 3822 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
a0ed51b3
LW
3823}
3824
051a06d4 3825/* Not currently externally documented, and subject to change:
a3815e44 3826 * <flags> is set iff the rules from the current underlying locale are to
31f05a37
KW
3827 * be used.
3828 */
051a06d4 3829
2104c8d9 3830UV
607313a1
KW
3831Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3832 const U8 *e,
3833 U8* ustrp,
3834 STRLEN *lenp,
059703b0 3835 bool flags)
a0ed51b3 3836{
051a06d4
KW
3837 UV result;
3838
051a06d4 3839 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 3840
1eafd03a 3841 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
b257a28c 3842 turkic_lc);
eaf412bf 3843 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
b4e400f9
JH
3844}
3845
051a06d4
KW
3846/* Not currently externally documented, and subject to change,
3847 * in <flags>
31f05a37
KW
3848 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3849 * locale are to be used.
051a06d4
KW
3850 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3851 * otherwise simple folds
a0270393
KW
3852 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3853 * prohibited
445bf929 3854 */
36bb2ab6 3855
b4e400f9 3856UV
607313a1
KW
3857Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3858 const U8 *e,
3859 U8* ustrp,
3860 STRLEN *lenp,
059703b0 3861 U8 flags)
b4e400f9 3862{
051a06d4
KW
3863 UV result;
3864
36bb2ab6 3865 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 3866
a0270393
KW
3867 /* These are mutually exclusive */
3868 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3869
50ba90ff
KW
3870 assert(p != ustrp); /* Otherwise overwrites */
3871
1eafd03a 3872 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
b257a28c
KW
3873 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3874 turkic_fc);
31f05a37 3875
1604cfb0 3876 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 3877
1604cfb0 3878 if (flags & FOLD_FLAGS_LOCALE) {
1ca267a5 3879
76f2ffcd 3880# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
3881# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3882# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd 3883
538e84ed
KW
3884 /* Special case these two characters, as what normally gets
3885 * returned under locale doesn't work */
db540106 3886 if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
1ca267a5 3887 {
ab0b796c
KW
3888 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3889 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3890 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3891 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
3892 goto return_long_s;
3893 }
0766489e
KW
3894 else
3895#endif
db540106 3896 if (memBEGINs((char *) p, e - p, LONG_S_T))
9fc2026f 3897 {
ab0b796c
KW
3898 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3899 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3900 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3901 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
3902 goto return_ligature_st;
3903 }
74894415
KW
3904
3905#if UNICODE_MAJOR_VERSION == 3 \
3906 && UNICODE_DOT_VERSION == 0 \
3907 && UNICODE_DOT_DOT_VERSION == 1
3908# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3909
3910 /* And special case this on this Unicode version only, for the same
3911 * reaons the other two are special cased. They would cross the
3912 * 255/256 boundary which is forbidden under /l, and so the code
3913 * wouldn't catch that they are equivalent (which they are only in
3914 * this release) */
db540106 3915 else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
74894415
KW
3916 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3917 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3918 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3919 "resolved to \"\\x{0131}\".");
3920 goto return_dotless_i;
3921 }
3922#endif
3923
1604cfb0
MS
3924 return check_locale_boundary_crossing(p, result, ustrp, lenp);
3925 }
3926 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3927 return result;
3928 }
3929 else {
3930 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3931 * character above the ASCII range, and the result should not
3932 * contain an ASCII character. */
a0270393 3933
1604cfb0 3934 UV original; /* To store the first code point of <p> */
a0270393 3935
1604cfb0
MS
3936 /* Look at every character in the result; if any cross the
3937 * boundary, the whole thing is disallowed */
3938 U8* s = ustrp;
3939 U8* send = ustrp + *lenp;
3940 while (s < send) {
3941 if (isASCII(*s)) {
3942 /* Crossed, have to return the original */
3943 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3944
9fc2026f 3945 /* But in these instances, there is an alternative we can
1ca267a5 3946 * return that is valid */
0766489e
KW
3947 if (original == LATIN_SMALL_LETTER_SHARP_S
3948#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3949 || original == LATIN_CAPITAL_LETTER_SHARP_S
3950#endif
3951 ) {
1ca267a5
KW
3952 goto return_long_s;
3953 }
9fc2026f
KW
3954 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3955 goto return_ligature_st;
3956 }
74894415
KW
3957#if UNICODE_MAJOR_VERSION == 3 \
3958 && UNICODE_DOT_VERSION == 0 \
3959 && UNICODE_DOT_DOT_VERSION == 1
3960
3961 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3962 goto return_dotless_i;
3963 }
3964#endif
1604cfb0
MS
3965 Copy(p, ustrp, *lenp, char);
3966 return original;
3967 }
3968 s += UTF8SKIP(s);
3969 }
3970
3971 /* Here, no characters crossed, result is ok as-is */
3972 return result;
3973 }
051a06d4
KW
3974 }
3975
4a4088c4 3976 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4 3977 if (UTF8_IS_INVARIANT(result)) {
1604cfb0
MS
3978 *ustrp = (U8) result;
3979 *lenp = 1;
051a06d4
KW
3980 }
3981 else {
1604cfb0
MS
3982 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3983 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3984 *lenp = 2;
051a06d4
KW
3985 }
3986
051a06d4 3987 return result;
1ca267a5
KW
3988
3989 return_long_s:
3990 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3991 * folds to a string of two of these characters. By returning this
3992 * instead, then, e.g.,
3993 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3994 * works. */
3995
c5b28134 3996 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
68a23e40 3997 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1ca267a5
KW
3998 ustrp, *lenp, U8);
3999 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
4000
4001 return_ligature_st:
4002 /* Two folds to 'st' are prohibited by the options; instead we pick one and
4003 * have the other one fold to it */
4004
c5b28134 4005 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
9fc2026f
KW
4006 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
4007 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
4008
4009#if UNICODE_MAJOR_VERSION == 3 \
4010 && UNICODE_DOT_VERSION == 0 \
4011 && UNICODE_DOT_DOT_VERSION == 1
4012
4013 return_dotless_i:
c5b28134 4014 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
74894415
KW
4015 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
4016 return LATIN_SMALL_LETTER_DOTLESS_I;
4017
4018#endif
4019
a0ed51b3
LW
4020}
4021
0876b9a0 4022bool
5aaab254 4023Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
0876b9a0
KW
4024{
4025 /* May change: warns if surrogates, non-character code points, or
56576a04
KW
4026 * non-Unicode code points are in 's' which has length 'len' bytes.
4027 * Returns TRUE if none found; FALSE otherwise. The only other validity
d22ec717
KW
4028 * check is to make sure that this won't exceed the string's length nor
4029 * overflow */
0876b9a0
KW
4030
4031 const U8* const e = s + len;
4032 bool ok = TRUE;
4033
4034 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4035
4036 while (s < e) {
1604cfb0
MS
4037 if (UTF8SKIP(s) > len) {
4038 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4039 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4040 return FALSE;
4041 }
4042 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4043 if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
760c7c2f 4044 if ( ckWARN_d(WARN_NON_UNICODE)
e050c007
KW
4045 || UNLIKELY(0 < does_utf8_overflow(s, s + len,
4046 0 /* Don't consider overlongs */
4047 )))
4048 {
15ca5930 4049 /* A side effect of this function will be to warn */
2db24202 4050 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
7ee537e6
KW
4051 ok = FALSE;
4052 }
1604cfb0
MS
4053 }
4054 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4055 if (ckWARN_d(WARN_SURROGATE)) {
15ca5930
KW
4056 /* This has a different warning than the one the called
4057 * function would output, so can't just call it, unlike we
4058 * do for the non-chars and above-unicodes */
1604cfb0
MS
4059 UV uv = utf8_to_uvchr_buf(s, e, NULL);
4060 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4061 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
56576a04 4062 uv);
1604cfb0
MS
4063 ok = FALSE;
4064 }
4065 }
4066 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
56576a04
KW
4067 && (ckWARN_d(WARN_NONCHAR)))
4068 {
15ca5930 4069 /* A side effect of this function will be to warn */
2db24202 4070 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
1604cfb0
MS
4071 ok = FALSE;
4072 }
4073 }
4074 s += UTF8SKIP(s);
0876b9a0
KW
4075 }
4076
4077 return ok;
4078}
4079
0f830e0b 4080/*
87cea99e 4081=for apidoc pv_uni_display
d2cc3551 4082
daf6caf1
KW
4083Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
4084C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
4085long (if longer, the rest is truncated and C<"..."> will be appended).
0a2ef054 4086
796b6530
KW
4087The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4088C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4089to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4090(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4091C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4092C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
0a2ef054 4093
daf6caf1
KW
4094Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
4095backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
4096
a1433954 4097The pointer to the PV of the C<dsv> is returned.
d2cc3551 4098
119bc988
KW
4099See also L</sv_uni_display>.
4100
d145625f
KW
4101=for apidoc Amnh||UNI_DISPLAY_BACKSLASH
4102=for apidoc Amnh||UNI_DISPLAY_BACKSPACE
4103=for apidoc Amnh||UNI_DISPLAY_ISPRINT
4104=for apidoc Amnh||UNI_DISPLAY_QQ
4105=for apidoc Amnh||UNI_DISPLAY_REGEX
4106=cut
4107*/
e6b2e755 4108char *
56576a04
KW
4109Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4110 UV flags)
e6b2e755
JH
4111{
4112 int truncated = 0;
e1ec3a88 4113 const char *s, *e;
e6b2e755 4114
7918f24d
NC
4115 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4116
9e2aa2e7 4117 SvPVCLEAR(dsv);
7fddd944 4118 SvUTF8_off(dsv);
e1ec3a88 4119 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1604cfb0
MS
4120 UV u;
4121 bool ok = 0;
4122
4123 if (pvlim && SvCUR(dsv) >= pvlim) {
4124 truncated++;
4125 break;
4126 }
4127 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4128 if (u < 256) {
4c08ae9e 4129 const U8 c = (U8) u;
1604cfb0 4130 if (flags & UNI_DISPLAY_BACKSLASH) {
daf6caf1
KW
4131 if ( isMNEMONIC_CNTRL(c)
4132 && ( c != '\b'
4133 || (flags & UNI_DISPLAY_BACKSPACE)))
4134 {
4135 const char * mnemonic = cntrl_to_mnemonic(c);
4136 sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4137 ok = 1;
4138 }
4139 else if (c == '\\') {
4140 sv_catpvs(dsv, "\\\\");
4141 ok = 1;
4142 }
4143 }
1604cfb0
MS
4144 /* isPRINT() is the locale-blind version. */
4145 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4146 const char string = c;
4147 sv_catpvn(dsv, &string, 1);
4148 ok = 1;
4149 }
4150 }
4151 if (!ok)
4152 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
e6b2e755
JH
4153 }
4154 if (truncated)
1604cfb0 4155 sv_catpvs(dsv, "...");
48ef279e 4156
e6b2e755
JH
4157 return SvPVX(dsv);
4158}
2b9d42f0 4159
d2cc3551 4160/*
87cea99e 4161=for apidoc sv_uni_display
d2cc3551 4162
a1433954
KW
4163Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4164the displayable version being at most C<pvlim> bytes long
d2cc3551 4165(if longer, the rest is truncated and "..." will be appended).
0a2ef054 4166
a1433954 4167The C<flags> argument is as in L</pv_uni_display>().
0a2ef054 4168
a1433954 4169The pointer to the PV of the C<dsv> is returned.
d2cc3551 4170
d4c19fe8
AL
4171=cut
4172*/
e6b2e755
JH
4173char *
4174Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4175{
8cdde9f8
NC
4176 const char * const ptr =
4177 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4178
7918f24d
NC
4179 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4180
8cdde9f8 4181 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
1604cfb0 4182 SvCUR(ssv), pvlim, flags);
701a277b
JH
4183}
4184
d2cc3551 4185/*
e6226b18 4186=for apidoc foldEQ_utf8
d2cc3551 4187
56576a04
KW
4188Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4189both of which may be in UTF-8) are the same case-insensitively; false
4190otherwise. How far into the strings to compare is determined by other input
4191parameters.
8b35872c 4192
a1433954 4193If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
56576a04
KW
4194otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
4195C<u2> with respect to C<s2>.
4196
4197If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4198fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
4199The scan will not be considered to be a match unless the goal is reached, and
4200scanning won't continue past that goal. Correspondingly for C<l2> with respect
4201to C<s2>.
4202
4203If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4204pointer is considered an end pointer to the position 1 byte past the maximum
4205point in C<s1> beyond which scanning will not continue under any circumstances.
03bb5c85 4206(This routine assumes that UTF-8 encoded input strings are not malformed;
56576a04
KW
4207malformed input can cause it to read past C<pe1>). This means that if both
4208C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4209will never be successful because it can never
d51c1b21 4210get as far as its goal (and in fact is asserted against). Correspondingly for
a1433954 4211C<pe2> with respect to C<s2>.
8b35872c 4212
a1433954
KW
4213At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4214C<l2> must be non-zero), and if both do, both have to be
8b35872c
KW
4215reached for a successful match. Also, if the fold of a character is multiple
4216characters, all of them must be matched (see tr21 reference below for
4217'folding').
4218
796b6530 4219Upon a successful match, if C<pe1> is non-C<NULL>,
a1433954
KW
4220it will be set to point to the beginning of the I<next> character of C<s1>
4221beyond what was matched. Correspondingly for C<pe2> and C<s2>.
d2cc3551
JH
4222
4223For case-insensitiveness, the "casefolding" of Unicode is used
4224instead of upper/lowercasing both the characters, see
ad37daf5 4225L<https://www.unicode.org/reports/tr21/> (Case Mappings).
d2cc3551 4226
d145625f
KW
4227=for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4228=for apidoc Cmnh||FOLDEQ_LOCALE
4229=for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4230=for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4231=for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4232=for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4233
d2cc3551 4234=cut */
a33c29bc
KW
4235
4236/* A flags parameter has been added which may change, and hence isn't
4237 * externally documented. Currently it is:
4238 * 0 for as-documented above
4239 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
1604cfb0 4240 ASCII one, to not match
31f05a37
KW
4241 * FOLDEQ_LOCALE is set iff the rules from the current underlying
4242 * locale are to be used.
4243 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
aa8ebe62
KW
4244 * routine. This allows that step to be skipped.
4245 * Currently, this requires s1 to be encoded as UTF-8
4246 * (u1 must be true), which is asserted for.
d635b710
KW
4247 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
4248 * cross certain boundaries. Hence, the caller should
4249 * let this function do the folding instead of
4250 * pre-folding. This code contains an assertion to
4251 * that effect. However, if the caller knows what
4252 * it's doing, it can pass this flag to indicate that,
4253 * and the assertion is skipped.
b4408913
KW
4254 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4255 * to s2, and s2 doesn't have to be UTF-8 encoded.
4256 * This introduces an asymmetry to save a few branches
4257 * in a loop. Currently, this is not a problem, as
4258 * never are both inputs pre-folded. Simply call this
4259 * function with the pre-folded one as the second
4260 * string.
d635b710 4261 * FOLDEQ_S2_FOLDS_SANE
a33c29bc 4262 */
f0919eff 4263
701a277b 4264I32
56576a04
KW
4265Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4266 const char *s2, char **pe2, UV l2, bool u2,
4267 U32 flags)
332ddc25 4268{
eb578fdb
KW
4269 const U8 *p1 = (const U8*)s1; /* Point to current char */
4270 const U8 *p2 = (const U8*)s2;
4271 const U8 *g1 = NULL; /* goal for s1 */
4272 const U8 *g2 = NULL;
4273 const U8 *e1 = NULL; /* Don't scan s1 past this */
4274 U8 *f1 = NULL; /* Point to current folded */
4275 const U8 *e2 = NULL;
4276 U8 *f2 = NULL;
48ef279e 4277 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
8b35872c
KW
4278 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4279 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1d39b2cd 4280 U8 flags_for_folder = FOLD_FLAGS_FULL;
8b35872c 4281
eda9cac1 4282 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
8b35872c 4283
68a23e40
KW
4284 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4285 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
4286 && !(flags & FOLDEQ_S1_FOLDS_SANE))
4287 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
4288 && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
b08f1bd5
KW
4289 /* The algorithm is to trial the folds without regard to the flags on
4290 * the first line of the above assert(), and then see if the result
4291 * violates them. This means that the inputs can't be pre-folded to a
4292 * violating result, hence the assert. This could be changed, with the
4293 * addition of extra tests here for the already-folded case, which would
4294 * slow it down. That cost is more than any possible gain for when these
4295 * flags are specified, as the flags indicate /il or /iaa matching which
4296 * is less common than /iu, and I (khw) also believe that real-world /il
4297 * and /iaa matches are most likely to involve code points 0-255, and this
4298 * function only under rare conditions gets called for 0-255. */
18f762c3 4299
1d39b2cd
KW
4300 if (flags & FOLDEQ_LOCALE) {
4301 if (IN_UTF8_CTYPE_LOCALE) {
35b8412f
KW
4302 if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4303 flags_for_folder |= FOLD_FLAGS_LOCALE;
4304 }
4305 else {
4306 flags &= ~FOLDEQ_LOCALE;
4307 }
1d39b2cd
KW
4308 }
4309 else {
4310 flags_for_folder |= FOLD_FLAGS_LOCALE;
4311 }
31f05a37 4312 }
cfd23983
KW
4313 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4314 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4315 }
31f05a37 4316
8b35872c 4317 if (pe1) {
48ef279e 4318 e1 = *(U8**)pe1;
8b35872c
KW
4319 }
4320
4321 if (l1) {
48ef279e 4322 g1 = (const U8*)s1 + l1;
8b35872c
KW
4323 }
4324
4325 if (pe2) {
48ef279e 4326 e2 = *(U8**)pe2;
8b35872c
KW
4327 }
4328
4329 if (l2) {
48ef279e 4330 g2 = (const U8*)s2 + l2;
8b35872c
KW
4331 }
4332
4333 /* Must have at least one goal */
4334 assert(g1 || g2);
4335
4336 if (g1) {
4337
48ef279e
KW
4338 /* Will never match if goal is out-of-bounds */
4339 assert(! e1 || e1 >= g1);
8b35872c 4340
48ef279e
KW
4341 /* Here, there isn't an end pointer, or it is beyond the goal. We
4342 * only go as far as the goal */
4343 e1 = g1;
8b35872c 4344 }
313b38e5 4345 else {
1604cfb0 4346 assert(e1); /* Must have an end for looking at s1 */
313b38e5 4347 }
8b35872c
KW
4348
4349 /* Same for goal for s2 */
4350 if (g2) {
48ef279e
KW
4351 assert(! e2 || e2 >= g2);
4352 e2 = g2;
8b35872c 4353 }
313b38e5 4354 else {
1604cfb0 4355 assert(e2);
313b38e5 4356 }
8b35872c 4357
18f762c3
KW
4358 /* If both operands are already folded, we could just do a memEQ on the
4359 * whole strings at once, but it would be better if the caller realized
4360 * this and didn't even call us */
4361
8b35872c
KW
4362 /* Look through both strings, a character at a time */
4363 while (p1 < e1 && p2 < e2) {
4364
d51c1b21 4365 /* If at the beginning of a new character in s1, get its fold to use
1604cfb0 4366 * and the length of the fold. */
48ef279e 4367 if (n1 == 0) {
1604cfb0
MS
4368 if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4369 f1 = (U8 *) p1;
aa8ebe62 4370 assert(u1);
1604cfb0
MS
4371 n1 = UTF8SKIP(f1);
4372 }
4373 else {
1d39b2cd
KW
4374 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4375
4376 /* We have to forbid mixing ASCII with non-ASCII if the
4377 * flags so indicate. And, we can short circuit having to
4378 * call the general functions for this common ASCII case,
4379 * all of whose non-locale folds are also ASCII, and hence
4380 * UTF-8 invariants, so the UTF8ness of the strings is not
4381 * relevant. */
4382 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4383 return 0;
4384 }
4385 n1 = 1;
4386 *foldbuf1 = toFOLD(*p1);
4387 }
4388 else if (u1) {
a1a5ec35 4389 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
1d39b2cd 4390 }
4a4088c4 4391 else { /* Not UTF-8, get UTF-8 fold */
1d39b2cd
KW
4392 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4393 }
4394 f1 = foldbuf1;
4395 }
48ef279e 4396 }
8b35872c 4397
48ef279e 4398 if (n2 == 0) { /* Same for s2 */
1604cfb0 4399 if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
b4408913
KW
4400
4401 /* Point to the already-folded character. But for non-UTF-8
4402 * variants, convert to UTF-8 for the algorithm below */
1604cfb0 4403 if (UTF8_IS_INVARIANT(*p2)) {
b4408913
KW
4404 f2 = (U8 *) p2;
4405 n2 = 1;
4406 }
4407 else if (u2) {
4408 f2 = (U8 *) p2;
4409 n2 = UTF8SKIP(f2);
4410 }
4411 else {
4412 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4413 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4414 f2 = foldbuf2;
4415 n2 = 2;
4416 }
1604cfb0
MS
4417 }
4418 else {
1d39b2cd
KW
4419 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4420 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4421 return 0;
4422 }
4423 n2 = 1;
4424 *foldbuf2 = toFOLD(*p2);
4425 }
4426 else if (u2) {
a1a5ec35 4427 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
1d39b2cd
KW
4428 }
4429 else {
4430 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4431 }
4432 f2 = foldbuf2;
1604cfb0 4433 }
48ef279e 4434 }
8b35872c 4435
1604cfb0
MS
4436 /* Here f1 and f2 point to the beginning of the strings to compare.
4437 * These strings are the folds of the next character from each input
4438 * string, stored in UTF-8. */
5e64d0fa 4439
48ef279e
KW
4440 /* While there is more to look for in both folds, see if they
4441 * continue to match */
4442 while (n1 && n2) {
4443 U8 fold_length = UTF8SKIP(f1);
4444 if (fold_length != UTF8SKIP(f2)
4445 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4446 function call for single
a6d5f321 4447 byte */
48ef279e
KW
4448 || memNE((char*)f1, (char*)f2, fold_length))
4449 {
e6226b18 4450 return 0; /* mismatch */
48ef279e
KW
4451 }
4452
4453 /* Here, they matched, advance past them */
4454 n1 -= fold_length;
4455 f1 += fold_length;
4456 n2 -= fold_length;
4457 f2 += fold_length;
4458 }
8b35872c 4459
48ef279e
KW
4460 /* When reach the end of any fold, advance the input past it */
4461 if (n1 == 0) {
4462 p1 += u1 ? UTF8SKIP(p1) : 1;
4463 }
4464 if (n2 == 0) {
4465 p2 += u2 ? UTF8SKIP(p2) : 1;
4466 }
8b35872c
KW
4467 } /* End of loop through both strings */
4468
4469 /* A match is defined by each scan that specified an explicit length
4470 * reaching its final goal, and the other not having matched a partial
4471 * character (which can happen when the fold of a character is more than one
4472 * character). */
4473 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
e6226b18 4474 return 0;
8b35872c
KW
4475 }
4476
4477 /* Successful match. Set output pointers */
4478 if (pe1) {
48ef279e 4479 *pe1 = (char*)p1;
8b35872c
KW
4480 }
4481 if (pe2) {
48ef279e 4482 *pe2 = (char*)p2;
8b35872c 4483 }
e6226b18 4484 return 1;
e6b2e755 4485}
701a277b 4486
7723e007 4487/*
14d04a33 4488 * ex: set ts=8 sts=4 sw=4 et:
37442d52 4489 */