This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Avoid unnecessary work xlating utf8 to uv
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
afde5508 35#include "uni_keywords.h"
a0ed51b3 36
806547a7 37static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 38static const char unees[] =
806547a7 39 "Malformed UTF-8 character (unexpected end of string)";
fb7e7255
KW
40
41/* Be sure to synchronize this message with the similar one in regcomp.c */
760c7c2f 42static const char cp_above_legal_max[] =
d22ec717
KW
43 "Use of code point 0x%" UVXf " is not allowed; the"
44 " permissible max is 0x%" UVXf;
760c7c2f 45
d22ec717 46#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
901b21bf 47
48ef279e 48/*
ccfc67b7 49=head1 Unicode Support
7fefc6c1 50These are various utility functions for manipulating UTF8-encoded
72d33970 51strings. For the uninitiated, this is a method of representing arbitrary
61296642 52Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
53characters in the ASCII range are unmodified, and a zero byte never appears
54within non-zero characters.
166f8a29 55
eaf7a4d2
CS
56=cut
57*/
58
9cbfb8ab
KW
59void
60Perl__force_out_malformed_utf8_message(pTHX_
61 const U8 *const p, /* First byte in UTF-8 sequence */
62 const U8 * const e, /* Final byte in sequence (may include
63 multiple chars */
64 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
65 usually 0, or some DISALLOW flags */
66 const bool die_here) /* If TRUE, this function does not return */
67{
68 /* This core-only function is to be called when a malformed UTF-8 character
69 * is found, in order to output the detailed information about the
70 * malformation before dieing. The reason it exists is for the occasions
71 * when such a malformation is fatal, but warnings might be turned off, so
72 * that normally they would not be actually output. This ensures that they
73 * do get output. Because a sequence may be malformed in more than one
74 * way, multiple messages may be generated, so we can't make them fatal, as
75 * that would cause the first one to die.
76 *
77 * Instead we pretend -W was passed to perl, then die afterwards. The
78 * flexibility is here to return to the caller so they can finish up and
79 * die themselves */
80 U32 errors;
81
82 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
83
84 ENTER;
c15a80f3 85 SAVEI8(PL_dowarn);
9cbfb8ab
KW
86 SAVESPTR(PL_curcop);
87
88 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
89 if (PL_curcop) {
90 PL_curcop->cop_warnings = pWARN_ALL;
91 }
92
93 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
94
95 LEAVE;
96
97 if (! errors) {
98 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
99 " be called only when there are errors found");
100 }
101
102 if (die_here) {
103 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
104 }
105}
106
bb07812e
KW
107STATIC HV *
108S_new_msg_hv(pTHX_ const char * const message, /* The message text */
109 U32 categories, /* Packed warning categories */
110 U32 flag) /* Flag associated with this message */
111{
112 /* Creates, populates, and returns an HV* that describes an error message
113 * for the translators between UTF8 and code point */
114
115 SV* msg_sv = newSVpv(message, 0);
116 SV* category_sv = newSVuv(categories);
117 SV* flag_bit_sv = newSVuv(flag);
118
119 HV* msg_hv = newHV();
120
121 PERL_ARGS_ASSERT_NEW_MSG_HV;
122
2b672cf5
KW
123 (void) hv_stores(msg_hv, "text", msg_sv);
124 (void) hv_stores(msg_hv, "warn_categories", category_sv);
125 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
bb07812e
KW
126
127 return msg_hv;
128}
129
eaf7a4d2 130/*
378516de 131=for apidoc uvoffuni_to_utf8_flags
eebe1485 132
a27992cc 133THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
134Instead, B<Almost all code should use L</uvchr_to_utf8> or
135L</uvchr_to_utf8_flags>>.
a27992cc 136
de69f3af
KW
137This function is like them, but the input is a strict Unicode
138(as opposed to native) code point. Only in very rare circumstances should code
139not be using the native code point.
949cf498 140
efa9cd84 141For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 142
eebe1485
SC
143=cut
144*/
145
33f38593
KW
146U8 *
147Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
148{
149 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
150
151 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
152}
153
c94c2f39
KW
154/* All these formats take a single UV code point argument */
155const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
156const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
157 " is not recommended for open interchange";
158const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
159 " may not be portable";
57ff5f59
KW
160const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
161 " Unicode, requires a Perl extension," \
162 " and so is not portable";
c94c2f39 163
33f38593 164#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
8ee1cdcb
KW
165 STMT_START { \
166 if (flags & UNICODE_WARN_SURROGATE) { \
33f38593
KW
167 U32 category = packWARN(WARN_SURROGATE); \
168 const char * format = surrogate_cp_format; \
169 if (msgs) { \
170 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
171 category, \
172 UNICODE_GOT_SURROGATE); \
173 } \
174 else { \
175 Perl_ck_warner_d(aTHX_ category, format, uv); \
176 } \
8ee1cdcb
KW
177 } \
178 if (flags & UNICODE_DISALLOW_SURROGATE) { \
179 return NULL; \
180 } \
181 } STMT_END;
182
33f38593 183#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
8ee1cdcb
KW
184 STMT_START { \
185 if (flags & UNICODE_WARN_NONCHAR) { \
33f38593
KW
186 U32 category = packWARN(WARN_NONCHAR); \
187 const char * format = nonchar_cp_format; \
188 if (msgs) { \
189 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
190 category, \
191 UNICODE_GOT_NONCHAR); \
192 } \
193 else { \
194 Perl_ck_warner_d(aTHX_ category, format, uv); \
195 } \
8ee1cdcb
KW
196 } \
197 if (flags & UNICODE_DISALLOW_NONCHAR) { \
198 return NULL; \
199 } \
200 } STMT_END;
201
ba6ed43c
KW
202/* Use shorter names internally in this file */
203#define SHIFT UTF_ACCUMULATION_SHIFT
204#undef MARK
205#define MARK UTF_CONTINUATION_MARK
206#define MASK UTF_CONTINUATION_MASK
207
33f38593
KW
208/*
209=for apidoc uvchr_to_utf8_flags_msgs
210
211THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
212
213Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
214
215This function is for code that wants any warning and/or error messages to be
216returned to the caller rather than be displayed. All messages that would have
884a31ee 217been displayed if all lexical warnings are enabled will be returned.
33f38593
KW
218
219It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
220placed after all the others, C<msgs>. If this parameter is 0, this function
221behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
222be a pointer to an C<HV *> variable, in which this function creates a new HV to
223contain any appropriate messages. The hash has three key-value pairs, as
224follows:
225
226=over 4
227
228=item C<text>
229
230The text of the message as a C<SVpv>.
231
232=item C<warn_categories>
233
234The warning category (or categories) packed into a C<SVuv>.
235
236=item C<flag>
237
238A single flag bit associated with this message, in a C<SVuv>.
239The bit corresponds to some bit in the C<*errors> return value,
240such as C<UNICODE_GOT_SURROGATE>.
241
242=back
243
244It's important to note that specifying this parameter as non-null will cause
245any warnings this function would otherwise generate to be suppressed, and
246instead be placed in C<*msgs>. The caller can check the lexical warnings state
247(or not) when choosing what to do with the returned messages.
248
249The caller, of course, is responsible for freeing any returned HV.
250
251=cut
252*/
253
254/* Undocumented; we don't want people using this. Instead they should use
255 * uvchr_to_utf8_flags_msgs() */
dfe13c55 256U8 *
33f38593 257Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
a0ed51b3 258{
33f38593
KW
259 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
260
261 if (msgs) {
262 *msgs = NULL;
263 }
7918f24d 264
2d1545e5 265 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 266 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
267 return d;
268 }
facc1dc2 269
3ea68d71 270 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
271 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
272 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
273 return d;
274 }
d9432125 275
ba6ed43c
KW
276 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
277 * below, the 16 is for start bytes E0-EF (which are all the possible ones
278 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
279 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
280 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
281 * 0x800-0xFFFF on ASCII */
282 if (uv < (16 * (1U << (2 * SHIFT)))) {
283 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
284 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
285 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
286
287#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
288 aren't tested here */
289 /* The most likely code points in this range are below the surrogates.
290 * Do an extra test to quickly exclude those. */
291 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
292 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
293 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
294 {
33f38593 295 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
8ee1cdcb
KW
296 }
297 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
33f38593 298 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
760c7c2f 299 }
ba6ed43c
KW
300 }
301#endif
302 return d;
303 }
304
305 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
306 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
307 * happen starting with 4-byte characters on ASCII platforms. We unify the
308 * code for these with EBCDIC, even though some of them require 5-bytes on
309 * those, because khw believes the code saving is worth the very slight
310 * performance hit on these high EBCDIC code points. */
311
312 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
d22ec717
KW
313 if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
314 Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
a5bf80e0 315 }
33f38593
KW
316 if ( (flags & UNICODE_WARN_SUPER)
317 || ( (flags & UNICODE_WARN_PERL_EXTENDED)
0a8a1a5b 318 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0 319 {
33f38593
KW
320 const char * format = super_cp_format;
321 U32 category = packWARN(WARN_NON_UNICODE);
322 U32 flag = UNICODE_GOT_SUPER;
323
324 /* Choose the more dire applicable warning */
325 if (UNICODE_IS_PERL_EXTENDED(uv)) {
326 format = perl_extended_cp_format;
327 if (flags & (UNICODE_WARN_PERL_EXTENDED
328 |UNICODE_DISALLOW_PERL_EXTENDED))
329 {
330 flag = UNICODE_GOT_PERL_EXTENDED;
331 }
332 }
a5bf80e0 333
33f38593
KW
334 if (msgs) {
335 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
336 category, flag);
337 }
338 else {
339 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
340 }
a5bf80e0 341 }
56576a04 342 if ( (flags & UNICODE_DISALLOW_SUPER)
0a8a1a5b
KW
343 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
344 && UNICODE_IS_PERL_EXTENDED(uv)))
a5bf80e0
KW
345 {
346 return NULL;
347 }
348 }
ba6ed43c 349 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
33f38593 350 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
507b9800 351 }
d9432125 352
ba6ed43c
KW
353 /* Test for and handle 4-byte result. In the test immediately below, the
354 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
355 * characters). The 3 is for 3 continuation bytes; these each contribute
356 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
357 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
358 * 0x1_0000-0x1F_FFFF on ASCII */
359 if (uv < (8 * (1U << (3 * SHIFT)))) {
360 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
361 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
362 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
363 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
364
365#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
366 characters. The end-plane non-characters for EBCDIC were
367 handled just above */
368 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
33f38593 369 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
d528804a 370 }
ba6ed43c 371 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
33f38593 372 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
ba6ed43c
KW
373 }
374#endif
375
376 return d;
377 }
378
379 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
380 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
381 * format. The unrolled version above turns out to not save all that much
382 * time, and at these high code points (well above the legal Unicode range
383 * on ASCII platforms, and well above anything in common use in EBCDIC),
384 * khw believes that less code outweighs slight performance gains. */
385
d9432125 386 {
5aaebcb3 387 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
388 U8 *p = d+len-1;
389 while (p > d) {
957a9e81
KW
390 *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
391 uv >>= SHIFT;
1d72bdf6 392 }
4c8cd605 393 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
394 return d+len;
395 }
a0ed51b3 396}
a5bf80e0 397
646ca15d 398/*
07693fe6
KW
399=for apidoc uvchr_to_utf8
400
bcb1a2d4 401Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 402of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
403C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
404the byte after the end of the new character. In other words,
07693fe6
KW
405
406 d = uvchr_to_utf8(d, uv);
407
408is the recommended wide native character-aware way of saying
409
410 *(d++) = uv;
411
d22ec717
KW
412This function accepts any code point from 0..C<IV_MAX> as input.
413C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
414
415It is possible to forbid or warn on non-Unicode code points, or those that may
416be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 417
07693fe6
KW
418=cut
419*/
420
de69f3af
KW
421/* This is also a macro */
422PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
423
07693fe6
KW
424U8 *
425Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
426{
de69f3af 427 return uvchr_to_utf8(d, uv);
07693fe6
KW
428}
429
de69f3af
KW
430/*
431=for apidoc uvchr_to_utf8_flags
432
433Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 434of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
435C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
436the byte after the end of the new character. In other words,
de69f3af
KW
437
438 d = uvchr_to_utf8_flags(d, uv, flags);
439
440or, in most cases,
441
442 d = uvchr_to_utf8_flags(d, uv, 0);
443
444This is the Unicode-aware way of saying
445
446 *(d++) = uv;
447
d22ec717
KW
448If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
449input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
760c7c2f
KW
450
451Specifying C<flags> can further restrict what is allowed and not warned on, as
452follows:
de69f3af 453
796b6530 454If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
455the function will raise a warning, provided UTF8 warnings are enabled. If
456instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
457NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 458
760c7c2f
KW
459Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
460affect how the function handles a Unicode non-character.
93e6dbd6 461
760c7c2f
KW
462And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
463affect the handling of code points that are above the Unicode maximum of
4640x10FFFF. Languages other than Perl may not be able to accept files that
465contain these.
93e6dbd6
KW
466
467The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
468the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
469three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
470allowed inputs to the strict UTF-8 traditionally defined by Unicode.
471Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
472C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
473above-Unicode and surrogate flags, but not the non-character ones, as
474defined in
475L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
476See L<perlunicode/Noncharacter code points>.
93e6dbd6 477
57ff5f59
KW
478Extremely high code points were never specified in any standard, and require an
479extension to UTF-8 to express, which Perl does. It is likely that programs
480written in something other than Perl would not be able to read files that
481contain these; nor would Perl understand files written by something that uses a
482different extension. For these reasons, there is a separate set of flags that
483can warn and/or disallow these extremely high code points, even if other
484above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
485and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
486L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
487treat all above-Unicode code points, including these, as malformations. (Note
488that the Unicode standard considers anything above 0x10FFFF to be illegal, but
489there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
490
491A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
492retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
493C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
7c4a22ed
KW
494C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
495platforms,these flags can apply to code points that actually do fit in 31 bits.
496The new names accurately describe the situation in all cases.
de69f3af 497
de69f3af
KW
498=cut
499*/
500
501/* This is also a macro */
502PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
503
07693fe6
KW
504U8 *
505Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
506{
de69f3af 507 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
508}
509
57ff5f59
KW
510#ifndef UV_IS_QUAD
511
e050c007
KW
512STATIC int
513S_is_utf8_cp_above_31_bits(const U8 * const s,
514 const U8 * const e,
515 const bool consider_overlongs)
83dc0f42
KW
516{
517 /* Returns TRUE if the first code point represented by the Perl-extended-
518 * UTF-8-encoded string starting at 's', and looking no further than 'e -
519 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
520 *
521 * The function handles the case where the input bytes do not include all
522 * the ones necessary to represent a full character. That is, they may be
523 * the intial bytes of the representation of a code point, but possibly
524 * the final ones necessary for the complete representation may be beyond
525 * 'e - 1'.
526 *
e050c007
KW
527 * The function also can handle the case where the input is an overlong
528 * sequence. If 'consider_overlongs' is 0, the function assumes the
529 * input is not overlong, without checking, and will return based on that
530 * assumption. If this parameter is 1, the function will go to the trouble
531 * of figuring out if it actually evaluates to above or below 31 bits.
83dc0f42 532 *
e050c007 533 * The sequence is otherwise assumed to be well-formed, without checking.
83dc0f42
KW
534 */
535
e050c007
KW
536 const STRLEN len = e - s;
537 int is_overlong;
538
539 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
540
541 assert(! UTF8_IS_INVARIANT(*s) && e > s);
542
83dc0f42
KW
543#ifdef EBCDIC
544
e050c007 545 PERL_UNUSED_ARG(consider_overlongs);
83dc0f42 546
e050c007
KW
547 /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
548 * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
549 * also be the start byte for a 31-bit code point; we need at least 2
550 * bytes, and maybe up through 8 bytes, to determine that. (It can also be
551 * the start byte for an overlong sequence, but for 30-bit or smaller code
552 * points, so we don't have to worry about overlongs on EBCDIC.) */
553 if (*s != 0xFE) {
554 return 0;
555 }
83dc0f42 556
e050c007
KW
557 if (len == 1) {
558 return -1;
559 }
83dc0f42 560
e050c007 561#else
83dc0f42 562
e050c007
KW
563 /* On ASCII, FE and FF are the only start bytes that can evaluate to
564 * needing more than 31 bits. */
565 if (LIKELY(*s < 0xFE)) {
566 return 0;
567 }
83dc0f42 568
e050c007
KW
569 /* What we have left are FE and FF. Both of these require more than 31
570 * bits unless they are for overlongs. */
571 if (! consider_overlongs) {
572 return 1;
573 }
83dc0f42 574
e050c007
KW
575 /* Here, we have FE or FF. If the input isn't overlong, it evaluates to
576 * above 31 bits. But we need more than one byte to discern this, so if
577 * passed just the start byte, it could be an overlong evaluating to
578 * smaller */
579 if (len == 1) {
580 return -1;
581 }
83dc0f42 582
e050c007
KW
583 /* Having excluded len==1, and knowing that FE and FF are both valid start
584 * bytes, we can call the function below to see if the sequence is
585 * overlong. (We don't need the full generality of the called function,
586 * but for these huge code points, speed shouldn't be a consideration, and
587 * the compiler does have enough information, since it's static to this
588 * file, to optimize to just the needed parts.) */
589 is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
83dc0f42 590
e050c007
KW
591 /* If it isn't overlong, more than 31 bits are required. */
592 if (is_overlong == 0) {
593 return 1;
594 }
83dc0f42 595
e050c007
KW
596 /* If it is indeterminate if it is overlong, return that */
597 if (is_overlong < 0) {
598 return -1;
599 }
600
601 /* Here is overlong. Such a sequence starting with FE is below 31 bits, as
602 * the max it can be is 2**31 - 1 */
603 if (*s == 0xFE) {
604 return 0;
83dc0f42
KW
605 }
606
e050c007
KW
607#endif
608
609 /* Here, ASCII and EBCDIC rejoin:
610 * On ASCII: We have an overlong sequence starting with FF
611 * On EBCDIC: We have a sequence starting with FE. */
612
613 { /* For C89, use a block so the declaration can be close to its use */
614
615#ifdef EBCDIC
616
5f995336
KW
617 /* U+7FFFFFFF (2 ** 31 - 1)
618 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
619 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
620 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
621 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
622 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
623 * U+80000000 (2 ** 31):
624 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
625 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
626 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
627 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
e050c007
KW
628 *
629 * and since we know that *s = \xfe, any continuation sequcence
630 * following it that is gt the below is above 31 bits
631 [0] [1] [2] [3] [4] [5] [6] */
632 const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
633
634#else
635
636 /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
637 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
638 * FF overlong for U+80000000 (2 ** 31):
639 * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
640 * and since we know that *s = \xff, any continuation sequcence
641 * following it that is gt the below is above 30 bits
642 [0] [1] [2] [3] [4] [5] [6] */
643 const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
5f995336 644
83dc0f42
KW
645
646#endif
e050c007
KW
647 const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
648 const STRLEN cmp_len = MIN(conts_len, len - 1);
649
650 /* Now compare the continuation bytes in s with the ones we have
651 * compiled in that are for the largest 30 bit code point. If we have
652 * enough bytes available to determine the answer, or the bytes we do
653 * have differ from them, we can compare the two to get a definitive
654 * answer (Note that in UTF-EBCDIC, the two lowest possible
655 * continuation bytes are \x41 and \x42.) */
656 if (cmp_len >= conts_len || memNE(s + 1,
657 conts_for_highest_30_bit,
658 cmp_len))
659 {
660 return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
661 }
83dc0f42 662
e050c007
KW
663 /* Here, all the bytes we have are the same as the highest 30-bit code
664 * point, but we are missing so many bytes that we can't make the
665 * determination */
666 return -1;
667 }
83dc0f42
KW
668}
669
57ff5f59
KW
670#endif
671
d6be65ae 672PERL_STATIC_INLINE int
12a4bed3
KW
673S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
674{
d6be65ae
KW
675 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
676 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
677 * it isn't, and -1 if there isn't enough information to tell. This last
678 * return value can happen if the sequence is incomplete, missing some
679 * trailing bytes that would form a complete character. If there are
680 * enough bytes to make a definitive decision, this function does so.
681 * Usually 2 bytes sufficient.
682 *
683 * Overlongs can occur whenever the number of continuation bytes changes.
684 * That means whenever the number of leading 1 bits in a start byte
685 * increases from the next lower start byte. That happens for start bytes
686 * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
687 * start bytes have already been excluded, so don't need to be tested here;
12a4bed3
KW
688 * ASCII platforms: C0, C1
689 * EBCDIC platforms C0, C1, C2, C3, C4, E0
d6be65ae 690 */
12a4bed3
KW
691
692 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
693 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
694
695 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
696 assert(len > 1 && UTF8_IS_START(*s));
697
698 /* Each platform has overlongs after the start bytes given above (expressed
699 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
700 * the logic is the same, except the E0 overlong has already been excluded
701 * on EBCDIC platforms. The values below were found by manually
702 * inspecting the UTF-8 patterns. See the tables in utf8.h and
703 * utfebcdic.h. */
704
705# ifdef EBCDIC
706# define F0_ABOVE_OVERLONG 0xB0
707# define F8_ABOVE_OVERLONG 0xA8
708# define FC_ABOVE_OVERLONG 0xA4
709# define FE_ABOVE_OVERLONG 0xA2
710# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
711 /* I8(0xfe) is FF */
712# else
713
714 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
d6be65ae 715 return 1;
12a4bed3
KW
716 }
717
718# define F0_ABOVE_OVERLONG 0x90
719# define F8_ABOVE_OVERLONG 0x88
720# define FC_ABOVE_OVERLONG 0x84
721# define FE_ABOVE_OVERLONG 0x82
722# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
723# endif
724
725
726 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
727 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
728 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
729 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
730 {
d6be65ae 731 return 1;
12a4bed3
KW
732 }
733
b0b342d4 734 /* Check for the FF overlong */
d6be65ae 735 return isFF_OVERLONG(s, len);
b0b342d4
KW
736}
737
8d6204cc 738PERL_STATIC_INLINE int
b0b342d4
KW
739S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
740{
8d6204cc
KW
741 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
742 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
743 * it isn't, and -1 if there isn't enough information to tell. This last
744 * return value can happen if the sequence is incomplete, missing some
745 * trailing bytes that would form a complete character. If there are
746 * enough bytes to make a definitive decision, this function does so. */
747
b0b342d4 748 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3 749
8d6204cc
KW
750 /* To be an FF overlong, all the available bytes must match */
751 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
752 MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
753 {
754 return 0;
755 }
756
757 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
758 * be there; what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 759 * utfebcdic.h. */
8d6204cc
KW
760 if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
761 return 1;
762 }
12a4bed3 763
8d6204cc
KW
764 /* The missing bytes could cause the result to go one way or the other, so
765 * the result is indeterminate */
766 return -1;
12a4bed3
KW
767}
768
d22ec717 769#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
a77c906e
KW
770# ifdef EBCDIC /* Actually is I8 */
771# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 772 "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
773# else
774# define HIGHEST_REPRESENTABLE_UTF8 \
d22ec717 775 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
a77c906e
KW
776# endif
777#endif
778
c285bbc4 779PERL_STATIC_INLINE int
e050c007
KW
780S_does_utf8_overflow(const U8 * const s,
781 const U8 * e,
782 const bool consider_overlongs)
a77c906e 783{
c285bbc4 784 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
d22ec717
KW
785 * 'e' - 1 would overflow an IV on this platform; that is if it represents
786 * a code point larger than the highest representable code point. It
787 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
788 * enough information to tell. This last return value can happen if the
789 * sequence is incomplete, missing some trailing bytes that would form a
790 * complete character. If there are enough bytes to make a definitive
791 * decision, this function does so.
c285bbc4 792 *
e050c007
KW
793 * If 'consider_overlongs' is TRUE, the function checks for the possibility
794 * that the sequence is an overlong that doesn't overflow. Otherwise, it
795 * assumes the sequence is not an overlong. This can give different
796 * results only on ASCII 32-bit platforms.
797 *
c285bbc4
KW
798 * (For ASCII platforms, we could use memcmp() because we don't have to
799 * convert each byte to I8, but it's very rare input indeed that would
800 * approach overflow, so the loop below will likely only get executed once.)
801 *
802 * 'e' - 1 must not be beyond a full character. */
a77c906e 803
a77c906e
KW
804
805 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
806 assert(s <= e && s + UTF8SKIP(s) >= e);
807
d22ec717
KW
808#if ! defined(UV_IS_QUAD)
809
810 return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
811
812#else
813
814 PERL_UNUSED_ARG(consider_overlongs);
815
816 {
817 const STRLEN len = e - s;
818 const U8 *x;
819 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
820
821 for (x = s; x < e; x++, y++) {
822
823 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
824 continue;
825 }
826
827 /* If this byte is larger than the corresponding highest UTF-8
828 * byte, the sequence overflow; otherwise the byte is less than,
829 * and so the sequence doesn't overflow */
830 return NATIVE_UTF8_TO_I8(*x) > *y;
831
832 }
833
834 /* Got to the end and all bytes are the same. If the input is a whole
835 * character, it doesn't overflow. And if it is a partial character,
836 * there's not enough information to tell */
837 if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
838 return -1;
839 }
840
841 return 0;
842 }
843
844#endif
845
846}
847
848#if 0
849
850/* This is the portions of the above function that deal with UV_MAX instead of
851 * IV_MAX. They are left here in case we want to combine them so that internal
852 * uses can have larger code points. The only logic difference is that the
853 * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
854 * different logic.
855 */
856
857/* Anything larger than this will overflow the word if it were converted into a UV */
858#if defined(UV_IS_QUAD)
859# ifdef EBCDIC /* Actually is I8 */
860# define HIGHEST_REPRESENTABLE_UTF8 \
861 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
862# else
863# define HIGHEST_REPRESENTABLE_UTF8 \
864 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
865# endif
866#else /* 32-bit */
867# ifdef EBCDIC
868# define HIGHEST_REPRESENTABLE_UTF8 \
869 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
870# else
871# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
872# endif
873#endif
874
a77c906e
KW
875#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
876
877 /* On 32 bit ASCII machines, many overlongs that start with FF don't
878 * overflow */
e050c007 879 if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
c285bbc4
KW
880
881 /* To be such an overlong, the first bytes of 's' must match
882 * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we
883 * don't have any additional bytes available, the sequence, when
884 * completed might or might not fit in 32 bits. But if we have that
885 * next byte, we can tell for sure. If it is <= 0x83, then it does
886 * fit. */
887 if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
888 return -1;
889 }
890
891 return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
a77c906e
KW
892 }
893
d22ec717
KW
894/* Starting with the #else, the rest of the function is identical except
895 * 1. we need to move the 'len' declaration to be global to the function
896 * 2. the endif move to just after the UNUSED_ARG.
897 * An empty endif is given just below to satisfy the preprocessor
898 */
a77c906e
KW
899#endif
900
d22ec717 901#endif
a77c906e 902
12a4bed3
KW
903#undef F0_ABOVE_OVERLONG
904#undef F8_ABOVE_OVERLONG
905#undef FC_ABOVE_OVERLONG
906#undef FE_ABOVE_OVERLONG
907#undef FF_OVERLONG_PREFIX
908
35f8c9bd 909STRLEN
edc2c47a 910Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 911{
2b479609 912 STRLEN len;
12a4bed3 913 const U8 *x;
35f8c9bd 914
2b479609
KW
915 /* A helper function that should not be called directly.
916 *
917 * This function returns non-zero if the string beginning at 's' and
918 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
919 * code point; otherwise it returns 0. The examination stops after the
920 * first code point in 's' is validated, not looking at the rest of the
921 * input. If 'e' is such that there are not enough bytes to represent a
922 * complete code point, this function will return non-zero anyway, if the
923 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
924 * excluded by 'flags'.
925 *
926 * A non-zero return gives the number of bytes required to represent the
927 * code point. Be aware that if the input is for a partial character, the
928 * return will be larger than 'e - s'.
929 *
930 * This function assumes that the code point represented is UTF-8 variant.
56576a04
KW
931 * The caller should have excluded the possibility of it being invariant
932 * before calling this function.
2b479609
KW
933 *
934 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
935 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
936 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
937 * disallowed by the flags. If the input is only for a partial character,
938 * the function will return non-zero if there is any sequence of
939 * well-formed UTF-8 that, when appended to the input sequence, could
940 * result in an allowed code point; otherwise it returns 0. Non characters
941 * cannot be determined based on partial character input. But many of the
942 * other excluded types can be determined with just the first one or two
943 * bytes.
944 *
945 */
946
947 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
948
949 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 950 |UTF8_DISALLOW_PERL_EXTENDED)));
2b479609 951 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 952
2b479609 953 /* A variant char must begin with a start byte */
35f8c9bd
KW
954 if (UNLIKELY(! UTF8_IS_START(*s))) {
955 return 0;
956 }
957
edc2c47a
KW
958 /* Examine a maximum of a single whole code point */
959 if (e - s > UTF8SKIP(s)) {
960 e = s + UTF8SKIP(s);
961 }
962
2b479609
KW
963 len = e - s;
964
965 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
966 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 967
56576a04
KW
968 /* Here, we are disallowing some set of largish code points, and the
969 * first byte indicates the sequence is for a code point that could be
970 * in the excluded set. We generally don't have to look beyond this or
971 * the second byte to see if the sequence is actually for one of the
972 * excluded classes. The code below is derived from this table:
973 *
2b479609
KW
974 * UTF-8 UTF-EBCDIC I8
975 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
976 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
977 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
978 *
56576a04
KW
979 * Keep in mind that legal continuation bytes range between \x80..\xBF
980 * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
981 * continuation bytes. Hence, we don't have to test the upper edge
982 * because if any of those is encountered, the sequence is malformed,
983 * and would fail elsewhere in this function.
984 *
985 * The code here likewise assumes that there aren't other
986 * malformations; again the function should fail elsewhere because of
987 * these. For example, an overlong beginning with FC doesn't actually
988 * have to be a super; it could actually represent a small code point,
989 * even U+0000. But, since overlongs (and other malformations) are
990 * illegal, the function should return FALSE in either case.
2b479609
KW
991 */
992
993#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
994# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 995# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 996
19794540
KW
997# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
998 /* B6 and B7 */ \
999 && ((s1) & 0xFE ) == 0xB6)
57ff5f59 1000# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
2b479609
KW
1001#else
1002# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
1003# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
1004# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
57ff5f59 1005# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
2b479609
KW
1006#endif
1007
1008 if ( (flags & UTF8_DISALLOW_SUPER)
ddb65933
KW
1009 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1010 {
2b479609
KW
1011 return 0; /* Above Unicode */
1012 }
1013
d044b7a7 1014 if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
57ff5f59 1015 && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
2b479609 1016 {
57ff5f59 1017 return 0;
2b479609
KW
1018 }
1019
1020 if (len > 1) {
1021 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
1022
1023 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 1024 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
1025 {
1026 return 0; /* Above Unicode */
1027 }
1028
1029 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 1030 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
1031 {
1032 return 0; /* Surrogate */
1033 }
1034
1035 if ( (flags & UTF8_DISALLOW_NONCHAR)
1036 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
1037 {
1038 return 0; /* Noncharacter code point */
1039 }
1040 }
1041 }
1042
1043 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
1044 for (x = s + 1; x < e; x++) {
1045 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
1046 return 0;
1047 }
1048 }
1049
af13dd8a 1050 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3 1051 * overlong. */
d6be65ae 1052 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
12a4bed3 1053 return 0;
af13dd8a
KW
1054 }
1055
12a4bed3
KW
1056 /* And finally, that the code point represented fits in a word on this
1057 * platform */
e050c007
KW
1058 if (0 < does_utf8_overflow(s, e,
1059 0 /* Don't consider overlongs */
1060 ))
1061 {
12a4bed3 1062 return 0;
35f8c9bd
KW
1063 }
1064
2b479609 1065 return UTF8SKIP(s);
35f8c9bd
KW
1066}
1067
7e2f38b2 1068char *
63ab03b3 1069Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
7cf8d05d
KW
1070{
1071 /* Returns a mortalized C string that is a displayable copy of the 'len'
63ab03b3 1072 * bytes starting at 'start'. 'format' gives how to display each byte.
7e2f38b2
KW
1073 * Currently, there are only two formats, so it is currently a bool:
1074 * 0 \xab
1075 * 1 ab (that is a space between two hex digit bytes)
1076 */
7cf8d05d
KW
1077
1078 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
1079 trailing NUL */
63ab03b3
KW
1080 const U8 * s = start;
1081 const U8 * const e = start + len;
7cf8d05d
KW
1082 char * output;
1083 char * d;
1084
1085 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
1086
1087 Newx(output, output_len, char);
1088 SAVEFREEPV(output);
1089
1090 d = output;
63ab03b3 1091 for (s = start; s < e; s++) {
7cf8d05d
KW
1092 const unsigned high_nibble = (*s & 0xF0) >> 4;
1093 const unsigned low_nibble = (*s & 0x0F);
1094
7e2f38b2 1095 if (format) {
63ab03b3
KW
1096 if (s > start) {
1097 *d++ = ' ';
1098 }
7e2f38b2
KW
1099 }
1100 else {
1101 *d++ = '\\';
1102 *d++ = 'x';
1103 }
7cf8d05d
KW
1104
1105 if (high_nibble < 10) {
1106 *d++ = high_nibble + '0';
1107 }
1108 else {
1109 *d++ = high_nibble - 10 + 'a';
1110 }
1111
1112 if (low_nibble < 10) {
1113 *d++ = low_nibble + '0';
1114 }
1115 else {
1116 *d++ = low_nibble - 10 + 'a';
1117 }
1118 }
1119
1120 *d = '\0';
1121 return output;
1122}
1123
806547a7 1124PERL_STATIC_INLINE char *
7cf8d05d
KW
1125S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1126
421da25c 1127 /* Max number of bytes to print */
3cc6a05e 1128 STRLEN print_len,
7cf8d05d
KW
1129
1130 /* Which one is the non-continuation */
1131 const STRLEN non_cont_byte_pos,
1132
1133 /* How many bytes should there be? */
1134 const STRLEN expect_len)
806547a7
KW
1135{
1136 /* Return the malformation warning text for an unexpected continuation
1137 * byte. */
1138
7cf8d05d 1139 const char * const where = (non_cont_byte_pos == 1)
806547a7 1140 ? "immediately"
7cf8d05d
KW
1141 : Perl_form(aTHX_ "%d bytes",
1142 (int) non_cont_byte_pos);
421da25c
KW
1143 const U8 * x = s + non_cont_byte_pos;
1144 const U8 * e = s + print_len;
806547a7
KW
1145
1146 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1147
7cf8d05d
KW
1148 /* We don't need to pass this parameter, but since it has already been
1149 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1150 assert(expect_len == UTF8SKIP(s));
1151
421da25c
KW
1152 /* As a defensive coding measure, don't output anything past a NUL. Such
1153 * bytes shouldn't be in the middle of a malformation, and could mark the
1154 * end of the allocated string, and what comes after is undefined */
1155 for (; x < e; x++) {
1156 if (*x == '\0') {
1157 x++; /* Output this particular NUL */
1158 break;
1159 }
1160 }
1161
7cf8d05d
KW
1162 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1163 " %s after start byte 0x%02x; need %d bytes, got %d)",
1164 malformed_text,
421da25c 1165 _byte_dump_string(s, x - s, 0),
7cf8d05d
KW
1166 *(s + non_cont_byte_pos),
1167 where,
1168 *s,
1169 (int) expect_len,
1170 (int) non_cont_byte_pos);
806547a7
KW
1171}
1172
35f8c9bd
KW
1173/*
1174
de69f3af 1175=for apidoc utf8n_to_uvchr
378516de
KW
1176
1177THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 1178Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 1179
9041c2e3 1180Bottom level UTF-8 decode routine.
de69f3af 1181Returns the native code point value of the first character in the string C<s>,
746afd53
KW
1182which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1183C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1184the length, in bytes, of that character.
949cf498
KW
1185
1186The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
1187well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1188causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1189is the next possible position in C<s> that could begin a non-malformed
1190character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1191is raised. Some UTF-8 input sequences may contain multiple malformations.
1192This function tries to find every possible one in each call, so multiple
56576a04 1193warnings can be raised for the same sequence.
949cf498
KW
1194
1195Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1196individual types of malformations, such as the sequence being overlong (that
1197is, when there is a shorter sequence that can express the same code point;
1198overlong sequences are expressly forbidden in the UTF-8 standard due to
1199potential security issues). Another malformation example is the first byte of
1200a character not being a legal first byte. See F<utf8.h> for the list of such
94953955
KW
1201flags. Even if allowed, this function generally returns the Unicode
1202REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1203F<utf8.h> to override this behavior for the overlong malformations, but don't
1204do that except for very specialized purposes.
949cf498 1205
796b6530 1206The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
1207flags) malformation is found. If this flag is set, the routine assumes that
1208the caller will raise a warning, and this function will silently just set
d088425d
KW
1209C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1210
75200dff 1211Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 1212character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
1213in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1214be set to 1. To disambiguate, upon a zero return, see if the first byte of
1215C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 1216error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
1217
1218Certain code points are considered problematic. These are Unicode surrogates,
746afd53 1219Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 1220By default these are considered regular code points, but certain situations
ecc1615f
KW
1221warrant special handling for them, which can be specified using the C<flags>
1222parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1223three classes are treated as malformations and handled as such. The flags
1224C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1225C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1226disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1227restricts the allowed inputs to the strict UTF-8 traditionally defined by
1228Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1229definition given by
1230L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1231The difference between traditional strictness and C9 strictness is that the
1232latter does not forbid non-character code points. (They are still discouraged,
1233however.) For more discussion see L<perlunicode/Noncharacter code points>.
1234
1235The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1236C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
1237C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1238raised for their respective categories, but otherwise the code points are
1239considered valid (not malformations). To get a category to both be treated as
1240a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 1241(But note that warnings are not raised if lexically disabled nor if
796b6530 1242C<UTF8_CHECK_ONLY> is also specified.)
949cf498 1243
57ff5f59
KW
1244Extremely high code points were never specified in any standard, and require an
1245extension to UTF-8 to express, which Perl does. It is likely that programs
1246written in something other than Perl would not be able to read files that
1247contain these; nor would Perl understand files written by something that uses a
1248different extension. For these reasons, there is a separate set of flags that
1249can warn and/or disallow these extremely high code points, even if other
1250above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1251C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
1252L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
1253above-Unicode code points, including these, as malformations.
1254(Note that the Unicode standard considers anything above 0x10FFFF to be
1255illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1256(2**31 -1))
1257
1258A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1259retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1260C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1261C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1262can apply to code points that actually do fit in 31 bits. This happens on
1263EBCDIC platforms, and sometimes when the L<overlong
1264malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1265describe the situation in all cases.
1266
ab8e6d41 1267
949cf498
KW
1268All other code points corresponding to Unicode characters, including private
1269use and those yet to be assigned, are never considered malformed and never
1270warn.
67e989fb 1271
37607a96 1272=cut
f9380377
KW
1273
1274Also implemented as a macro in utf8.h
1275*/
1276
1277UV
1278Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
1279 STRLEN curlen,
1280 STRLEN *retlen,
1281 const U32 flags)
1282{
1283 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1284
1285 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1286}
1287
1288/*
1289
1290=for apidoc utf8n_to_uvchr_error
1291
1292THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1293Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1294
1295This function is for code that needs to know what the precise malformation(s)
37657a5b
KW
1296are when an error is found. If you also need to know the generated warning
1297messages, use L</utf8n_to_uvchr_msgs>() instead.
f9380377
KW
1298
1299It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1300all the others, C<errors>. If this parameter is 0, this function behaves
1301identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1302to a C<U32> variable, which this function sets to indicate any errors found.
1303Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1304C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1305of these bits will be set if a malformation is found, even if the input
7a65503b 1306C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
1307exceptions are noted:
1308
1309=over 4
1310
57ff5f59 1311=item C<UTF8_GOT_PERL_EXTENDED>
f9380377 1312
57ff5f59
KW
1313The input sequence is not standard UTF-8, but a Perl extension. This bit is
1314set only if the input C<flags> parameter contains either the
1315C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1316
1317Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1318and so some extension must be used to express them. Perl uses a natural
1319extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1320extension to represent even higher ones, so that any code point that fits in a
132164-bit word can be represented. Text using these extensions is not likely to
1322be portable to non-Perl code. We lump both of these extensions together and
1323refer to them as Perl extended UTF-8. There exist other extensions that people
1324have invented, incompatible with Perl's.
1325
1326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1328than on ASCII. Prior to that, code points 2**31 and higher were simply
1329unrepresentable, and a different, incompatible method was used to represent
1330code points between 2**30 and 2**31 - 1.
1331
1332On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1333Perl extended UTF-8 is used.
1334
1335In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1336may use for backward compatibility. That name is misleading, as this flag may
1337be set when the code point actually does fit in 31 bits. This happens on
1338EBCDIC platforms, and sometimes when the L<overlong
1339malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1340describes the situation in all cases.
f9380377
KW
1341
1342=item C<UTF8_GOT_CONTINUATION>
1343
1344The input sequence was malformed in that the first byte was a a UTF-8
1345continuation byte.
1346
1347=item C<UTF8_GOT_EMPTY>
1348
1349The input C<curlen> parameter was 0.
1350
1351=item C<UTF8_GOT_LONG>
1352
1353The input sequence was malformed in that there is some other sequence that
1354evaluates to the same code point, but that sequence is shorter than this one.
1355
fecaf136
KW
1356Until Unicode 3.1, it was legal for programs to accept this malformation, but
1357it was discovered that this created security issues.
1358
f9380377
KW
1359=item C<UTF8_GOT_NONCHAR>
1360
1361The code point represented by the input UTF-8 sequence is for a Unicode
1362non-character code point.
1363This bit is set only if the input C<flags> parameter contains either the
1364C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1365
1366=item C<UTF8_GOT_NON_CONTINUATION>
1367
1368The input sequence was malformed in that a non-continuation type byte was found
1369in a position where only a continuation type one should be.
1370
1371=item C<UTF8_GOT_OVERFLOW>
1372
1373The input sequence was malformed in that it is for a code point that is not
d22ec717 1374representable in the number of bits available in an IV on the current platform.
f9380377
KW
1375
1376=item C<UTF8_GOT_SHORT>
1377
1378The input sequence was malformed in that C<curlen> is smaller than required for
1379a complete sequence. In other words, the input is for a partial character
1380sequence.
1381
1382=item C<UTF8_GOT_SUPER>
1383
1384The input sequence was malformed in that it is for a non-Unicode code point;
1385that is, one above the legal Unicode maximum.
1386This bit is set only if the input C<flags> parameter contains either the
1387C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1388
1389=item C<UTF8_GOT_SURROGATE>
1390
1391The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1392code point.
1393This bit is set only if the input C<flags> parameter contains either the
1394C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1395
1396=back
1397
133551d8
KW
1398To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1399flag to suppress any warnings, and then examine the C<*errors> return.
1400
f9380377 1401=cut
37657a5b
KW
1402
1403Also implemented as a macro in utf8.h
37607a96 1404*/
67e989fb 1405
a0ed51b3 1406UV
f9380377 1407Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
37657a5b
KW
1408 STRLEN curlen,
1409 STRLEN *retlen,
1410 const U32 flags,
1411 U32 * errors)
1412{
1413 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1414
1415 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1416}
1417
1418/*
1419
1420=for apidoc utf8n_to_uvchr_msgs
1421
1422THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1423Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1424
1425This function is for code that needs to know what the precise malformation(s)
1426are when an error is found, and wants the corresponding warning and/or error
1427messages to be returned to the caller rather than be displayed. All messages
1428that would have been displayed if all lexcial warnings are enabled will be
1429returned.
1430
1431It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1432placed after all the others, C<msgs>. If this parameter is 0, this function
1433behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1434be a pointer to an C<AV *> variable, in which this function creates a new AV to
1435contain any appropriate messages. The elements of the array are ordered so
1436that the first message that would have been displayed is in the 0th element,
1437and so on. Each element is a hash with three key-value pairs, as follows:
1438
1439=over 4
1440
1441=item C<text>
1442
1443The text of the message as a C<SVpv>.
1444
1445=item C<warn_categories>
1446
1447The warning category (or categories) packed into a C<SVuv>.
1448
1449=item C<flag>
1450
1451A single flag bit associated with this message, in a C<SVuv>.
1452The bit corresponds to some bit in the C<*errors> return value,
1453such as C<UTF8_GOT_LONG>.
1454
1455=back
1456
1457It's important to note that specifying this parameter as non-null will cause
1458any warnings this function would otherwise generate to be suppressed, and
1459instead be placed in C<*msgs>. The caller can check the lexical warnings state
1460(or not) when choosing what to do with the returned messages.
1461
1462If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1463no AV is created.
1464
1465The caller, of course, is responsible for freeing any returned AV.
1466
1467=cut
1468*/
1469
1470UV
1471Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
1472 STRLEN curlen,
1473 STRLEN *retlen,
1474 const U32 flags,
1475 U32 * errors,
1476 AV ** msgs)
a0ed51b3 1477{
d4c19fe8 1478 const U8 * const s0 = s;
2b9519f0 1479 const U8 * send = s0 + curlen;
5af9f822
KW
1480 U32 possible_problems; /* A bit is set here for each potential problem
1481 found as we go along */
1482 UV uv;
1483 STRLEN expectlen; /* How long should this sequence be? */
1484 STRLEN avail_len; /* When input is too short, gives what that is */
1485 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
1486 gets set and discarded */
a0dbb045 1487
2b5e7bc2
KW
1488 /* The below are used only if there is both an overlong malformation and a
1489 * too short one. Otherwise the first two are set to 's0' and 'send', and
1490 * the third not used at all */
5af9f822 1491 U8 * adjusted_s0;
e9f2c446
KW
1492 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1493 routine; see [perl #130921] */
5af9f822 1494 UV uv_so_far;
2b9519f0
KW
1495 UV state = 0;
1496
37657a5b 1497 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
f9380377 1498
5af9f822
KW
1499 /* Measurements show that this dfa is somewhat faster than the regular code
1500 * below, so use it first, dropping down for the non-normal cases. */
1501
1502#define PERL_UTF8_DECODE_REJECT 1
1503
1504 while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
1505 UV type = strict_utf8_dfa_tab[*s];
1506
1507 uv = (state == 0)
1508 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1509 : UTF8_ACCUMULATE(uv, *s);
1510 state = strict_utf8_dfa_tab[256 + state + type];
1511
1512 if (state == 0) {
1513 if (retlen) {
1514 *retlen = s - s0 + 1;
1515 }
1516 if (errors) {
1517 *errors = 0;
1518 }
1519 if (msgs) {
1520 *msgs = NULL;
1521 }
1522
1523 return uv;
1524 }
1525
1526 s++;
1527 }
1528
1529 /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1530 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1531 * syllables that the dfa doesn't properly handle. Quickly dispose of the
1532 * final case. */
1533
1534#ifndef EBCDIC
1535
1536 /* Each of the affected Hanguls starts with \xED */
1537
1538 if (is_HANGUL_ED_utf8_safe(s0, send)) {
1539 if (retlen) {
1540 *retlen = 3;
1541 }
1542 if (errors) {
1543 *errors = 0;
1544 }
1545 if (msgs) {
1546 *msgs = NULL;
1547 }
1548
1549 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1550 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1551 | (s0[2] & UTF_CONTINUATION_MASK);
1552 }
1553
1554#endif
1555
1556 /* In conjunction with the exhaustive tests that can be enabled in
1557 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1558 * what it is intended to do, and that no flaws in it are masked by
1559 * dropping down and executing the code below
1560 assert(! isUTF8_CHAR(s0, send)
1561 || UTF8_IS_SURROGATE(s0, send)
1562 || UTF8_IS_SUPER(s0, send)
1563 || UTF8_IS_NONCHAR(s0,send));
1564 */
1565
1566 s = s0;
1567 uv = *s0;
1568 possible_problems = 0;
1569 expectlen = 0;
1570 avail_len = 0;
1571 discard_errors = 0;
1572 adjusted_s0 = (U8 *) s0;
1573 uv_so_far = 0;
1574
f9380377
KW
1575 if (errors) {
1576 *errors = 0;
1577 }
1578 else {
1579 errors = &discard_errors;
1580 }
a0dbb045 1581
eb83ed87
KW
1582 /* The order of malformation tests here is important. We should consume as
1583 * few bytes as possible in order to not skip any valid character. This is
1584 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1585 * http://unicode.org/reports/tr36 for more discussion as to why. For
1586 * example, once we've done a UTF8SKIP, we can tell the expected number of
1587 * bytes, and could fail right off the bat if the input parameters indicate
1588 * that there are too few available. But it could be that just that first
1589 * byte is garbled, and the intended character occupies fewer bytes. If we
1590 * blindly assumed that the first byte is correct, and skipped based on
1591 * that number, we could skip over a valid input character. So instead, we
1592 * always examine the sequence byte-by-byte.
1593 *
1594 * We also should not consume too few bytes, otherwise someone could inject
1595 * things. For example, an input could be deliberately designed to
1596 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1597 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1598 * which is actually part of of the overflowing sequence), that could look
1599 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1600 * sequence and process the rest, inappropriately.
1601 *
1602 * Some possible input sequences are malformed in more than one way. This
1603 * function goes to lengths to try to find all of them. This is necessary
1604 * for correctness, as the inputs may allow one malformation but not
1605 * another, and if we abandon searching for others after finding the
1606 * allowed one, we could allow in something that shouldn't have been.
1607 */
eb83ed87 1608
b5b9af04 1609 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1610 possible_problems |= UTF8_GOT_EMPTY;
1611 curlen = 0;
5a48568d 1612 uv = UNICODE_REPLACEMENT;
2b5e7bc2 1613 goto ready_to_handle_errors;
0c443dc2
JH
1614 }
1615
eb83ed87
KW
1616 expectlen = UTF8SKIP(s);
1617
1618 /* A well-formed UTF-8 character, as the vast majority of calls to this
1619 * function will be for, has this expected length. For efficiency, set
1620 * things up here to return it. It will be overriden only in those rare
1621 * cases where a malformation is found */
1622 if (retlen) {
1623 *retlen = expectlen;
1624 }
1625
eb83ed87 1626 /* A continuation character can't start a valid sequence */
b5b9af04 1627 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1628 possible_problems |= UTF8_GOT_CONTINUATION;
1629 curlen = 1;
1630 uv = UNICODE_REPLACEMENT;
1631 goto ready_to_handle_errors;
ba210ebe 1632 }
9041c2e3 1633
dcd27b3c 1634 /* Here is not a continuation byte, nor an invariant. The only thing left
ddb65933
KW
1635 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1636 * because it excludes start bytes like \xC0 that always lead to
1637 * overlongs.) */
dcd27b3c 1638
534752c1
KW
1639 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1640 * that indicate the number of bytes in the character's whole UTF-8
1641 * sequence, leaving just the bits that are part of the value. */
1642 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1643
e308b348
KW
1644 /* Setup the loop end point, making sure to not look past the end of the
1645 * input string, and flag it as too short if the size isn't big enough. */
e308b348
KW
1646 if (UNLIKELY(curlen < expectlen)) {
1647 possible_problems |= UTF8_GOT_SHORT;
1648 avail_len = curlen;
e308b348
KW
1649 }
1650 else {
2b9519f0 1651 send = (U8*) s0 + expectlen;
e308b348 1652 }
e308b348 1653
eb83ed87 1654 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1655 * accumulating each into the working value as we go. */
eb83ed87 1656 for (s = s0 + 1; s < send; s++) {
b5b9af04 1657 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1658 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1659 continue;
1660 }
1661
1662 /* Here, found a non-continuation before processing all expected bytes.
1663 * This byte indicates the beginning of a new character, so quit, even
1664 * if allowing this malformation. */
2b5e7bc2 1665 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1666 break;
eb83ed87
KW
1667 } /* End of loop through the character's bytes */
1668
1669 /* Save how many bytes were actually in the character */
1670 curlen = s - s0;
1671
2b5e7bc2
KW
1672 /* Note that there are two types of too-short malformation. One is when
1673 * there is actual wrong data before the normal termination of the
1674 * sequence. The other is that the sequence wasn't complete before the end
1675 * of the data we are allowed to look at, based on the input 'curlen'.
1676 * This means that we were passed data for a partial character, but it is
1677 * valid as far as we saw. The other is definitely invalid. This
1678 * distinction could be important to a caller, so the two types are kept
15b010f0
KW
1679 * separate.
1680 *
1681 * A convenience macro that matches either of the too-short conditions. */
1682# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1683
1684 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1685 uv_so_far = uv;
1686 uv = UNICODE_REPLACEMENT;
1687 }
2b5e7bc2 1688
08e73697
KW
1689 /* Check for overflow. The algorithm requires us to not look past the end
1690 * of the current character, even if partial, so the upper limit is 's' */
e050c007
KW
1691 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1692 1 /* Do consider overlongs */
1693 )))
1694 {
2b5e7bc2
KW
1695 possible_problems |= UTF8_GOT_OVERFLOW;
1696 uv = UNICODE_REPLACEMENT;
eb83ed87 1697 }
eb83ed87 1698
2b5e7bc2
KW
1699 /* Check for overlong. If no problems so far, 'uv' is the correct code
1700 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1701 * we must look at the UTF-8 byte sequence itself to see if it is for an
1702 * overlong */
1703 if ( ( LIKELY(! possible_problems)
1704 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
56576a04 1705 || ( UNLIKELY(possible_problems)
2b5e7bc2
KW
1706 && ( UNLIKELY(! UTF8_IS_START(*s0))
1707 || ( curlen > 1
d6be65ae 1708 && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
08e73697 1709 s - s0))))))
2f8f112e 1710 {
2b5e7bc2
KW
1711 possible_problems |= UTF8_GOT_LONG;
1712
abc28b54 1713 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
56576a04 1714
abc28b54
KW
1715 /* The calculation in the 'true' branch of this 'if'
1716 * below won't work if overflows, and isn't needed
1717 * anyway. Further below we handle all overflow
1718 * cases */
1719 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1720 {
2b5e7bc2
KW
1721 UV min_uv = uv_so_far;
1722 STRLEN i;
1723
1724 /* Here, the input is both overlong and is missing some trailing
1725 * bytes. There is no single code point it could be for, but there
1726 * may be enough information present to determine if what we have
1727 * so far is for an unallowed code point, such as for a surrogate.
56576a04
KW
1728 * The code further below has the intelligence to determine this,
1729 * but just for non-overlong UTF-8 sequences. What we do here is
1730 * calculate the smallest code point the input could represent if
1731 * there were no too short malformation. Then we compute and save
1732 * the UTF-8 for that, which is what the code below looks at
1733 * instead of the raw input. It turns out that the smallest such
1734 * code point is all we need. */
2b5e7bc2
KW
1735 for (i = curlen; i < expectlen; i++) {
1736 min_uv = UTF8_ACCUMULATE(min_uv,
1737 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1738 }
1739
e9f2c446 1740 adjusted_s0 = temp_char_buf;
57ff5f59 1741 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
2b5e7bc2 1742 }
eb83ed87
KW
1743 }
1744
56576a04
KW
1745 /* Here, we have found all the possible problems, except for when the input
1746 * is for a problematic code point not allowed by the input parameters. */
1747
06188866
KW
1748 /* uv is valid for overlongs */
1749 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1750
1751 /* isn't problematic if < this */
1752 && uv >= UNICODE_SURROGATE_FIRST)
2b5e7bc2 1753 || ( UNLIKELY(possible_problems)
d60baaa7
KW
1754
1755 /* if overflow, we know without looking further
1756 * precisely which of the problematic types it is,
1757 * and we deal with those in the overflow handling
1758 * code */
1759 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
57ff5f59
KW
1760 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1761 || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
760c7c2f
KW
1762 && ((flags & ( UTF8_DISALLOW_NONCHAR
1763 |UTF8_DISALLOW_SURROGATE
1764 |UTF8_DISALLOW_SUPER
d044b7a7 1765 |UTF8_DISALLOW_PERL_EXTENDED
760c7c2f
KW
1766 |UTF8_WARN_NONCHAR
1767 |UTF8_WARN_SURROGATE
1768 |UTF8_WARN_SUPER
d22ec717 1769 |UTF8_WARN_PERL_EXTENDED))))
eb83ed87 1770 {
2b5e7bc2
KW
1771 /* If there were no malformations, or the only malformation is an
1772 * overlong, 'uv' is valid */
1773 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1774 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1775 possible_problems |= UTF8_GOT_SURROGATE;
1776 }
1777 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1778 possible_problems |= UTF8_GOT_SUPER;
1779 }
1780 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1781 possible_problems |= UTF8_GOT_NONCHAR;
1782 }
1783 }
1784 else { /* Otherwise, need to look at the source UTF-8, possibly
1785 adjusted to be non-overlong */
1786
1787 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1788 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1789 {
2b5e7bc2
KW
1790 possible_problems |= UTF8_GOT_SUPER;
1791 }
1792 else if (curlen > 1) {
1793 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1794 NATIVE_UTF8_TO_I8(*adjusted_s0),
1795 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1796 {
2b5e7bc2 1797 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1798 }
2b5e7bc2
KW
1799 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1800 NATIVE_UTF8_TO_I8(*adjusted_s0),
1801 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1802 {
1803 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1804 }
1805 }
c0236afe 1806
2b5e7bc2
KW
1807 /* We need a complete well-formed UTF-8 character to discern
1808 * non-characters, so can't look for them here */
1809 }
1810 }
949cf498 1811
2b5e7bc2
KW
1812 ready_to_handle_errors:
1813
1814 /* At this point:
1815 * curlen contains the number of bytes in the sequence that
1816 * this call should advance the input by.
e308b348
KW
1817 * avail_len gives the available number of bytes passed in, but
1818 * only if this is less than the expected number of
1819 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1820 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1821 * is set in it for each potential problem found.
1822 * uv contains the code point the input sequence
1823 * represents; or if there is a problem that prevents
1824 * a well-defined value from being computed, it is
1825 * some subsitute value, typically the REPLACEMENT
1826 * CHARACTER.
1827 * s0 points to the first byte of the character
56576a04
KW
1828 * s points to just after were we left off processing
1829 * the character
1830 * send points to just after where that character should
1831 * end, based on how many bytes the start byte tells
1832 * us should be in it, but no further than s0 +
1833 * avail_len
2b5e7bc2 1834 */
eb83ed87 1835
2b5e7bc2
KW
1836 if (UNLIKELY(possible_problems)) {
1837 bool disallowed = FALSE;
1838 const U32 orig_problems = possible_problems;
1839
37657a5b
KW
1840 if (msgs) {
1841 *msgs = NULL;
1842 }
1843
2b5e7bc2
KW
1844 while (possible_problems) { /* Handle each possible problem */
1845 UV pack_warn = 0;
1846 char * message = NULL;
37657a5b 1847 U32 this_flag_bit = 0;
2b5e7bc2
KW
1848
1849 /* Each 'if' clause handles one problem. They are ordered so that
1850 * the first ones' messages will be displayed before the later
6c64cd9d
KW
1851 * ones; this is kinda in decreasing severity order. But the
1852 * overlong must come last, as it changes 'uv' looked at by the
1853 * others */
2b5e7bc2
KW
1854 if (possible_problems & UTF8_GOT_OVERFLOW) {
1855
56576a04
KW
1856 /* Overflow means also got a super and are using Perl's
1857 * extended UTF-8, but we handle all three cases here */
2b5e7bc2 1858 possible_problems
d044b7a7 1859 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
f9380377
KW
1860 *errors |= UTF8_GOT_OVERFLOW;
1861
1862 /* But the API says we flag all errors found */
1863 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1864 *errors |= UTF8_GOT_SUPER;
1865 }
ddb65933 1866 if (flags
d044b7a7 1867 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 1868 {
d044b7a7 1869 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 1870 }
2b5e7bc2 1871
d60baaa7 1872 /* Disallow if any of the three categories say to */
56576a04 1873 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
d60baaa7 1874 || (flags & ( UTF8_DISALLOW_SUPER
d044b7a7 1875 |UTF8_DISALLOW_PERL_EXTENDED)))
d60baaa7
KW
1876 {
1877 disallowed = TRUE;
1878 }
1879
d22ec717
KW
1880 /* Likewise, warn if any say to */
1881 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1882 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
d60baaa7 1883 {
2b5e7bc2 1884
ddb65933
KW
1885 /* The warnings code explicitly says it doesn't handle the
1886 * case of packWARN2 and two categories which have
1887 * parent-child relationship. Even if it works now to
1888 * raise the warning if either is enabled, it wouldn't
1889 * necessarily do so in the future. We output (only) the
56576a04 1890 * most dire warning */
ddb65933 1891 if (! (flags & UTF8_CHECK_ONLY)) {
37657a5b 1892 if (msgs || ckWARN_d(WARN_UTF8)) {
ddb65933
KW
1893 pack_warn = packWARN(WARN_UTF8);
1894 }
37657a5b 1895 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
ddb65933
KW
1896 pack_warn = packWARN(WARN_NON_UNICODE);
1897 }
1898 if (pack_warn) {
1899 message = Perl_form(aTHX_ "%s: %s (overflows)",
1900 malformed_text,
05b9033b 1901 _byte_dump_string(s0, curlen, 0));
37657a5b 1902 this_flag_bit = UTF8_GOT_OVERFLOW;
ddb65933 1903 }
2b5e7bc2
KW
1904 }
1905 }
1906 }
1907 else if (possible_problems & UTF8_GOT_EMPTY) {
1908 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1909 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1910
1911 if (! (flags & UTF8_ALLOW_EMPTY)) {
d1f8d421
KW
1912
1913 /* This so-called malformation is now treated as a bug in
1914 * the caller. If you have nothing to decode, skip calling
1915 * this function */
1916 assert(0);
1917
2b5e7bc2 1918 disallowed = TRUE;
37657a5b
KW
1919 if ( (msgs
1920 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1921 {
2b5e7bc2
KW
1922 pack_warn = packWARN(WARN_UTF8);
1923 message = Perl_form(aTHX_ "%s (empty string)",
1924 malformed_text);
37657a5b 1925 this_flag_bit = UTF8_GOT_EMPTY;
2b5e7bc2
KW
1926 }
1927 }
1928 }
1929 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1930 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1931 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1932
1933 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1934 disallowed = TRUE;
37657a5b
KW
1935 if (( msgs
1936 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1937 {
2b5e7bc2
KW
1938 pack_warn = packWARN(WARN_UTF8);
1939 message = Perl_form(aTHX_
1940 "%s: %s (unexpected continuation byte 0x%02x,"
1941 " with no preceding start byte)",
1942 malformed_text,
7e2f38b2 1943 _byte_dump_string(s0, 1, 0), *s0);
37657a5b 1944 this_flag_bit = UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1945 }
1946 }
1947 }
2b5e7bc2
KW
1948 else if (possible_problems & UTF8_GOT_SHORT) {
1949 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1950 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1951
1952 if (! (flags & UTF8_ALLOW_SHORT)) {
1953 disallowed = TRUE;
37657a5b
KW
1954 if (( msgs
1955 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1956 {
2b5e7bc2
KW
1957 pack_warn = packWARN(WARN_UTF8);
1958 message = Perl_form(aTHX_
56576a04
KW
1959 "%s: %s (too short; %d byte%s available, need %d)",
1960 malformed_text,
1961 _byte_dump_string(s0, send - s0, 0),
1962 (int)avail_len,
1963 avail_len == 1 ? "" : "s",
1964 (int)expectlen);
37657a5b 1965 this_flag_bit = UTF8_GOT_SHORT;
2b5e7bc2
KW
1966 }
1967 }
ba210ebe 1968
2b5e7bc2 1969 }
e308b348
KW
1970 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1971 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1972 *errors |= UTF8_GOT_NON_CONTINUATION;
1973
1974 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1975 disallowed = TRUE;
37657a5b
KW
1976 if (( msgs
1977 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1978 {
99a765e9
KW
1979
1980 /* If we don't know for sure that the input length is
1981 * valid, avoid as much as possible reading past the
1982 * end of the buffer */
1983 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1984 ? s - s0
1985 : send - s0;
e308b348
KW
1986 pack_warn = packWARN(WARN_UTF8);
1987 message = Perl_form(aTHX_ "%s",
1988 unexpected_non_continuation_text(s0,
99a765e9 1989 printlen,
e308b348
KW
1990 s - s0,
1991 (int) expectlen));
37657a5b 1992 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
e308b348
KW
1993 }
1994 }
1995 }
2b5e7bc2
KW
1996 else if (possible_problems & UTF8_GOT_SURROGATE) {
1997 possible_problems &= ~UTF8_GOT_SURROGATE;
1998
f9380377
KW
1999 if (flags & UTF8_WARN_SURROGATE) {
2000 *errors |= UTF8_GOT_SURROGATE;
2001
2002 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2003 && (msgs || ckWARN_d(WARN_SURROGATE)))
f9380377 2004 {
2b5e7bc2
KW
2005 pack_warn = packWARN(WARN_SURROGATE);
2006
2007 /* These are the only errors that can occur with a
2008 * surrogate when the 'uv' isn't valid */
2009 if (orig_problems & UTF8_GOT_TOO_SHORT) {
2010 message = Perl_form(aTHX_
2011 "UTF-16 surrogate (any UTF-8 sequence that"
2012 " starts with \"%s\" is for a surrogate)",
7e2f38b2 2013 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
2014 }
2015 else {
c94c2f39 2016 message = Perl_form(aTHX_ surrogate_cp_format, uv);
2b5e7bc2 2017 }
37657a5b 2018 this_flag_bit = UTF8_GOT_SURROGATE;
f9380377 2019 }
2b5e7bc2 2020 }
ba210ebe 2021
2b5e7bc2
KW
2022 if (flags & UTF8_DISALLOW_SURROGATE) {
2023 disallowed = TRUE;
f9380377 2024 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
2025 }
2026 }
2027 else if (possible_problems & UTF8_GOT_SUPER) {
2028 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 2029
f9380377
KW
2030 if (flags & UTF8_WARN_SUPER) {
2031 *errors |= UTF8_GOT_SUPER;
2032
2033 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2034 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
f9380377 2035 {
2b5e7bc2
KW
2036 pack_warn = packWARN(WARN_NON_UNICODE);
2037
2038 if (orig_problems & UTF8_GOT_TOO_SHORT) {
2039 message = Perl_form(aTHX_
2040 "Any UTF-8 sequence that starts with"
2041 " \"%s\" is for a non-Unicode code point,"
2042 " may not be portable",
7e2f38b2 2043 _byte_dump_string(s0, curlen, 0));
2b5e7bc2
KW
2044 }
2045 else {
c94c2f39 2046 message = Perl_form(aTHX_ super_cp_format, uv);
2b5e7bc2 2047 }
37657a5b 2048 this_flag_bit = UTF8_GOT_SUPER;
f9380377 2049 }
2b5e7bc2 2050 }
ba210ebe 2051
57ff5f59
KW
2052 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2053 * and before possibly bailing out, so that the more dire
2054 * warning will override the regular one. */
2055 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
2b5e7bc2 2056 if ( ! (flags & UTF8_CHECK_ONLY)
d044b7a7 2057 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
37657a5b 2058 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
2b5e7bc2 2059 {
db0f09e6 2060 pack_warn = packWARN(WARN_NON_UNICODE);
2b5e7bc2 2061
57ff5f59
KW
2062 /* If it is an overlong that evaluates to a code point
2063 * that doesn't have to use the Perl extended UTF-8, it
2064 * still used it, and so we output a message that
2065 * doesn't refer to the code point. The same is true
2066 * if there was a SHORT malformation where the code
2067 * point is not valid. In that case, 'uv' will have
2068 * been set to the REPLACEMENT CHAR, and the message
2069 * below without the code point in it will be selected
2070 * */
2071 if (UNICODE_IS_PERL_EXTENDED(uv)) {
2b5e7bc2 2072 message = Perl_form(aTHX_
57ff5f59 2073 perl_extended_cp_format, uv);
2b5e7bc2
KW
2074 }
2075 else {
2076 message = Perl_form(aTHX_
57ff5f59
KW
2077 "Any UTF-8 sequence that starts with"
2078 " \"%s\" is a Perl extension, and"
2079 " so is not portable",
2080 _byte_dump_string(s0, curlen, 0));
2b5e7bc2 2081 }
37657a5b 2082 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2b5e7bc2
KW
2083 }
2084
d044b7a7
KW
2085 if (flags & ( UTF8_WARN_PERL_EXTENDED
2086 |UTF8_DISALLOW_PERL_EXTENDED))
ddb65933 2087 {
d044b7a7 2088 *errors |= UTF8_GOT_PERL_EXTENDED;
f9380377 2089
d044b7a7 2090 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
f9380377
KW
2091 disallowed = TRUE;
2092 }
2b5e7bc2
KW
2093 }
2094 }
eb83ed87 2095
2b5e7bc2 2096 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 2097 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
2098 disallowed = TRUE;
2099 }
2b5e7bc2
KW
2100 }
2101 else if (possible_problems & UTF8_GOT_NONCHAR) {
2102 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 2103
f9380377
KW
2104 if (flags & UTF8_WARN_NONCHAR) {
2105 *errors |= UTF8_GOT_NONCHAR;
2106
2107 if ( ! (flags & UTF8_CHECK_ONLY)
37657a5b 2108 && (msgs || ckWARN_d(WARN_NONCHAR)))
f9380377 2109 {
2b5e7bc2
KW
2110 /* The code above should have guaranteed that we don't
2111 * get here with errors other than overlong */
2112 assert (! (orig_problems
2113 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2114
2115 pack_warn = packWARN(WARN_NONCHAR);
c94c2f39 2116 message = Perl_form(aTHX_ nonchar_cp_format, uv);
37657a5b 2117 this_flag_bit = UTF8_GOT_NONCHAR;
f9380377 2118 }
2b5e7bc2 2119 }
5b311467 2120
2b5e7bc2
KW
2121 if (flags & UTF8_DISALLOW_NONCHAR) {
2122 disallowed = TRUE;
f9380377 2123 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2 2124 }
6c64cd9d
KW
2125 }
2126 else if (possible_problems & UTF8_GOT_LONG) {
2127 possible_problems &= ~UTF8_GOT_LONG;
2128 *errors |= UTF8_GOT_LONG;
2129
2130 if (flags & UTF8_ALLOW_LONG) {
2131
2132 /* We don't allow the actual overlong value, unless the
2133 * special extra bit is also set */
2134 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2135 & ~UTF8_ALLOW_LONG)))
2136 {
2137 uv = UNICODE_REPLACEMENT;
2138 }
2139 }
2140 else {
2141 disallowed = TRUE;
2142
37657a5b
KW
2143 if (( msgs
2144 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2145 {
6c64cd9d
KW
2146 pack_warn = packWARN(WARN_UTF8);
2147
2148 /* These error types cause 'uv' to be something that
2149 * isn't what was intended, so can't use it in the
2150 * message. The other error types either can't
2151 * generate an overlong, or else the 'uv' is valid */
2152 if (orig_problems &
2153 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2154 {
2155 message = Perl_form(aTHX_
2156 "%s: %s (any UTF-8 sequence that starts"
2157 " with \"%s\" is overlong which can and"
2158 " should be represented with a"
2159 " different, shorter sequence)",
2160 malformed_text,
2161 _byte_dump_string(s0, send - s0, 0),
2162 _byte_dump_string(s0, curlen, 0));
2163 }
2164 else {
2165 U8 tmpbuf[UTF8_MAXBYTES+1];
1be62ab9
KW
2166 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2167 uv, 0);
d819dc50
KW
2168 /* Don't use U+ for non-Unicode code points, which
2169 * includes those in the Latin1 range */
2170 const char * preface = ( uv > PERL_UNICODE_MAX
2171#ifdef EBCDIC
2172 || uv <= 0xFF
2173#endif
2174 )
2175 ? "0x"
2176 : "U+";
6c64cd9d
KW
2177 message = Perl_form(aTHX_
2178 "%s: %s (overlong; instead use %s to represent"
2179 " %s%0*" UVXf ")",
2180 malformed_text,
2181 _byte_dump_string(s0, send - s0, 0),
2182 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2183 preface,
2184 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2185 small code points */
1be62ab9 2186 UNI_TO_NATIVE(uv));
6c64cd9d 2187 }
37657a5b 2188 this_flag_bit = UTF8_GOT_LONG;
6c64cd9d
KW
2189 }
2190 }
2b5e7bc2
KW
2191 } /* End of looking through the possible flags */
2192
2193 /* Display the message (if any) for the problem being handled in
2194 * this iteration of the loop */
2195 if (message) {
37657a5b 2196 if (msgs) {
37657a5b
KW
2197 assert(this_flag_bit);
2198
2199 if (*msgs == NULL) {
2200 *msgs = newAV();
2201 }
2202
bb07812e
KW
2203 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2204 pack_warn,
2205 this_flag_bit)));
37657a5b
KW
2206 }
2207 else if (PL_op)
2b5e7bc2
KW
2208 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2209 OP_DESC(PL_op));
2210 else
2211 Perl_warner(aTHX_ pack_warn, "%s", message);
2212 }
ddb65933 2213 } /* End of 'while (possible_problems)' */
a0dbb045 2214
2b5e7bc2
KW
2215 /* Since there was a possible problem, the returned length may need to
2216 * be changed from the one stored at the beginning of this function.
2217 * Instead of trying to figure out if that's needed, just do it. */
2218 if (retlen) {
2219 *retlen = curlen;
2220 }
a0dbb045 2221
2b5e7bc2
KW
2222 if (disallowed) {
2223 if (flags & UTF8_CHECK_ONLY && retlen) {
2224 *retlen = ((STRLEN) -1);
2225 }
2226 return 0;
2227 }
eb83ed87 2228 }
ba210ebe 2229
2b5e7bc2 2230 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
2231}
2232
8e84507e 2233/*
ec5f19d0
KW
2234=for apidoc utf8_to_uvchr_buf
2235
2236Returns the native code point of the first character in the string C<s> which
2237is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 2238C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 2239
524080c4
KW
2240If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2241enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 2242C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 2243(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 2244C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 2245the next possible position in C<s> that could begin a non-malformed character.
de69f3af 2246See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 2247returned.
ec5f19d0
KW
2248
2249=cut
52be2536
KW
2250
2251Also implemented as a macro in utf8.h
2252
ec5f19d0
KW
2253*/
2254
2255
2256UV
2257Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2258{
7f974d7e
KW
2259 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2260
ec5f19d0
KW
2261 assert(s < send);
2262
2263 return utf8n_to_uvchr(s, send - s, retlen,
ddb65933 2264 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
ec5f19d0
KW
2265}
2266
52be2536
KW
2267/* This is marked as deprecated
2268 *
ec5f19d0
KW
2269=for apidoc utf8_to_uvuni_buf
2270
de69f3af
KW
2271Only in very rare circumstances should code need to be dealing in Unicode
2272(as opposed to native) code points. In those few cases, use
07dfe0a4
KW
2273C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead. If you
2274are not absolutely sure this is one of those cases, then assume it isn't and
2275use plain C<utf8_to_uvchr_buf> instead.
4f83cdcd
KW
2276
2277Returns the Unicode (not-native) code point of the first character in the
2278string C<s> which
ec5f19d0
KW
2279is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2280C<retlen> will be set to the length, in bytes, of that character.
2281
524080c4
KW
2282If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2283enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2284NULL) to -1. If those warnings are off, the computed value if well-defined (or
2285the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2286is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2287next possible position in C<s> that could begin a non-malformed character.
de69f3af 2288See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0
KW
2289
2290=cut
2291*/
2292
2293UV
2294Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2295{
2296 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2297
2298 assert(send > s);
2299
5962d97e 2300 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
2301}
2302
b76347f2 2303/*
87cea99e 2304=for apidoc utf8_length
b76347f2 2305
b2e7ed74
KW
2306Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2307at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2308same place, it returns 0 with no warning raised.
2309
2310If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2311and returns the number of valid characters.
b76347f2
JH
2312
2313=cut
2314*/
2315
2316STRLEN
35a4481c 2317Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
2318{
2319 STRLEN len = 0;
2320
7918f24d
NC
2321 PERL_ARGS_ASSERT_UTF8_LENGTH;
2322
8850bf83
JH
2323 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2324 * the bitops (especially ~) can create illegal UTF-8.
2325 * In other words: in Perl UTF-8 is not just for Unicode. */
2326
a3b680e6
AL
2327 if (e < s)
2328 goto warn_and_return;
b76347f2 2329 while (s < e) {
4cbf4130 2330 s += UTF8SKIP(s);
8e91ec7f
AV
2331 len++;
2332 }
2333
2334 if (e != s) {
2335 len--;
2336 warn_and_return:
9b387841
NC
2337 if (PL_op)
2338 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2339 "%s in %s", unees, OP_DESC(PL_op));
2340 else
61a12c31 2341 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
2342 }
2343
2344 return len;
2345}
2346
b06226ff 2347/*
fed3ba5d
NC
2348=for apidoc bytes_cmp_utf8
2349
a1433954 2350Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
2351sequence of characters (stored as UTF-8)
2352in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
2353equal, -1 or -2 if the first string is less than the second string, +1 or +2
2354if the first string is greater than the second string.
2355
2356-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
2357longer string. -2 or +2 is returned if
2358there was a difference between characters
fed3ba5d
NC
2359within the strings.
2360
2361=cut
2362*/
2363
2364int
2365Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2366{
2367 const U8 *const bend = b + blen;
2368 const U8 *const uend = u + ulen;
2369
2370 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
2371
2372 while (b < bend && u < uend) {
2373 U8 c = *u++;
2374 if (!UTF8_IS_INVARIANT(c)) {
2375 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2376 if (u < uend) {
2377 U8 c1 = *u++;
2378 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 2379 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 2380 } else {
2b5e7bc2 2381 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 2382 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
56576a04
KW
2383 "%s %s%s",
2384 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2385 PL_op ? " in " : "",
2386 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
2387 return -2;
2388 }
2389 } else {
2390 if (PL_op)
2391 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2392 "%s in %s", unees, OP_DESC(PL_op));
2393 else
61a12c31 2394 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
2395 return -2; /* Really want to return undef :-) */
2396 }
2397 } else {
2398 return -2;
2399 }
2400 }
2401 if (*b != c) {
2402 return *b < c ? -2 : +2;
2403 }
2404 ++b;
2405 }
2406
2407 if (b == bend && u == uend)
2408 return 0;
2409
2410 return b < bend ? +1 : -1;
2411}
2412
2413/*
87cea99e 2414=for apidoc utf8_to_bytes
6940069f 2415
3bc0c78c 2416Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
a1433954 2417Unlike L</bytes_to_utf8>, this over-writes the original string, and
09af0336 2418updates C<*lenp> to contain the new length.
3bc0c78c
KW
2419Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2420
2421Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2422having saved the value of C<*lenp> before the call, and subtracting the
2423after-call value of C<*lenp> from it.
6940069f 2424
a1433954 2425If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 2426
6940069f
GS
2427=cut
2428*/
2429
2430U8 *
09af0336 2431Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
6940069f 2432{
9fe0d3c2 2433 U8 * first_variant;
246fae53 2434
7918f24d 2435 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 2436 PERL_UNUSED_CONTEXT;
7918f24d 2437
9fe0d3c2 2438 /* This is a no-op if no variants at all in the input */
09af0336 2439 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
9fe0d3c2
KW
2440 return s;
2441 }
2442
2443 {
3c5aa262 2444 U8 * const save = s;
09af0336 2445 U8 * const send = s + *lenp;
3c5aa262
KW
2446 U8 * d;
2447
2448 /* Nothing before the first variant needs to be changed, so start the real
2449 * work there */
2450 s = first_variant;
2451 while (s < send) {
2452 if (! UTF8_IS_INVARIANT(*s)) {
2453 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
09af0336 2454 *lenp = ((STRLEN) -1);
3c5aa262
KW
2455 return 0;
2456 }
2457 s++;
d59937ca
KW
2458 }
2459 s++;
dcad2880 2460 }
dcad2880 2461
3c5aa262
KW
2462 /* Is downgradable, so do it */
2463 d = s = first_variant;
2464 while (s < send) {
2465 U8 c = *s++;
2466 if (! UVCHR_IS_INVARIANT(c)) {
2467 /* Then it is two-byte encoded */
2468 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2469 s++;
2470 }
2471 *d++ = c;
2472 }
2473 *d = '\0';
09af0336 2474 *lenp = d - save;
3c5aa262
KW
2475
2476 return save;
9fe0d3c2 2477 }
6940069f
GS
2478}
2479
2480/*
87cea99e 2481=for apidoc bytes_from_utf8
f9a63242 2482
09af0336 2483Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
41ae6089 2484byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
4f3d592d
KW
2485actually encoded in UTF-8.
2486
2487Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2488the input string.
2489
41ae6089
KW
2490Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2491not expressible in native byte encoding. In these cases, C<*is_utf8p> and
09af0336 2492C<*lenp> are unchanged, and the return value is the original C<s>.
4f3d592d 2493
41ae6089 2494Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
4f3d592d 2495newly created string containing a downgraded copy of C<s>, and whose length is
9ff99fb3
KW
2496returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2497caller is responsible for arranging for the memory used by this string to get
2498freed.
f9a63242 2499
3bc0c78c 2500Upon successful return, the number of variants in the string can be computed by
23b37b12
KW
2501having saved the value of C<*lenp> before the call, and subtracting the
2502after-call value of C<*lenp> from it.
3bc0c78c 2503
37607a96 2504=cut
976c1b08
KW
2505
2506There is a macro that avoids this function call, but this is retained for
2507anyone who calls it with the Perl_ prefix */
f9a63242
JH
2508
2509U8 *
41ae6089 2510Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
f9a63242 2511{
7918f24d 2512 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 2513 PERL_UNUSED_CONTEXT;
f9a63242 2514
976c1b08
KW
2515 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2516}
2517
2518/*
2519No = here because currently externally undocumented
2520for apidoc bytes_from_utf8_loc
2521
2522Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2523to store the location of the first character in C<"s"> that cannot be
2524converted to non-UTF8.
2525
2526If that parameter is C<NULL>, this function behaves identically to
2527C<bytes_from_utf8>.
2528
2529Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2530C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2531
2532Otherwise, the function returns a newly created C<NUL>-terminated string
2533containing the non-UTF8 equivalent of the convertible first portion of
2534C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2535If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2536and C<*first_non_downgradable> is set to C<NULL>.
2537
2538Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2539first character in the original string that wasn't converted. C<*is_utf8p> is
2540unchanged. Note that the new string may have length 0.
2541
2542Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2543C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2544converts as many characters in it as possible stopping at the first one it
385b74be 2545finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
976c1b08
KW
2546set to point to that. The function returns the portion that could be converted
2547in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2548not including the terminating C<NUL>. If the very first character in the
2549original could not be converted, C<*lenp> will be 0, and the new string will
2550contain just a single C<NUL>. If the entire input string was converted,
2551C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2552
2553Upon successful return, the number of variants in the converted portion of the
2554string can be computed by having saved the value of C<*lenp> before the call,
2555and subtracting the after-call value of C<*lenp> from it.
2556
2557=cut
2558
2559
2560*/
2561
2562U8 *
2563Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2564{
2565 U8 *d;
2566 const U8 *original = s;
2567 U8 *converted_start;
2568 const U8 *send = s + *lenp;
f9a63242 2569
976c1b08 2570 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
170a1c22 2571
976c1b08
KW
2572 if (! *is_utf8p) {
2573 if (first_unconverted) {
2574 *first_unconverted = NULL;
2575 }
2576
2577 return (U8 *) original;
2578 }
2579
2580 Newx(d, (*lenp) + 1, U8);
2581
2582 converted_start = d;
7299a045
KW
2583 while (s < send) {
2584 U8 c = *s++;
2585 if (! UTF8_IS_INVARIANT(c)) {
976c1b08
KW
2586
2587 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2588 * have to stop now */
2589 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2590 if (first_unconverted) {
2591 *first_unconverted = s - 1;
2592 goto finish_and_return;
2593 }
2594 else {
2595 Safefree(converted_start);
2596 return (U8 *) original;
2597 }
2598 }
2599
7299a045
KW
2600 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2601 s++;
38af28cf 2602 }
7299a045
KW
2603 *d++ = c;
2604 }
170a1c22 2605
976c1b08
KW
2606 /* Here, converted the whole of the input */
2607 *is_utf8p = FALSE;
2608 if (first_unconverted) {
2609 *first_unconverted = NULL;
170a1c22 2610 }
976c1b08
KW
2611
2612 finish_and_return:
46a08a6f
KW
2613 *d = '\0';
2614 *lenp = d - converted_start;
976c1b08
KW
2615
2616 /* Trim unused space */
2617 Renew(converted_start, *lenp + 1, U8);
2618
2619 return converted_start;
f9a63242
JH
2620}
2621
2622/*
87cea99e 2623=for apidoc bytes_to_utf8
6940069f 2624
09af0336 2625Converts a string C<s> of length C<*lenp> bytes from the native encoding into
ff97e5cf 2626UTF-8.
09af0336 2627Returns a pointer to the newly-created string, and sets C<*lenp> to
9ff99fb3
KW
2628reflect the new length in bytes. The caller is responsible for arranging for
2629the memory used by this string to get freed.
6940069f 2630
3bc0c78c 2631Upon successful return, the number of variants in the string can be computed by
23b37b12 2632having saved the value of C<*lenp> before the call, and subtracting it from the
3bc0c78c
KW
2633after-call value of C<*lenp>.
2634
75200dff 2635A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
2636
2637If you want to convert to UTF-8 from encodings other than
2638the native (Latin1 or EBCDIC),
a1433954 2639see L</sv_recode_to_utf8>().
c9ada85f 2640
497711e7 2641=cut
6940069f
GS
2642*/
2643
2644U8*
09af0336 2645Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
6940069f 2646{
09af0336 2647 const U8 * const send = s + (*lenp);
6940069f
GS
2648 U8 *d;
2649 U8 *dst;
7918f24d
NC
2650
2651 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 2652 PERL_UNUSED_CONTEXT;
6940069f 2653
09af0336 2654 Newx(d, (*lenp) * 2 + 1, U8);
6940069f
GS
2655 dst = d;
2656
2657 while (s < send) {
55d09dc8
KW
2658 append_utf8_from_native_byte(*s, &d);
2659 s++;
6940069f 2660 }
2e11cf67 2661
6940069f 2662 *d = '\0';
09af0336 2663 *lenp = d-dst;
2e11cf67
KW
2664
2665 /* Trim unused space */
2666 Renew(dst, *lenp + 1, U8);
2667
6940069f
GS
2668 return dst;
2669}
2670
a0ed51b3 2671/*
624504c5
KW
2672 * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
2673 * use utf16_to_utf8_reversed().
a0ed51b3 2674 *
624504c5
KW
2675 * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
2676 * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
2677 * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
2678 *
2679 * These functions don't check for overflow. The worst case is every code
2680 * point in the input is 2 bytes, and requires 4 bytes on output. (If the code
2681 * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
2682 * destination must be pre-extended to 2 times the source length.
2683 *
2684 * Do not use in-place. We optimize for native, for obvious reasons. */
a0ed51b3
LW
2685
2686U8*
dea0fc0b 2687Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 2688{
dea0fc0b
JH
2689 U8* pend;
2690 U8* dstart = d;
2691
7918f24d
NC
2692 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2693
dea0fc0b 2694 if (bytelen & 1)
56576a04
KW
2695 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2696 (UV)bytelen);
dea0fc0b
JH
2697
2698 pend = p + bytelen;
2699
a0ed51b3 2700 while (p < pend) {
dea0fc0b
JH
2701 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2702 p += 2;
2d1545e5 2703 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 2704 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
2705 continue;
2706 }
56d37426
KW
2707 if (uv <= MAX_UTF8_TWO_BYTE) {
2708 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2709 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
2710 continue;
2711 }
ffd0a9d3 2712
46956fad
KW
2713#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2714#define LAST_HIGH_SURROGATE 0xDBFF
2715#define FIRST_LOW_SURROGATE 0xDC00
2716#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
ffd0a9d3 2717#define FIRST_IN_PLANE1 0x10000
e23c50db
KW
2718
2719 /* This assumes that most uses will be in the first Unicode plane, not
2720 * needing surrogates */
2721 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2722 && uv <= UNICODE_SURROGATE_LAST))
2723 {
2724 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2725 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2726 }
2727 else {
01ea242b 2728 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2729 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2730 || UNLIKELY(low > LAST_LOW_SURROGATE))
2731 {
01ea242b 2732 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2733 }
2734 p += 2;
46956fad 2735 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
ffd0a9d3 2736 + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
01ea242b 2737 }
a0ed51b3 2738 }
56d37426
KW
2739#ifdef EBCDIC
2740 d = uvoffuni_to_utf8_flags(d, uv, 0);
2741#else
ffd0a9d3 2742 if (uv < FIRST_IN_PLANE1) {
eb160463
GS
2743 *d++ = (U8)(( uv >> 12) | 0xe0);
2744 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2745 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2746 continue;
2747 }
2748 else {
eb160463
GS
2749 *d++ = (U8)(( uv >> 18) | 0xf0);
2750 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2751 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2752 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2753 continue;
2754 }
56d37426 2755#endif
a0ed51b3 2756 }
dea0fc0b 2757 *newlen = d - dstart;
a0ed51b3
LW
2758 return d;
2759}
2760
2761/* Note: this one is slightly destructive of the source. */
2762
2763U8*
dea0fc0b 2764Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2765{
2766 U8* s = (U8*)p;
d4c19fe8 2767 U8* const send = s + bytelen;
7918f24d
NC
2768
2769 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2770
e0ea5e2d 2771 if (bytelen & 1)
147e3846 2772 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2773 (UV)bytelen);
2774
a0ed51b3 2775 while (s < send) {
d4c19fe8 2776 const U8 tmp = s[0];
a0ed51b3
LW
2777 s[0] = s[1];
2778 s[1] = tmp;
2779 s += 2;
2780 }
dea0fc0b 2781 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2782}
2783
922e8cb4
KW
2784bool
2785Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2786{
dc31b55c 2787 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
922e8cb4
KW
2788}
2789
f9ae8fb6
JD
2790/* Internal function so we can deprecate the external one, and call
2791 this one from other deprecated functions in this file */
2792
f2645549
KW
2793bool
2794Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2795{
f2645549 2796 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2797
2798 if (*p == '_')
2799 return TRUE;
af0d7385
KW
2800 return is_utf8_common(p, NULL,
2801 "This is buggy if this gets used",
2802 PL_utf8_idstart);
61b19385
KW
2803}
2804
5092f92a 2805bool
eba68aa0
KW
2806Perl__is_uni_perl_idcont(pTHX_ UV c)
2807{
c12658c9 2808 return _invlist_contains_cp(PL_utf8_perl_idcont, c);
eba68aa0
KW
2809}
2810
2811bool
f91dcd13
KW
2812Perl__is_uni_perl_idstart(pTHX_ UV c)
2813{
c12658c9 2814 return _invlist_contains_cp(PL_utf8_perl_idstart, c);
f91dcd13
KW
2815}
2816
3a4c58c9 2817UV
56576a04
KW
2818Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2819 const char S_or_s)
3a4c58c9
KW
2820{
2821 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2822 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2823 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2824 * either "SS" or "Ss". Which one to use is passed into the routine in
2825 * 'S_or_s' to avoid a test */
2826
2827 UV converted = toUPPER_LATIN1_MOD(c);
2828
2829 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2830
2831 assert(S_or_s == 'S' || S_or_s == 's');
2832
6f2d5cbc 2833 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2834 characters in this range */
3a4c58c9
KW
2835 *p = (U8) converted;
2836 *lenp = 1;
2837 return converted;
2838 }
2839
2840 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2841 * which it maps to one of them, so as to only have to have one check for
2842 * it in the main case */
2843 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2844 switch (c) {
2845 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2846 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2847 break;
2848 case MICRO_SIGN:
2849 converted = GREEK_CAPITAL_LETTER_MU;
2850 break;
79e064b9
KW
2851#if UNICODE_MAJOR_VERSION > 2 \
2852 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2853 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2854 case LATIN_SMALL_LETTER_SHARP_S:
2855 *(p)++ = 'S';
2856 *p = S_or_s;
2857 *lenp = 2;
2858 return 'S';
79e064b9 2859#endif
3a4c58c9 2860 default:
56576a04
KW
2861 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2862 " '%c' to map to '%c'",
2863 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2864 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2865 }
2866 }
2867
2868 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2869 *p = UTF8_TWO_BYTE_LO(converted);
2870 *lenp = 2;
2871
2872 return converted;
2873}
2874
fe63c520
KW
2875/* If compiled on an early Unicode version, there may not be auxiliary tables
2876 * */
2877#ifndef HAS_UC_AUX_TABLES
2878# define UC_AUX_TABLE_ptrs NULL
2879# define UC_AUX_TABLE_lengths NULL
2880#endif
2881#ifndef HAS_TC_AUX_TABLES
2882# define TC_AUX_TABLE_ptrs NULL
2883# define TC_AUX_TABLE_lengths NULL
2884#endif
2885#ifndef HAS_LC_AUX_TABLES
2886# define LC_AUX_TABLE_ptrs NULL
2887# define LC_AUX_TABLE_lengths NULL
2888#endif
2889#ifndef HAS_CF_AUX_TABLES
2890# define CF_AUX_TABLE_ptrs NULL
2891# define CF_AUX_TABLE_lengths NULL
2892#endif
2893#ifndef HAS_UC_AUX_TABLES
2894# define UC_AUX_TABLE_ptrs NULL
2895# define UC_AUX_TABLE_lengths NULL
2896#endif
2897
50bda2c3
KW
2898/* Call the function to convert a UTF-8 encoded character to the specified case.
2899 * Note that there may be more than one character in the result.
6fa2f9bc
KW
2900 * 's' is a pointer to the first byte of the input character
2901 * 'd' will be set to the first byte of the string of changed characters. It
50bda2c3 2902 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
6fa2f9bc 2903 * 'lenp' will be set to the length in bytes of the string of changed characters
50bda2c3 2904 *
56576a04 2905 * The functions return the ordinal of the first character in the string of
6fa2f9bc 2906 * 'd' */
56576a04 2907#define CALL_UPPER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2908 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
2909 Uppercase_Mapping_invmap, \
2910 UC_AUX_TABLE_ptrs, \
2911 UC_AUX_TABLE_lengths, \
2912 "uppercase")
56576a04 2913#define CALL_TITLE_CASE(uv, s, d, lenp) \
8946fcd9
KW
2914 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
2915 Titlecase_Mapping_invmap, \
2916 TC_AUX_TABLE_ptrs, \
2917 TC_AUX_TABLE_lengths, \
2918 "titlecase")
56576a04 2919#define CALL_LOWER_CASE(uv, s, d, lenp) \
8946fcd9
KW
2920 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
2921 Lowercase_Mapping_invmap, \
2922 LC_AUX_TABLE_ptrs, \
2923 LC_AUX_TABLE_lengths, \
2924 "lowercase")
2925
50bda2c3 2926
b9992569
KW
2927/* This additionally has the input parameter 'specials', which if non-zero will
2928 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2929 * folding); otherwise, when zero, this implies a simple case fold */
56576a04 2930#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
8946fcd9
KW
2931 (specials) \
2932 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
2933 Case_Folding_invmap, \
2934 CF_AUX_TABLE_ptrs, \
2935 CF_AUX_TABLE_lengths, \
2936 "foldcase") \
2937 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
2938 Simple_Case_Folding_invmap, \
2939 NULL, NULL, \
2940 "foldcase")
c3fd2246 2941
84afefe6
JH
2942UV
2943Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2944{
a1433954
KW
2945 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2946 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2947 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2948 * the changed version may be longer than the original character.
2949 *
2950 * The ordinal of the first character of the changed version is returned
2951 * (but note, as explained above, that there may be more.) */
2952
7918f24d
NC
2953 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2954
3a4c58c9
KW
2955 if (c < 256) {
2956 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2957 }
2958
0ebc6274 2959 uvchr_to_utf8(p, c);
b9992569 2960 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2961}
2962
84afefe6
JH
2963UV
2964Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2965{
7918f24d
NC
2966 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2967
3a4c58c9
KW
2968 if (c < 256) {
2969 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2970 }
2971
0ebc6274 2972 uvchr_to_utf8(p, c);
b9992569 2973 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2974}
2975
afc16117 2976STATIC U8
eaf412bf 2977S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2978{
2979 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2980 * those, converting the result to UTF-8. Since the result is always just
a1433954 2981 * one character, we allow <p> to be NULL */
afc16117
KW
2982
2983 U8 converted = toLOWER_LATIN1(c);
2984
eaf412bf
KW
2985 PERL_UNUSED_ARG(dummy);
2986
afc16117 2987 if (p != NULL) {
6f2d5cbc 2988 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2989 *p = converted;
2990 *lenp = 1;
2991 }
2992 else {
430c9760
KW
2993 /* Result is known to always be < 256, so can use the EIGHT_BIT
2994 * macros */
2995 *p = UTF8_EIGHT_BIT_HI(converted);
2996 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2997 *lenp = 2;
2998 }
2999 }
3000 return converted;
3001}
3002
84afefe6
JH
3003UV
3004Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 3005{
7918f24d
NC
3006 PERL_ARGS_ASSERT_TO_UNI_LOWER;
3007
afc16117 3008 if (c < 256) {
eaf412bf 3009 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
3010 }
3011
afc16117 3012 uvchr_to_utf8(p, c);
b9992569 3013 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
3014}
3015
84afefe6 3016UV
7c0ab950 3017Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 3018{
51910141 3019 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 3020 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 3021 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
3022 *
3023 * Not to be used for locale folds
51910141 3024 */
f673fad4 3025
a1dde8de
KW
3026 UV converted;
3027
3028 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3029
1ca267a5
KW
3030 assert (! (flags & FOLD_FLAGS_LOCALE));
3031
659a7c2d 3032 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
3033 converted = GREEK_SMALL_LETTER_MU;
3034 }
9b63e895
KW
3035#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3036 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3037 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
3038 else if ( (flags & FOLD_FLAGS_FULL)
3039 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3040 {
1ca267a5
KW
3041 /* If can't cross 127/128 boundary, can't return "ss"; instead return
3042 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3043 * under those circumstances. */
3044 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3045 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3046 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3047 p, *lenp, U8);
3048 return LATIN_SMALL_LETTER_LONG_S;
3049 }
3050 else {
4f489194
KW
3051 *(p)++ = 's';
3052 *p = 's';
3053 *lenp = 2;
3054 return 's';
1ca267a5 3055 }
a1dde8de 3056 }
9b63e895 3057#endif
a1dde8de
KW
3058 else { /* In this range the fold of all other characters is their lower
3059 case */
3060 converted = toLOWER_LATIN1(c);
3061 }
3062
6f2d5cbc 3063 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
3064 *p = (U8) converted;
3065 *lenp = 1;
3066 }
3067 else {
3068 *(p)++ = UTF8_TWO_BYTE_HI(converted);
3069 *p = UTF8_TWO_BYTE_LO(converted);
3070 *lenp = 2;
3071 }
3072
3073 return converted;
3074}
3075
3076UV
31f05a37 3077Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 3078{
4b593389 3079
a0270393
KW
3080 /* Not currently externally documented, and subject to change
3081 * <flags> bits meanings:
3082 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
3083 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3084 * locale are to be used.
a0270393
KW
3085 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3086 */
4b593389 3087
36bb2ab6 3088 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 3089
780fcc9f 3090 if (flags & FOLD_FLAGS_LOCALE) {
8b7358b9
KW
3091 /* Treat a UTF-8 locale as not being in locale at all, except for
3092 * potentially warning */
3093 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
780fcc9f
KW
3094 if (IN_UTF8_CTYPE_LOCALE) {
3095 flags &= ~FOLD_FLAGS_LOCALE;
3096 }
3097 else {
e7b7ac46 3098 goto needs_full_generality;
780fcc9f 3099 }
31f05a37
KW
3100 }
3101
a1dde8de 3102 if (c < 256) {
e7b7ac46 3103 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 3104 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
3105 }
3106
2f306ab9 3107 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
3108 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3109 uvchr_to_utf8(p, c);
b9992569 3110 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393 3111 }
567b353c 3112 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
a0270393
KW
3113 the special flags. */
3114 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
3115
3116 needs_full_generality:
a0270393 3117 uvchr_to_utf8(utf8_c, c);
56576a04
KW
3118 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
3119 p, lenp, flags);
a0270393 3120 }
84afefe6
JH
3121}
3122
26483009 3123PERL_STATIC_INLINE bool
5141f98e 3124S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 3125 const char *const swashname, SV* const invlist)
bde6a22d 3126{
ea317ccb
KW
3127 /* returns a boolean giving whether or not the UTF8-encoded character that
3128 * starts at <p> is in the swash indicated by <swashname>. <swash>
3129 * contains a pointer to where the swash indicated by <swashname>
3130 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
3131 * look at <*swash> and only generate a swash if it is not null. <invlist>
3132 * is NULL or an inversion list that defines the swash. If not null, it
3133 * saves time during initialization of the swash.
ea317ccb
KW
3134 *
3135 * Note that it is assumed that the buffer length of <p> is enough to
3136 * contain all the bytes that comprise the character. Thus, <*p> should
3137 * have been checked before this call for mal-formedness enough to assure
3138 * that. */
3139
7918f24d
NC
3140 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3141
492a624f 3142 /* The API should have included a length for the UTF-8 character in <p>,
28123549 3143 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
3144 * as far as there being enough bytes available in it to accommodate the
3145 * character without reading beyond the end, and pass that number on to the
3146 * validating routine */
6302f837 3147 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
86ae6e94 3148 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
99a765e9 3149 _UTF8_NO_CONFIDENCE_IN_CURLEN,
86ae6e94
KW
3150 1 /* Die */ );
3151 NOT_REACHED; /* NOTREACHED */
28123549 3152 }
86ae6e94 3153
dc31b55c
KW
3154 if (invlist) {
3155 return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
3156 }
3157
af0d7385
KW
3158 assert(swash);
3159
87367d5f
KW
3160 if (!*swash) {
3161 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
3162 *swash = _core_swash_init("utf8",
3163
3164 /* Only use the name if there is no inversion
3165 * list; otherwise will go out to disk */
3166 (invlist) ? "" : swashname,
3167
3168 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 3169 }
28123549 3170
bde6a22d
NC
3171 return swash_fetch(*swash, p, TRUE) != 0;
3172}
3173
da8c1a98 3174PERL_STATIC_INLINE bool
56576a04
KW
3175S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
3176 SV **swash, const char *const swashname,
3177 SV* const invlist)
da8c1a98
KW
3178{
3179 /* returns a boolean giving whether or not the UTF8-encoded character that
3180 * starts at <p>, and extending no further than <e - 1> is in the swash
3181 * indicated by <swashname>. <swash> contains a pointer to where the swash
3182 * indicated by <swashname> is to be stored; which this routine will do, so
3183 * that future calls will look at <*swash> and only generate a swash if it
3184 * is not null. <invlist> is NULL or an inversion list that defines the
3185 * swash. If not null, it saves time during initialization of the swash.
3186 */
3187
3188 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
3189
3190 if (! isUTF8_CHAR(p, e)) {
3191 _force_out_malformed_utf8_message(p, e, 0, 1);
3192 NOT_REACHED; /* NOTREACHED */
3193 }
3194
dc31b55c
KW
3195 if (invlist) {
3196 return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
3197 }
3198
af0d7385
KW
3199 assert(swash);
3200
da8c1a98
KW
3201 if (!*swash) {
3202 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
3203 *swash = _core_swash_init("utf8",
3204
3205 /* Only use the name if there is no inversion
3206 * list; otherwise will go out to disk */
3207 (invlist) ? "" : swashname,
3208
3209 &PL_sv_undef, 1, 0, invlist, &flags);
3210 }
3211
3212 return swash_fetch(*swash, p, TRUE) != 0;
3213}
3214
34aeb2e9
KW
3215STATIC void
3216S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3217 const char * const alternative,
3218 const bool use_locale,
3219 const char * const file,
3220 const unsigned line)
3221{
3222 const char * key;
3223
3224 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3225
3226 if (ckWARN_d(WARN_DEPRECATED)) {
3227
3228 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3229 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3230 if (! PL_seen_deprecated_macro) {
3231 PL_seen_deprecated_macro = newHV();
3232 }
3233 if (! hv_store(PL_seen_deprecated_macro, key,
3234 strlen(key), &PL_sv_undef, 0))
3235 {
3236 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3237 }
3238
c44e9413 3239 if (instr(file, "mathoms.c")) {
607313a1
KW
3240 Perl_warner(aTHX_ WARN_DEPRECATED,
3241 "In %s, line %d, starting in Perl v5.30, %s()"
3242 " will be removed. Avoid this message by"
3243 " converting to use %s().\n",
3244 file, line, name, alternative);
3245 }
3246 else {
34aeb2e9
KW
3247 Perl_warner(aTHX_ WARN_DEPRECATED,
3248 "In %s, line %d, starting in Perl v5.30, %s() will"
3249 " require an additional parameter. Avoid this"
3250 " message by converting to use %s().\n",
3251 file, line, name, alternative);
607313a1 3252 }
34aeb2e9
KW
3253 }
3254 }
3255}
3256
bde6a22d 3257bool
34aeb2e9 3258Perl__is_utf8_FOO(pTHX_ U8 classnum,
be99e2c2 3259 const U8 * const p,
34aeb2e9
KW
3260 const char * const name,
3261 const char * const alternative,
3262 const bool use_utf8,
3263 const bool use_locale,
3264 const char * const file,
3265 const unsigned line)
922e8cb4 3266{
922e8cb4
KW
3267 PERL_ARGS_ASSERT__IS_UTF8_FOO;
3268
34aeb2e9
KW
3269 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3270
3271 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
34aeb2e9
KW
3272
3273 switch (classnum) {
3274 case _CC_WORDCHAR:
3275 case _CC_DIGIT:
3276 case _CC_ALPHA:
3277 case _CC_LOWER:
3278 case _CC_UPPER:
3279 case _CC_PUNCT:
3280 case _CC_PRINT:
3281 case _CC_ALPHANUMERIC:
3282 case _CC_GRAPH:
3283 case _CC_CASED:
3284
3285 return is_utf8_common(p,
af0d7385
KW
3286 NULL,
3287 "This is buggy if this gets used",
34aeb2e9
KW
3288 PL_XPosix_ptrs[classnum]);
3289
3290 case _CC_SPACE:
3291 return is_XPERLSPACE_high(p);
3292 case _CC_BLANK:
3293 return is_HORIZWS_high(p);
3294 case _CC_XDIGIT:
3295 return is_XDIGIT_high(p);
3296 case _CC_CNTRL:
3297 return 0;
3298 case _CC_ASCII:
3299 return 0;
3300 case _CC_VERTSPACE:
3301 return is_VERTWS_high(p);
3302 case _CC_IDFIRST:
af0d7385
KW
3303 return is_utf8_common(p, NULL,
3304 "This is buggy if this gets used",
3305 PL_utf8_perl_idstart);
34aeb2e9 3306 case _CC_IDCONT:
af0d7385
KW
3307 return is_utf8_common(p, NULL,
3308 "This is buggy if this gets used",
3309 PL_utf8_perl_idcont);
34aeb2e9
KW
3310 }
3311 }
3312
3313 /* idcont is the same as wordchar below 256 */
3314 if (classnum == _CC_IDCONT) {
3315 classnum = _CC_WORDCHAR;
3316 }
3317 else if (classnum == _CC_IDFIRST) {
3318 if (*p == '_') {
3319 return TRUE;
3320 }
3321 classnum = _CC_ALPHA;
3322 }
3323
3324 if (! use_locale) {
3325 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
3326 return _generic_isCC(*p, classnum);
3327 }
922e8cb4 3328
34aeb2e9
KW
3329 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
3330 }
3331 else {
3332 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
3333 return isFOO_lc(classnum, *p);
3334 }
3335
3336 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
3337 }
3338
3339 NOT_REACHED; /* NOTREACHED */
922e8cb4
KW
3340}
3341
3342bool
da8c1a98
KW
3343Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
3344 const U8 * const e)
3345{
3346 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
3347
af0d7385
KW
3348 return is_utf8_common_with_len(p, e, NULL,
3349 "This is buggy if this gets used",
da8c1a98
KW
3350 PL_XPosix_ptrs[classnum]);
3351}
3352
3353bool
da8c1a98
KW
3354Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
3355{
da8c1a98
KW
3356 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
3357
af0d7385
KW
3358 return is_utf8_common_with_len(p, e, NULL,
3359 "This is buggy if this gets used",
3360 PL_utf8_perl_idstart);
da8c1a98
KW
3361}
3362
3363bool
f2645549 3364Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 3365{
f2645549 3366 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
3367
3368 if (*p == '_')
3369 return TRUE;
f25ce844 3370 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
3371}
3372
3373bool
da8c1a98
KW
3374Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
3375{
da8c1a98
KW
3376 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
3377
af0d7385
KW
3378 return is_utf8_common_with_len(p, e, NULL,
3379 "This is buggy if this gets used",
3380 PL_utf8_perl_idcont);
da8c1a98
KW
3381}
3382
3383bool
f2645549 3384Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 3385{
f2645549 3386 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 3387
f25ce844 3388 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
3389}
3390
3391bool
f2645549 3392Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 3393{
f2645549 3394 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 3395
5062866a 3396 return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
c11ff943
KW
3397}
3398
3399bool
7dbf68d2
KW
3400Perl__is_utf8_mark(pTHX_ const U8 *p)
3401{
7dbf68d2
KW
3402 PERL_ARGS_ASSERT__IS_UTF8_MARK;
3403
f25ce844 3404 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
3405}
3406
6a4a25f4 3407STATIC UV
30613bdc
KW
3408S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
3409 U8* ustrp, STRLEN *lenp,
341bb5b7 3410 SV *invlist, const int * const invmap,
e39a4130 3411 const unsigned int * const * const aux_tables,
30613bdc
KW
3412 const U8 * const aux_table_lengths,
3413 const char * const normal)
b9992569 3414{
0134edef 3415 STRLEN len = 0;
7918f24d 3416
30613bdc
KW
3417 /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
3418 * by this routine to be valid) begins at 'p'. 'normal' is a string to use
3419 * to name the new case in any generated messages, as a fallback if the
3420 * operation being used is not available. The new case is given by the
3421 * data structures in the remaining arguments.
3422 *
3423 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3424 * entire changed case string, and the return value is the first code point
3425 * in that string */
3426
b9992569 3427 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 3428
36eaa811
KW
3429 /* For code points that don't change case, we already know that the output
3430 * of this function is the unchanged input, so we can skip doing look-ups
3431 * for them. Unfortunately the case-changing code points are scattered
3432 * around. But there are some long consecutive ranges where there are no
3433 * case changing code points. By adding tests, we can eliminate the lookup
3434 * for all the ones in such ranges. This is currently done here only for
3435 * just a few cases where the scripts are in common use in modern commerce
3436 * (and scripts adjacent to those which can be included without additional
3437 * tests). */
3438
3439 if (uv1 >= 0x0590) {
3440 /* This keeps from needing further processing the code points most
3441 * likely to be used in the following non-cased scripts: Hebrew,
3442 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
3443 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
3444 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
3445 if (uv1 < 0x10A0) {
3446 goto cases_to_self;
3447 }
3448
3449 /* The following largish code point ranges also don't have case
3450 * changes, but khw didn't think they warranted extra tests to speed
3451 * them up (which would slightly slow down everything else above them):
3452 * 1100..139F Hangul Jamo, Ethiopic
3453 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
3454 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
3455 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
3456 * Combining Diacritical Marks Extended, Balinese,
3457 * Sundanese, Batak, Lepcha, Ol Chiki
3458 * 2000..206F General Punctuation
3459 */
3460
3461 if (uv1 >= 0x2D30) {
3462
3463 /* This keeps the from needing further processing the code points
3464 * most likely to be used in the following non-cased major scripts:
3465 * CJK, Katakana, Hiragana, plus some less-likely scripts.
3466 *
3467 * (0x2D30 above might have to be changed to 2F00 in the unlikely
3468 * event that Unicode eventually allocates the unused block as of
3469 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
3470 * that the test suite will start having failures to alert you
3471 * should that happen) */
3472 if (uv1 < 0xA640) {
3473 goto cases_to_self;
3474 }
3475
3476 if (uv1 >= 0xAC00) {
3477 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
3478 if (ckWARN_d(WARN_SURROGATE)) {
3479 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3480 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
56576a04
KW
3481 "Operation \"%s\" returns its argument for"
3482 " UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
3483 }
3484 goto cases_to_self;
3485 }
36eaa811
KW
3486
3487 /* AC00..FAFF Catches Hangul syllables and private use, plus
3488 * some others */
3489 if (uv1 < 0xFB00) {
3490 goto cases_to_self;
36eaa811
KW
3491 }
3492
5af9bc97 3493 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
d22ec717
KW
3494 if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
3495 Perl_croak(aTHX_ cp_above_legal_max, uv1,
3496 MAX_EXTERNALLY_LEGAL_CP);
5af9bc97
KW
3497 }
3498 if (ckWARN_d(WARN_NON_UNICODE)) {
3499 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3500 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
56576a04
KW
3501 "Operation \"%s\" returns its argument for"
3502 " non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
3503 }
3504 goto cases_to_self;
3505 }
3bfc1e70
KW
3506#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3507 if (UNLIKELY(uv1
3508 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3509 {
3510
56576a04
KW
3511 /* As of Unicode 10.0, this means we avoid swash creation
3512 * for anything beyond high Plane 1 (below emojis) */
3bfc1e70
KW
3513 goto cases_to_self;
3514 }
3515#endif
36eaa811
KW
3516 }
3517 }
9ae3ac1a 3518
36eaa811 3519 /* Note that non-characters are perfectly legal, so no warning should
8946fcd9 3520 * be given. */
9ae3ac1a
KW
3521 }
3522
8946fcd9
KW
3523 {
3524 unsigned int i;
e39a4130 3525 const unsigned int * cp_list;
8946fcd9
KW
3526 U8 * d;
3527 SSize_t index = _invlist_search(invlist, uv1);
3528 IV base = invmap[index];
0134edef 3529
30613bdc
KW
3530 /* The data structures are set up so that if 'base' is non-negative,
3531 * the case change is 1-to-1; and if 0, the change is to itself */
8946fcd9
KW
3532 if (base >= 0) {
3533 IV lc;
b08cf34e 3534
8946fcd9
KW
3535 if (base == 0) {
3536 goto cases_to_self;
4a8240a3 3537 }
4a8240a3 3538
30613bdc 3539 /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
8946fcd9
KW
3540 lc = base + uv1 - invlist_array(invlist)[index];
3541 *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
3542 return lc;
3543 }
1feea2c7 3544
30613bdc
KW
3545 /* Here 'base' is negative. That means the mapping is 1-to-many, and
3546 * requires an auxiliary table look up. abs(base) gives the index into
3547 * a list of such tables which points to the proper aux table. And a
3548 * parallel list gives the length of each corresponding aux table. */
8946fcd9 3549 cp_list = aux_tables[-base];
30613bdc
KW
3550
3551 /* Create the string of UTF-8 from the mapped-to code points */
8946fcd9
KW
3552 d = ustrp;
3553 for (i = 0; i < aux_table_lengths[-base]; i++) {
3554 d = uvchr_to_utf8(d, cp_list[i]);
cbe07460 3555 }
8946fcd9
KW
3556 *d = '\0';
3557 *lenp = d - ustrp;
3558
3559 return cp_list[0];
cbe07460
KW
3560 }
3561
3562 /* Here, there was no mapping defined, which means that the code point maps
3563 * to itself. Return the inputs */
e24dfe9c 3564 cases_to_self:
bfdf22ec 3565 len = UTF8SKIP(p);
ca9fab46
KW
3566 if (p != ustrp) { /* Don't copy onto itself */
3567 Copy(p, ustrp, len, U8);
3568 }
0134edef 3569
2a37f04d
JH
3570 if (lenp)
3571 *lenp = len;
3572
f4cd282c 3573 return uv1;
cbe07460 3574
a0ed51b3
LW
3575}
3576
b74fe592 3577Size_t
e39a4130
KW
3578Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
3579 const unsigned int ** remaining_folds_to)
b74fe592
KW
3580{
3581 /* Returns the count of the number of code points that fold to the input
3582 * 'cp' (besides itself).
3583 *
3584 * If the return is 0, there is nothing else that folds to it, and
3585 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3586 *
3587 * If the return is 1, '*first_folds_to' is set to the single code point,
3588 * and '*remaining_folds_to' is set to NULL.
3589 *
3590 * Otherwise, '*first_folds_to' is set to a code point, and
3591 * '*remaining_fold_to' is set to an array that contains the others. The
3592 * length of this array is the returned count minus 1.
3593 *
3594 * The reason for this convolution is to avoid having to deal with
3595 * allocating and freeing memory. The lists are already constructed, so
3596 * the return can point to them, but single code points aren't, so would
3597 * need to be constructed if we didn't employ something like this API */
3598
3599 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3600 int base = _Perl_IVCF_invmap[index];
3601
3602 PERL_ARGS_ASSERT__INVERSE_FOLDS;
3603
3604 if (base == 0) { /* No fold */
3605 *first_folds_to = 0;
3606 *remaining_folds_to = NULL;
3607 return 0;
3608 }
3609
3610#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
3611
3612 assert(base > 0);
3613
3614#else
3615
3616 if (UNLIKELY(base < 0)) { /* Folds to more than one character */
3617
3618 /* The data structure is set up so that the absolute value of 'base' is
3619 * an index into a table of pointers to arrays, with the array
3620 * corresponding to the index being the list of code points that fold
3621 * to 'cp', and the parallel array containing the length of the list
3622 * array */
3623 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3624 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
3625 *first_folds_to
3626 */
3627 return IVCF_AUX_TABLE_lengths[-base];
3628 }
3629
3630#endif
3631
3632 /* Only the single code point. This works like 'fc(G) = G - A + a' */
3633 *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
3634 *remaining_folds_to = NULL;
3635 return 1;
3636}
3637
051a06d4 3638STATIC UV
56576a04
KW
3639S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3640 U8* const ustrp, STRLEN *lenp)
051a06d4 3641{
4a4088c4 3642 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
3643 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3644 * result contains a character that crosses the 255/256 boundary, disallow
3645 * the change, and return the original code point. See L<perlfunc/lc> for
3646 * why;
051a06d4 3647 *
a1433954
KW
3648 * p points to the original string whose case was changed; assumed
3649 * by this routine to be well-formed
051a06d4 3650 * result the code point of the first character in the changed-case string
56576a04
KW
3651 * ustrp points to the changed-case string (<result> represents its
3652 * first char)
051a06d4
KW
3653 * lenp points to the length of <ustrp> */
3654
3655 UV original; /* To store the first code point of <p> */
3656
3657 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3658
a4f12ed7 3659 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
3660
3661 /* We know immediately if the first character in the string crosses the
5e45c680 3662 * boundary, so can skip testing */
051a06d4
KW
3663 if (result > 255) {
3664
3665 /* Look at every character in the result; if any cross the
3666 * boundary, the whole thing is disallowed */
3667 U8* s = ustrp + UTF8SKIP(ustrp);
3668 U8* e = ustrp + *lenp;
3669 while (s < e) {
a4f12ed7 3670 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
3671 goto bad_crossing;
3672 }
3673 s += UTF8SKIP(s);
3674 }
3675
613abc6d
KW
3676 /* Here, no characters crossed, result is ok as-is, but we warn. */
3677 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
3678 return result;
3679 }
3680
7b52d656 3681 bad_crossing:
051a06d4
KW
3682
3683 /* Failed, have to return the original */
4b88fb76 3684 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
3685
3686 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3687 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
56576a04
KW
3688 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3689 " locale; resolved to \"\\x{%" UVXf "}\".",
357aadde 3690 OP_DESC(PL_op),
ab0b796c
KW
3691 original,
3692 original);
051a06d4
KW
3693 Copy(p, ustrp, *lenp, char);
3694 return original;
3695}
3696
607313a1
KW
3697STATIC U32
3698S_check_and_deprecate(pTHX_ const U8 *p,
3699 const U8 **e,
3700 const unsigned int type, /* See below */
3701 const bool use_locale, /* Is this a 'LC_'
3702 macro call? */
3703 const char * const file,
3704 const unsigned line)
3705{
3706 /* This is a temporary function to deprecate the unsafe calls to the case
3707 * changing macros and functions. It keeps all the special stuff in just
3708 * one place.
3709 *
3710 * It updates *e with the pointer to the end of the input string. If using
3711 * the old-style macros, *e is NULL on input, and so this function assumes
3712 * the input string is long enough to hold the entire UTF-8 sequence, and
3713 * sets *e accordingly, but it then returns a flag to pass the
3714 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3715 * using the full length if possible.
3716 *
3717 * It also does the assert that *e > p when *e is not NULL. This should be
3718 * migrated to the callers when this function gets deleted.
3719 *
3720 * The 'type' parameter is used for the caller to specify which case
3721 * changing function this is called from: */
3722
3723# define DEPRECATE_TO_UPPER 0
3724# define DEPRECATE_TO_TITLE 1
3725# define DEPRECATE_TO_LOWER 2
3726# define DEPRECATE_TO_FOLD 3
3727
3728 U32 utf8n_flags = 0;
3729 const char * name;
3730 const char * alternative;
3731
3732 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3733
3734 if (*e == NULL) {
3735 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3736 *e = p + UTF8SKIP(p);
3737
3738 /* For mathoms.c calls, we use the function name we know is stored
c44e9413 3739 * there. It could be part of a larger path */
607313a1