3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
32 #define PERL_IN_UTF8_C
34 #include "invlist_inline.h"
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38 "Malformed UTF-8 character (unexpected end of string)";
41 These are various utility functions for manipulating UTF8-encoded
42 strings. For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
49 Perl__force_out_malformed_utf8_message(pTHX_
50 const U8 *const p, /* First byte in UTF-8 sequence */
51 const U8 * const e, /* Final byte in sequence (may include
53 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
54 usually 0, or some DISALLOW flags */
55 const bool die_here) /* If TRUE, this function does not return */
57 /* This core-only function is to be called when a malformed UTF-8 character
58 * is found, in order to output the detailed information about the
59 * malformation before dieing. The reason it exists is for the occasions
60 * when such a malformation is fatal, but warnings might be turned off, so
61 * that normally they would not be actually output. This ensures that they
62 * do get output. Because a sequence may be malformed in more than one
63 * way, multiple messages may be generated, so we can't make them fatal, as
64 * that would cause the first one to die.
66 * Instead we pretend -W was passed to perl, then die afterwards. The
67 * flexibility is here to return to the caller so they can finish up and
71 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
77 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
80 PL_curcop->cop_warnings = pWARN_ALL;
83 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
88 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
89 " be called only when there are errors found");
93 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
98 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
99 U32 categories, /* Packed warning categories */
100 U32 flag) /* Flag associated with this message */
102 /* Creates, populates, and returns an HV* that describes an error message
103 * for the translators between UTF8 and code point */
105 SV* msg_sv = newSVpv(message, 0);
106 SV* category_sv = newSVuv(categories);
107 SV* flag_bit_sv = newSVuv(flag);
109 HV* msg_hv = newHV();
111 PERL_ARGS_ASSERT_NEW_MSG_HV;
113 (void) hv_stores(msg_hv, "text", msg_sv);
114 (void) hv_stores(msg_hv, "warn_categories", category_sv);
115 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
121 =for apidoc uvoffuni_to_utf8_flags
123 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
124 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
125 L<perlapi/uvchr_to_utf8_flags>>.
127 This function is like them, but the input is a strict Unicode
128 (as opposed to native) code point. Only in very rare circumstances should code
129 not be using the native code point.
131 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
137 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
139 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
141 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
144 /* All these formats take a single UV code point argument */
145 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
146 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
147 " is not recommended for open interchange";
148 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
149 " may not be portable";
151 /* Use shorter names internally in this file */
152 #define SHIFT UTF_ACCUMULATION_SHIFT
154 #define MARK UTF_CONTINUATION_MARK
155 #define MASK UTF_CONTINUATION_MASK
158 =for apidoc uvchr_to_utf8_flags_msgs
160 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
162 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
164 This function is for code that wants any warning and/or error messages to be
165 returned to the caller rather than be displayed. All messages that would have
166 been displayed if all lexical warnings are enabled will be returned.
168 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
169 placed after all the others, C<msgs>. If this parameter is 0, this function
170 behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
171 be a pointer to an C<HV *> variable, in which this function creates a new HV to
172 contain any appropriate messages. The hash has three key-value pairs, as
179 The text of the message as a C<SVpv>.
181 =item C<warn_categories>
183 The warning category (or categories) packed into a C<SVuv>.
187 A single flag bit associated with this message, in a C<SVuv>.
188 The bit corresponds to some bit in the C<*errors> return value,
189 such as C<UNICODE_GOT_SURROGATE>.
193 It's important to note that specifying this parameter as non-null will cause
194 any warnings this function would otherwise generate to be suppressed, and
195 instead be placed in C<*msgs>. The caller can check the lexical warnings state
196 (or not) when choosing what to do with the returned messages.
198 The caller, of course, is responsible for freeing any returned HV.
203 /* Undocumented; we don't want people using this. Instead they should use
204 * uvchr_to_utf8_flags_msgs() */
206 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
209 UV shifted_uv = input_uv;
210 STRLEN utf8_skip = OFFUNISKIP(input_uv);
212 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
220 *d++ = LATIN1_TO_NATIVE(input_uv);
224 if ( UNLIKELY(input_uv > MAX_LEGAL_CP
225 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
227 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
231 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
232 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
233 const char * format = PL_extended_cp_format;
235 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
237 UNICODE_GOT_PERL_EXTENDED);
240 Perl_ck_warner_d(aTHX_ category, format, input_uv);
243 /* Don't output a 2nd msg */
244 flags &= ~UNICODE_WARN_SUPER;
247 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
251 p = d + utf8_skip - 1;
252 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
253 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
254 shifted_uv >>= SHIFT;
259 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
260 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
261 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
262 shifted_uv >>= SHIFT;
265 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
266 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
267 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
268 shifted_uv >>= SHIFT;
271 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
272 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
273 if (flags & UNICODE_WARN_SUPER) {
274 U32 category = packWARN(WARN_NON_UNICODE);
275 const char * format = super_cp_format;
278 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
283 Perl_ck_warner_d(aTHX_ category, format, input_uv);
286 if (flags & UNICODE_DISALLOW_SUPER) {
290 if ( (flags & UNICODE_DISALLOW_SUPER)
291 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
292 && UNICODE_IS_PERL_EXTENDED(input_uv)))
298 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
299 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
300 shifted_uv >>= SHIFT;
303 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
304 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
305 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
306 if (flags & UNICODE_WARN_NONCHAR) {
307 U32 category = packWARN(WARN_NONCHAR);
308 const char * format = nonchar_cp_format;
310 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
312 UNICODE_GOT_NONCHAR);
315 Perl_ck_warner_d(aTHX_ category, format, input_uv);
318 if (flags & UNICODE_DISALLOW_NONCHAR) {
322 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
323 if (flags & UNICODE_WARN_SURROGATE) {
324 U32 category = packWARN(WARN_SURROGATE);
325 const char * format = surrogate_cp_format;
327 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
329 UNICODE_GOT_SURROGATE);
332 Perl_ck_warner_d(aTHX_ category, format, input_uv);
335 if (flags & UNICODE_DISALLOW_SURROGATE) {
341 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
342 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
343 shifted_uv >>= SHIFT;
349 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
350 shifted_uv >>= SHIFT;
357 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
358 shifted_uv >>= SHIFT;
359 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
360 | UTF_START_MARK(utf8_skip));
364 return d + utf8_skip;
368 =for apidoc uvchr_to_utf8
370 Adds the UTF-8 representation of the native code point C<uv> to the end
371 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
372 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
373 the byte after the end of the new character. In other words,
375 d = uvchr_to_utf8(d, uv);
377 is the recommended wide native character-aware way of saying
381 This function accepts any code point from 0..C<IV_MAX> as input.
382 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
384 It is possible to forbid or warn on non-Unicode code points, or those that may
385 be problematic by using L</uvchr_to_utf8_flags>.
390 /* This is also a macro */
391 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
394 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
396 return uvchr_to_utf8(d, uv);
400 =for apidoc uvchr_to_utf8_flags
402 Adds the UTF-8 representation of the native code point C<uv> to the end
403 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
404 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
405 the byte after the end of the new character. In other words,
407 d = uvchr_to_utf8_flags(d, uv, flags);
411 d = uvchr_to_utf8_flags(d, uv, 0);
413 This is the Unicode-aware way of saying
417 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
418 input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
420 Specifying C<flags> can further restrict what is allowed and not warned on, as
423 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
424 the function will raise a warning, provided UTF8 warnings are enabled. If
425 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
426 NULL. If both flags are set, the function will both warn and return NULL.
428 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
429 affect how the function handles a Unicode non-character.
431 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
432 affect the handling of code points that are above the Unicode maximum of
433 0x10FFFF. Languages other than Perl may not be able to accept files that
436 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
437 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
438 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
439 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
440 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
441 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
442 above-Unicode and surrogate flags, but not the non-character ones, as
444 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
445 See L<perlunicode/Noncharacter code points>.
447 Extremely high code points were never specified in any standard, and require an
448 extension to UTF-8 to express, which Perl does. It is likely that programs
449 written in something other than Perl would not be able to read files that
450 contain these; nor would Perl understand files written by something that uses a
451 different extension. For these reasons, there is a separate set of flags that
452 can warn and/or disallow these extremely high code points, even if other
453 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
454 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
455 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
456 treat all above-Unicode code points, including these, as malformations. (Note
457 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
458 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
460 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
461 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
462 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
463 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
464 platforms,these flags can apply to code points that actually do fit in 31 bits.
465 The new names accurately describe the situation in all cases.
467 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
468 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
469 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
470 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
471 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
472 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
473 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
474 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
475 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
476 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
477 =for apidoc Amnh||UNICODE_WARN_NONCHAR
478 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
479 =for apidoc Amnh||UNICODE_WARN_SUPER
480 =for apidoc Amnh||UNICODE_WARN_SURROGATE
485 /* This is also a macro */
486 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
489 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
491 return uvchr_to_utf8_flags(d, uv, flags);
494 PERL_STATIC_INLINE int
495 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
497 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
498 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
499 * it isn't, and -1 if there isn't enough information to tell. This last
500 * return value can happen if the sequence is incomplete, missing some
501 * trailing bytes that would form a complete character. If there are
502 * enough bytes to make a definitive decision, this function does so.
503 * Usually 2 bytes are sufficient.
505 * Overlongs can occur whenever the number of continuation bytes changes.
506 * That means whenever the number of leading 1 bits in a start byte
507 * increases from the next lower start byte. That happens for start bytes
508 * C0, E0, F0, F8, FC, FE, and FF.
511 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
513 /* Each platform has overlongs after the start bytes given above (expressed
514 * in I8 for EBCDIC). The values below were found by manually inspecting
515 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */
517 switch (NATIVE_UTF8_TO_I8(s[0])) {
519 assert(UTF8_IS_START(s[0]));
535 return (len < 2) ? -1 : s[1] < 0xA0;
541 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
545 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
549 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
553 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
555 return isFF_overlong(s, len);
559 PERL_STATIC_INLINE int
560 S_isFF_overlong(const U8 * const s, const STRLEN len)
562 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
563 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
564 * it isn't, and -1 if there isn't enough information to tell. This last
565 * return value can happen if the sequence is incomplete, missing some
566 * trailing bytes that would form a complete character. If there are
567 * enough bytes to make a definitive decision, this function does so. */
569 PERL_ARGS_ASSERT_ISFF_OVERLONG;
572 /* This works on all three EBCDIC code pages traditionally supported by
574 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
576 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
579 /* To be an FF overlong, all the available bytes must match */
580 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
581 MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
586 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
587 * be there; what comes after them doesn't matter. See tables in utf8.h,
589 if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
593 /* The missing bytes could cause the result to go one way or the other, so
594 * the result is indeterminate */
598 /* At some point we may want to allow core to use up to UV_MAX */
600 #ifdef EBCDIC /* Actually is I8 */
601 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
602 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7"
603 /* UV_MAX "\xFF\xAF" */
604 # else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
605 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
606 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
609 # if defined(UV_IS_QUAD)
610 # define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87"
611 /* UV_MAX "\xFF\x80" */
613 # define HIGHEST_REPRESENTABLE_UTF "\xFD"
614 /* UV_MAX "\xFE\x83" */
618 PERL_STATIC_INLINE int
619 S_does_utf8_overflow(const U8 * const s,
621 const bool consider_overlongs)
623 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
624 * 'e' - 1 would overflow an IV on this platform; that is if it represents
625 * a code point larger than the highest representable code point. It
626 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
627 * enough information to tell. This last return value can happen if the
628 * sequence is incomplete, missing some trailing bytes that would form a
629 * complete character. If there are enough bytes to make a definitive
630 * decision, this function does so.
632 * If 'consider_overlongs' is TRUE, the function checks for the possibility
633 * that the sequence is an overlong that doesn't overflow. Otherwise, it
634 * assumes the sequence is not an overlong. This can give different
635 * results only on ASCII 32-bit platforms.
637 * (For ASCII platforms, we could use memcmp() because we don't have to
638 * convert each byte to I8, but it's very rare input indeed that would
639 * approach overflow, so the loop below will likely only get executed once.)
642 const STRLEN len = e - s;
644 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
647 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
649 for (x = s; x < e; x++, y++) {
651 /* 'y' is set up to not include the trailing bytes that are all the
652 * maximum possible continuation byte. So when we reach the end of 'y'
653 * (known to be NUL terminated), it is impossible for 'x' to contain
654 * bytes larger than those omitted bytes, and therefore 'x' can't
660 /* If this byte is less than the corresponding highest non-overflowing
661 * UTF-8, the sequence doesn't overflow */
662 if (NATIVE_UTF8_TO_I8(*x) < *y) {
666 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
667 goto overflows_if_not_overlong;
671 /* Got to the end, and all bytes are the same. If the input is a whole
672 * character, it doesn't overflow. And if it is a partial character,
673 * there's not enough information to tell */
674 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
676 overflows_if_not_overlong:
678 /* Here, a well-formed sequence overflows. If we are assuming
679 * well-formedness, return that it overflows. */
680 if (! consider_overlongs) {
684 /* Here, it could be the overlong malformation, and might not actually
685 * overflow if you were to calculate it out.
687 * See if it actually is overlong */
688 is_overlong = is_utf8_overlong(s, len);
690 /* If it isn't overlong, is well-formed, so overflows */
691 if (is_overlong == 0) {
695 /* Not long enough to determine */
696 if (is_overlong < 0) {
700 /* Here, it appears to overflow, but it is also overlong */
702 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
704 /* On many platforms, it is impossible for an overlong to overflow. For
705 * these, no further work is necessary: we can return immediately that this
706 * overlong that is an apparent overflow actually isn't
708 * To see why, note that a length_N sequence can represent as overlongs all
709 * the code points representable by shorter length sequences, but no
710 * higher. If it could represent a higher code point without being an
711 * overlong, we wouldn't have had to increase the sequence length!
713 * The highest possible start byte is FF; the next highest is FE. The
714 * highest code point representable as an overlong on the platform is thus
715 * the highest code point representable by a non-overlong sequence whose
716 * start byte is FE. If that value doesn't overflow the platform's word
717 * size, overlongs can't overflow.
719 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
720 * information (the high 7 bits, all ones, say that the sequence is 7 bytes
721 * long, and the bottom, zero, bit is s placeholder. That leaves the 6
722 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
723 If that number of bits doesn't exceed the word size, it can't overflow. */
729 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can
730 * represent, as an overlong, the highest code point representable by an FD
731 * start byte, which is 5*6 continuation bytes of info plus one bit from
732 * the start byte, or 31 bits. That doesn't overflow. More explicitly:
733 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
735 * That means only the FF start byte can have an overflowing overlong. */
740 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
741 * evaluates to 2**31, so overflows an IV. For a UV it's
742 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
743 # define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82"
745 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */
749 # define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
751 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
758 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
760 SSize_t len, full_len;
762 /* An internal helper function.
765 * 's' is a string, which is known to be syntactically valid UTF-8 as far
766 * as (e - 1); e > s must hold.
767 * 'e' This function is allowed to look at any byte from 's'...'e-1', but
768 * nowhere else. The function has to cope as best it can if that
769 * sequence does not form a full character.
770 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
771 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns
772 * 0 if it determines the input will match something disallowed.
774 * The return is the number of bytes required to represent the code point
775 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the
776 * input is for a partial character, a successful return will be larger
779 * If *s..*(e-1) is only for a partial character, the function will return
780 * non-zero if there is any sequence of well-formed UTF-8 that, when
781 * appended to the input sequence, could result in an allowed code point;
782 * otherwise it returns 0. Non characters cannot be determined based on
783 * partial character input. But many of the other excluded types can be
784 * determined with just the first one or two bytes.
788 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
791 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
792 |UTF8_DISALLOW_PERL_EXTENDED)));
794 full_len = UTF8SKIP(s);
797 if (len > full_len) {
805 default: /* Extended */
806 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
812 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
813 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
815 if (flags & UTF8_DISALLOW_SUPER) {
816 return 0; /* Above Unicode */
821 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
822 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
824 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
825 && NATIVE_UTF8_TO_I8(s[1])
826 >= UTF_FIRST_CONT_BYTE_110000_));
828 if (flags & UTF8_DISALLOW_SUPER) {
832 else if ( (flags & UTF8_DISALLOW_NONCHAR)
834 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
841 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
843 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
847 if ( (flags & UTF8_DISALLOW_SURROGATE)
848 && UNLIKELY(is_SURROGATE_utf8(s)))
850 return 0; /* Surrogate */
853 if ( (flags & UTF8_DISALLOW_NONCHAR)
855 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
862 /* The lower code points don't have any disallowable characters */
875 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
876 const bool require_partial)
878 /* This is called to determine if the UTF-8 sequence starting at s0 and
879 * continuing for up to one full character of bytes, but looking no further
880 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
881 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
882 * for the largest code points recognized by Perl, the ones that require
883 * the most UTF-8 bytes per character to represent (somewhat less than
884 * twice the size of the next longest kind). This sequence will only ever
885 * be Perl extended UTF-8.
887 * The routine returns 0 if the sequence is not fully valid, syntactically
888 * or semantically. That means it checks that everything following the
889 * start byte is a continuation byte, and that it doesn't overflow, nor is
890 * an overlong representation.
892 * If 'require_partial' is FALSE, the routine returns non-zero only if the
893 * input (as far as 'e-1') is a full character. The return is the count of
894 * the bytes in the character.
896 * If 'require_partial' is TRUE, the routine returns non-zero only if the
897 * input as far as 'e-1' is a partial, not full character, with no
898 * malformations found before position 'e'. The return is either just
901 const U8 *s = s0 + 1;
904 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
907 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
909 send = s + MIN(UTF8_MAXBYTES - 1, e - s);
911 if (! UTF8_IS_CONTINUATION(*s)) {
918 if (0 < does_utf8_overflow(s0, e,
919 FALSE /* Don't consider_overlongs */
924 if (0 < isFF_overlong(s0, e - s0)) {
928 /* Here, the character is valid as far as it got. Check if got a partial
930 if (s - s0 < UTF8_MAXBYTES) {
931 return (require_partial) ? 1 : 0;
934 /* Here, got a full character */
935 return (require_partial) ? 0 : UTF8_MAXBYTES;
939 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
941 /* Returns a mortalized C string that is a displayable copy of the 'len'
942 * bytes starting at 'start'. 'format' gives how to display each byte.
943 * Currently, there are only two formats, so it is currently a bool:
945 * 1 ab (that is a space between two hex digit bytes)
948 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
950 const U8 * s = start;
951 const U8 * const e = start + len;
955 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
957 Newx(output, output_len, char);
961 for (s = start; s < e; s++) {
962 const unsigned high_nibble = (*s & 0xF0) >> 4;
963 const unsigned low_nibble = (*s & 0x0F);
975 if (high_nibble < 10) {
976 *d++ = high_nibble + '0';
979 *d++ = high_nibble - 10 + 'a';
982 if (low_nibble < 10) {
983 *d++ = low_nibble + '0';
986 *d++ = low_nibble - 10 + 'a';
994 PERL_STATIC_INLINE char *
995 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
997 /* Max number of bytes to print */
1000 /* Which one is the non-continuation */
1001 const STRLEN non_cont_byte_pos,
1003 /* How many bytes should there be? */
1004 const STRLEN expect_len)
1006 /* Return the malformation warning text for an unexpected continuation
1009 const char * const where = (non_cont_byte_pos == 1)
1011 : Perl_form(aTHX_ "%d bytes",
1012 (int) non_cont_byte_pos);
1013 const U8 * x = s + non_cont_byte_pos;
1014 const U8 * e = s + print_len;
1016 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1018 /* We don't need to pass this parameter, but since it has already been
1019 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1020 assert(expect_len == UTF8SKIP(s));
1022 /* As a defensive coding measure, don't output anything past a NUL. Such
1023 * bytes shouldn't be in the middle of a malformation, and could mark the
1024 * end of the allocated string, and what comes after is undefined */
1025 for (; x < e; x++) {
1027 x++; /* Output this particular NUL */
1032 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1033 " %s after start byte 0x%02x; need %d bytes, got %d)",
1035 _byte_dump_string(s, x - s, 0),
1036 *(s + non_cont_byte_pos),
1040 (int) non_cont_byte_pos);
1045 =for apidoc utf8n_to_uvchr
1047 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1048 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1051 Bottom level UTF-8 decode routine.
1052 Returns the native code point value of the first character in the string C<s>,
1053 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1054 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1055 the length, in bytes, of that character.
1057 The value of C<flags> determines the behavior when C<s> does not point to a
1058 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1059 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1060 is the next possible position in C<s> that could begin a non-malformed
1061 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1062 is raised. Some UTF-8 input sequences may contain multiple malformations.
1063 This function tries to find every possible one in each call, so multiple
1064 warnings can be raised for the same sequence.
1066 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1067 individual types of malformations, such as the sequence being overlong (that
1068 is, when there is a shorter sequence that can express the same code point;
1069 overlong sequences are expressly forbidden in the UTF-8 standard due to
1070 potential security issues). Another malformation example is the first byte of
1071 a character not being a legal first byte. See F<utf8.h> for the list of such
1072 flags. Even if allowed, this function generally returns the Unicode
1073 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1074 F<utf8.h> to override this behavior for the overlong malformations, but don't
1075 do that except for very specialized purposes.
1077 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1078 flags) malformation is found. If this flag is set, the routine assumes that
1079 the caller will raise a warning, and this function will silently just set
1080 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1082 Note that this API requires disambiguation between successful decoding a C<NUL>
1083 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1084 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1085 be set to 1. To disambiguate, upon a zero return, see if the first byte of
1086 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
1087 error. Or you can use C<L</utf8n_to_uvchr_error>>.
1089 Certain code points are considered problematic. These are Unicode surrogates,
1090 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1091 By default these are considered regular code points, but certain situations
1092 warrant special handling for them, which can be specified using the C<flags>
1093 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1094 three classes are treated as malformations and handled as such. The flags
1095 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1096 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1097 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1098 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1099 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1101 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1102 The difference between traditional strictness and C9 strictness is that the
1103 latter does not forbid non-character code points. (They are still discouraged,
1104 however.) For more discussion see L<perlunicode/Noncharacter code points>.
1106 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1107 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1108 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1109 raised for their respective categories, but otherwise the code points are
1110 considered valid (not malformations). To get a category to both be treated as
1111 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1112 (But note that warnings are not raised if lexically disabled nor if
1113 C<UTF8_CHECK_ONLY> is also specified.)
1115 Extremely high code points were never specified in any standard, and require an
1116 extension to UTF-8 to express, which Perl does. It is likely that programs
1117 written in something other than Perl would not be able to read files that
1118 contain these; nor would Perl understand files written by something that uses a
1119 different extension. For these reasons, there is a separate set of flags that
1120 can warn and/or disallow these extremely high code points, even if other
1121 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1122 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
1123 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
1124 above-Unicode code points, including these, as malformations.
1125 (Note that the Unicode standard considers anything above 0x10FFFF to be
1126 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1129 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1130 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1131 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1132 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1133 can apply to code points that actually do fit in 31 bits. This happens on
1134 EBCDIC platforms, and sometimes when the L<overlong
1135 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1136 describe the situation in all cases.
1139 All other code points corresponding to Unicode characters, including private
1140 use and those yet to be assigned, are never considered malformed and never
1143 =for apidoc Amnh||UTF8_CHECK_ONLY
1144 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1145 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1146 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1147 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1148 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1149 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1150 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1151 =for apidoc Amnh||UTF8_WARN_SURROGATE
1152 =for apidoc Amnh||UTF8_WARN_NONCHAR
1153 =for apidoc Amnh||UTF8_WARN_SUPER
1154 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1155 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1159 Also implemented as a macro in utf8.h
1163 Perl_utf8n_to_uvchr(const U8 *s,
1168 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1170 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1175 =for apidoc utf8n_to_uvchr_error
1177 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1178 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1181 This function is for code that needs to know what the precise malformation(s)
1182 are when an error is found. If you also need to know the generated warning
1183 messages, use L</utf8n_to_uvchr_msgs>() instead.
1185 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1186 all the others, C<errors>. If this parameter is 0, this function behaves
1187 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1188 to a C<U32> variable, which this function sets to indicate any errors found.
1189 Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1190 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1191 of these bits will be set if a malformation is found, even if the input
1192 C<flags> parameter indicates that the given malformation is allowed; those
1193 exceptions are noted:
1197 =item C<UTF8_GOT_PERL_EXTENDED>
1199 The input sequence is not standard UTF-8, but a Perl extension. This bit is
1200 set only if the input C<flags> parameter contains either the
1201 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1203 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1204 and so some extension must be used to express them. Perl uses a natural
1205 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1206 extension to represent even higher ones, so that any code point that fits in a
1207 64-bit word can be represented. Text using these extensions is not likely to
1208 be portable to non-Perl code. We lump both of these extensions together and
1209 refer to them as Perl extended UTF-8. There exist other extensions that people
1210 have invented, incompatible with Perl's.
1212 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1213 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1214 than on ASCII. Prior to that, code points 2**31 and higher were simply
1215 unrepresentable, and a different, incompatible method was used to represent
1216 code points between 2**30 and 2**31 - 1.
1218 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1219 Perl extended UTF-8 is used.
1221 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1222 may use for backward compatibility. That name is misleading, as this flag may
1223 be set when the code point actually does fit in 31 bits. This happens on
1224 EBCDIC platforms, and sometimes when the L<overlong
1225 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1226 describes the situation in all cases.
1228 =item C<UTF8_GOT_CONTINUATION>
1230 The input sequence was malformed in that the first byte was a UTF-8
1233 =item C<UTF8_GOT_EMPTY>
1235 The input C<curlen> parameter was 0.
1237 =item C<UTF8_GOT_LONG>
1239 The input sequence was malformed in that there is some other sequence that
1240 evaluates to the same code point, but that sequence is shorter than this one.
1242 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1243 it was discovered that this created security issues.
1245 =item C<UTF8_GOT_NONCHAR>
1247 The code point represented by the input UTF-8 sequence is for a Unicode
1248 non-character code point.
1249 This bit is set only if the input C<flags> parameter contains either the
1250 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1252 =item C<UTF8_GOT_NON_CONTINUATION>
1254 The input sequence was malformed in that a non-continuation type byte was found
1255 in a position where only a continuation type one should be. See also
1256 C<L</UTF8_GOT_SHORT>>.
1258 =item C<UTF8_GOT_OVERFLOW>
1260 The input sequence was malformed in that it is for a code point that is not
1261 representable in the number of bits available in an IV on the current platform.
1263 =item C<UTF8_GOT_SHORT>
1265 The input sequence was malformed in that C<curlen> is smaller than required for
1266 a complete sequence. In other words, the input is for a partial character
1270 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1271 sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1272 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1273 sequence was looked at. If no other flags are present, it means that the
1274 sequence was valid as far as it went. Depending on the application, this could
1275 mean one of three things:
1281 The C<curlen> length parameter passed in was too small, and the function was
1282 prevented from examining all the necessary bytes.
1286 The buffer being looked at is based on reading data, and the data received so
1287 far stopped in the middle of a character, so that the next read will
1288 read the remainder of this character. (It is up to the caller to deal with the
1289 split bytes somehow.)
1293 This is a real error, and the partial sequence is all we're going to get.
1297 =item C<UTF8_GOT_SUPER>
1299 The input sequence was malformed in that it is for a non-Unicode code point;
1300 that is, one above the legal Unicode maximum.
1301 This bit is set only if the input C<flags> parameter contains either the
1302 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1304 =item C<UTF8_GOT_SURROGATE>
1306 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1308 This bit is set only if the input C<flags> parameter contains either the
1309 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1313 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1314 flag to suppress any warnings, and then examine the C<*errors> return.
1316 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1317 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1318 =for apidoc Amnh||UTF8_GOT_EMPTY
1319 =for apidoc Amnh||UTF8_GOT_LONG
1320 =for apidoc Amnh||UTF8_GOT_NONCHAR
1321 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1322 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1323 =for apidoc Amnh||UTF8_GOT_SHORT
1324 =for apidoc Amnh||UTF8_GOT_SUPER
1325 =for apidoc Amnh||UTF8_GOT_SURROGATE
1329 Also implemented as a macro in utf8.h
1333 Perl_utf8n_to_uvchr_error(const U8 *s,
1339 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1341 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1346 =for apidoc utf8n_to_uvchr_msgs
1348 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1349 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1352 This function is for code that needs to know what the precise malformation(s)
1353 are when an error is found, and wants the corresponding warning and/or error
1354 messages to be returned to the caller rather than be displayed. All messages
1355 that would have been displayed if all lexical warnings are enabled will be
1358 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1359 placed after all the others, C<msgs>. If this parameter is 0, this function
1360 behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1361 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1362 contain any appropriate messages. The elements of the array are ordered so
1363 that the first message that would have been displayed is in the 0th element,
1364 and so on. Each element is a hash with three key-value pairs, as follows:
1370 The text of the message as a C<SVpv>.
1372 =item C<warn_categories>
1374 The warning category (or categories) packed into a C<SVuv>.
1378 A single flag bit associated with this message, in a C<SVuv>.
1379 The bit corresponds to some bit in the C<*errors> return value,
1380 such as C<UTF8_GOT_LONG>.
1384 It's important to note that specifying this parameter as non-null will cause
1385 any warnings this function would otherwise generate to be suppressed, and
1386 instead be placed in C<*msgs>. The caller can check the lexical warnings state
1387 (or not) when choosing what to do with the returned messages.
1389 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1392 The caller, of course, is responsible for freeing any returned AV.
1398 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1405 const U8 * const s0 = s;
1406 const U8 * send = s0 + curlen;
1407 U32 possible_problems; /* A bit is set here for each potential problem
1408 found as we go along */
1410 STRLEN expectlen; /* How long should this sequence be? */
1411 STRLEN avail_len; /* When input is too short, gives what that is */
1412 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
1413 gets set and discarded */
1415 /* The below are used only if there is both an overlong malformation and a
1416 * too short one. Otherwise the first two are set to 's0' and 'send', and
1417 * the third not used at all */
1419 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1420 routine; see [perl #130921] */
1424 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1426 /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1427 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1428 * syllables that the dfa doesn't properly handle. Quickly dispose of the
1431 /* Each of the affected Hanguls starts with \xED */
1433 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1444 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1445 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1446 | (s0[2] & UTF_CONTINUATION_MASK);
1449 /* In conjunction with the exhaustive tests that can be enabled in
1450 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1451 * what it is intended to do, and that no flaws in it are masked by
1452 * dropping down and executing the code below
1453 assert(! isUTF8_CHAR(s0, send)
1454 || UTF8_IS_SURROGATE(s0, send)
1455 || UTF8_IS_SUPER(s0, send)
1456 || UTF8_IS_NONCHAR(s0,send));
1460 possible_problems = 0;
1464 adjusted_s0 = (U8 *) s0;
1471 errors = &discard_errors;
1474 /* The order of malformation tests here is important. We should consume as
1475 * few bytes as possible in order to not skip any valid character. This is
1476 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1477 * https://unicode.org/reports/tr36 for more discussion as to why. For
1478 * example, once we've done a UTF8SKIP, we can tell the expected number of
1479 * bytes, and could fail right off the bat if the input parameters indicate
1480 * that there are too few available. But it could be that just that first
1481 * byte is garbled, and the intended character occupies fewer bytes. If we
1482 * blindly assumed that the first byte is correct, and skipped based on
1483 * that number, we could skip over a valid input character. So instead, we
1484 * always examine the sequence byte-by-byte.
1486 * We also should not consume too few bytes, otherwise someone could inject
1487 * things. For example, an input could be deliberately designed to
1488 * overflow, and if this code bailed out immediately upon discovering that,
1489 * returning to the caller C<*retlen> pointing to the very next byte (one
1490 * which is actually part of the overflowing sequence), that could look
1491 * legitimate to the caller, which could discard the initial partial
1492 * sequence and process the rest, inappropriately.
1494 * Some possible input sequences are malformed in more than one way. This
1495 * function goes to lengths to try to find all of them. This is necessary
1496 * for correctness, as the inputs may allow one malformation but not
1497 * another, and if we abandon searching for others after finding the
1498 * allowed one, we could allow in something that shouldn't have been.
1501 if (UNLIKELY(curlen == 0)) {
1502 possible_problems |= UTF8_GOT_EMPTY;
1504 uv = UNICODE_REPLACEMENT;
1505 goto ready_to_handle_errors;
1508 /* We now know we can examine the first byte of the input */
1509 expectlen = UTF8SKIP(s);
1512 /* A well-formed UTF-8 character, as the vast majority of calls to this
1513 * function will be for, has this expected length. For efficiency, set
1514 * things up here to return it. It will be overridden only in those rare
1515 * cases where a malformation is found */
1517 *retlen = expectlen;
1520 /* A continuation character can't start a valid sequence */
1521 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1522 possible_problems |= UTF8_GOT_CONTINUATION;
1524 uv = UNICODE_REPLACEMENT;
1525 goto ready_to_handle_errors;
1528 /* Here is not a continuation byte, nor an invariant. The only thing left
1529 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1530 * because it excludes start bytes like \xC0 that always lead to
1533 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1534 * that indicate the number of bytes in the character's whole UTF-8
1535 * sequence, leaving just the bits that are part of the value. */
1536 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1538 /* Setup the loop end point, making sure to not look past the end of the
1539 * input string, and flag it as too short if the size isn't big enough. */
1540 if (UNLIKELY(curlen < expectlen)) {
1541 possible_problems |= UTF8_GOT_SHORT;
1545 send = (U8*) s0 + expectlen;
1548 /* Now, loop through the remaining bytes in the character's sequence,
1549 * accumulating each into the working value as we go. */
1550 for (s = s0 + 1; s < send; s++) {
1551 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1552 uv = UTF8_ACCUMULATE(uv, *s);
1556 /* Here, found a non-continuation before processing all expected bytes.
1557 * This byte indicates the beginning of a new character, so quit, even
1558 * if allowing this malformation. */
1559 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1561 } /* End of loop through the character's bytes */
1563 /* Save how many bytes were actually in the character */
1566 /* Note that there are two types of too-short malformation. One is when
1567 * there is actual wrong data before the normal termination of the
1568 * sequence. The other is that the sequence wasn't complete before the end
1569 * of the data we are allowed to look at, based on the input 'curlen'.
1570 * This means that we were passed data for a partial character, but it is
1571 * valid as far as we saw. The other is definitely invalid. This
1572 * distinction could be important to a caller, so the two types are kept
1575 * A convenience macro that matches either of the too-short conditions. */
1576 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1578 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1580 uv = UNICODE_REPLACEMENT;
1583 /* Check for overflow. The algorithm requires us to not look past the end
1584 * of the current character, even if partial, so the upper limit is 's' */
1585 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1586 1 /* Do consider overlongs */
1589 possible_problems |= UTF8_GOT_OVERFLOW;
1590 uv = UNICODE_REPLACEMENT;
1593 /* Check for overlong. If no problems so far, 'uv' is the correct code
1594 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1595 * we must look at the UTF-8 byte sequence itself to see if it is for an
1597 if ( ( LIKELY(! possible_problems)
1598 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1599 || ( UNLIKELY(possible_problems)
1600 && ( UNLIKELY(! UTF8_IS_START(*s0))
1601 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1603 possible_problems |= UTF8_GOT_LONG;
1605 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1607 /* The calculation in the 'true' branch of this 'if'
1608 * below won't work if overflows, and isn't needed
1609 * anyway. Further below we handle all overflow
1611 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1613 UV min_uv = uv_so_far;
1616 /* Here, the input is both overlong and is missing some trailing
1617 * bytes. There is no single code point it could be for, but there
1618 * may be enough information present to determine if what we have
1619 * so far is for an unallowed code point, such as for a surrogate.
1620 * The code further below has the intelligence to determine this,
1621 * but just for non-overlong UTF-8 sequences. What we do here is
1622 * calculate the smallest code point the input could represent if
1623 * there were no too short malformation. Then we compute and save
1624 * the UTF-8 for that, which is what the code below looks at
1625 * instead of the raw input. It turns out that the smallest such
1626 * code point is all we need. */
1627 for (i = curlen; i < expectlen; i++) {
1628 min_uv = UTF8_ACCUMULATE(min_uv,
1629 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1632 adjusted_s0 = temp_char_buf;
1633 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1637 /* Here, we have found all the possible problems, except for when the input
1638 * is for a problematic code point not allowed by the input parameters. */
1640 /* uv is valid for overlongs */
1641 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1642 && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1643 || ( UNLIKELY(possible_problems)
1645 /* if overflow, we know without looking further
1646 * precisely which of the problematic types it is,
1647 * and we deal with those in the overflow handling
1649 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1650 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1651 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1652 && ((flags & ( UTF8_DISALLOW_NONCHAR
1653 |UTF8_DISALLOW_SURROGATE
1654 |UTF8_DISALLOW_SUPER
1655 |UTF8_DISALLOW_PERL_EXTENDED
1657 |UTF8_WARN_SURROGATE
1659 |UTF8_WARN_PERL_EXTENDED))))
1661 /* If there were no malformations, or the only malformation is an
1662 * overlong, 'uv' is valid */
1663 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1664 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1665 possible_problems |= UTF8_GOT_SURROGATE;
1667 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1668 possible_problems |= UTF8_GOT_SUPER;
1670 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1671 possible_problems |= UTF8_GOT_NONCHAR;
1674 else { /* Otherwise, need to look at the source UTF-8, possibly
1675 adjusted to be non-overlong */
1677 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1678 > UTF_START_BYTE_110000_))
1680 possible_problems |= UTF8_GOT_SUPER;
1682 else if (curlen > 1) {
1683 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
1684 == UTF_START_BYTE_110000_
1685 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1686 >= UTF_FIRST_CONT_BYTE_110000_))
1688 possible_problems |= UTF8_GOT_SUPER;
1690 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1691 possible_problems |= UTF8_GOT_SURROGATE;
1695 /* We need a complete well-formed UTF-8 character to discern
1696 * non-characters, so can't look for them here */
1700 ready_to_handle_errors:
1703 * curlen contains the number of bytes in the sequence that
1704 * this call should advance the input by.
1705 * avail_len gives the available number of bytes passed in, but
1706 * only if this is less than the expected number of
1707 * bytes, based on the code point's start byte.
1708 * possible_problems is 0 if there weren't any problems; otherwise a bit
1709 * is set in it for each potential problem found.
1710 * uv contains the code point the input sequence
1711 * represents; or if there is a problem that prevents
1712 * a well-defined value from being computed, it is
1713 * some substitute value, typically the REPLACEMENT
1715 * s0 points to the first byte of the character
1716 * s points to just after where we left off processing
1718 * send points to just after where that character should
1719 * end, based on how many bytes the start byte tells
1720 * us should be in it, but no further than s0 +
1724 if (UNLIKELY(possible_problems)) {
1725 bool disallowed = FALSE;
1726 const U32 orig_problems = possible_problems;
1732 while (possible_problems) { /* Handle each possible problem */
1734 char * message = NULL;
1735 U32 this_flag_bit = 0;
1737 /* Each 'if' clause handles one problem. They are ordered so that
1738 * the first ones' messages will be displayed before the later
1739 * ones; this is kinda in decreasing severity order. But the
1740 * overlong must come last, as it changes 'uv' looked at by the
1742 if (possible_problems & UTF8_GOT_OVERFLOW) {
1744 /* Overflow means also got a super and are using Perl's
1745 * extended UTF-8, but we handle all three cases here */
1747 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1748 *errors |= UTF8_GOT_OVERFLOW;
1750 /* But the API says we flag all errors found */
1751 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1752 *errors |= UTF8_GOT_SUPER;
1755 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1757 *errors |= UTF8_GOT_PERL_EXTENDED;
1760 /* Disallow if any of the three categories say to */
1761 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1762 || (flags & ( UTF8_DISALLOW_SUPER
1763 |UTF8_DISALLOW_PERL_EXTENDED)))
1768 /* Likewise, warn if any say to */
1769 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1770 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1773 /* The warnings code explicitly says it doesn't handle the
1774 * case of packWARN2 and two categories which have
1775 * parent-child relationship. Even if it works now to
1776 * raise the warning if either is enabled, it wouldn't
1777 * necessarily do so in the future. We output (only) the
1778 * most dire warning */
1779 if (! (flags & UTF8_CHECK_ONLY)) {
1780 if (msgs || ckWARN_d(WARN_UTF8)) {
1781 pack_warn = packWARN(WARN_UTF8);
1783 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1784 pack_warn = packWARN(WARN_NON_UNICODE);
1787 message = Perl_form(aTHX_ "%s: %s (overflows)",
1789 _byte_dump_string(s0, curlen, 0));
1790 this_flag_bit = UTF8_GOT_OVERFLOW;
1795 else if (possible_problems & UTF8_GOT_EMPTY) {
1796 possible_problems &= ~UTF8_GOT_EMPTY;
1797 *errors |= UTF8_GOT_EMPTY;
1799 if (! (flags & UTF8_ALLOW_EMPTY)) {
1801 /* This so-called malformation is now treated as a bug in
1802 * the caller. If you have nothing to decode, skip calling
1808 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1810 pack_warn = packWARN(WARN_UTF8);
1811 message = Perl_form(aTHX_ "%s (empty string)",
1813 this_flag_bit = UTF8_GOT_EMPTY;
1817 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1818 possible_problems &= ~UTF8_GOT_CONTINUATION;
1819 *errors |= UTF8_GOT_CONTINUATION;
1821 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1824 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1826 pack_warn = packWARN(WARN_UTF8);
1827 message = Perl_form(aTHX_
1828 "%s: %s (unexpected continuation byte 0x%02x,"
1829 " with no preceding start byte)",
1831 _byte_dump_string(s0, 1, 0), *s0);
1832 this_flag_bit = UTF8_GOT_CONTINUATION;
1836 else if (possible_problems & UTF8_GOT_SHORT) {
1837 possible_problems &= ~UTF8_GOT_SHORT;
1838 *errors |= UTF8_GOT_SHORT;
1840 if (! (flags & UTF8_ALLOW_SHORT)) {
1843 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1845 pack_warn = packWARN(WARN_UTF8);
1846 message = Perl_form(aTHX_
1847 "%s: %s (too short; %d byte%s available, need %d)",
1849 _byte_dump_string(s0, send - s0, 0),
1851 avail_len == 1 ? "" : "s",
1853 this_flag_bit = UTF8_GOT_SHORT;
1858 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1859 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1860 *errors |= UTF8_GOT_NON_CONTINUATION;
1862 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1865 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1868 /* If we don't know for sure that the input length is
1869 * valid, avoid as much as possible reading past the
1870 * end of the buffer */
1871 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1873 : (int) (send - s0);
1874 pack_warn = packWARN(WARN_UTF8);
1875 message = Perl_form(aTHX_ "%s",
1876 unexpected_non_continuation_text(s0,
1880 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1884 else if (possible_problems & UTF8_GOT_SURROGATE) {
1885 possible_problems &= ~UTF8_GOT_SURROGATE;
1887 if (flags & UTF8_WARN_SURROGATE) {
1888 *errors |= UTF8_GOT_SURROGATE;
1890 if ( ! (flags & UTF8_CHECK_ONLY)
1891 && (msgs || ckWARN_d(WARN_SURROGATE)))
1893 pack_warn = packWARN(WARN_SURROGATE);
1895 /* These are the only errors that can occur with a
1896 * surrogate when the 'uv' isn't valid */
1897 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1898 message = Perl_form(aTHX_
1899 "UTF-16 surrogate (any UTF-8 sequence that"
1900 " starts with \"%s\" is for a surrogate)",
1901 _byte_dump_string(s0, curlen, 0));
1904 message = Perl_form(aTHX_ surrogate_cp_format, uv);
1906 this_flag_bit = UTF8_GOT_SURROGATE;
1910 if (flags & UTF8_DISALLOW_SURROGATE) {
1912 *errors |= UTF8_GOT_SURROGATE;
1915 else if (possible_problems & UTF8_GOT_SUPER) {
1916 possible_problems &= ~UTF8_GOT_SUPER;
1918 if (flags & UTF8_WARN_SUPER) {
1919 *errors |= UTF8_GOT_SUPER;
1921 if ( ! (flags & UTF8_CHECK_ONLY)
1922 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1924 pack_warn = packWARN(WARN_NON_UNICODE);
1926 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1927 message = Perl_form(aTHX_
1928 "Any UTF-8 sequence that starts with"
1929 " \"%s\" is for a non-Unicode code point,"
1930 " may not be portable",
1931 _byte_dump_string(s0, curlen, 0));
1934 message = Perl_form(aTHX_ super_cp_format, uv);
1936 this_flag_bit = UTF8_GOT_SUPER;
1940 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1941 * and before possibly bailing out, so that the more dire
1942 * warning will override the regular one. */
1943 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1944 if ( ! (flags & UTF8_CHECK_ONLY)
1945 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1946 && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
1947 || ckWARN(WARN_PORTABLE))))
1949 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1951 /* If it is an overlong that evaluates to a code point
1952 * that doesn't have to use the Perl extended UTF-8, it
1953 * still used it, and so we output a message that
1954 * doesn't refer to the code point. The same is true
1955 * if there was a SHORT malformation where the code
1956 * point is not valid. In that case, 'uv' will have
1957 * been set to the REPLACEMENT CHAR, and the message
1958 * below without the code point in it will be selected
1960 if (UNICODE_IS_PERL_EXTENDED(uv)) {
1961 message = Perl_form(aTHX_
1962 PL_extended_cp_format, uv);
1965 message = Perl_form(aTHX_
1966 "Any UTF-8 sequence that starts with"
1967 " \"%s\" is a Perl extension, and"
1968 " so is not portable",
1969 _byte_dump_string(s0, curlen, 0));
1971 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1974 if (flags & ( UTF8_WARN_PERL_EXTENDED
1975 |UTF8_DISALLOW_PERL_EXTENDED))
1977 *errors |= UTF8_GOT_PERL_EXTENDED;
1979 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1985 if (flags & UTF8_DISALLOW_SUPER) {
1986 *errors |= UTF8_GOT_SUPER;
1990 else if (possible_problems & UTF8_GOT_NONCHAR) {
1991 possible_problems &= ~UTF8_GOT_NONCHAR;
1993 if (flags & UTF8_WARN_NONCHAR) {
1994 *errors |= UTF8_GOT_NONCHAR;
1996 if ( ! (flags & UTF8_CHECK_ONLY)
1997 && (msgs || ckWARN_d(WARN_NONCHAR)))
1999 /* The code above should have guaranteed that we don't
2000 * get here with errors other than overlong */
2001 assert (! (orig_problems
2002 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2004 pack_warn = packWARN(WARN_NONCHAR);
2005 message = Perl_form(aTHX_ nonchar_cp_format, uv);
2006 this_flag_bit = UTF8_GOT_NONCHAR;
2010 if (flags & UTF8_DISALLOW_NONCHAR) {
2012 *errors |= UTF8_GOT_NONCHAR;
2015 else if (possible_problems & UTF8_GOT_LONG) {
2016 possible_problems &= ~UTF8_GOT_LONG;
2017 *errors |= UTF8_GOT_LONG;
2019 if (flags & UTF8_ALLOW_LONG) {
2021 /* We don't allow the actual overlong value, unless the
2022 * special extra bit is also set */
2023 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2024 & ~UTF8_ALLOW_LONG)))
2026 uv = UNICODE_REPLACEMENT;
2033 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2035 pack_warn = packWARN(WARN_UTF8);
2037 /* These error types cause 'uv' to be something that
2038 * isn't what was intended, so can't use it in the
2039 * message. The other error types either can't
2040 * generate an overlong, or else the 'uv' is valid */
2042 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2044 message = Perl_form(aTHX_
2045 "%s: %s (any UTF-8 sequence that starts"
2046 " with \"%s\" is overlong which can and"
2047 " should be represented with a"
2048 " different, shorter sequence)",
2050 _byte_dump_string(s0, send - s0, 0),
2051 _byte_dump_string(s0, curlen, 0));
2054 U8 tmpbuf[UTF8_MAXBYTES+1];
2055 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2057 /* Don't use U+ for non-Unicode code points, which
2058 * includes those in the Latin1 range */
2059 const char * preface = ( UNICODE_IS_SUPER(uv)
2066 message = Perl_form(aTHX_
2067 "%s: %s (overlong; instead use %s to represent"
2070 _byte_dump_string(s0, send - s0, 0),
2071 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2073 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2074 small code points */
2077 this_flag_bit = UTF8_GOT_LONG;
2080 } /* End of looking through the possible flags */
2082 /* Display the message (if any) for the problem being handled in
2083 * this iteration of the loop */
2086 assert(this_flag_bit);
2088 if (*msgs == NULL) {
2092 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2097 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2100 Perl_warner(aTHX_ pack_warn, "%s", message);
2102 } /* End of 'while (possible_problems)' */
2104 /* Since there was a possible problem, the returned length may need to
2105 * be changed from the one stored at the beginning of this function.
2106 * Instead of trying to figure out if it has changed, just do it. */
2112 if (flags & UTF8_CHECK_ONLY && retlen) {
2113 *retlen = ((STRLEN) -1);
2119 return UNI_TO_NATIVE(uv);
2123 =for apidoc utf8_to_uvchr_buf
2125 Returns the native code point of the first character in the string C<s> which
2126 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2127 C<*retlen> will be set to the length, in bytes, of that character.
2129 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2130 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2131 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
2132 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2133 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2134 the next possible position in C<s> that could begin a non-malformed character.
2135 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2140 Also implemented as a macro in utf8.h
2146 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2148 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2150 return utf8_to_uvchr_buf_helper(s, send, retlen);
2154 =for apidoc utf8_length
2156 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2157 at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2158 same place, it returns 0 with no warning raised.
2160 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2161 and returns the number of valid characters.
2167 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2171 PERL_ARGS_ASSERT_UTF8_LENGTH;
2173 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2174 * the bitops (especially ~) can create illegal UTF-8.
2175 * In other words: in Perl UTF-8 is not just for Unicode. */
2178 Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2180 if (UNLIKELY(e - s < expected_byte_count)) {
2181 goto warn_and_return;
2185 s += expected_byte_count;
2188 if (LIKELY(e == s)) {
2192 /* Here, s > e on entry */
2196 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2197 "%s in %s", unees, OP_DESC(PL_op));
2199 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2205 =for apidoc bytes_cmp_utf8
2207 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2208 sequence of characters (stored as UTF-8)
2209 in C<u>, C<ulen>. Returns 0 if they are
2210 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2211 if the first string is greater than the second string.
2213 -1 or +1 is returned if the shorter string was identical to the start of the
2214 longer string. -2 or +2 is returned if
2215 there was a difference between characters
2222 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2224 const U8 *const bend = b + blen;
2225 const U8 *const uend = u + ulen;
2227 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2229 while (b < bend && u < uend) {
2231 if (!UTF8_IS_INVARIANT(c)) {
2232 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2235 if (UTF8_IS_CONTINUATION(c1)) {
2236 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2238 /* diag_listed_as: Malformed UTF-8 character%s */
2239 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2241 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2242 PL_op ? " in " : "",
2243 PL_op ? OP_DESC(PL_op) : "");
2248 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2249 "%s in %s", unees, OP_DESC(PL_op));
2251 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2252 return -2; /* Really want to return undef :-) */
2259 return *b < c ? -2 : +2;
2264 if (b == bend && u == uend)
2267 return b < bend ? +1 : -1;
2271 =for apidoc utf8_to_bytes
2273 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2274 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2275 updates C<*lenp> to contain the new length.
2276 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2278 Upon successful return, the number of variants in the string can be computed by
2279 having saved the value of C<*lenp> before the call, and subtracting the
2280 after-call value of C<*lenp> from it.
2282 If you need a copy of the string, see L</bytes_from_utf8>.
2288 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2292 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2293 PERL_UNUSED_CONTEXT;
2295 /* This is a no-op if no variants at all in the input */
2296 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2300 /* Nothing before 'first_variant' needs to be changed, so start the real
2303 U8 * const save = s;
2304 U8 * const send = s + *lenp;
2307 #ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
2309 /* There is some start-up/tear-down overhead with this, so no real gain
2310 * unless the string is long enough. The current value is just a
2312 if (*lenp > 5 * PERL_WORDSIZE) {
2314 /* First, go through the string a word at-a-time to verify that it is
2315 * downgradable. If it contains any start byte besides C2 and C3, then
2318 const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
2319 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
2320 const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
2322 /* Points to the first byte >=s which is positioned at a word boundary.
2323 * If s is on a word boundary, it is s, otherwise it is the first byte
2324 * of the next word. */
2325 U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2326 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2328 /* Here there is at least a full word beyond the first word boundary.
2329 * Process up to that boundary. */
2330 while (s < partial_word_end) {
2331 if (! UTF8_IS_INVARIANT(*s)) {
2332 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2333 *lenp = ((STRLEN) -1);
2341 /* Adjust back down any overshoot */
2342 s = partial_word_end;
2344 /* Process per-word */
2347 PERL_UINTMAX_T C2_C3_start_bytes;
2349 /* First find the bytes that are start bytes. ANDing with
2350 * C0C0...C0 causes any start byte to become C0; any other byte
2351 * becomes something else. Then XORing with C0 causes any start
2352 * byte to become 0; all other bytes non-zero. */
2353 PERL_UINTMAX_T start_bytes
2354 = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
2356 /* These shifts causes the most significant bit to be set to 1 for
2357 * any bytes in the word that aren't completely 0. Hence after
2358 * these, only the start bytes have 0 in their msb */
2359 start_bytes |= start_bytes << 1;
2360 start_bytes |= start_bytes << 2;
2361 start_bytes |= start_bytes << 4;
2363 /* When we complement, then AND with 8080...80, the start bytes
2364 * will have 1 in their msb, and all other bits are 0 */
2365 start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
2367 /* Now repeat the procedure, but look for bytes that match only
2369 C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
2371 C2_C3_start_bytes |= C2_C3_start_bytes << 1;
2372 C2_C3_start_bytes |= C2_C3_start_bytes << 2;
2373 C2_C3_start_bytes |= C2_C3_start_bytes << 4;
2374 C2_C3_start_bytes = ~ C2_C3_start_bytes
2375 & PERL_VARIANTS_WORD_MASK;
2377 /* Here, start_bytes has a 1 in the msb of each byte that has a
2379 * C2_C3_start_bytes has a 1 in the msb of each byte that has a
2380 * start_byte of C2 or C3
2381 * If they're not equal, there are start bytes that aren't C2
2382 * nor C3, hence this is not downgradable */
2383 if (start_bytes != C2_C3_start_bytes) {
2384 *lenp = ((STRLEN) -1);
2389 } while (s + PERL_WORDSIZE <= send);
2391 /* If the final byte was a start byte, it means that the character
2392 * straddles two words, so back off one to start looking below at the
2393 * first byte of the character */
2394 if (s > first_variant && UTF8_IS_START(*(s-1))) {
2401 /* Do the straggler bytes beyond the final word boundary (or all bytes
2402 * in the case of EBCDIC) */
2404 if (! UTF8_IS_INVARIANT(*s)) {
2405 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2406 *lenp = ((STRLEN) -1);
2414 /* Here, we passed the tests above. For the EBCDIC case, everything
2415 * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC,
2416 * it means only that all start bytes were C2 or C3, hence any
2417 * well-formed sequences are downgradable. But we didn't test, for
2418 * example, that there weren't two C2's in a row. That means that in
2419 * the loop below, we have to be sure things are well-formed. Because
2420 * this is very very likely, and we don't care about having speedy
2421 * handling of malformed input, the loop proceeds as if well formed,
2422 * and should a malformed one come along, it undoes what it already has
2425 d = s = first_variant;
2430 if (UVCHR_IS_INVARIANT(*s)) {
2435 /* Here it is two-byte encoded. */
2436 if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
2437 && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
2439 U8 first_byte = *s++;
2440 *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
2445 /* Here, it is malformed. This shouldn't happen on EBCDIC, and on
2446 * ASCII platforms, we know that the only start bytes in the text
2447 * are C2 and C3, and the code above has made sure that it doesn't
2448 * end with a start byte. That means the only malformations that
2449 * are possible are a start byte without a continuation (either
2450 * followed by another start byte or an invariant) or an unexpected
2453 * We have to undo all we've done before, back down to the first
2454 * UTF-8 variant. Note that each 2-byte variant we've done so far
2455 * (converted to single byte) slides things to the left one byte,
2456 * and so we have bytes that haven't been written over.
2458 * Here, 'd' points to the next position to overwrite, and 's'
2459 * points to the first invalid byte. That means 'd's contents
2460 * haven't been changed yet, nor has anything else beyond it in the
2461 * string. In restoring to the original contents, we don't need to
2462 * do anything past (d-1).
2464 * In particular, the bytes from 'd' to 's' have not been changed.
2465 * This loop uses a new variable 's1' (to avoid confusing 'source'
2466 * and 'destination') set to 'd', and moves 's' and 's1' in lock
2467 * step back so that afterwards, 's1' points to the first changed
2468 * byte that will be the source for the first byte (or bytes) at
2469 * 's' that need to be changed back. Note that s1 can expand to
2474 if (! UVCHR_IS_INVARIANT(*s1)) {
2480 /* Do the changing back */
2481 while (s1 >= first_variant) {
2482 if (UVCHR_IS_INVARIANT(*s1)) {
2486 *s-- = UTF8_EIGHT_BIT_LO(*s1);
2487 *s-- = UTF8_EIGHT_BIT_HI(*s1);
2492 *lenp = ((STRLEN) -1);
2504 =for apidoc bytes_from_utf8
2506 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2507 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2508 actually encoded in UTF-8.
2510 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2513 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2514 not expressible in native byte encoding. In these cases, C<*is_utf8p> and
2515 C<*lenp> are unchanged, and the return value is the original C<s>.
2517 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2518 newly created string containing a downgraded copy of C<s>, and whose length is
2519 returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2520 caller is responsible for arranging for the memory used by this string to get
2523 Upon successful return, the number of variants in the string can be computed by
2524 having saved the value of C<*lenp> before the call, and subtracting the
2525 after-call value of C<*lenp> from it.
2529 There is a macro that avoids this function call, but this is retained for
2530 anyone who calls it with the Perl_ prefix */
2533 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2535 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2536 PERL_UNUSED_CONTEXT;
2538 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2542 =for apidoc bytes_from_utf8_loc
2544 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2545 to where to store the location of the first character in C<"s"> that cannot be
2546 converted to non-UTF8.
2548 If that parameter is C<NULL>, this function behaves identically to
2551 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2552 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2554 Otherwise, the function returns a newly created C<NUL>-terminated string
2555 containing the non-UTF8 equivalent of the convertible first portion of
2556 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2557 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2558 and C<*first_non_downgradable> is set to C<NULL>.
2560 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2561 first character in the original string that wasn't converted. C<*is_utf8p> is
2562 unchanged. Note that the new string may have length 0.
2564 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2565 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2566 converts as many characters in it as possible stopping at the first one it
2567 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
2568 set to point to that. The function returns the portion that could be converted
2569 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2570 not including the terminating C<NUL>. If the very first character in the
2571 original could not be converted, C<*lenp> will be 0, and the new string will
2572 contain just a single C<NUL>. If the entire input string was converted,
2573 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2575 Upon successful return, the number of variants in the converted portion of the
2576 string can be computed by having saved the value of C<*lenp> before the call,
2577 and subtracting the after-call value of C<*lenp> from it.
2585 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2588 const U8 *original = s;
2589 U8 *converted_start;
2590 const U8 *send = s + *lenp;
2592 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2595 if (first_unconverted) {
2596 *first_unconverted = NULL;
2599 return (U8 *) original;
2602 Newx(d, (*lenp) + 1, U8);
2604 converted_start = d;
2607 if (! UTF8_IS_INVARIANT(c)) {
2609 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2610 * have to stop now */
2611 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2612 if (first_unconverted) {
2613 *first_unconverted = s - 1;
2614 goto finish_and_return;
2617 Safefree(converted_start);
2618 return (U8 *) original;
2622 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2628 /* Here, converted the whole of the input */
2630 if (first_unconverted) {
2631 *first_unconverted = NULL;
2636 *lenp = d - converted_start;
2638 /* Trim unused space */
2639 Renew(converted_start, *lenp + 1, U8);
2641 return converted_start;
2645 =for apidoc bytes_to_utf8
2647 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2649 Returns a pointer to the newly-created string, and sets C<*lenp> to
2650 reflect the new length in bytes. The caller is responsible for arranging for
2651 the memory used by this string to get freed.
2653 Upon successful return, the number of variants in the string can be computed by
2654 having saved the value of C<*lenp> before the call, and subtracting it from the
2655 after-call value of C<*lenp>.
2657 A C<NUL> character will be written after the end of the string.
2659 If you want to convert to UTF-8 from encodings other than
2660 the native (Latin1 or EBCDIC),
2661 see L</sv_recode_to_utf8>().
2667 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2669 const U8 * const send = s + (*lenp);
2673 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2674 PERL_UNUSED_CONTEXT;
2676 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2677 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2681 append_utf8_from_native_byte(*s, &d);
2692 * Convert native UTF-16 to UTF-8. Called via the more public functions
2693 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2696 * 'p' is the UTF-16 input string, passed as a pointer to U8.
2697 * 'bytelen' is its length (must be even)
2698 * 'd' is the pointer to the destination buffer. The caller must ensure that
2699 * the space is large enough. The maximum expansion factor is 2 times
2700 * 'bytelen'. 1.5 if never going to run on an EBCDIC box.
2701 * '*newlen' will contain the number of bytes this function filled of 'd'.
2702 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2703 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
2705 * The expansion factor is because UTF-16 requires 2 bytes for every code point
2706 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code
2707 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for
2708 * every code point below 0x1000; otherwise 4-5 bytes.
2710 * The worst case is where every code point is below U+10000, hence requiring 2
2711 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2712 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2714 * Do not use in-place. */
2717 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2718 const bool high_byte, /* Which of next two bytes is
2720 const bool low_byte)
2725 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2728 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2729 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2734 /* Next 16 bits is what we want. (The bool is cast to U8 because on
2735 * platforms where a bool is implemented as a signed char, a compiler
2736 * warning may be generated) */
2737 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2740 /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2742 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2744 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2745 #define LAST_HIGH_SURROGATE 0xDBFF
2746 #define FIRST_LOW_SURROGATE 0xDC00
2747 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
2748 #define FIRST_IN_PLANE1 0x10000
2750 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2751 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2754 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2755 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2756 LAST_LOW_SURROGATE)))
2758 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2763 /* Here uv is the high surrogate. Combine with low surrogate
2764 * just computed to form the actual U32 code point.
2766 * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2767 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2768 + low_surrogate - FIRST_LOW_SURROGATE;
2772 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2773 d = uvchr_to_utf8(d, uv);
2776 *newlen = d - dstart;
2781 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2783 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2785 return utf16_to_utf8(p, d, bytelen, newlen);
2789 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2791 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2793 return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2797 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2798 * big-endian and utf8_to_utf16_reversed() for little-endian,
2800 * 's' is the UTF-8 input string, passed as a pointer to U8.
2801 * 'bytelen' is its length
2802 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The
2803 * caller must ensure that the space is large enough. The maximum
2804 * expansion factor is 2 times 'bytelen'. This happens when the input is
2805 * entirely single-byte ASCII, expanding to two-byte UTF-16.
2806 * '*newlen' will contain the number of bytes this function filled of 'd'.
2807 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2808 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
2810 * Do not use in-place. */
2812 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2813 const bool high_byte, /* Which of next two bytes
2815 const bool low_byte)
2820 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2826 UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2827 /* No surrogates nor above-Unicode */
2828 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2830 /* The modern method is to keep going with malformed input,
2831 * substituting the REPLACEMENT CHARACTER */
2832 if (UNLIKELY(uv == 0 && *s != '\0')) {
2833 uv = UNICODE_REPLACEMENT;
2836 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */
2838 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2839 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2840 + FIRST_HIGH_SURROGATE;
2842 /* (The bool is cast to U8 because on platforms where a bool is
2843 * implemented as a signed char, a compiler warning may be
2845 d[(U8) high_byte] = high_surrogate >> 8;
2846 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8);
2849 /* The low surrogate is the lower 10 bits plus the offset */
2850 uv &= nBIT_MASK(10);
2851 uv += FIRST_LOW_SURROGATE;
2853 /* Drop down to output the low surrogate like it were a
2857 d[(U8) high_byte] = uv >> 8;
2858 d[(U8) low_byte] = uv & nBIT_MASK(8);
2864 *newlen = d - dstart;
2869 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2871 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2875 Perl__is_uni_perl_idcont(pTHX_ UV c)
2877 return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2881 Perl__is_uni_perl_idstart(pTHX_ UV c)
2883 return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2887 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2890 /* We have the latin1-range values compiled into the core, so just use
2891 * those, converting the result to UTF-8. The only difference between upper
2892 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2893 * either "SS" or "Ss". Which one to use is passed into the routine in
2894 * 'S_or_s' to avoid a test */
2896 UV converted = toUPPER_LATIN1_MOD(c);
2898 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2900 assert(S_or_s == 'S' || S_or_s == 's');
2902 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2903 characters in this range */
2904 *p = (U8) converted;
2909 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2910 * which it maps to one of them, so as to only have to have one check for
2911 * it in the main case */
2912 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2914 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2915 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2918 converted = GREEK_CAPITAL_LETTER_MU;
2920 #if UNICODE_MAJOR_VERSION > 2 \
2921 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2922 && UNICODE_DOT_DOT_VERSION >= 8)
2923 case LATIN_SMALL_LETTER_SHARP_S:
2930 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2931 " '%c' to map to '%c'",
2932 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2933 NOT_REACHED; /* NOTREACHED */
2937 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2938 *p = UTF8_TWO_BYTE_LO(converted);
2944 /* If compiled on an early Unicode version, there may not be auxiliary tables
2946 #ifndef HAS_UC_AUX_TABLES
2947 # define UC_AUX_TABLE_ptrs NULL
2948 # define UC_AUX_TABLE_lengths NULL
2950 #ifndef HAS_TC_AUX_TABLES
2951 # define TC_AUX_TABLE_ptrs NULL
2952 # define TC_AUX_TABLE_lengths NULL
2954 #ifndef HAS_LC_AUX_TABLES
2955 # define LC_AUX_TABLE_ptrs NULL
2956 # define LC_AUX_TABLE_lengths NULL
2958 #ifndef HAS_CF_AUX_TABLES
2959 # define CF_AUX_TABLE_ptrs NULL
2960 # define CF_AUX_TABLE_lengths NULL
2963 /* Call the function to convert a UTF-8 encoded character to the specified case.
2964 * Note that there may be more than one character in the result.
2965 * 's' is a pointer to the first byte of the input character
2966 * 'd' will be set to the first byte of the string of changed characters. It
2967 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2968 * 'lenp' will be set to the length in bytes of the string of changed characters
2970 * The functions return the ordinal of the first character in the string of
2972 #define CALL_UPPER_CASE(uv, s, d, lenp) \
2973 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
2974 Uppercase_Mapping_invmap, \
2975 UC_AUX_TABLE_ptrs, \
2976 UC_AUX_TABLE_lengths, \
2978 #define CALL_TITLE_CASE(uv, s, d, lenp) \
2979 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
2980 Titlecase_Mapping_invmap, \
2981 TC_AUX_TABLE_ptrs, \
2982 TC_AUX_TABLE_lengths, \
2984 #define CALL_LOWER_CASE(uv, s, d, lenp) \
2985 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
2986 Lowercase_Mapping_invmap, \
2987 LC_AUX_TABLE_ptrs, \
2988 LC_AUX_TABLE_lengths, \
2992 /* This additionally has the input parameter 'specials', which if non-zero will
2993 * cause this to use the specials hash for folding (meaning get full case
2994 * folding); otherwise, when zero, this implies a simple case fold */
2995 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
2997 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
2998 Case_Folding_invmap, \
2999 CF_AUX_TABLE_ptrs, \
3000 CF_AUX_TABLE_lengths, \
3002 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
3003 Simple_Case_Folding_invmap, \
3008 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
3010 /* Convert the Unicode character whose ordinal is <c> to its uppercase
3011 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
3012 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
3013 * the changed version may be longer than the original character.
3015 * The ordinal of the first character of the changed version is returned
3016 * (but note, as explained above, that there may be more.) */
3018 PERL_ARGS_ASSERT_TO_UNI_UPPER;
3021 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
3024 return CALL_UPPER_CASE(c, NULL, p, lenp);
3028 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
3030 PERL_ARGS_ASSERT_TO_UNI_TITLE;
3033 return _to_upper_title_latin1((U8) c, p, lenp, 's');
3036 return CALL_TITLE_CASE(c, NULL, p, lenp);
3040 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
3042 /* We have the latin1-range values compiled into the core, so just use
3043 * those, converting the result to UTF-8. Since the result is always just
3044 * one character, we allow <p> to be NULL */
3046 U8 converted = toLOWER_LATIN1(c);
3048 PERL_UNUSED_ARG(dummy);
3051 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
3056 /* Result is known to always be < 256, so can use the EIGHT_BIT
3058 *p = UTF8_EIGHT_BIT_HI(converted);
3059 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
3067 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
3069 PERL_ARGS_ASSERT_TO_UNI_LOWER;
3072 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
3075 return CALL_LOWER_CASE(c, NULL, p, lenp);
3079 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
3081 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
3082 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3083 * FOLD_FLAGS_FULL iff full folding is to be used;
3085 * Not to be used for locale folds
3090 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3092 assert (! (flags & FOLD_FLAGS_LOCALE));
3094 if (UNLIKELY(c == MICRO_SIGN)) {
3095 converted = GREEK_SMALL_LETTER_MU;
3097 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3098 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3099 || UNICODE_DOT_DOT_VERSION > 0)
3100 else if ( (flags & FOLD_FLAGS_FULL)
3101 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3103 /* If can't cross 127/128 boundary, can't return "ss"; instead return
3104 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3105 * under those circumstances. */
3106 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3107 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3108 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3110 return LATIN_SMALL_LETTER_LONG_S;
3120 else { /* In this range the fold of all other characters is their lower
3122 converted = toLOWER_LATIN1(c);
3125 if (UVCHR_IS_INVARIANT(converted)) {
3126 *p = (U8) converted;
3130 *(p)++ = UTF8_TWO_BYTE_HI(converted);
3131 *p = UTF8_TWO_BYTE_LO(converted);
3139 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3142 /* Not currently externally documented, and subject to change
3143 * <flags> bits meanings:
3144 * FOLD_FLAGS_FULL iff full folding is to be used;
3145 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3146 * locale are to be used.
3147 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3150 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3152 if (flags & FOLD_FLAGS_LOCALE) {
3153 /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3154 * except for potentially warning */
3155 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3156 if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) {
3157 flags &= ~FOLD_FLAGS_LOCALE;
3160 goto needs_full_generality;
3165 return _to_fold_latin1((U8) c, p, lenp,
3166 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3169 /* Here, above 255. If no special needs, just use the macro */
3170 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3171 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3173 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3174 the special flags. */
3175 U8 utf8_c[UTF8_MAXBYTES + 1];
3177 needs_full_generality:
3178 uvchr_to_utf8(utf8_c, c);
3179 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3184 PERL_STATIC_INLINE bool
3185 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3188 /* returns a boolean giving whether or not the UTF8-encoded character that
3189 * starts at <p>, and extending no further than <e - 1> is in the inversion
3190 * list <invlist>. */
3192 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3194 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3196 if (cp == 0 && (p >= e || *p != '\0')) {
3197 _force_out_malformed_utf8_message(p, e, 0, 1);
3198 NOT_REACHED; /* NOTREACHED */
3202 return _invlist_contains_cp(invlist, cp);
3205 #if 0 /* Not currently used, but may be needed in the future */
3206 PERLVAR(I, seen_deprecated_macro, HV *)
3209 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3210 const char * const alternative,
3211 const bool use_locale,
3212 const char * const file,
3213 const unsigned line)
3217 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3219 if (ckWARN_d(WARN_DEPRECATED)) {
3221 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3222 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3223 if (! PL_seen_deprecated_macro) {
3224 PL_seen_deprecated_macro = newHV();
3226 if (! hv_store(PL_seen_deprecated_macro, key,
3227 strlen(key), &PL_sv_undef, 0))
3229 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3232 if (instr(file, "mathoms.c")) {
3233 Perl_warner(aTHX_ WARN_DEPRECATED,
3234 "In %s, line %d, starting in Perl v5.32, %s()"
3235 " will be removed. Avoid this message by"
3236 " converting to use %s().\n",
3237 file, line, name, alternative);
3240 Perl_warner(aTHX_ WARN_DEPRECATED,
3241 "In %s, line %d, starting in Perl v5.32, %s() will"
3242 " require an additional parameter. Avoid this"
3243 " message by converting to use %s().\n",
3244 file, line, name, alternative);
3252 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3254 PERL_ARGS_ASSERT__IS_UTF8_FOO;
3256 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3260 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3262 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3264 return is_utf8_common(p, e, PL_utf8_perl_idstart);
3268 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3270 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3272 return is_utf8_common(p, e, PL_utf8_perl_idcont);
3276 S_to_case_cp_list(pTHX_
3278 const U32 ** const remaining_list,
3279 Size_t * remaining_count,
3280 SV *invlist, const I32 * const invmap,
3281 const U32 * const * const aux_tables,
3282 const U8 * const aux_table_lengths,
3283 const char * const normal)
3288 /* Calculate the changed case of code point 'original'. The first code
3289 * point of the changed case is returned.
3291 * If 'remaining_count' is not NULL, *remaining_count will be set to how
3292 * many *other* code points are in the changed case. If non-zero and
3293 * 'remaining_list' is also not NULL, *remaining_list will be set to point
3294 * to a non-modifiable array containing the second and potentially third
3295 * code points in the changed case. (Unicode guarantees a maximum of 3.)
3296 * Note that this means that *remaining_list is undefined unless there are
3297 * multiple code points, and the caller has chosen to find out how many by
3298 * making 'remaining_count' not NULL.
3300 * 'normal' is a string to use to name the new case in any generated
3301 * messages, as a fallback if the operation being used is not available.
3303 * The casing to use is given by the data structures in the remaining
3307 PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3309 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3310 * that covers all possible inputs. See [perl #133365] */
3311 index = _invlist_search(invlist, original);
3312 base = invmap[index];
3314 /* Most likely, the case change will contain just a single code point */
3315 if (remaining_count) {
3316 *remaining_count = 0;
3319 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */
3321 /* At this bottom level routine is where we warn about illegal code
3323 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3324 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3325 if (ckWARN_d(WARN_SURROGATE)) {
3326 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3327 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3328 "Operation \"%s\" returns its argument for"
3329 " UTF-16 surrogate U+%04" UVXf, desc, original);
3332 else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3333 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3334 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3336 if (ckWARN_d(WARN_NON_UNICODE)) {
3337 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3338 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3339 "Operation \"%s\" returns its argument for"
3340 " non-Unicode code point 0x%04" UVXf, desc, original);
3344 /* Note that non-characters are perfectly legal, so no warning
3345 * should be given. */
3351 if (LIKELY(base > 0)) { /* means original mapped to a single code point,
3352 different from itself */
3353 return base + original - invlist_array(invlist)[index];
3356 /* Here 'base' is negative. That means the mapping is 1-to-many, and
3357 * requires an auxiliary table look up. abs(base) gives the index into a
3358 * list of such tables which points to the proper aux table. And a
3359 * parallel list gives the length of each corresponding aux table. Skip
3360 * the first entry in the *remaining returns, as it is returned by the
3363 if (remaining_count) {
3364 *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3366 if (remaining_list) {
3367 *remaining_list = aux_tables[base] + 1;
3371 return (UV) aux_tables[base][0];
3375 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3376 U8* ustrp, STRLEN *lenp,
3377 SV *invlist, const I32 * const invmap,
3378 const U32 * const * const aux_tables,
3379 const U8 * const aux_table_lengths,
3380 const char * const normal)
3382 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to
3383 * the beginning of the (assumed to be valid) UTF-8 representation of
3384 * 'original'. 'normal' is a string to use to name the new case in any
3385 * generated messages, as a fallback if the operation being used is not
3386 * available. The new case is given by the data structures in the
3387 * remaining arguments.
3389 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3390 * entire changed case string, and the return value is the first code point
3393 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3394 * since the changed version may be longer than the original character. */
3396 const U32 * remaining_list;
3397 Size_t remaining_count;
3398 UV first = to_case_cp_list(original,
3399 &remaining_list, &remaining_count,
3401 aux_tables, aux_table_lengths,
3404 PERL_ARGS_ASSERT__TO_UTF8_CASE;
3406 /* If the code point maps to itself and we already have its representation,
3407 * copy it instead of recalculating */
3408 if (original == first && p) {
3409 *lenp = UTF8SKIP(p);
3411 if (p != ustrp) { /* Don't copy onto itself */
3412 Copy(p, ustrp, *lenp, U8);
3419 d = uvchr_to_utf8(d, first);
3421 for (i = 0; i < remaining_count; i++) {
3422 d = uvchr_to_utf8(d, remaining_list[i]);
3433 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3434 const U32 ** remaining_folds_to)
3436 /* Returns the count of the number of code points that fold to the input
3437 * 'cp' (besides itself).
3439 * If the return is 0, there is nothing else that folds to it, and
3440 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3442 * If the return is 1, '*first_folds_to' is set to the single code point,
3443 * and '*remaining_folds_to' is set to NULL.
3445 * Otherwise, '*first_folds_to' is set to a code point, and
3446 * '*remaining_fold_to' is set to an array that contains the others. The
3447 * length of this array is the returned count minus 1.
3449 * The reason for this convolution is to avoid having to deal with
3450 * allocating and freeing memory. The lists are already constructed, so
3451 * the return can point to them, but single code points aren't, so would
3452 * need to be constructed if we didn't employ something like this API
3454 * The code points returned by this function are all legal Unicode, which
3455 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3456 * constructed with this size (to save space and memory), and we return
3457 * pointers, so they must be this size */
3459 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3460 * that covers all possible inputs. See [perl #133365] */
3461 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3462 I32 base = _Perl_IVCF_invmap[index];
3464 PERL_ARGS_ASSERT__INVERSE_FOLDS;
3466 if (base == 0) { /* No fold */
3467 *first_folds_to = 0;
3468 *remaining_folds_to = NULL;
3472 #ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
3478 if (UNLIKELY(base < 0)) { /* Folds to more than one character */
3480 /* The data structure is set up so that the absolute value of 'base' is
3481 * an index into a table of pointers to arrays, with the array
3482 * corresponding to the index being the list of code points that fold
3483 * to 'cp', and the parallel array containing the length of the list
3485 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3486 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3487 /* +1 excludes first_folds_to */
3488 return IVCF_AUX_TABLE_lengths[-base];
3493 /* Only the single code point. This works like 'fc(G) = G - A + a' */
3494 *first_folds_to = (U32) (base + cp
3495 - invlist_array(PL_utf8_foldclosures)[index]);
3496 *remaining_folds_to = NULL;
3501 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3502 U8* const ustrp, STRLEN *lenp)
3504 /* This is called when changing the case of a UTF-8-encoded character above
3505 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3506 * result contains a character that crosses the 255/256 boundary, disallow
3507 * the change, and return the original code point. See L<perlfunc/lc> for
3510 * p points to the original string whose case was changed; assumed
3511 * by this routine to be well-formed
3512 * result the code point of the first character in the changed-case string
3513 * ustrp points to the changed-case string (<result> represents its
3515 * lenp points to the length of <ustrp> */
3517 UV original; /* To store the first code point of <p> */
3519 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3521 assert(UTF8_IS_ABOVE_LATIN1(*p));
3523 /* We know immediately if the first character in the string crosses the
3524 * boundary, so can skip testing */
3527 /* Look at every character in the result; if any cross the
3528 * boundary, the whole thing is disallowed */
3529 U8* s = ustrp + UTF8SKIP(ustrp);
3530 U8* e = ustrp + *lenp;
3532 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3538 /* Here, no characters crossed, result is ok as-is, but we warn. */
3539 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3545 /* Failed, have to return the original */
3546 original = valid_utf8_to_uvchr(p, lenp);
3548 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3549 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3550 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3551 " locale; resolved to \"\\x{%" UVXf "}\".",
3555 Copy(p, ustrp, *lenp, char);
3560 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3561 U8 * ustrp, STRLEN *lenp)
3563 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3564 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3565 * Otherwise, it returns the first code point of the Turkic foldcased
3566 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3567 * contain *lenp bytes
3569 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3570 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3573 PERL_ARGS_ASSERT_TURKIC_FC;
3576 if (UNLIKELY(*p == 'I')) {
3578 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3579 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3580 return LATIN_SMALL_LETTER_DOTLESS_I;
3583 if (UNLIKELY(memBEGINs(p, e - p,
3584 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3595 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3596 U8 * ustrp, STRLEN *lenp)
3598 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3599 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3600 * Otherwise, it returns the first code point of the Turkic lowercased
3601 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3602 * contain *lenp bytes */
3604 PERL_ARGS_ASSERT_TURKIC_LC;
3607 /* A 'I' requires context as to what to do */
3608 if (UNLIKELY(*p0 == 'I')) {
3609 const U8 * p = p0 + 1;
3611 /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3612 * modified by a dot above lowercases to 'i' even in turkic locales. */
3616 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3622 /* For the dot above to modify the 'I', it must be part of a
3623 * combining sequence immediately following the 'I', and no other
3624 * modifier with a ccc of 230 may intervene */
3625 cp = utf8_to_uvchr_buf(p, e, NULL);
3626 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3630 /* Here the combining sequence continues */
3635 /* In all other cases the lc is the same as the fold */
3636 return turkic_fc(p0, e, ustrp, lenp);
3640 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3641 U8 * ustrp, STRLEN *lenp)
3643 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3644 * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3645 * Otherwise, it returns the first code point of the Turkic upper or
3646 * title-cased sequence, and the entire sequence will be stored in *ustrp.
3647 * ustrp will contain *lenp bytes
3649 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3650 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3653 PERL_ARGS_ASSERT_TURKIC_UC;
3658 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3659 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3660 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3663 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3672 /* The process for changing the case is essentially the same for the four case
3673 * change types, except there are complications for folding. Otherwise the
3674 * difference is only which case to change to. To make sure that they all do
3675 * the same thing, the bodies of the functions are extracted out into the
3676 * following two macros. The functions are written with the same variable
3677 * names, and these are known and used inside these macros. It would be
3678 * better, of course, to have inline functions to do it, but since different
3679 * macros are called, depending on which case is being changed to, this is not
3680 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3681 * function can start with the common start macro, then finish with its special
3682 * handling; while the other three cases can just use the common end macro.
3684 * The algorithm is to use the proper (passed in) macro or function to change
3685 * the case for code points that are below 256. The macro is used if using
3686 * locale rules for the case change; the function if not. If the code point is
3687 * above 255, it is computed from the input UTF-8, and another macro is called
3688 * to do the conversion. If necessary, the output is converted to UTF-8. If
3689 * using a locale, we have to check that the change did not cross the 255/256
3690 * boundary, see check_locale_boundary_crossing() for further details.
3692 * The macros are split with the correct case change for the below-256 case
3693 * stored into 'result', and in the middle of an else clause for the above-255
3694 * case. At that point in the 'else', 'result' is not the final result, but is
3695 * the input code point calculated from the UTF-8. The fold code needs to
3696 * realize all this and take it from there.
3698 * To deal with Turkic locales, the function specified by the parameter
3699 * 'turkic' is called when appropriate.
3701 * If you read the two macros as sequential, it's easier to understand what's
3703 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \
3704 L1_func_extra_param, turkic) \
3706 if (flags & (locale_flags)) { \
3707 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
3708 if (IN_UTF8_CTYPE_LOCALE) { \
3709 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { \
3710 UV ret = turkic(p, e, ustrp, lenp); \
3711 if (ret) return ret; \
3714 /* Otherwise, treat a UTF-8 locale as not being in locale at \
3716 flags &= ~(locale_flags); \
3720 if (UTF8_IS_INVARIANT(*p)) { \
3721 if (flags & (locale_flags)) { \
3722 result = libc_change_function(*p); \
3725 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3728 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
3729 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
3730 if (flags & (locale_flags)) { \
3731 result = libc_change_function(c); \
3734 return L1_func(c, ustrp, lenp, L1_func_extra_param); \
3737 else { /* malformed UTF-8 or ord above 255 */ \
3738 STRLEN len_result; \
3739 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3740 if (len_result == (STRLEN) -1) { \
3741 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
3744 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3745 result = change_macro(result, p, ustrp, lenp); \
3747 if (flags & (locale_flags)) { \
3748 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3753 /* Here, used locale rules. Convert back to UTF-8 */ \
3754 if (UTF8_IS_INVARIANT(result)) { \
3755 *ustrp = (U8) result; \
3759 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3760 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3766 /* Not currently externally documented, and subject to change:
3767 * <flags> is set iff the rules from the current underlying locale are to
3771 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3779 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3781 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3782 /* 2nd char of uc(U+DF) is 'S' */
3783 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3785 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
3788 /* Not currently externally documented, and subject to change:
3789 * <flags> is set iff the rules from the current underlying locale are to be
3790 * used. Since titlecase is not defined in POSIX, for other than a
3791 * UTF-8 locale, uppercase is used instead for code points < 256.
3795 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3803 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3805 /* 2nd char of ucfirst(U+DF) is 's' */
3806 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3808 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
3811 /* Not currently externally documented, and subject to change:
3812 * <flags> is set iff the rules from the current underlying locale are to
3817 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3825 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3827 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3829 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
3832 /* Not currently externally documented, and subject to change,
3834 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3835 * locale are to be used.
3836 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3837 * otherwise simple folds
3838 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3843 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3851 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3853 /* These are mutually exclusive */
3854 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3856 assert(p != ustrp); /* Otherwise overwrites */
3858 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3859 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3862 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3864 if (flags & FOLD_FLAGS_LOCALE) {
3866 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3867 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3868 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3870 /* Special case these two characters, as what normally gets
3871 * returned under locale doesn't work */
3872 if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3874 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3875 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3876 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3877 "resolved to \"\\x{17F}\\x{17F}\".");
3882 if (memBEGINs((char *) p, e - p, LONG_S_T))
3884 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3885 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3886 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3887 "resolved to \"\\x{FB06}\".");
3888 goto return_ligature_st;
3891 #if UNICODE_MAJOR_VERSION == 3 \
3892 && UNICODE_DOT_VERSION == 0 \
3893 && UNICODE_DOT_DOT_VERSION == 1
3894 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3896 /* And special case this on this Unicode version only, for the same
3897 * reaons the other two are special cased. They would cross the
3898 * 255/256 boundary which is forbidden under /l, and so the code
3899 * wouldn't catch that they are equivalent (which they are only in
3901 else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3902 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3903 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3904 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3905 "resolved to \"\\x{0131}\".");
3906 goto return_dotless_i;
3910 return check_locale_boundary_crossing(p, result, ustrp, lenp);
3912 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3916 /* This is called when changing the case of a UTF-8-encoded
3917 * character above the ASCII range, and the result should not
3918 * contain an ASCII character. */
3920 UV original; /* To store the first code point of <p> */
3922 /* Look at every character in the result; if any cross the
3923 * boundary, the whole thing is disallowed */
3925 U8* send = ustrp + *lenp;
3928 /* Crossed, have to return the original */
3929 original = valid_utf8_to_uvchr(p, lenp);
3931 /* But in these instances, there is an alternative we can
3932 * return that is valid */
3933 if (original == LATIN_SMALL_LETTER_SHARP_S
3934 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3935 || original == LATIN_CAPITAL_LETTER_SHARP_S
3940 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3941 goto return_ligature_st;
3943 #if UNICODE_MAJOR_VERSION == 3 \
3944 && UNICODE_DOT_VERSION == 0 \
3945 && UNICODE_DOT_DOT_VERSION == 1
3947 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3948 goto return_dotless_i;
3951 Copy(p, ustrp, *lenp, char);
3957 /* Here, no characters crossed, result is ok as-is */
3962 /* Here, used locale rules. Convert back to UTF-8 */
3963 if (UTF8_IS_INVARIANT(result)) {
3964 *ustrp = (U8) result;
3968 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3969 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3976 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3977 * folds to a string of two of these characters. By returning this
3978 * instead, then, e.g.,
3979 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3982 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3983 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3985 return LATIN_SMALL_LETTER_LONG_S;
3988 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3989 * have the other one fold to it */
3991 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
3992 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3993 return LATIN_SMALL_LIGATURE_ST;
3995 #if UNICODE_MAJOR_VERSION == 3 \
3996 && UNICODE_DOT_VERSION == 0 \
3997 && UNICODE_DOT_DOT_VERSION == 1
4000 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
4001 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
4002 return LATIN_SMALL_LETTER_DOTLESS_I;
4009 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4011 /* May change: warns if surrogates, non-character code points, or
4012 * non-Unicode code points are in 's' which has length 'len' bytes.
4013 * Returns TRUE if none found; FALSE otherwise. The only other validity
4014 * check is to make sure that this won't exceed the string's length nor
4017 const U8* const e = s + len;
4020 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4023 if (UTF8SKIP(s) > len) {
4024 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4025 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4028 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4029 if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
4030 if ( ckWARN_d(WARN_NON_UNICODE)
4031 || UNLIKELY(0 < does_utf8_overflow(s, s + len,
4032 0 /* Don't consider overlongs */
4035 /* A side effect of this function will be to warn */
4036 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
4040 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4041 if (ckWARN_d(WARN_SURROGATE)) {
4042 /* This has a different warning than the one the called
4043 * function would output, so can't just call it, unlike we
4044 * do for the non-chars and above-unicodes */
4045 UV uv = utf8_to_uvchr_buf(s, e, NULL);
4046 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4047 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
4052 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
4053 && (ckWARN_d(WARN_NONCHAR)))
4055 /* A side effect of this function will be to warn */
4056 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
4067 =for apidoc pv_uni_display
4069 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
4070 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
4071 long (if longer, the rest is truncated and C<"..."> will be appended).
4073 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4074 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4075 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4076 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4077 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4078 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4080 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
4081 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
4083 The pointer to the PV of the C<dsv> is returned.
4085 See also L</sv_uni_display>.
4087 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
4088 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
4089 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
4090 =for apidoc Amnh||UNI_DISPLAY_QQ
4091 =for apidoc Amnh||UNI_DISPLAY_REGEX
4095 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4101 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4105 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4109 if (pvlim && SvCUR(dsv) >= pvlim) {
4113 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4115 const U8 c = (U8) u;
4116 if (flags & UNI_DISPLAY_BACKSLASH) {
4117 if ( isMNEMONIC_CNTRL(c)
4119 || (flags & UNI_DISPLAY_BACKSPACE)))
4121 const char * mnemonic = cntrl_to_mnemonic(c);
4122 sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4125 else if (c == '\\') {
4126 sv_catpvs(dsv, "\\\\");
4130 /* isPRINT() is the locale-blind version. */
4131 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4132 const char string = c;
4133 sv_catpvn(dsv, &string, 1);
4138 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4141 sv_catpvs(dsv, "...");
4147 =for apidoc sv_uni_display
4149 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4150 the displayable version being at most C<pvlim> bytes long
4151 (if longer, the rest is truncated and "..." will be appended).
4153 The C<flags> argument is as in L</pv_uni_display>().
4155 The pointer to the PV of the C<dsv> is returned.
4160 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4162 const char * const ptr =
4163 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4165 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4167 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4168 SvCUR(ssv), pvlim, flags);
4172 =for apidoc foldEQ_utf8
4174 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4175 both of which may be in UTF-8) are the same case-insensitively; false
4176 otherwise. How far into the strings to compare is determined by other input
4179 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4180 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
4181 C<u2> with respect to C<s2>.
4183 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4184 fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
4185 The scan will not be considered to be a match unless the goal is reached, and
4186 scanning won't continue past that goal. Correspondingly for C<l2> with respect
4189 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4190 pointer is considered an end pointer to the position 1 byte past the maximum
4191 point in C<s1> beyond which scanning will not continue under any circumstances.
4192 (This routine assumes that UTF-8 encoded input strings are not malformed;
4193 malformed input can cause it to read past C<pe1>). This means that if both
4194 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4195 will never be successful because it can never
4196 get as far as its goal (and in fact is asserted against). Correspondingly for
4197 C<pe2> with respect to C<s2>.
4199 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4200 C<l2> must be non-zero), and if both do, both have to be
4201 reached for a successful match. Also, if the fold of a character is multiple
4202 characters, all of them must be matched (see tr21 reference below for
4205 Upon a successful match, if C<pe1> is non-C<NULL>,
4206 it will be set to point to the beginning of the I<next> character of C<s1>
4207 beyond what was matched. Correspondingly for C<pe2> and C<s2>.
4209 For case-insensitiveness, the "casefolding" of Unicode is used
4210 instead of upper/lowercasing both the characters, see
4211 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4213 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4214 =for apidoc Cmnh||FOLDEQ_LOCALE
4215 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4216 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4217 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4218 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4222 /* A flags parameter has been added which may change, and hence isn't
4223 * externally documented. Currently it is:
4224 * 0 for as-documented above
4225 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4226 ASCII one, to not match
4227 * FOLDEQ_LOCALE is set iff the rules from the current underlying
4228 * locale are to be used.
4229 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4230 * routine. This allows that step to be skipped.
4231 * Currently, this requires s1 to be encoded as UTF-8
4232 * (u1 must be true), which is asserted for.
4233 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
4234 * cross certain boundaries. Hence, the caller should
4235 * let this function do the folding instead of
4236 * pre-folding. This code contains an assertion to
4237 * that effect. However, if the caller knows what
4238 * it's doing, it can pass this flag to indicate that,
4239 * and the assertion is skipped.
4240 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4241 * to s2, and s2 doesn't have to be UTF-8 encoded.
4242 * This introduces an asymmetry to save a few branches
4243 * in a loop. Currently, this is not a problem, as
4244 * never are both inputs pre-folded. Simply call this
4245 * function with the pre-folded one as the second
4247 * FOLDEQ_S2_FOLDS_SANE
4251 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4252 const char *s2, char **pe2, UV l2, bool u2,
4255 const U8 *p1 = (const U8*)s1; /* Point to current char */
4256 const U8 *p2 = (const U8*)s2;
4257 const U8 *g1 = NULL; /* goal for s1 */
4258 const U8 *g2 = NULL;
4259 const U8 *e1 = NULL; /* Don't scan s1 past this */
4260 U8 *f1 = NULL; /* Point to current folded */
4261 const U8 *e2 = NULL;
4263 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
4264 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4265 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4266 U8 flags_for_folder = FOLD_FLAGS_FULL;
4268 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4270 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4271 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
4272 && !(flags & FOLDEQ_S1_FOLDS_SANE))
4273 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
4274 && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
4275 /* The algorithm is to trial the folds without regard to the flags on
4276 * the first line of the above assert(), and then see if the result
4277 * violates them. This means that the inputs can't be pre-folded to a
4278 * violating result, hence the assert. This could be changed, with the
4279 * addition of extra tests here for the already-folded case, which would
4280 * slow it down. That cost is more than any possible gain for when these
4281 * flags are specified, as the flags indicate /il or /iaa matching which
4282 * is less common than /iu, and I (khw) also believe that real-world /il
4283 * and /iaa matches are most likely to involve code points 0-255, and this
4284 * function only under rare conditions gets called for 0-255. */
4286 if (flags & FOLDEQ_LOCALE) {
4287 if (IN_UTF8_CTYPE_LOCALE) {
4288 if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {
4289 flags_for_folder |= FOLD_FLAGS_LOCALE;
4292 flags &= ~FOLDEQ_LOCALE;
4296 flags_for_folder |= FOLD_FLAGS_LOCALE;
4299 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4300 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4308 g1 = (const U8*)s1 + l1;
4316 g2 = (const U8*)s2 + l2;
4319 /* Must have at least one goal */
4324 /* Will never match if goal is out-of-bounds */
4325 assert(! e1 || e1 >= g1);
4327 /* Here, there isn't an end pointer, or it is beyond the goal. We
4328 * only go as far as the goal */
4332 assert(e1); /* Must have an end for looking at s1 */
4335 /* Same for goal for s2 */
4337 assert(! e2 || e2 >= g2);
4344 /* If both operands are already folded, we could just do a memEQ on the
4345 * whole strings at once, but it would be better if the caller realized
4346 * this and didn't even call us */
4348 /* Look through both strings, a character at a time */
4349 while (p1 < e1 && p2 < e2) {
4351 /* If at the beginning of a new character in s1, get its fold to use
4352 * and the length of the fold. */
4354 if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4360 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4362 /* We have to forbid mixing ASCII with non-ASCII if the
4363 * flags so indicate. And, we can short circuit having to
4364 * call the general functions for this common ASCII case,
4365 * all of whose non-locale folds are also ASCII, and hence
4366 * UTF-8 invariants, so the UTF8ness of the strings is not
4368 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4372 *foldbuf1 = toFOLD(*p1);
4375 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4377 else { /* Not UTF-8, get UTF-8 fold */
4378 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4384 if (n2 == 0) { /* Same for s2 */
4385 if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4387 /* Point to the already-folded character. But for non-UTF-8
4388 * variants, convert to UTF-8 for the algorithm below */
4389 if (UTF8_IS_INVARIANT(*p2)) {
4398 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4399 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4405 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4406 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4410 *foldbuf2 = toFOLD(*p2);
4413 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4416 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4422 /* Here f1 and f2 point to the beginning of the strings to compare.
4423 * These strings are the folds of the next character from each input
4424 * string, stored in UTF-8. */
4426 /* While there is more to look for in both folds, see if they
4427 * continue to match */
4429 U8 fold_length = UTF8SKIP(f1);
4430 if (fold_length != UTF8SKIP(f2)
4431 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4432 function call for single
4434 || memNE((char*)f1, (char*)f2, fold_length))
4436 return 0; /* mismatch */
4439 /* Here, they matched, advance past them */
4446 /* When reach the end of any fold, advance the input past it */
4448 p1 += u1 ? UTF8SKIP(p1) : 1;
4451 p2 += u2 ? UTF8SKIP(p2) : 1;
4453 } /* End of loop through both strings */
4455 /* A match is defined by each scan that specified an explicit length
4456 * reaching its final goal, and the other not having matched a partial
4457 * character (which can happen when the fold of a character is more than one
4459 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4463 /* Successful match. Set output pointers */
4474 * ex: set ts=8 sts=4 sw=4 et: