3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
32 #define PERL_IN_UTF8_C
34 #include "invlist_inline.h"
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38 "Malformed UTF-8 character (unexpected end of string)";
41 These are various utility functions for manipulating UTF8-encoded
42 strings. For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
48 /* helper for Perl__force_out_malformed_utf8_message(). Like
49 * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
53 S_restore_cop_warnings(pTHX_ void *p)
55 free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
60 Perl__force_out_malformed_utf8_message(pTHX_
61 const U8 *const p, /* First byte in UTF-8 sequence */
62 const U8 * const e, /* Final byte in sequence (may include
64 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
65 usually 0, or some DISALLOW flags */
66 const bool die_here) /* If TRUE, this function does not return */
68 /* This core-only function is to be called when a malformed UTF-8 character
69 * is found, in order to output the detailed information about the
70 * malformation before dieing. The reason it exists is for the occasions
71 * when such a malformation is fatal, but warnings might be turned off, so
72 * that normally they would not be actually output. This ensures that they
73 * do get output. Because a sequence may be malformed in more than one
74 * way, multiple messages may be generated, so we can't make them fatal, as
75 * that would cause the first one to die.
77 * Instead we pretend -W was passed to perl, then die afterwards. The
78 * flexibility is here to return to the caller so they can finish up and
82 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
88 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
90 /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
91 * than PL_compiling */
92 SAVEDESTRUCTOR_X(S_restore_cop_warnings,
93 (void*)PL_curcop->cop_warnings);
94 PL_curcop->cop_warnings = pWARN_ALL;
97 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
102 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
103 " be called only when there are errors found");
107 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
112 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
113 U32 categories, /* Packed warning categories */
114 U32 flag) /* Flag associated with this message */
116 /* Creates, populates, and returns an HV* that describes an error message
117 * for the translators between UTF8 and code point */
119 SV* msg_sv = newSVpv(message, 0);
120 SV* category_sv = newSVuv(categories);
121 SV* flag_bit_sv = newSVuv(flag);
123 HV* msg_hv = newHV();
125 PERL_ARGS_ASSERT_NEW_MSG_HV;
127 (void) hv_stores(msg_hv, "text", msg_sv);
128 (void) hv_stores(msg_hv, "warn_categories", category_sv);
129 (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
135 =for apidoc uvoffuni_to_utf8_flags
137 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
138 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
139 L<perlapi/uvchr_to_utf8_flags>>.
141 This function is like them, but the input is a strict Unicode
142 (as opposed to native) code point. Only in very rare circumstances should code
143 not be using the native code point.
145 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
151 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
153 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
155 return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
158 /* All these formats take a single UV code point argument */
159 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
160 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
161 " is not recommended for open interchange";
162 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
163 " may not be portable";
165 /* Use shorter names internally in this file */
166 #define SHIFT UTF_ACCUMULATION_SHIFT
168 #define MARK UTF_CONTINUATION_MARK
169 #define MASK UTF_CONTINUATION_MASK
172 =for apidoc uvchr_to_utf8_flags_msgs
174 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
176 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
178 This function is for code that wants any warning and/or error messages to be
179 returned to the caller rather than be displayed. All messages that would have
180 been displayed if all lexical warnings are enabled will be returned.
182 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
183 placed after all the others, C<msgs>. If this parameter is 0, this function
184 behaves identically to C<L</uvchr_to_utf8_flags>>. Otherwise, C<msgs> should
185 be a pointer to an C<HV *> variable, in which this function creates a new HV to
186 contain any appropriate messages. The hash has three key-value pairs, as
193 The text of the message as a C<SVpv>.
195 =item C<warn_categories>
197 The warning category (or categories) packed into a C<SVuv>.
201 A single flag bit associated with this message, in a C<SVuv>.
202 The bit corresponds to some bit in the C<*errors> return value,
203 such as C<UNICODE_GOT_SURROGATE>.
207 It's important to note that specifying this parameter as non-null will cause
208 any warnings this function would otherwise generate to be suppressed, and
209 instead be placed in C<*msgs>. The caller can check the lexical warnings state
210 (or not) when choosing what to do with the returned messages.
212 The caller, of course, is responsible for freeing any returned HV.
217 /* Undocumented; we don't want people using this. Instead they should use
218 * uvchr_to_utf8_flags_msgs() */
220 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
223 UV shifted_uv = input_uv;
224 STRLEN utf8_skip = OFFUNISKIP(input_uv);
226 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
234 *d++ = LATIN1_TO_NATIVE(input_uv);
238 if ( UNLIKELY(input_uv > MAX_LEGAL_CP
239 && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
241 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
245 if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
246 U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
247 const char * format = PL_extended_cp_format;
249 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
251 UNICODE_GOT_PERL_EXTENDED);
254 Perl_ck_warner_d(aTHX_ category, format, input_uv);
257 /* Don't output a 2nd msg */
258 flags &= ~UNICODE_WARN_SUPER;
261 if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
265 p = d + utf8_skip - 1;
266 while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
267 *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
268 shifted_uv >>= SHIFT;
273 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
274 d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
275 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
276 shifted_uv >>= SHIFT;
279 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
280 d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
281 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
282 shifted_uv >>= SHIFT;
285 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
286 if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
287 if (flags & UNICODE_WARN_SUPER) {
288 U32 category = packWARN(WARN_NON_UNICODE);
289 const char * format = super_cp_format;
292 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
297 Perl_ck_warner_d(aTHX_ category, format, input_uv);
300 if (flags & UNICODE_DISALLOW_SUPER) {
304 if ( (flags & UNICODE_DISALLOW_SUPER)
305 || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
306 && UNICODE_IS_PERL_EXTENDED(input_uv)))
312 d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
313 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
314 shifted_uv >>= SHIFT;
317 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
318 if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
319 if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
320 if (flags & UNICODE_WARN_NONCHAR) {
321 U32 category = packWARN(WARN_NONCHAR);
322 const char * format = nonchar_cp_format;
324 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
326 UNICODE_GOT_NONCHAR);
329 Perl_ck_warner_d(aTHX_ category, format, input_uv);
332 if (flags & UNICODE_DISALLOW_NONCHAR) {
336 else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
337 if (flags & UNICODE_WARN_SURROGATE) {
338 U32 category = packWARN(WARN_SURROGATE);
339 const char * format = surrogate_cp_format;
341 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
343 UNICODE_GOT_SURROGATE);
346 Perl_ck_warner_d(aTHX_ category, format, input_uv);
349 if (flags & UNICODE_DISALLOW_SURROGATE) {
355 d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
356 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
357 shifted_uv >>= SHIFT;
363 d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
364 shifted_uv >>= SHIFT;
371 d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
372 shifted_uv >>= SHIFT;
373 d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
374 | UTF_START_MARK(utf8_skip));
378 return d + utf8_skip;
382 =for apidoc uvchr_to_utf8
384 Adds the UTF-8 representation of the native code point C<uv> to the end
385 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
386 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
387 the byte after the end of the new character. In other words,
389 d = uvchr_to_utf8(d, uv);
391 is the recommended wide native character-aware way of saying
395 This function accepts any code point from 0..C<IV_MAX> as input.
396 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
398 It is possible to forbid or warn on non-Unicode code points, or those that may
399 be problematic by using L</uvchr_to_utf8_flags>.
404 /* This is also a macro */
405 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
408 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
410 return uvchr_to_utf8(d, uv);
414 =for apidoc uvchr_to_utf8_flags
416 Adds the UTF-8 representation of the native code point C<uv> to the end
417 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
418 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
419 the byte after the end of the new character. In other words,
421 d = uvchr_to_utf8_flags(d, uv, flags);
425 d = uvchr_to_utf8_flags(d, uv, 0);
427 This is the Unicode-aware way of saying
431 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
432 input. C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
434 Specifying C<flags> can further restrict what is allowed and not warned on, as
437 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
438 the function will raise a warning, provided UTF8 warnings are enabled. If
439 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
440 NULL. If both flags are set, the function will both warn and return NULL.
442 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
443 affect how the function handles a Unicode non-character.
445 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
446 affect the handling of code points that are above the Unicode maximum of
447 0x10FFFF. Languages other than Perl may not be able to accept files that
450 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
451 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
452 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
453 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
454 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
455 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
456 above-Unicode and surrogate flags, but not the non-character ones, as
458 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
459 See L<perlunicode/Noncharacter code points>.
461 Extremely high code points were never specified in any standard, and require an
462 extension to UTF-8 to express, which Perl does. It is likely that programs
463 written in something other than Perl would not be able to read files that
464 contain these; nor would Perl understand files written by something that uses a
465 different extension. For these reasons, there is a separate set of flags that
466 can warn and/or disallow these extremely high code points, even if other
467 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
468 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
469 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
470 treat all above-Unicode code points, including these, as malformations. (Note
471 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
472 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
474 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
475 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
476 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
477 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because on EBCDIC
478 platforms,these flags can apply to code points that actually do fit in 31 bits.
479 The new names accurately describe the situation in all cases.
481 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
482 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
483 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
484 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
485 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
486 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
487 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
488 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
489 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
490 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
491 =for apidoc Amnh||UNICODE_WARN_NONCHAR
492 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
493 =for apidoc Amnh||UNICODE_WARN_SUPER
494 =for apidoc Amnh||UNICODE_WARN_SURROGATE
499 /* This is also a macro */
500 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
503 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
505 return uvchr_to_utf8_flags(d, uv, flags);
508 PERL_STATIC_INLINE int
509 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
511 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
512 * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
513 * it isn't, and -1 if there isn't enough information to tell. This last
514 * return value can happen if the sequence is incomplete, missing some
515 * trailing bytes that would form a complete character. If there are
516 * enough bytes to make a definitive decision, this function does so.
517 * Usually 2 bytes are sufficient.
519 * Overlongs can occur whenever the number of continuation bytes changes.
520 * That means whenever the number of leading 1 bits in a start byte
521 * increases from the next lower start byte. That happens for start bytes
522 * C0, E0, F0, F8, FC, FE, and FF.
525 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
527 /* Each platform has overlongs after the start bytes given above (expressed
528 * in I8 for EBCDIC). The values below were found by manually inspecting
529 * the UTF-8 patterns. See the tables in utf8.h and utfebcdic.h. */
531 switch (NATIVE_UTF8_TO_I8(s[0])) {
533 assert(UTF8_IS_START(s[0]));
549 return (len < 2) ? -1 : s[1] < 0xA0;
555 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
559 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
563 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
567 : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
569 return isFF_overlong(s, len);
573 PERL_STATIC_INLINE int
574 S_isFF_overlong(const U8 * const s, const STRLEN len)
576 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
577 * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if
578 * it isn't, and -1 if there isn't enough information to tell. This last
579 * return value can happen if the sequence is incomplete, missing some
580 * trailing bytes that would form a complete character. If there are
581 * enough bytes to make a definitive decision, this function does so. */
583 PERL_ARGS_ASSERT_ISFF_OVERLONG;
586 /* This works on all three EBCDIC code pages traditionally supported by
588 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
590 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
593 /* To be an FF overlong, all the available bytes must match */
594 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
595 MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
600 /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
601 * be there; what comes after them doesn't matter. See tables in utf8.h,
603 if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
607 /* The missing bytes could cause the result to go one way or the other, so
608 * the result is indeterminate */
612 /* At some point we may want to allow core to use up to UV_MAX */
614 #ifdef EBCDIC /* Actually is I8 */
615 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
616 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA7"
617 /* UV_MAX "\xFF\xAF" */
618 # else /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
619 # define HIGHEST_REPRESENTABLE_UTF "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
620 /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
623 # if defined(UV_IS_QUAD)
624 # define HIGHEST_REPRESENTABLE_UTF "\xFF\x80\x87"
625 /* UV_MAX "\xFF\x80" */
627 # define HIGHEST_REPRESENTABLE_UTF "\xFD"
628 /* UV_MAX "\xFE\x83" */
632 PERL_STATIC_INLINE int
633 S_does_utf8_overflow(const U8 * const s,
635 const bool consider_overlongs)
637 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
638 * 'e' - 1 would overflow an IV on this platform; that is if it represents
639 * a code point larger than the highest representable code point. It
640 * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
641 * enough information to tell. This last return value can happen if the
642 * sequence is incomplete, missing some trailing bytes that would form a
643 * complete character. If there are enough bytes to make a definitive
644 * decision, this function does so.
646 * If 'consider_overlongs' is TRUE, the function checks for the possibility
647 * that the sequence is an overlong that doesn't overflow. Otherwise, it
648 * assumes the sequence is not an overlong. This can give different
649 * results only on ASCII 32-bit platforms.
651 * (For ASCII platforms, we could use memcmp() because we don't have to
652 * convert each byte to I8, but it's very rare input indeed that would
653 * approach overflow, so the loop below will likely only get executed once.)
656 const STRLEN len = e - s;
658 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
661 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
663 for (x = s; x < e; x++, y++) {
665 /* 'y' is set up to not include the trailing bytes that are all the
666 * maximum possible continuation byte. So when we reach the end of 'y'
667 * (known to be NUL terminated), it is impossible for 'x' to contain
668 * bytes larger than those omitted bytes, and therefore 'x' can't
674 /* If this byte is less than the corresponding highest non-overflowing
675 * UTF-8, the sequence doesn't overflow */
676 if (NATIVE_UTF8_TO_I8(*x) < *y) {
680 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
681 goto overflows_if_not_overlong;
685 /* Got to the end, and all bytes are the same. If the input is a whole
686 * character, it doesn't overflow. And if it is a partial character,
687 * there's not enough information to tell */
688 return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
690 overflows_if_not_overlong:
692 /* Here, a well-formed sequence overflows. If we are assuming
693 * well-formedness, return that it overflows. */
694 if (! consider_overlongs) {
698 /* Here, it could be the overlong malformation, and might not actuallly
699 * overflow if you were to calculate it out.
701 * See if it actually is overlong */
702 is_overlong = is_utf8_overlong(s, len);
704 /* If it isn't overlong, is well-formed, so overflows */
705 if (is_overlong == 0) {
709 /* Not long enough to determine */
710 if (is_overlong < 0) {
714 /* Here, it appears to overflow, but it is also overlong */
716 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
718 /* On many platforms, it is impossible for an overlong to overflow. For
719 * these, no further work is necessary: we can return immediately that this
720 * overlong that is an apparent overflow actually isn't
722 * To see why, note that a length_N sequence can represent as overlongs all
723 * the code points representable by shorter length sequences, but no
724 * higher. If it could represent a higher code point without being an
725 * overlong, we wouldn't have had to increase the sequence length!
727 * The highest possible start byte is FF; the next highest is FE. The
728 * highest code point representable as an overlong on the platform is thus
729 * the highest code point representable by a non-overlong sequence whose
730 * start byte is FE. If that value doesn't overflow the platform's word
731 * size, overlongs can't overflow.
733 * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
734 * information (the high 7 bits, all ones, say that the sequence is 7 bytes
735 * long, and the bottom, zero, bit is s placeholder. That leaves the 6
736 * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
737 If that number of bits doesn't exceed the word size, it can't overflow. */
743 /* In practice, only a 32-bit ASCII box gets here. The FE start byte can
744 * represent, as an overlong, the highest code point representable by an FD
745 * start byte, which is 5*6 continuation bytes of info plus one bit from
746 * the start byte, or 31 bits. That doesn't overflow. More explicitly:
747 * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
749 * That means only the FF start byte can have an overflowing overlong. */
754 /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
755 * evaluates to 2**31, so overflows an IV. For a UV it's
756 * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
757 # define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82"
759 if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */
763 # define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
765 return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
772 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
774 SSize_t len, full_len;
776 /* An internal helper function.
779 * 's' is a string, which is known to be syntactically valid UTF-8 as far
780 * as (e - 1); e > s must hold.
781 * 'e' This function is allowed to look at any byte from 's'...'e-1', but
782 * nowhere else. The function has to cope as best it can if that
783 * sequence does not form a full character.
784 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
785 * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns
786 * 0 if it determines the input will match something disallowed.
788 * The return is the number of bytes required to represent the code point
789 * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the
790 * input is for a partial character, a successful return will be larger
793 * If *s..*(e-1) is only for a partial character, the function will return
794 * non-zero if there is any sequence of well-formed UTF-8 that, when
795 * appended to the input sequence, could result in an allowed code point;
796 * otherwise it returns 0. Non characters cannot be determined based on
797 * partial character input. But many of the other excluded types can be
798 * determined with just the first one or two bytes.
802 PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
805 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
806 |UTF8_DISALLOW_PERL_EXTENDED)));
808 full_len = UTF8SKIP(s);
811 if (len > full_len) {
819 default: /* Extended */
820 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
826 case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
827 case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
829 if (flags & UTF8_DISALLOW_SUPER) {
830 return 0; /* Above Unicode */
835 case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
836 is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
838 && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
839 && NATIVE_UTF8_TO_I8(s[1])
840 >= UTF_FIRST_CONT_BYTE_110000_));
842 if (flags & UTF8_DISALLOW_SUPER) {
846 else if ( (flags & UTF8_DISALLOW_NONCHAR)
848 && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
855 case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
857 if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
861 if ( (flags & UTF8_DISALLOW_SURROGATE)
862 && UNLIKELY(is_SURROGATE_utf8(s)))
864 return 0; /* Surrogate */
867 if ( (flags & UTF8_DISALLOW_NONCHAR)
869 && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
876 /* The lower code points don't have any disallowable characters */
889 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
890 const bool require_partial)
892 /* This is called to determine if the UTF-8 sequence starting at s0 and
893 * continuing for up to one full character of bytes, but looking no further
894 * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
895 * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
896 * for the largest code points recognized by Perl, the ones that require
897 * the most UTF-8 bytes per character to represent (somewhat less than
898 * twice the size of the next longest kind). This sequence will only ever
899 * be Perl extended UTF-8.
901 * The routine returns 0 if the sequence is not fully valid, syntactically
902 * or semantically. That means it checks that everything following the
903 * start byte is a continuation byte, and that it doesn't overflow, nor is
904 * an overlong representation.
906 * If 'require_partial' is FALSE, the routine returns non-zero only if the
907 * input (as far as 'e-1') is a full character. The return is the count of
908 * the bytes in the character.
910 * If 'require_partial' is TRUE, the routine returns non-zero only if the
911 * input as far as 'e-1' is a partial, not full character, with no
912 * malformations found before position 'e'. The return is either just
915 const U8 *s = s0 + 1;
918 PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
921 assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
923 send = s + MIN(UTF8_MAXBYTES - 1, e - s);
925 if (! UTF8_IS_CONTINUATION(*s)) {
932 if (0 < does_utf8_overflow(s0, e,
933 FALSE /* Don't consider_overlongs */
938 if (0 < isFF_overlong(s0, e - s0)) {
942 /* Here, the character is valid as far as it got. Check if got a partial
944 if (s - s0 < UTF8_MAXBYTES) {
945 return (require_partial) ? 1 : 0;
948 /* Here, got a full character */
949 return (require_partial) ? 0 : UTF8_MAXBYTES;
953 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
955 /* Returns a mortalized C string that is a displayable copy of the 'len'
956 * bytes starting at 'start'. 'format' gives how to display each byte.
957 * Currently, there are only two formats, so it is currently a bool:
959 * 1 ab (that is a space between two hex digit bytes)
962 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
964 const U8 * s = start;
965 const U8 * const e = start + len;
969 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
971 Newx(output, output_len, char);
975 for (s = start; s < e; s++) {
976 const unsigned high_nibble = (*s & 0xF0) >> 4;
977 const unsigned low_nibble = (*s & 0x0F);
989 if (high_nibble < 10) {
990 *d++ = high_nibble + '0';
993 *d++ = high_nibble - 10 + 'a';
996 if (low_nibble < 10) {
997 *d++ = low_nibble + '0';
1000 *d++ = low_nibble - 10 + 'a';
1008 PERL_STATIC_INLINE char *
1009 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1011 /* Max number of bytes to print */
1014 /* Which one is the non-continuation */
1015 const STRLEN non_cont_byte_pos,
1017 /* How many bytes should there be? */
1018 const STRLEN expect_len)
1020 /* Return the malformation warning text for an unexpected continuation
1023 const char * const where = (non_cont_byte_pos == 1)
1025 : Perl_form(aTHX_ "%d bytes",
1026 (int) non_cont_byte_pos);
1027 const U8 * x = s + non_cont_byte_pos;
1028 const U8 * e = s + print_len;
1030 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1032 /* We don't need to pass this parameter, but since it has already been
1033 * calculated, it's likely faster to pass it; verify under DEBUGGING */
1034 assert(expect_len == UTF8SKIP(s));
1036 /* As a defensive coding measure, don't output anything past a NUL. Such
1037 * bytes shouldn't be in the middle of a malformation, and could mark the
1038 * end of the allocated string, and what comes after is undefined */
1039 for (; x < e; x++) {
1041 x++; /* Output this particular NUL */
1046 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1047 " %s after start byte 0x%02x; need %d bytes, got %d)",
1049 _byte_dump_string(s, x - s, 0),
1050 *(s + non_cont_byte_pos),
1054 (int) non_cont_byte_pos);
1059 =for apidoc utf8n_to_uvchr
1061 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1062 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1065 Bottom level UTF-8 decode routine.
1066 Returns the native code point value of the first character in the string C<s>,
1067 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1068 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1069 the length, in bytes, of that character.
1071 The value of C<flags> determines the behavior when C<s> does not point to a
1072 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
1073 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1074 is the next possible position in C<s> that could begin a non-malformed
1075 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
1076 is raised. Some UTF-8 input sequences may contain multiple malformations.
1077 This function tries to find every possible one in each call, so multiple
1078 warnings can be raised for the same sequence.
1080 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1081 individual types of malformations, such as the sequence being overlong (that
1082 is, when there is a shorter sequence that can express the same code point;
1083 overlong sequences are expressly forbidden in the UTF-8 standard due to
1084 potential security issues). Another malformation example is the first byte of
1085 a character not being a legal first byte. See F<utf8.h> for the list of such
1086 flags. Even if allowed, this function generally returns the Unicode
1087 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
1088 F<utf8.h> to override this behavior for the overlong malformations, but don't
1089 do that except for very specialized purposes.
1091 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1092 flags) malformation is found. If this flag is set, the routine assumes that
1093 the caller will raise a warning, and this function will silently just set
1094 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1096 Note that this API requires disambiguation between successful decoding a C<NUL>
1097 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1098 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1099 be set to 1. To disambiguate, upon a zero return, see if the first byte of
1100 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
1101 error. Or you can use C<L</utf8n_to_uvchr_error>>.
1103 Certain code points are considered problematic. These are Unicode surrogates,
1104 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1105 By default these are considered regular code points, but certain situations
1106 warrant special handling for them, which can be specified using the C<flags>
1107 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1108 three classes are treated as malformations and handled as such. The flags
1109 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1110 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1111 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1112 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1113 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1115 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1116 The difference between traditional strictness and C9 strictness is that the
1117 latter does not forbid non-character code points. (They are still discouraged,
1118 however.) For more discussion see L<perlunicode/Noncharacter code points>.
1120 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1121 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1122 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1123 raised for their respective categories, but otherwise the code points are
1124 considered valid (not malformations). To get a category to both be treated as
1125 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1126 (But note that warnings are not raised if lexically disabled nor if
1127 C<UTF8_CHECK_ONLY> is also specified.)
1129 Extremely high code points were never specified in any standard, and require an
1130 extension to UTF-8 to express, which Perl does. It is likely that programs
1131 written in something other than Perl would not be able to read files that
1132 contain these; nor would Perl understand files written by something that uses a
1133 different extension. For these reasons, there is a separate set of flags that
1134 can warn and/or disallow these extremely high code points, even if other
1135 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
1136 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
1137 C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
1138 above-Unicode code points, including these, as malformations.
1139 (Note that the Unicode standard considers anything above 0x10FFFF to be
1140 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1143 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1144 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
1145 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1146 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
1147 can apply to code points that actually do fit in 31 bits. This happens on
1148 EBCDIC platforms, and sometimes when the L<overlong
1149 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
1150 describe the situation in all cases.
1153 All other code points corresponding to Unicode characters, including private
1154 use and those yet to be assigned, are never considered malformed and never
1157 =for apidoc Amnh||UTF8_CHECK_ONLY
1158 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1159 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1160 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1161 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1162 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1163 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1164 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1165 =for apidoc Amnh||UTF8_WARN_SURROGATE
1166 =for apidoc Amnh||UTF8_WARN_NONCHAR
1167 =for apidoc Amnh||UTF8_WARN_SUPER
1168 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1169 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1173 Also implemented as a macro in utf8.h
1177 Perl_utf8n_to_uvchr(const U8 *s,
1182 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1184 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1189 =for apidoc utf8n_to_uvchr_error
1191 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1192 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1195 This function is for code that needs to know what the precise malformation(s)
1196 are when an error is found. If you also need to know the generated warning
1197 messages, use L</utf8n_to_uvchr_msgs>() instead.
1199 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1200 all the others, C<errors>. If this parameter is 0, this function behaves
1201 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1202 to a C<U32> variable, which this function sets to indicate any errors found.
1203 Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1204 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1205 of these bits will be set if a malformation is found, even if the input
1206 C<flags> parameter indicates that the given malformation is allowed; those
1207 exceptions are noted:
1211 =item C<UTF8_GOT_PERL_EXTENDED>
1213 The input sequence is not standard UTF-8, but a Perl extension. This bit is
1214 set only if the input C<flags> parameter contains either the
1215 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1217 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1218 and so some extension must be used to express them. Perl uses a natural
1219 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1220 extension to represent even higher ones, so that any code point that fits in a
1221 64-bit word can be represented. Text using these extensions is not likely to
1222 be portable to non-Perl code. We lump both of these extensions together and
1223 refer to them as Perl extended UTF-8. There exist other extensions that people
1224 have invented, incompatible with Perl's.
1226 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1227 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1228 than on ASCII. Prior to that, code points 2**31 and higher were simply
1229 unrepresentable, and a different, incompatible method was used to represent
1230 code points between 2**30 and 2**31 - 1.
1232 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1233 Perl extended UTF-8 is used.
1235 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1236 may use for backward compatibility. That name is misleading, as this flag may
1237 be set when the code point actually does fit in 31 bits. This happens on
1238 EBCDIC platforms, and sometimes when the L<overlong
1239 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1240 describes the situation in all cases.
1242 =item C<UTF8_GOT_CONTINUATION>
1244 The input sequence was malformed in that the first byte was a UTF-8
1247 =item C<UTF8_GOT_EMPTY>
1249 The input C<curlen> parameter was 0.
1251 =item C<UTF8_GOT_LONG>
1253 The input sequence was malformed in that there is some other sequence that
1254 evaluates to the same code point, but that sequence is shorter than this one.
1256 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1257 it was discovered that this created security issues.
1259 =item C<UTF8_GOT_NONCHAR>
1261 The code point represented by the input UTF-8 sequence is for a Unicode
1262 non-character code point.
1263 This bit is set only if the input C<flags> parameter contains either the
1264 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1266 =item C<UTF8_GOT_NON_CONTINUATION>
1268 The input sequence was malformed in that a non-continuation type byte was found
1269 in a position where only a continuation type one should be. See also
1270 C<L</UTF8_GOT_SHORT>>.
1272 =item C<UTF8_GOT_OVERFLOW>
1274 The input sequence was malformed in that it is for a code point that is not
1275 representable in the number of bits available in an IV on the current platform.
1277 =item C<UTF8_GOT_SHORT>
1279 The input sequence was malformed in that C<curlen> is smaller than required for
1280 a complete sequence. In other words, the input is for a partial character
1284 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1285 sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1286 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1287 sequence was looked at. If no other flags are present, it means that the
1288 sequence was valid as far as it went. Depending on the application, this could
1289 mean one of three things:
1295 The C<curlen> length parameter passed in was too small, and the function was
1296 prevented from examining all the necessary bytes.
1300 The buffer being looked at is based on reading data, and the data received so
1301 far stopped in the middle of a character, so that the next read will
1302 read the remainder of this character. (It is up to the caller to deal with the
1303 split bytes somehow.)
1307 This is a real error, and the partial sequence is all we're going to get.
1311 =item C<UTF8_GOT_SUPER>
1313 The input sequence was malformed in that it is for a non-Unicode code point;
1314 that is, one above the legal Unicode maximum.
1315 This bit is set only if the input C<flags> parameter contains either the
1316 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1318 =item C<UTF8_GOT_SURROGATE>
1320 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1322 This bit is set only if the input C<flags> parameter contains either the
1323 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1327 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1328 flag to suppress any warnings, and then examine the C<*errors> return.
1330 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1331 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1332 =for apidoc Amnh||UTF8_GOT_EMPTY
1333 =for apidoc Amnh||UTF8_GOT_LONG
1334 =for apidoc Amnh||UTF8_GOT_NONCHAR
1335 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1336 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1337 =for apidoc Amnh||UTF8_GOT_SHORT
1338 =for apidoc Amnh||UTF8_GOT_SUPER
1339 =for apidoc Amnh||UTF8_GOT_SURROGATE
1343 Also implemented as a macro in utf8.h
1347 Perl_utf8n_to_uvchr_error(const U8 *s,
1353 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1355 return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1360 =for apidoc utf8n_to_uvchr_msgs
1362 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1363 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1366 This function is for code that needs to know what the precise malformation(s)
1367 are when an error is found, and wants the corresponding warning and/or error
1368 messages to be returned to the caller rather than be displayed. All messages
1369 that would have been displayed if all lexical warnings are enabled will be
1372 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1373 placed after all the others, C<msgs>. If this parameter is 0, this function
1374 behaves identically to C<L</utf8n_to_uvchr_error>>. Otherwise, C<msgs> should
1375 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1376 contain any appropriate messages. The elements of the array are ordered so
1377 that the first message that would have been displayed is in the 0th element,
1378 and so on. Each element is a hash with three key-value pairs, as follows:
1384 The text of the message as a C<SVpv>.
1386 =item C<warn_categories>
1388 The warning category (or categories) packed into a C<SVuv>.
1392 A single flag bit associated with this message, in a C<SVuv>.
1393 The bit corresponds to some bit in the C<*errors> return value,
1394 such as C<UTF8_GOT_LONG>.
1398 It's important to note that specifying this parameter as non-null will cause
1399 any warnings this function would otherwise generate to be suppressed, and
1400 instead be placed in C<*msgs>. The caller can check the lexical warnings state
1401 (or not) when choosing what to do with the returned messages.
1403 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1406 The caller, of course, is responsible for freeing any returned AV.
1412 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1419 const U8 * const s0 = s;
1420 const U8 * send = s0 + curlen;
1421 U32 possible_problems; /* A bit is set here for each potential problem
1422 found as we go along */
1424 STRLEN expectlen; /* How long should this sequence be? */
1425 STRLEN avail_len; /* When input is too short, gives what that is */
1426 U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
1427 gets set and discarded */
1429 /* The below are used only if there is both an overlong malformation and a
1430 * too short one. Otherwise the first two are set to 's0' and 'send', and
1431 * the third not used at all */
1433 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1434 routine; see [perl #130921] */
1438 PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1440 /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1441 * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1442 * syllables that the dfa doesn't properly handle. Quickly dispose of the
1445 /* Each of the affected Hanguls starts with \xED */
1447 if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1458 return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1459 | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1460 | (s0[2] & UTF_CONTINUATION_MASK);
1463 /* In conjunction with the exhaustive tests that can be enabled in
1464 * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1465 * what it is intended to do, and that no flaws in it are masked by
1466 * dropping down and executing the code below
1467 assert(! isUTF8_CHAR(s0, send)
1468 || UTF8_IS_SURROGATE(s0, send)
1469 || UTF8_IS_SUPER(s0, send)
1470 || UTF8_IS_NONCHAR(s0,send));
1474 possible_problems = 0;
1478 adjusted_s0 = (U8 *) s0;
1485 errors = &discard_errors;
1488 /* The order of malformation tests here is important. We should consume as
1489 * few bytes as possible in order to not skip any valid character. This is
1490 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1491 * https://unicode.org/reports/tr36 for more discussion as to why. For
1492 * example, once we've done a UTF8SKIP, we can tell the expected number of
1493 * bytes, and could fail right off the bat if the input parameters indicate
1494 * that there are too few available. But it could be that just that first
1495 * byte is garbled, and the intended character occupies fewer bytes. If we
1496 * blindly assumed that the first byte is correct, and skipped based on
1497 * that number, we could skip over a valid input character. So instead, we
1498 * always examine the sequence byte-by-byte.
1500 * We also should not consume too few bytes, otherwise someone could inject
1501 * things. For example, an input could be deliberately designed to
1502 * overflow, and if this code bailed out immediately upon discovering that,
1503 * returning to the caller C<*retlen> pointing to the very next byte (one
1504 * which is actually part of the overflowing sequence), that could look
1505 * legitimate to the caller, which could discard the initial partial
1506 * sequence and process the rest, inappropriately.
1508 * Some possible input sequences are malformed in more than one way. This
1509 * function goes to lengths to try to find all of them. This is necessary
1510 * for correctness, as the inputs may allow one malformation but not
1511 * another, and if we abandon searching for others after finding the
1512 * allowed one, we could allow in something that shouldn't have been.
1515 if (UNLIKELY(curlen == 0)) {
1516 possible_problems |= UTF8_GOT_EMPTY;
1518 uv = UNICODE_REPLACEMENT;
1519 goto ready_to_handle_errors;
1522 /* We now know we can examine the first byte of the input */
1523 expectlen = UTF8SKIP(s);
1526 /* A well-formed UTF-8 character, as the vast majority of calls to this
1527 * function will be for, has this expected length. For efficiency, set
1528 * things up here to return it. It will be overriden only in those rare
1529 * cases where a malformation is found */
1531 *retlen = expectlen;
1534 /* A continuation character can't start a valid sequence */
1535 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1536 possible_problems |= UTF8_GOT_CONTINUATION;
1538 uv = UNICODE_REPLACEMENT;
1539 goto ready_to_handle_errors;
1542 /* Here is not a continuation byte, nor an invariant. The only thing left
1543 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1544 * because it excludes start bytes like \xC0 that always lead to
1547 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1548 * that indicate the number of bytes in the character's whole UTF-8
1549 * sequence, leaving just the bits that are part of the value. */
1550 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1552 /* Setup the loop end point, making sure to not look past the end of the
1553 * input string, and flag it as too short if the size isn't big enough. */
1554 if (UNLIKELY(curlen < expectlen)) {
1555 possible_problems |= UTF8_GOT_SHORT;
1559 send = (U8*) s0 + expectlen;
1562 /* Now, loop through the remaining bytes in the character's sequence,
1563 * accumulating each into the working value as we go. */
1564 for (s = s0 + 1; s < send; s++) {
1565 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1566 uv = UTF8_ACCUMULATE(uv, *s);
1570 /* Here, found a non-continuation before processing all expected bytes.
1571 * This byte indicates the beginning of a new character, so quit, even
1572 * if allowing this malformation. */
1573 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1575 } /* End of loop through the character's bytes */
1577 /* Save how many bytes were actually in the character */
1580 /* Note that there are two types of too-short malformation. One is when
1581 * there is actual wrong data before the normal termination of the
1582 * sequence. The other is that the sequence wasn't complete before the end
1583 * of the data we are allowed to look at, based on the input 'curlen'.
1584 * This means that we were passed data for a partial character, but it is
1585 * valid as far as we saw. The other is definitely invalid. This
1586 * distinction could be important to a caller, so the two types are kept
1589 * A convenience macro that matches either of the too-short conditions. */
1590 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1592 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1594 uv = UNICODE_REPLACEMENT;
1597 /* Check for overflow. The algorithm requires us to not look past the end
1598 * of the current character, even if partial, so the upper limit is 's' */
1599 if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1600 1 /* Do consider overlongs */
1603 possible_problems |= UTF8_GOT_OVERFLOW;
1604 uv = UNICODE_REPLACEMENT;
1607 /* Check for overlong. If no problems so far, 'uv' is the correct code
1608 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1609 * we must look at the UTF-8 byte sequence itself to see if it is for an
1611 if ( ( LIKELY(! possible_problems)
1612 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1613 || ( UNLIKELY(possible_problems)
1614 && ( UNLIKELY(! UTF8_IS_START(*s0))
1615 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1617 possible_problems |= UTF8_GOT_LONG;
1619 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1621 /* The calculation in the 'true' branch of this 'if'
1622 * below won't work if overflows, and isn't needed
1623 * anyway. Further below we handle all overflow
1625 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1627 UV min_uv = uv_so_far;
1630 /* Here, the input is both overlong and is missing some trailing
1631 * bytes. There is no single code point it could be for, but there
1632 * may be enough information present to determine if what we have
1633 * so far is for an unallowed code point, such as for a surrogate.
1634 * The code further below has the intelligence to determine this,
1635 * but just for non-overlong UTF-8 sequences. What we do here is
1636 * calculate the smallest code point the input could represent if
1637 * there were no too short malformation. Then we compute and save
1638 * the UTF-8 for that, which is what the code below looks at
1639 * instead of the raw input. It turns out that the smallest such
1640 * code point is all we need. */
1641 for (i = curlen; i < expectlen; i++) {
1642 min_uv = UTF8_ACCUMULATE(min_uv,
1643 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1646 adjusted_s0 = temp_char_buf;
1647 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1651 /* Here, we have found all the possible problems, except for when the input
1652 * is for a problematic code point not allowed by the input parameters. */
1654 /* uv is valid for overlongs */
1655 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1656 && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1657 || ( UNLIKELY(possible_problems)
1659 /* if overflow, we know without looking further
1660 * precisely which of the problematic types it is,
1661 * and we deal with those in the overflow handling
1663 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1664 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1665 || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1666 && ((flags & ( UTF8_DISALLOW_NONCHAR
1667 |UTF8_DISALLOW_SURROGATE
1668 |UTF8_DISALLOW_SUPER
1669 |UTF8_DISALLOW_PERL_EXTENDED
1671 |UTF8_WARN_SURROGATE
1673 |UTF8_WARN_PERL_EXTENDED))))
1675 /* If there were no malformations, or the only malformation is an
1676 * overlong, 'uv' is valid */
1677 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1678 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1679 possible_problems |= UTF8_GOT_SURROGATE;
1681 else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1682 possible_problems |= UTF8_GOT_SUPER;
1684 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1685 possible_problems |= UTF8_GOT_NONCHAR;
1688 else { /* Otherwise, need to look at the source UTF-8, possibly
1689 adjusted to be non-overlong */
1691 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1692 > UTF_START_BYTE_110000_))
1694 possible_problems |= UTF8_GOT_SUPER;
1696 else if (curlen > 1) {
1697 if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
1698 == UTF_START_BYTE_110000_
1699 && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1700 >= UTF_FIRST_CONT_BYTE_110000_))
1702 possible_problems |= UTF8_GOT_SUPER;
1704 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1705 possible_problems |= UTF8_GOT_SURROGATE;
1709 /* We need a complete well-formed UTF-8 character to discern
1710 * non-characters, so can't look for them here */
1714 ready_to_handle_errors:
1717 * curlen contains the number of bytes in the sequence that
1718 * this call should advance the input by.
1719 * avail_len gives the available number of bytes passed in, but
1720 * only if this is less than the expected number of
1721 * bytes, based on the code point's start byte.
1722 * possible_problems is 0 if there weren't any problems; otherwise a bit
1723 * is set in it for each potential problem found.
1724 * uv contains the code point the input sequence
1725 * represents; or if there is a problem that prevents
1726 * a well-defined value from being computed, it is
1727 * some subsitute value, typically the REPLACEMENT
1729 * s0 points to the first byte of the character
1730 * s points to just after where we left off processing
1732 * send points to just after where that character should
1733 * end, based on how many bytes the start byte tells
1734 * us should be in it, but no further than s0 +
1738 if (UNLIKELY(possible_problems)) {
1739 bool disallowed = FALSE;
1740 const U32 orig_problems = possible_problems;
1746 while (possible_problems) { /* Handle each possible problem */
1748 char * message = NULL;
1749 U32 this_flag_bit = 0;
1751 /* Each 'if' clause handles one problem. They are ordered so that
1752 * the first ones' messages will be displayed before the later
1753 * ones; this is kinda in decreasing severity order. But the
1754 * overlong must come last, as it changes 'uv' looked at by the
1756 if (possible_problems & UTF8_GOT_OVERFLOW) {
1758 /* Overflow means also got a super and are using Perl's
1759 * extended UTF-8, but we handle all three cases here */
1761 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1762 *errors |= UTF8_GOT_OVERFLOW;
1764 /* But the API says we flag all errors found */
1765 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1766 *errors |= UTF8_GOT_SUPER;
1769 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1771 *errors |= UTF8_GOT_PERL_EXTENDED;
1774 /* Disallow if any of the three categories say to */
1775 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1776 || (flags & ( UTF8_DISALLOW_SUPER
1777 |UTF8_DISALLOW_PERL_EXTENDED)))
1782 /* Likewise, warn if any say to */
1783 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1784 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1787 /* The warnings code explicitly says it doesn't handle the
1788 * case of packWARN2 and two categories which have
1789 * parent-child relationship. Even if it works now to
1790 * raise the warning if either is enabled, it wouldn't
1791 * necessarily do so in the future. We output (only) the
1792 * most dire warning */
1793 if (! (flags & UTF8_CHECK_ONLY)) {
1794 if (msgs || ckWARN_d(WARN_UTF8)) {
1795 pack_warn = packWARN(WARN_UTF8);
1797 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1798 pack_warn = packWARN(WARN_NON_UNICODE);
1801 message = Perl_form(aTHX_ "%s: %s (overflows)",
1803 _byte_dump_string(s0, curlen, 0));
1804 this_flag_bit = UTF8_GOT_OVERFLOW;
1809 else if (possible_problems & UTF8_GOT_EMPTY) {
1810 possible_problems &= ~UTF8_GOT_EMPTY;
1811 *errors |= UTF8_GOT_EMPTY;
1813 if (! (flags & UTF8_ALLOW_EMPTY)) {
1815 /* This so-called malformation is now treated as a bug in
1816 * the caller. If you have nothing to decode, skip calling
1822 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1824 pack_warn = packWARN(WARN_UTF8);
1825 message = Perl_form(aTHX_ "%s (empty string)",
1827 this_flag_bit = UTF8_GOT_EMPTY;
1831 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1832 possible_problems &= ~UTF8_GOT_CONTINUATION;
1833 *errors |= UTF8_GOT_CONTINUATION;
1835 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1838 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1840 pack_warn = packWARN(WARN_UTF8);
1841 message = Perl_form(aTHX_
1842 "%s: %s (unexpected continuation byte 0x%02x,"
1843 " with no preceding start byte)",
1845 _byte_dump_string(s0, 1, 0), *s0);
1846 this_flag_bit = UTF8_GOT_CONTINUATION;
1850 else if (possible_problems & UTF8_GOT_SHORT) {
1851 possible_problems &= ~UTF8_GOT_SHORT;
1852 *errors |= UTF8_GOT_SHORT;
1854 if (! (flags & UTF8_ALLOW_SHORT)) {
1857 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1859 pack_warn = packWARN(WARN_UTF8);
1860 message = Perl_form(aTHX_
1861 "%s: %s (too short; %d byte%s available, need %d)",
1863 _byte_dump_string(s0, send - s0, 0),
1865 avail_len == 1 ? "" : "s",
1867 this_flag_bit = UTF8_GOT_SHORT;
1872 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1873 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1874 *errors |= UTF8_GOT_NON_CONTINUATION;
1876 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1879 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1882 /* If we don't know for sure that the input length is
1883 * valid, avoid as much as possible reading past the
1884 * end of the buffer */
1885 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1887 : (int) (send - s0);
1888 pack_warn = packWARN(WARN_UTF8);
1889 message = Perl_form(aTHX_ "%s",
1890 unexpected_non_continuation_text(s0,
1894 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1898 else if (possible_problems & UTF8_GOT_SURROGATE) {
1899 possible_problems &= ~UTF8_GOT_SURROGATE;
1901 if (flags & UTF8_WARN_SURROGATE) {
1902 *errors |= UTF8_GOT_SURROGATE;
1904 if ( ! (flags & UTF8_CHECK_ONLY)
1905 && (msgs || ckWARN_d(WARN_SURROGATE)))
1907 pack_warn = packWARN(WARN_SURROGATE);
1909 /* These are the only errors that can occur with a
1910 * surrogate when the 'uv' isn't valid */
1911 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1912 message = Perl_form(aTHX_
1913 "UTF-16 surrogate (any UTF-8 sequence that"
1914 " starts with \"%s\" is for a surrogate)",
1915 _byte_dump_string(s0, curlen, 0));
1918 message = Perl_form(aTHX_ surrogate_cp_format, uv);
1920 this_flag_bit = UTF8_GOT_SURROGATE;
1924 if (flags & UTF8_DISALLOW_SURROGATE) {
1926 *errors |= UTF8_GOT_SURROGATE;
1929 else if (possible_problems & UTF8_GOT_SUPER) {
1930 possible_problems &= ~UTF8_GOT_SUPER;
1932 if (flags & UTF8_WARN_SUPER) {
1933 *errors |= UTF8_GOT_SUPER;
1935 if ( ! (flags & UTF8_CHECK_ONLY)
1936 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1938 pack_warn = packWARN(WARN_NON_UNICODE);
1940 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1941 message = Perl_form(aTHX_
1942 "Any UTF-8 sequence that starts with"
1943 " \"%s\" is for a non-Unicode code point,"
1944 " may not be portable",
1945 _byte_dump_string(s0, curlen, 0));
1948 message = Perl_form(aTHX_ super_cp_format, uv);
1950 this_flag_bit = UTF8_GOT_SUPER;
1954 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1955 * and before possibly bailing out, so that the more dire
1956 * warning will override the regular one. */
1957 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1958 if ( ! (flags & UTF8_CHECK_ONLY)
1959 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1960 && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
1961 || ckWARN(WARN_PORTABLE))))
1963 pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1965 /* If it is an overlong that evaluates to a code point
1966 * that doesn't have to use the Perl extended UTF-8, it
1967 * still used it, and so we output a message that
1968 * doesn't refer to the code point. The same is true
1969 * if there was a SHORT malformation where the code
1970 * point is not valid. In that case, 'uv' will have
1971 * been set to the REPLACEMENT CHAR, and the message
1972 * below without the code point in it will be selected
1974 if (UNICODE_IS_PERL_EXTENDED(uv)) {
1975 message = Perl_form(aTHX_
1976 PL_extended_cp_format, uv);
1979 message = Perl_form(aTHX_
1980 "Any UTF-8 sequence that starts with"
1981 " \"%s\" is a Perl extension, and"
1982 " so is not portable",
1983 _byte_dump_string(s0, curlen, 0));
1985 this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1988 if (flags & ( UTF8_WARN_PERL_EXTENDED
1989 |UTF8_DISALLOW_PERL_EXTENDED))
1991 *errors |= UTF8_GOT_PERL_EXTENDED;
1993 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1999 if (flags & UTF8_DISALLOW_SUPER) {
2000 *errors |= UTF8_GOT_SUPER;
2004 else if (possible_problems & UTF8_GOT_NONCHAR) {
2005 possible_problems &= ~UTF8_GOT_NONCHAR;
2007 if (flags & UTF8_WARN_NONCHAR) {
2008 *errors |= UTF8_GOT_NONCHAR;
2010 if ( ! (flags & UTF8_CHECK_ONLY)
2011 && (msgs || ckWARN_d(WARN_NONCHAR)))
2013 /* The code above should have guaranteed that we don't
2014 * get here with errors other than overlong */
2015 assert (! (orig_problems
2016 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2018 pack_warn = packWARN(WARN_NONCHAR);
2019 message = Perl_form(aTHX_ nonchar_cp_format, uv);
2020 this_flag_bit = UTF8_GOT_NONCHAR;
2024 if (flags & UTF8_DISALLOW_NONCHAR) {
2026 *errors |= UTF8_GOT_NONCHAR;
2029 else if (possible_problems & UTF8_GOT_LONG) {
2030 possible_problems &= ~UTF8_GOT_LONG;
2031 *errors |= UTF8_GOT_LONG;
2033 if (flags & UTF8_ALLOW_LONG) {
2035 /* We don't allow the actual overlong value, unless the
2036 * special extra bit is also set */
2037 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
2038 & ~UTF8_ALLOW_LONG)))
2040 uv = UNICODE_REPLACEMENT;
2047 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2049 pack_warn = packWARN(WARN_UTF8);
2051 /* These error types cause 'uv' to be something that
2052 * isn't what was intended, so can't use it in the
2053 * message. The other error types either can't
2054 * generate an overlong, or else the 'uv' is valid */
2056 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2058 message = Perl_form(aTHX_
2059 "%s: %s (any UTF-8 sequence that starts"
2060 " with \"%s\" is overlong which can and"
2061 " should be represented with a"
2062 " different, shorter sequence)",
2064 _byte_dump_string(s0, send - s0, 0),
2065 _byte_dump_string(s0, curlen, 0));
2068 U8 tmpbuf[UTF8_MAXBYTES+1];
2069 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2071 /* Don't use U+ for non-Unicode code points, which
2072 * includes those in the Latin1 range */
2073 const char * preface = ( UNICODE_IS_SUPER(uv)
2080 message = Perl_form(aTHX_
2081 "%s: %s (overlong; instead use %s to represent"
2084 _byte_dump_string(s0, send - s0, 0),
2085 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2087 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2088 small code points */
2091 this_flag_bit = UTF8_GOT_LONG;
2094 } /* End of looking through the possible flags */
2096 /* Display the message (if any) for the problem being handled in
2097 * this iteration of the loop */
2100 assert(this_flag_bit);
2102 if (*msgs == NULL) {
2106 av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2111 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2114 Perl_warner(aTHX_ pack_warn, "%s", message);
2116 } /* End of 'while (possible_problems)' */
2118 /* Since there was a possible problem, the returned length may need to
2119 * be changed from the one stored at the beginning of this function.
2120 * Instead of trying to figure out if it has changed, just do it. */
2126 if (flags & UTF8_CHECK_ONLY && retlen) {
2127 *retlen = ((STRLEN) -1);
2133 return UNI_TO_NATIVE(uv);
2137 =for apidoc utf8_to_uvchr_buf
2139 Returns the native code point of the first character in the string C<s> which
2140 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2141 C<*retlen> will be set to the length, in bytes, of that character.
2143 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2144 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2145 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
2146 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2147 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2148 the next possible position in C<s> that could begin a non-malformed character.
2149 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2154 Also implemented as a macro in utf8.h
2160 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2162 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2164 return utf8_to_uvchr_buf_helper(s, send, retlen);
2168 =for apidoc utf8_length
2170 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2171 at C<s> and ending at the byte just before C<e>. If <s> and <e> point to the
2172 same place, it returns 0 with no warning raised.
2174 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2175 and returns the number of valid characters.
2181 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2185 PERL_ARGS_ASSERT_UTF8_LENGTH;
2187 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2188 * the bitops (especially ~) can create illegal UTF-8.
2189 * In other words: in Perl UTF-8 is not just for Unicode. */
2192 Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2194 if (UNLIKELY(e - s < expected_byte_count)) {
2195 goto warn_and_return;
2199 s += expected_byte_count;
2202 if (LIKELY(e == s)) {
2206 /* Here, s > e on entry */
2210 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2211 "%s in %s", unees, OP_DESC(PL_op));
2213 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2219 =for apidoc bytes_cmp_utf8
2221 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2222 sequence of characters (stored as UTF-8)
2223 in C<u>, C<ulen>. Returns 0 if they are
2224 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2225 if the first string is greater than the second string.
2227 -1 or +1 is returned if the shorter string was identical to the start of the
2228 longer string. -2 or +2 is returned if
2229 there was a difference between characters
2236 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2238 const U8 *const bend = b + blen;
2239 const U8 *const uend = u + ulen;
2241 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2243 while (b < bend && u < uend) {
2245 if (!UTF8_IS_INVARIANT(c)) {
2246 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2249 if (UTF8_IS_CONTINUATION(c1)) {
2250 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2252 /* diag_listed_as: Malformed UTF-8 character%s */
2253 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2255 unexpected_non_continuation_text(u - 2, 2, 1, 2),
2256 PL_op ? " in " : "",
2257 PL_op ? OP_DESC(PL_op) : "");
2262 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2263 "%s in %s", unees, OP_DESC(PL_op));
2265 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2266 return -2; /* Really want to return undef :-) */
2273 return *b < c ? -2 : +2;
2278 if (b == bend && u == uend)
2281 return b < bend ? +1 : -1;
2285 =for apidoc utf8_to_bytes
2287 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2288 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2289 updates C<*lenp> to contain the new length.
2290 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2292 Upon successful return, the number of variants in the string can be computed by
2293 having saved the value of C<*lenp> before the call, and subtracting the
2294 after-call value of C<*lenp> from it.
2296 If you need a copy of the string, see L</bytes_from_utf8>.
2302 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2306 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2307 PERL_UNUSED_CONTEXT;
2309 /* This is a no-op if no variants at all in the input */
2310 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2315 U8 * const save = s;
2316 U8 * const send = s + *lenp;
2319 /* Nothing before the first variant needs to be changed, so start the real
2323 if (! UTF8_IS_INVARIANT(*s)) {
2324 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2325 *lenp = ((STRLEN) -1);
2333 /* Is downgradable, so do it */
2334 d = s = first_variant;
2337 if (! UVCHR_IS_INVARIANT(c)) {
2338 /* Then it is two-byte encoded */
2339 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2352 =for apidoc bytes_from_utf8
2354 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2355 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2356 actually encoded in UTF-8.
2358 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2361 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2362 not expressible in native byte encoding. In these cases, C<*is_utf8p> and
2363 C<*lenp> are unchanged, and the return value is the original C<s>.
2365 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2366 newly created string containing a downgraded copy of C<s>, and whose length is
2367 returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2368 caller is responsible for arranging for the memory used by this string to get
2371 Upon successful return, the number of variants in the string can be computed by
2372 having saved the value of C<*lenp> before the call, and subtracting the
2373 after-call value of C<*lenp> from it.
2377 There is a macro that avoids this function call, but this is retained for
2378 anyone who calls it with the Perl_ prefix */
2381 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2383 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2384 PERL_UNUSED_CONTEXT;
2386 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2390 =for apidoc bytes_from_utf8_loc
2392 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2393 to where to store the location of the first character in C<"s"> that cannot be
2394 converted to non-UTF8.
2396 If that parameter is C<NULL>, this function behaves identically to
2399 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2400 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2402 Otherwise, the function returns a newly created C<NUL>-terminated string
2403 containing the non-UTF8 equivalent of the convertible first portion of
2404 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2405 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2406 and C<*first_non_downgradable> is set to C<NULL>.
2408 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2409 first character in the original string that wasn't converted. C<*is_utf8p> is
2410 unchanged. Note that the new string may have length 0.
2412 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2413 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2414 converts as many characters in it as possible stopping at the first one it
2415 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
2416 set to point to that. The function returns the portion that could be converted
2417 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2418 not including the terminating C<NUL>. If the very first character in the
2419 original could not be converted, C<*lenp> will be 0, and the new string will
2420 contain just a single C<NUL>. If the entire input string was converted,
2421 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2423 Upon successful return, the number of variants in the converted portion of the
2424 string can be computed by having saved the value of C<*lenp> before the call,
2425 and subtracting the after-call value of C<*lenp> from it.
2433 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2436 const U8 *original = s;
2437 U8 *converted_start;
2438 const U8 *send = s + *lenp;
2440 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2443 if (first_unconverted) {
2444 *first_unconverted = NULL;
2447 return (U8 *) original;
2450 Newx(d, (*lenp) + 1, U8);
2452 converted_start = d;
2455 if (! UTF8_IS_INVARIANT(c)) {
2457 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2458 * have to stop now */
2459 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2460 if (first_unconverted) {
2461 *first_unconverted = s - 1;
2462 goto finish_and_return;
2465 Safefree(converted_start);
2466 return (U8 *) original;
2470 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2476 /* Here, converted the whole of the input */
2478 if (first_unconverted) {
2479 *first_unconverted = NULL;
2484 *lenp = d - converted_start;
2486 /* Trim unused space */
2487 Renew(converted_start, *lenp + 1, U8);
2489 return converted_start;
2493 =for apidoc bytes_to_utf8
2495 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2497 Returns a pointer to the newly-created string, and sets C<*lenp> to
2498 reflect the new length in bytes. The caller is responsible for arranging for
2499 the memory used by this string to get freed.
2501 Upon successful return, the number of variants in the string can be computed by
2502 having saved the value of C<*lenp> before the call, and subtracting it from the
2503 after-call value of C<*lenp>.
2505 A C<NUL> character will be written after the end of the string.
2507 If you want to convert to UTF-8 from encodings other than
2508 the native (Latin1 or EBCDIC),
2509 see L</sv_recode_to_utf8>().
2515 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2517 const U8 * const send = s + (*lenp);
2521 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2522 PERL_UNUSED_CONTEXT;
2524 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2525 Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2529 append_utf8_from_native_byte(*s, &d);
2540 * Convert native UTF-16 to UTF-8. Called via the more public functions
2541 * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2544 * 'p' is the UTF-16 input string, passed as a pointer to U8.
2545 * 'bytelen' is its length (must be even)
2546 * 'd' is the pointer to the destination buffer. The caller must ensure that
2547 * the space is large enough. The maximum expansion factor is 2 times
2548 * 'bytelen'. 1.5 if never going to run on an EBCDIC box.
2549 * '*newlen' will contain the number of bytes this function filled of 'd'.
2550 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2551 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
2553 * The expansion factor is because UTF-16 requires 2 bytes for every code point
2554 * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code
2555 * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for
2556 * every code point below 0x1000; otherwise 4-5 bytes.
2558 * The worst case is where every code point is below U+10000, hence requiring 2
2559 * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2560 * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2562 * Do not use in-place. */
2565 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2566 const bool high_byte, /* Which of next two bytes is
2568 const bool low_byte)
2573 PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2576 Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2577 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2582 /* Next 16 bits is what we want. (The bool is cast to U8 because on
2583 * platforms where a bool is implemented as a signed char, a compiler
2584 * warning may be generated) */
2585 U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2588 /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2590 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2592 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2593 #define LAST_HIGH_SURROGATE 0xDBFF
2594 #define FIRST_LOW_SURROGATE 0xDC00
2595 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
2596 #define FIRST_IN_PLANE1 0x10000
2598 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2599 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2602 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2603 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2604 LAST_LOW_SURROGATE)))
2606 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2611 /* Here uv is the high surrogate. Combine with low surrogate
2612 * just computed to form the actual U32 code point.
2614 * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2615 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2616 + low_surrogate - FIRST_LOW_SURROGATE;
2620 /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2621 d = uvchr_to_utf8(d, uv);
2624 *newlen = d - dstart;
2629 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2631 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2633 return utf16_to_utf8(p, d, bytelen, newlen);
2637 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2639 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2641 return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2645 * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2646 * big-endian and utf8_to_utf16_reversed() for little-endian,
2648 * 's' is the UTF-8 input string, passed as a pointer to U8.
2649 * 'bytelen' is its length
2650 * 'd' is the pointer to the destination buffer, currently passed as U8 *. The
2651 * caller must ensure that the space is large enough. The maximum
2652 * expansion factor is 2 times 'bytelen'. This happens when the input is
2653 * entirely single-byte ASCII, expanding to two-byte UTF-16.
2654 * '*newlen' will contain the number of bytes this function filled of 'd'.
2655 * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2656 * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
2658 * Do not use in-place. */
2660 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2661 const bool high_byte, /* Which of next two bytes
2663 const bool low_byte)
2668 PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2674 UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2675 /* No surrogates nor above-Unicode */
2676 UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2678 /* The modern method is to keep going with malformed input,
2679 * substituting the REPLACEMENT CHARACTER */
2680 if (UNLIKELY(uv == 0 && *s != '\0')) {
2681 uv = UNICODE_REPLACEMENT;
2684 if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */
2686 /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2687 U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2688 + FIRST_HIGH_SURROGATE;
2690 /* (The bool is cast to U8 because on platforms where a bool is
2691 * implemented as a signed char, a compiler warning may be
2693 d[(U8) high_byte] = high_surrogate >> 8;
2694 d[(U8) low_byte] = high_surrogate & nBIT_MASK(8);
2697 /* The low surrogate is the lower 10 bits plus the offset */
2698 uv &= nBIT_MASK(10);
2699 uv += FIRST_LOW_SURROGATE;
2701 /* Drop down to output the low surrogate like it were a
2705 d[(U8) high_byte] = uv >> 8;
2706 d[(U8) low_byte] = uv & nBIT_MASK(8);
2712 *newlen = d - dstart;
2717 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2719 return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2723 Perl__is_uni_perl_idcont(pTHX_ UV c)
2725 return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2729 Perl__is_uni_perl_idstart(pTHX_ UV c)
2731 return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2735 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2738 /* We have the latin1-range values compiled into the core, so just use
2739 * those, converting the result to UTF-8. The only difference between upper
2740 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2741 * either "SS" or "Ss". Which one to use is passed into the routine in
2742 * 'S_or_s' to avoid a test */
2744 UV converted = toUPPER_LATIN1_MOD(c);
2746 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2748 assert(S_or_s == 'S' || S_or_s == 's');
2750 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2751 characters in this range */
2752 *p = (U8) converted;
2757 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2758 * which it maps to one of them, so as to only have to have one check for
2759 * it in the main case */
2760 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2762 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2763 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2766 converted = GREEK_CAPITAL_LETTER_MU;
2768 #if UNICODE_MAJOR_VERSION > 2 \
2769 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2770 && UNICODE_DOT_DOT_VERSION >= 8)
2771 case LATIN_SMALL_LETTER_SHARP_S:
2778 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2779 " '%c' to map to '%c'",
2780 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2781 NOT_REACHED; /* NOTREACHED */
2785 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2786 *p = UTF8_TWO_BYTE_LO(converted);
2792 /* If compiled on an early Unicode version, there may not be auxiliary tables
2794 #ifndef HAS_UC_AUX_TABLES
2795 # define UC_AUX_TABLE_ptrs NULL
2796 # define UC_AUX_TABLE_lengths NULL
2798 #ifndef HAS_TC_AUX_TABLES
2799 # define TC_AUX_TABLE_ptrs NULL
2800 # define TC_AUX_TABLE_lengths NULL
2802 #ifndef HAS_LC_AUX_TABLES
2803 # define LC_AUX_TABLE_ptrs NULL
2804 # define LC_AUX_TABLE_lengths NULL
2806 #ifndef HAS_CF_AUX_TABLES
2807 # define CF_AUX_TABLE_ptrs NULL
2808 # define CF_AUX_TABLE_lengths NULL
2811 /* Call the function to convert a UTF-8 encoded character to the specified case.
2812 * Note that there may be more than one character in the result.
2813 * 's' is a pointer to the first byte of the input character
2814 * 'd' will be set to the first byte of the string of changed characters. It
2815 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2816 * 'lenp' will be set to the length in bytes of the string of changed characters
2818 * The functions return the ordinal of the first character in the string of
2820 #define CALL_UPPER_CASE(uv, s, d, lenp) \
2821 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \
2822 Uppercase_Mapping_invmap, \
2823 UC_AUX_TABLE_ptrs, \
2824 UC_AUX_TABLE_lengths, \
2826 #define CALL_TITLE_CASE(uv, s, d, lenp) \
2827 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \
2828 Titlecase_Mapping_invmap, \
2829 TC_AUX_TABLE_ptrs, \
2830 TC_AUX_TABLE_lengths, \
2832 #define CALL_LOWER_CASE(uv, s, d, lenp) \
2833 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \
2834 Lowercase_Mapping_invmap, \
2835 LC_AUX_TABLE_ptrs, \
2836 LC_AUX_TABLE_lengths, \
2840 /* This additionally has the input parameter 'specials', which if non-zero will
2841 * cause this to use the specials hash for folding (meaning get full case
2842 * folding); otherwise, when zero, this implies a simple case fold */
2843 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
2845 ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \
2846 Case_Folding_invmap, \
2847 CF_AUX_TABLE_ptrs, \
2848 CF_AUX_TABLE_lengths, \
2850 : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \
2851 Simple_Case_Folding_invmap, \
2856 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
2858 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2859 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2860 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2861 * the changed version may be longer than the original character.
2863 * The ordinal of the first character of the changed version is returned
2864 * (but note, as explained above, that there may be more.) */
2866 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2869 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2872 return CALL_UPPER_CASE(c, NULL, p, lenp);
2876 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
2878 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2881 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2884 return CALL_TITLE_CASE(c, NULL, p, lenp);
2888 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
2890 /* We have the latin1-range values compiled into the core, so just use
2891 * those, converting the result to UTF-8. Since the result is always just
2892 * one character, we allow <p> to be NULL */
2894 U8 converted = toLOWER_LATIN1(c);
2896 PERL_UNUSED_ARG(dummy);
2899 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
2904 /* Result is known to always be < 256, so can use the EIGHT_BIT
2906 *p = UTF8_EIGHT_BIT_HI(converted);
2907 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
2915 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
2917 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2920 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
2923 return CALL_LOWER_CASE(c, NULL, p, lenp);
2927 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
2929 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
2930 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2931 * FOLD_FLAGS_FULL iff full folding is to be used;
2933 * Not to be used for locale folds
2938 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
2940 assert (! (flags & FOLD_FLAGS_LOCALE));
2942 if (UNLIKELY(c == MICRO_SIGN)) {
2943 converted = GREEK_SMALL_LETTER_MU;
2945 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2946 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2947 || UNICODE_DOT_DOT_VERSION > 0)
2948 else if ( (flags & FOLD_FLAGS_FULL)
2949 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2951 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2952 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2953 * under those circumstances. */
2954 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2955 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
2956 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2958 return LATIN_SMALL_LETTER_LONG_S;
2968 else { /* In this range the fold of all other characters is their lower
2970 converted = toLOWER_LATIN1(c);
2973 if (UVCHR_IS_INVARIANT(converted)) {
2974 *p = (U8) converted;
2978 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2979 *p = UTF8_TWO_BYTE_LO(converted);
2987 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
2990 /* Not currently externally documented, and subject to change
2991 * <flags> bits meanings:
2992 * FOLD_FLAGS_FULL iff full folding is to be used;
2993 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2994 * locale are to be used.
2995 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2998 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3000 if (flags & FOLD_FLAGS_LOCALE) {
3001 /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3002 * except for potentially warning */
3003 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3004 if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
3005 flags &= ~FOLD_FLAGS_LOCALE;
3008 goto needs_full_generality;
3013 return _to_fold_latin1((U8) c, p, lenp,
3014 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3017 /* Here, above 255. If no special needs, just use the macro */
3018 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3019 return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3021 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3022 the special flags. */
3023 U8 utf8_c[UTF8_MAXBYTES + 1];
3025 needs_full_generality:
3026 uvchr_to_utf8(utf8_c, c);
3027 return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3032 PERL_STATIC_INLINE bool
3033 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3036 /* returns a boolean giving whether or not the UTF8-encoded character that
3037 * starts at <p>, and extending no further than <e - 1> is in the inversion
3038 * list <invlist>. */
3040 UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3042 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3044 if (cp == 0 && (p >= e || *p != '\0')) {
3045 _force_out_malformed_utf8_message(p, e, 0, 1);
3046 NOT_REACHED; /* NOTREACHED */
3050 return _invlist_contains_cp(invlist, cp);
3053 #if 0 /* Not currently used, but may be needed in the future */
3054 PERLVAR(I, seen_deprecated_macro, HV *)
3057 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3058 const char * const alternative,
3059 const bool use_locale,
3060 const char * const file,
3061 const unsigned line)
3065 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3067 if (ckWARN_d(WARN_DEPRECATED)) {
3069 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3070 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3071 if (! PL_seen_deprecated_macro) {
3072 PL_seen_deprecated_macro = newHV();
3074 if (! hv_store(PL_seen_deprecated_macro, key,
3075 strlen(key), &PL_sv_undef, 0))
3077 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3080 if (instr(file, "mathoms.c")) {
3081 Perl_warner(aTHX_ WARN_DEPRECATED,
3082 "In %s, line %d, starting in Perl v5.32, %s()"
3083 " will be removed. Avoid this message by"
3084 " converting to use %s().\n",
3085 file, line, name, alternative);
3088 Perl_warner(aTHX_ WARN_DEPRECATED,
3089 "In %s, line %d, starting in Perl v5.32, %s() will"
3090 " require an additional parameter. Avoid this"
3091 " message by converting to use %s().\n",
3092 file, line, name, alternative);
3100 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3102 PERL_ARGS_ASSERT__IS_UTF8_FOO;
3104 return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3108 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3110 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3112 return is_utf8_common(p, e, PL_utf8_perl_idstart);
3116 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3118 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3120 return is_utf8_common(p, e, PL_utf8_perl_idcont);
3124 S_to_case_cp_list(pTHX_
3126 const U32 ** const remaining_list,
3127 Size_t * remaining_count,
3128 SV *invlist, const I32 * const invmap,
3129 const U32 * const * const aux_tables,
3130 const U8 * const aux_table_lengths,
3131 const char * const normal)
3136 /* Calculate the changed case of code point 'original'. The first code
3137 * point of the changed case is returned.
3139 * If 'remaining_count' is not NULL, *remaining_count will be set to how
3140 * many *other* code points are in the changed case. If non-zero and
3141 * 'remaining_list' is also not NULL, *remaining_list will be set to point
3142 * to a non-modifiable array containing the second and potentially third
3143 * code points in the changed case. (Unicode guarantees a maximum of 3.)
3144 * Note that this means that *remaining_list is undefined unless there are
3145 * multiple code points, and the caller has chosen to find out how many by
3146 * making 'remaining_count' not NULL.
3148 * 'normal' is a string to use to name the new case in any generated
3149 * messages, as a fallback if the operation being used is not available.
3151 * The casing to use is given by the data structures in the remaining
3155 PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3157 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3158 * that covers all possible inputs. See [perl #133365] */
3159 index = _invlist_search(invlist, original);
3160 base = invmap[index];
3162 /* Most likely, the case change will contain just a single code point */
3163 if (remaining_count) {
3164 *remaining_count = 0;
3167 if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */
3169 /* At this bottom level routine is where we warn about illegal code
3171 if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3172 if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3173 if (ckWARN_d(WARN_SURROGATE)) {
3174 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3175 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3176 "Operation \"%s\" returns its argument for"
3177 " UTF-16 surrogate U+%04" UVXf, desc, original);
3180 else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3181 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3182 Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3184 if (ckWARN_d(WARN_NON_UNICODE)) {
3185 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3186 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3187 "Operation \"%s\" returns its argument for"
3188 " non-Unicode code point 0x%04" UVXf, desc, original);
3192 /* Note that non-characters are perfectly legal, so no warning
3193 * should be given. */
3199 if (LIKELY(base > 0)) { /* means original mapped to a single code point,
3200 different from itself */
3201 return base + original - invlist_array(invlist)[index];
3204 /* Here 'base' is negative. That means the mapping is 1-to-many, and
3205 * requires an auxiliary table look up. abs(base) gives the index into a
3206 * list of such tables which points to the proper aux table. And a
3207 * parallel list gives the length of each corresponding aux table. Skip
3208 * the first entry in the *remaining returns, as it is returned by the
3211 if (remaining_count) {
3212 *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3214 if (remaining_list) {
3215 *remaining_list = aux_tables[base] + 1;
3219 return (UV) aux_tables[base][0];
3223 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3224 U8* ustrp, STRLEN *lenp,
3225 SV *invlist, const I32 * const invmap,
3226 const U32 * const * const aux_tables,
3227 const U8 * const aux_table_lengths,
3228 const char * const normal)
3230 /* Change the case of code point 'original'. If 'p' is non-NULL, it points to
3231 * the beginning of the (assumed to be valid) UTF-8 representation of
3232 * 'original'. 'normal' is a string to use to name the new case in any
3233 * generated messages, as a fallback if the operation being used is not
3234 * available. The new case is given by the data structures in the
3235 * remaining arguments.
3237 * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3238 * entire changed case string, and the return value is the first code point
3241 * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3242 * since the changed version may be longer than the original character. */
3244 const U32 * remaining_list;
3245 Size_t remaining_count;
3246 UV first = to_case_cp_list(original,
3247 &remaining_list, &remaining_count,
3249 aux_tables, aux_table_lengths,
3252 PERL_ARGS_ASSERT__TO_UTF8_CASE;
3254 /* If the code point maps to itself and we already have its representation,
3255 * copy it instead of recalculating */
3256 if (original == first && p) {
3257 *lenp = UTF8SKIP(p);
3259 if (p != ustrp) { /* Don't copy onto itself */
3260 Copy(p, ustrp, *lenp, U8);
3267 d = uvchr_to_utf8(d, first);
3269 for (i = 0; i < remaining_count; i++) {
3270 d = uvchr_to_utf8(d, remaining_list[i]);
3281 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3282 const U32 ** remaining_folds_to)
3284 /* Returns the count of the number of code points that fold to the input
3285 * 'cp' (besides itself).
3287 * If the return is 0, there is nothing else that folds to it, and
3288 * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3290 * If the return is 1, '*first_folds_to' is set to the single code point,
3291 * and '*remaining_folds_to' is set to NULL.
3293 * Otherwise, '*first_folds_to' is set to a code point, and
3294 * '*remaining_fold_to' is set to an array that contains the others. The
3295 * length of this array is the returned count minus 1.
3297 * The reason for this convolution is to avoid having to deal with
3298 * allocating and freeing memory. The lists are already constructed, so
3299 * the return can point to them, but single code points aren't, so would
3300 * need to be constructed if we didn't employ something like this API
3302 * The code points returned by this function are all legal Unicode, which
3303 * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3304 * constructed with this size (to save space and memory), and we return
3305 * pointers, so they must be this size */
3307 /* 'index' is guaranteed to be non-negative, as this is an inversion map
3308 * that covers all possible inputs. See [perl #133365] */
3309 SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3310 I32 base = _Perl_IVCF_invmap[index];
3312 PERL_ARGS_ASSERT__INVERSE_FOLDS;
3314 if (base == 0) { /* No fold */
3315 *first_folds_to = 0;
3316 *remaining_folds_to = NULL;
3320 #ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
3326 if (UNLIKELY(base < 0)) { /* Folds to more than one character */
3328 /* The data structure is set up so that the absolute value of 'base' is
3329 * an index into a table of pointers to arrays, with the array
3330 * corresponding to the index being the list of code points that fold
3331 * to 'cp', and the parallel array containing the length of the list
3333 *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3334 *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3335 /* +1 excludes first_folds_to */
3336 return IVCF_AUX_TABLE_lengths[-base];
3341 /* Only the single code point. This works like 'fc(G) = G - A + a' */
3342 *first_folds_to = (U32) (base + cp
3343 - invlist_array(PL_utf8_foldclosures)[index]);
3344 *remaining_folds_to = NULL;
3349 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3350 U8* const ustrp, STRLEN *lenp)
3352 /* This is called when changing the case of a UTF-8-encoded character above
3353 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3354 * result contains a character that crosses the 255/256 boundary, disallow
3355 * the change, and return the original code point. See L<perlfunc/lc> for
3358 * p points to the original string whose case was changed; assumed
3359 * by this routine to be well-formed
3360 * result the code point of the first character in the changed-case string
3361 * ustrp points to the changed-case string (<result> represents its
3363 * lenp points to the length of <ustrp> */
3365 UV original; /* To store the first code point of <p> */
3367 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3369 assert(UTF8_IS_ABOVE_LATIN1(*p));
3371 /* We know immediately if the first character in the string crosses the
3372 * boundary, so can skip testing */
3375 /* Look at every character in the result; if any cross the
3376 * boundary, the whole thing is disallowed */
3377 U8* s = ustrp + UTF8SKIP(ustrp);
3378 U8* e = ustrp + *lenp;
3380 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3386 /* Here, no characters crossed, result is ok as-is, but we warn. */
3387 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3393 /* Failed, have to return the original */
3394 original = valid_utf8_to_uvchr(p, lenp);
3396 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3397 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3398 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3399 " locale; resolved to \"\\x{%" UVXf "}\".",
3403 Copy(p, ustrp, *lenp, char);
3408 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3409 U8 * ustrp, STRLEN *lenp)
3411 /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3412 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3413 * Otherwise, it returns the first code point of the Turkic foldcased
3414 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3415 * contain *lenp bytes
3417 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3418 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3421 PERL_ARGS_ASSERT_TURKIC_FC;
3424 if (UNLIKELY(*p == 'I')) {
3426 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3427 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3428 return LATIN_SMALL_LETTER_DOTLESS_I;
3431 if (UNLIKELY(memBEGINs(p, e - p,
3432 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3443 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3444 U8 * ustrp, STRLEN *lenp)
3446 /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3447 * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3448 * Otherwise, it returns the first code point of the Turkic lowercased
3449 * sequence, and the entire sequence will be stored in *ustrp. ustrp will
3450 * contain *lenp bytes */
3452 PERL_ARGS_ASSERT_TURKIC_LC;
3455 /* A 'I' requires context as to what to do */
3456 if (UNLIKELY(*p0 == 'I')) {
3457 const U8 * p = p0 + 1;
3459 /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3460 * modified by a dot above lowercases to 'i' even in turkic locales. */
3464 if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3470 /* For the dot above to modify the 'I', it must be part of a
3471 * combining sequence immediately following the 'I', and no other
3472 * modifier with a ccc of 230 may intervene */
3473 cp = utf8_to_uvchr_buf(p, e, NULL);
3474 if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3478 /* Here the combining sequence continues */
3483 /* In all other cases the lc is the same as the fold */
3484 return turkic_fc(p0, e, ustrp, lenp);
3488 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3489 U8 * ustrp, STRLEN *lenp)
3491 /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3492 * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3493 * Otherwise, it returns the first code point of the Turkic upper or
3494 * title-cased sequence, and the entire sequence will be stored in *ustrp.
3495 * ustrp will contain *lenp bytes
3497 * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3498 * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3501 PERL_ARGS_ASSERT_TURKIC_UC;
3506 ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3507 ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3508 return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3511 if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3520 /* The process for changing the case is essentially the same for the four case
3521 * change types, except there are complications for folding. Otherwise the
3522 * difference is only which case to change to. To make sure that they all do
3523 * the same thing, the bodies of the functions are extracted out into the
3524 * following two macros. The functions are written with the same variable
3525 * names, and these are known and used inside these macros. It would be
3526 * better, of course, to have inline functions to do it, but since different
3527 * macros are called, depending on which case is being changed to, this is not
3528 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3529 * function can start with the common start macro, then finish with its special
3530 * handling; while the other three cases can just use the common end macro.
3532 * The algorithm is to use the proper (passed in) macro or function to change
3533 * the case for code points that are below 256. The macro is used if using
3534 * locale rules for the case change; the function if not. If the code point is
3535 * above 255, it is computed from the input UTF-8, and another macro is called
3536 * to do the conversion. If necessary, the output is converted to UTF-8. If
3537 * using a locale, we have to check that the change did not cross the 255/256
3538 * boundary, see check_locale_boundary_crossing() for further details.
3540 * The macros are split with the correct case change for the below-256 case
3541 * stored into 'result', and in the middle of an else clause for the above-255
3542 * case. At that point in the 'else', 'result' is not the final result, but is
3543 * the input code point calculated from the UTF-8. The fold code needs to
3544 * realize all this and take it from there.
3546 * To deal with Turkic locales, the function specified by the parameter
3547 * 'turkic' is called when appropriate.
3549 * If you read the two macros as sequential, it's easier to understand what's
3551 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \
3552 L1_func_extra_param, turkic) \
3554 if (flags & (locale_flags)) { \
3555 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3556 if (IN_UTF8_CTYPE_LOCALE) { \
3557 if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
3558 UV ret = turkic(p, e, ustrp, lenp); \
3559 if (ret) return ret; \
3562 /* Otherwise, treat a UTF-8 locale as not being in locale at \
3564 flags &= ~(locale_flags); \
3568 if (UTF8_IS_INVARIANT(*p)) { \
3569 if (flags & (locale_flags)) { \
3570 result = libc_change_function(*p); \
3573 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3576 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
3577 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
3578 if (flags & (locale_flags)) { \
3579 result = libc_change_function(c); \
3582 return L1_func(c, ustrp, lenp, L1_func_extra_param); \
3585 else { /* malformed UTF-8 or ord above 255 */ \
3586 STRLEN len_result; \
3587 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3588 if (len_result == (STRLEN) -1) { \
3589 _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
3592 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3593 result = change_macro(result, p, ustrp, lenp); \
3595 if (flags & (locale_flags)) { \
3596 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3601 /* Here, used locale rules. Convert back to UTF-8 */ \
3602 if (UTF8_IS_INVARIANT(result)) { \
3603 *ustrp = (U8) result; \
3607 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3608 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3614 /* Not currently externally documented, and subject to change:
3615 * <flags> is set iff the rules from the current underlying locale are to
3619 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3627 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3629 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3630 /* 2nd char of uc(U+DF) is 'S' */
3631 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3633 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
3636 /* Not currently externally documented, and subject to change:
3637 * <flags> is set iff the rules from the current underlying locale are to be
3638 * used. Since titlecase is not defined in POSIX, for other than a
3639 * UTF-8 locale, uppercase is used instead for code points < 256.
3643 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3651 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3653 /* 2nd char of ucfirst(U+DF) is 's' */
3654 CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3656 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
3659 /* Not currently externally documented, and subject to change:
3660 * <flags> is set iff the rules from the current underlying locale are to
3665 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3673 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3675 CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3677 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
3680 /* Not currently externally documented, and subject to change,
3682 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3683 * locale are to be used.
3684 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3685 * otherwise simple folds
3686 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3691 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3699 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3701 /* These are mutually exclusive */
3702 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3704 assert(p != ustrp); /* Otherwise overwrites */
3706 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3707 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3710 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3712 if (flags & FOLD_FLAGS_LOCALE) {
3714 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3715 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3716 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3718 /* Special case these two characters, as what normally gets
3719 * returned under locale doesn't work */
3720 if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3722 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3723 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3724 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3725 "resolved to \"\\x{17F}\\x{17F}\".");
3730 if (memBEGINs((char *) p, e - p, LONG_S_T))
3732 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3733 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3734 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3735 "resolved to \"\\x{FB06}\".");
3736 goto return_ligature_st;
3739 #if UNICODE_MAJOR_VERSION == 3 \
3740 && UNICODE_DOT_VERSION == 0 \
3741 && UNICODE_DOT_DOT_VERSION == 1
3742 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3744 /* And special case this on this Unicode version only, for the same
3745 * reaons the other two are special cased. They would cross the
3746 * 255/256 boundary which is forbidden under /l, and so the code
3747 * wouldn't catch that they are equivalent (which they are only in
3749 else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3750 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3751 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3752 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3753 "resolved to \"\\x{0131}\".");
3754 goto return_dotless_i;
3758 return check_locale_boundary_crossing(p, result, ustrp, lenp);
3760 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3764 /* This is called when changing the case of a UTF-8-encoded
3765 * character above the ASCII range, and the result should not
3766 * contain an ASCII character. */
3768 UV original; /* To store the first code point of <p> */
3770 /* Look at every character in the result; if any cross the
3771 * boundary, the whole thing is disallowed */
3773 U8* send = ustrp + *lenp;
3776 /* Crossed, have to return the original */
3777 original = valid_utf8_to_uvchr(p, lenp);
3779 /* But in these instances, there is an alternative we can
3780 * return that is valid */
3781 if (original == LATIN_SMALL_LETTER_SHARP_S
3782 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3783 || original == LATIN_CAPITAL_LETTER_SHARP_S
3788 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3789 goto return_ligature_st;
3791 #if UNICODE_MAJOR_VERSION == 3 \
3792 && UNICODE_DOT_VERSION == 0 \
3793 && UNICODE_DOT_DOT_VERSION == 1
3795 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3796 goto return_dotless_i;
3799 Copy(p, ustrp, *lenp, char);
3805 /* Here, no characters crossed, result is ok as-is */
3810 /* Here, used locale rules. Convert back to UTF-8 */
3811 if (UTF8_IS_INVARIANT(result)) {
3812 *ustrp = (U8) result;
3816 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3817 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3824 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3825 * folds to a string of two of these characters. By returning this
3826 * instead, then, e.g.,
3827 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3830 *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3831 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3833 return LATIN_SMALL_LETTER_LONG_S;
3836 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3837 * have the other one fold to it */
3839 *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
3840 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3841 return LATIN_SMALL_LIGATURE_ST;
3843 #if UNICODE_MAJOR_VERSION == 3 \
3844 && UNICODE_DOT_VERSION == 0 \
3845 && UNICODE_DOT_DOT_VERSION == 1
3848 *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
3849 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3850 return LATIN_SMALL_LETTER_DOTLESS_I;
3857 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3859 /* May change: warns if surrogates, non-character code points, or
3860 * non-Unicode code points are in 's' which has length 'len' bytes.
3861 * Returns TRUE if none found; FALSE otherwise. The only other validity
3862 * check is to make sure that this won't exceed the string's length nor
3865 const U8* const e = s + len;
3868 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3871 if (UTF8SKIP(s) > len) {
3872 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3873 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3876 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
3877 if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
3878 if ( ckWARN_d(WARN_NON_UNICODE)
3879 || UNLIKELY(0 < does_utf8_overflow(s, s + len,
3880 0 /* Don't consider overlongs */
3883 /* A side effect of this function will be to warn */
3884 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
3888 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
3889 if (ckWARN_d(WARN_SURROGATE)) {
3890 /* This has a different warning than the one the called
3891 * function would output, so can't just call it, unlike we
3892 * do for the non-chars and above-unicodes */
3893 UV uv = utf8_to_uvchr_buf(s, e, NULL);
3894 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3895 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
3900 else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e))
3901 && (ckWARN_d(WARN_NONCHAR)))
3903 /* A side effect of this function will be to warn */
3904 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
3915 =for apidoc pv_uni_display
3917 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
3918 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
3919 long (if longer, the rest is truncated and C<"..."> will be appended).
3921 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
3922 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
3923 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
3924 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
3925 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
3926 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
3928 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
3929 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
3931 The pointer to the PV of the C<dsv> is returned.
3933 See also L</sv_uni_display>.
3935 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
3936 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
3937 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
3938 =for apidoc Amnh||UNI_DISPLAY_QQ
3939 =for apidoc Amnh||UNI_DISPLAY_REGEX
3943 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
3949 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3953 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
3957 if (pvlim && SvCUR(dsv) >= pvlim) {
3961 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
3963 const U8 c = (U8) u;
3964 if (flags & UNI_DISPLAY_BACKSLASH) {
3965 if ( isMNEMONIC_CNTRL(c)
3967 || (flags & UNI_DISPLAY_BACKSPACE)))
3969 const char * mnemonic = cntrl_to_mnemonic(c);
3970 sv_catpvn(dsv, mnemonic, strlen(mnemonic));
3973 else if (c == '\\') {
3974 sv_catpvs(dsv, "\\\\");
3978 /* isPRINT() is the locale-blind version. */
3979 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
3980 const char string = c;
3981 sv_catpvn(dsv, &string, 1);
3986 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
3989 sv_catpvs(dsv, "...");
3995 =for apidoc sv_uni_display
3997 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
3998 the displayable version being at most C<pvlim> bytes long
3999 (if longer, the rest is truncated and "..." will be appended).
4001 The C<flags> argument is as in L</pv_uni_display>().
4003 The pointer to the PV of the C<dsv> is returned.
4008 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4010 const char * const ptr =
4011 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4013 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4015 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4016 SvCUR(ssv), pvlim, flags);
4020 =for apidoc foldEQ_utf8
4022 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4023 both of which may be in UTF-8) are the same case-insensitively; false
4024 otherwise. How far into the strings to compare is determined by other input
4027 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4028 otherwise it is assumed to be in native 8-bit encoding. Correspondingly for
4029 C<u2> with respect to C<s2>.
4031 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4032 fold equality. In other words, C<s1>+C<l1> will be used as a goal to reach.
4033 The scan will not be considered to be a match unless the goal is reached, and
4034 scanning won't continue past that goal. Correspondingly for C<l2> with respect
4037 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4038 pointer is considered an end pointer to the position 1 byte past the maximum
4039 point in C<s1> beyond which scanning will not continue under any circumstances.
4040 (This routine assumes that UTF-8 encoded input strings are not malformed;
4041 malformed input can cause it to read past C<pe1>). This means that if both
4042 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4043 will never be successful because it can never
4044 get as far as its goal (and in fact is asserted against). Correspondingly for
4045 C<pe2> with respect to C<s2>.
4047 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4048 C<l2> must be non-zero), and if both do, both have to be
4049 reached for a successful match. Also, if the fold of a character is multiple
4050 characters, all of them must be matched (see tr21 reference below for
4053 Upon a successful match, if C<pe1> is non-C<NULL>,
4054 it will be set to point to the beginning of the I<next> character of C<s1>
4055 beyond what was matched. Correspondingly for C<pe2> and C<s2>.
4057 For case-insensitiveness, the "casefolding" of Unicode is used
4058 instead of upper/lowercasing both the characters, see
4059 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4061 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4062 =for apidoc Cmnh||FOLDEQ_LOCALE
4063 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4064 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4065 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4066 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4070 /* A flags parameter has been added which may change, and hence isn't
4071 * externally documented. Currently it is:
4072 * 0 for as-documented above
4073 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4074 ASCII one, to not match
4075 * FOLDEQ_LOCALE is set iff the rules from the current underlying
4076 * locale are to be used.
4077 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4078 * routine. This allows that step to be skipped.
4079 * Currently, this requires s1 to be encoded as UTF-8
4080 * (u1 must be true), which is asserted for.
4081 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
4082 * cross certain boundaries. Hence, the caller should
4083 * let this function do the folding instead of
4084 * pre-folding. This code contains an assertion to
4085 * that effect. However, if the caller knows what
4086 * it's doing, it can pass this flag to indicate that,
4087 * and the assertion is skipped.
4088 * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4089 * to s2, and s2 doesn't have to be UTF-8 encoded.
4090 * This introduces an asymmetry to save a few branches
4091 * in a loop. Currently, this is not a problem, as
4092 * never are both inputs pre-folded. Simply call this
4093 * function with the pre-folded one as the second
4095 * FOLDEQ_S2_FOLDS_SANE
4099 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4100 const char *s2, char **pe2, UV l2, bool u2,
4103 const U8 *p1 = (const U8*)s1; /* Point to current char */
4104 const U8 *p2 = (const U8*)s2;
4105 const U8 *g1 = NULL; /* goal for s1 */
4106 const U8 *g2 = NULL;
4107 const U8 *e1 = NULL; /* Don't scan s1 past this */
4108 U8 *f1 = NULL; /* Point to current folded */
4109 const U8 *e2 = NULL;
4111 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
4112 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4113 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4114 U8 flags_for_folder = FOLD_FLAGS_FULL;
4116 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4118 assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4119 && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
4120 && !(flags & FOLDEQ_S1_FOLDS_SANE))
4121 || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
4122 && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
4123 /* The algorithm is to trial the folds without regard to the flags on
4124 * the first line of the above assert(), and then see if the result
4125 * violates them. This means that the inputs can't be pre-folded to a
4126 * violating result, hence the assert. This could be changed, with the
4127 * addition of extra tests here for the already-folded case, which would
4128 * slow it down. That cost is more than any possible gain for when these
4129 * flags are specified, as the flags indicate /il or /iaa matching which
4130 * is less common than /iu, and I (khw) also believe that real-world /il
4131 * and /iaa matches are most likely to involve code points 0-255, and this
4132 * function only under rare conditions gets called for 0-255. */
4134 if (flags & FOLDEQ_LOCALE) {
4135 if (IN_UTF8_CTYPE_LOCALE) {
4136 if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4137 flags_for_folder |= FOLD_FLAGS_LOCALE;
4140 flags &= ~FOLDEQ_LOCALE;
4144 flags_for_folder |= FOLD_FLAGS_LOCALE;
4147 if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4148 flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4156 g1 = (const U8*)s1 + l1;
4164 g2 = (const U8*)s2 + l2;
4167 /* Must have at least one goal */
4172 /* Will never match if goal is out-of-bounds */
4173 assert(! e1 || e1 >= g1);
4175 /* Here, there isn't an end pointer, or it is beyond the goal. We
4176 * only go as far as the goal */
4180 assert(e1); /* Must have an end for looking at s1 */
4183 /* Same for goal for s2 */
4185 assert(! e2 || e2 >= g2);
4192 /* If both operands are already folded, we could just do a memEQ on the
4193 * whole strings at once, but it would be better if the caller realized
4194 * this and didn't even call us */
4196 /* Look through both strings, a character at a time */
4197 while (p1 < e1 && p2 < e2) {
4199 /* If at the beginning of a new character in s1, get its fold to use
4200 * and the length of the fold. */
4202 if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4208 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4210 /* We have to forbid mixing ASCII with non-ASCII if the
4211 * flags so indicate. And, we can short circuit having to
4212 * call the general functions for this common ASCII case,
4213 * all of whose non-locale folds are also ASCII, and hence
4214 * UTF-8 invariants, so the UTF8ness of the strings is not
4216 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4220 *foldbuf1 = toFOLD(*p1);
4223 _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4225 else { /* Not UTF-8, get UTF-8 fold */
4226 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4232 if (n2 == 0) { /* Same for s2 */
4233 if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4235 /* Point to the already-folded character. But for non-UTF-8
4236 * variants, convert to UTF-8 for the algorithm below */
4237 if (UTF8_IS_INVARIANT(*p2)) {
4246 foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4247 foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4253 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4254 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4258 *foldbuf2 = toFOLD(*p2);
4261 _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4264 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4270 /* Here f1 and f2 point to the beginning of the strings to compare.
4271 * These strings are the folds of the next character from each input
4272 * string, stored in UTF-8. */
4274 /* While there is more to look for in both folds, see if they
4275 * continue to match */
4277 U8 fold_length = UTF8SKIP(f1);
4278 if (fold_length != UTF8SKIP(f2)
4279 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4280 function call for single
4282 || memNE((char*)f1, (char*)f2, fold_length))
4284 return 0; /* mismatch */
4287 /* Here, they matched, advance past them */
4294 /* When reach the end of any fold, advance the input past it */
4296 p1 += u1 ? UTF8SKIP(p1) : 1;
4299 p2 += u2 ? UTF8SKIP(p2) : 1;
4301 } /* End of loop through both strings */
4303 /* A match is defined by each scan that specified an explicit length
4304 * reaching its final goal, and the other not having matched a partial
4305 * character (which can happen when the fold of a character is more than one
4307 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4311 /* Successful match. Set output pointers */
4322 * ex: set ts=8 sts=4 sw=4 et: