static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/* strlen() of a literal string constant. We might want this more general,
- * but using it in just this file for now. A problem with more generality is
- * the compiler warnings about comparing unlike signs */
-#define STRLENs(s) (sizeof("" s "") - 1)
-
/*
These are various utility functions for manipulating UTF8-encoded
strings. For the uninitiated, this is a method of representing arbitrary
within non-zero characters.
*/
-/* helper for Perl__force_out_malformed_utf8_message(). Like
- * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
- * PL_compiling */
-
-static void
-S_restore_cop_warnings(pTHX_ void *p)
-{
- free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
-}
-
-
void
Perl__force_out_malformed_utf8_message(pTHX_
const U8 *const p, /* First byte in UTF-8 sequence */
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
if (PL_curcop) {
- /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
- * than PL_compiling */
- SAVEDESTRUCTOR_X(S_restore_cop_warnings,
- (void*)PL_curcop->cop_warnings);
+ SAVECURCOPWARNINGS();
PL_curcop->cop_warnings = pWARN_ALL;
}
const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
" may not be portable";
-#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
- STMT_START { \
- if (flags & UNICODE_WARN_SURROGATE) { \
- U32 category = packWARN(WARN_SURROGATE); \
- const char * format = surrogate_cp_format; \
- if (msgs) { \
- *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
- category, \
- UNICODE_GOT_SURROGATE); \
- } \
- else { \
- Perl_ck_warner_d(aTHX_ category, format, uv); \
- } \
- } \
- if (flags & UNICODE_DISALLOW_SURROGATE) { \
- return NULL; \
- } \
- } STMT_END;
-
-#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
- STMT_START { \
- if (flags & UNICODE_WARN_NONCHAR) { \
- U32 category = packWARN(WARN_NONCHAR); \
- const char * format = nonchar_cp_format; \
- if (msgs) { \
- *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \
- category, \
- UNICODE_GOT_NONCHAR); \
- } \
- else { \
- Perl_ck_warner_d(aTHX_ category, format, uv); \
- } \
- } \
- if (flags & UNICODE_DISALLOW_NONCHAR) { \
- return NULL; \
- } \
- } STMT_END;
-
/* Use shorter names internally in this file */
#define SHIFT UTF_ACCUMULATION_SHIFT
#undef MARK
/* Undocumented; we don't want people using this. Instead they should use
* uvchr_to_utf8_flags_msgs() */
U8 *
-Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
+Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
{
+ U8 *p;
+ UV shifted_uv = input_uv;
+ STRLEN utf8_skip = OFFUNISKIP(input_uv);
+
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
if (msgs) {
*msgs = NULL;
}
- if (OFFUNI_IS_INVARIANT(uv)) {
- *d++ = LATIN1_TO_NATIVE(uv);
+ switch (utf8_skip) {
+ case 1:
+ *d++ = LATIN1_TO_NATIVE(input_uv);
return d;
- }
- if (uv <= MAX_UTF8_TWO_BYTE) {
- *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
- *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
- return d;
- }
+ default:
+ if ( UNLIKELY(input_uv > MAX_LEGAL_CP
+ && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
+ {
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
+ NULL, 0, input_uv));
+ }
- /* Not 2-byte; test for and handle 3-byte result. In the test immediately
- * below, the 16 is for start bytes E0-EF (which are all the possible ones
- * for 3 byte characters). The 2 is for 2 continuation bytes; these each
- * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
- * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
- * 0x800-0xFFFF on ASCII */
- if (uv < (16 * (1U << (2 * SHIFT)))) {
- *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
- *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
-
-#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
- aren't tested here */
- /* The most likely code points in this range are below the surrogates.
- * Do an extra test to quickly exclude those. */
- if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
- if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
- || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
- {
- HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
+ if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
+ U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
+ const char * format = PL_extended_cp_format;
+ if (msgs) {
+ *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+ category,
+ UNICODE_GOT_PERL_EXTENDED);
}
- else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
- HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+ else {
+ Perl_ck_warner_d(aTHX_ category, format, input_uv);
}
+
+ /* Don't output a 2nd msg */
+ flags &= ~UNICODE_WARN_SUPER;
}
-#endif
- return d;
- }
- /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
- * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
- * happen starting with 4-byte characters on ASCII platforms. We unify the
- * code for these with EBCDIC, even though some of them require 5-bytes on
- * those, because khw believes the code saving is worth the very slight
- * performance hit on these high EBCDIC code points. */
+ if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
+ return NULL;
+ }
- if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
- if (UNLIKELY( uv > MAX_LEGAL_CP
- && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
- {
- Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
+ p = d + utf8_skip - 1;
+ while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
+ *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
}
- if ( (flags & UNICODE_WARN_SUPER)
- || ( (flags & UNICODE_WARN_PERL_EXTENDED)
- && UNICODE_IS_PERL_EXTENDED(uv)))
- {
- const char * format = super_cp_format;
- U32 category = packWARN(WARN_NON_UNICODE);
- U32 flag = UNICODE_GOT_SUPER;
-
- /* Choose the more dire applicable warning */
- if (UNICODE_IS_PERL_EXTENDED(uv)) {
- format = PL_extended_cp_format;
- category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
- if (flags & (UNICODE_WARN_PERL_EXTENDED
- |UNICODE_DISALLOW_PERL_EXTENDED))
- {
- flag = UNICODE_GOT_PERL_EXTENDED;
+
+ /* FALLTHROUGH */
+
+ case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
+ = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
+
+ case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
+ = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
+
+ case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
+ if (flags & UNICODE_WARN_SUPER) {
+ U32 category = packWARN(WARN_NON_UNICODE);
+ const char * format = super_cp_format;
+
+ if (msgs) {
+ *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+ category,
+ UNICODE_GOT_SUPER);
+ }
+ else {
+ Perl_ck_warner_d(aTHX_ category, format, input_uv);
}
- }
- if (msgs) {
- *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
- category, flag);
+ if (flags & UNICODE_DISALLOW_SUPER) {
+ return NULL;
+ }
}
- else if ( ckWARN_d(WARN_NON_UNICODE)
- || ( (flag & UNICODE_GOT_PERL_EXTENDED)
- && ckWARN(WARN_PORTABLE)))
+ if ( (flags & UNICODE_DISALLOW_SUPER)
+ || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
+ && UNICODE_IS_PERL_EXTENDED(input_uv)))
{
- Perl_warner(aTHX_ category, format, uv);
+ return NULL;
}
}
- if ( (flags & UNICODE_DISALLOW_SUPER)
- || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED)
- && UNICODE_IS_PERL_EXTENDED(uv)))
- {
- return NULL;
- }
- }
- else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
- HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
- }
- /* Test for and handle 4-byte result. In the test immediately below, the
- * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
- * characters). The 3 is for 3 continuation bytes; these each contribute
- * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
- * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
- * 0x1_0000-0x1F_FFFF on ASCII */
- if (uv < (8 * (1U << (3 * SHIFT)))) {
- *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
- *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
- *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
-
-#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
- characters. The end-plane non-characters for EBCDIC were
- handled just above */
- if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
- HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
- }
- else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
- HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
+ d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
+ = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
+
+ case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
+ if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
+ if (flags & UNICODE_WARN_NONCHAR) {
+ U32 category = packWARN(WARN_NONCHAR);
+ const char * format = nonchar_cp_format;
+ if (msgs) {
+ *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+ category,
+ UNICODE_GOT_NONCHAR);
+ }
+ else {
+ Perl_ck_warner_d(aTHX_ category, format, input_uv);
+ }
+ }
+ if (flags & UNICODE_DISALLOW_NONCHAR) {
+ return NULL;
+ }
+ }
+ else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
+ if (flags & UNICODE_WARN_SURROGATE) {
+ U32 category = packWARN(WARN_SURROGATE);
+ const char * format = surrogate_cp_format;
+ if (msgs) {
+ *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+ category,
+ UNICODE_GOT_SURROGATE);
+ }
+ else {
+ Perl_ck_warner_d(aTHX_ category, format, input_uv);
+ }
+ }
+ if (flags & UNICODE_DISALLOW_SURROGATE) {
+ return NULL;
+ }
+ }
}
-#endif
- return d;
- }
+ d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
+ = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
- /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
- * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
- * format. The unrolled version above turns out to not save all that much
- * time, and at these high code points (well above the legal Unicode range
- * on ASCII platforms, and well above anything in common use in EBCDIC),
- * khw believes that less code outweighs slight performance gains. */
+#ifdef EBCDIC
- {
- STRLEN len = OFFUNISKIP(uv);
- U8 *p = d+len-1;
- while (p > d) {
- *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
- uv >>= SHIFT;
- }
- *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
- return d+len;
+ case 3:
+ d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ /* FALLTHROUGH */
+
+#endif
+
+ /* FALLTHROUGH */
+ case 2:
+ d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+ shifted_uv >>= SHIFT;
+ d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
+ | UTF_START_MARK(utf8_skip));
+ break;
}
+
+ return d + utf8_skip;
}
/*
return uvchr_to_utf8_flags(d, uv, flags);
}
-#ifndef UV_IS_QUAD
-
-STATIC int
-S_is_utf8_cp_above_31_bits(const U8 * const s,
- const U8 * const e,
- const bool consider_overlongs)
-{
- /* Returns TRUE if the first code point represented by the Perl-extended-
- * UTF-8-encoded string starting at 's', and looking no further than 'e -
- * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
- *
- * The function handles the case where the input bytes do not include all
- * the ones necessary to represent a full character. That is, they may be
- * the intial bytes of the representation of a code point, but possibly
- * the final ones necessary for the complete representation may be beyond
- * 'e - 1'.
- *
- * The function also can handle the case where the input is an overlong
- * sequence. If 'consider_overlongs' is 0, the function assumes the
- * input is not overlong, without checking, and will return based on that
- * assumption. If this parameter is 1, the function will go to the trouble
- * of figuring out if it actually evaluates to above or below 31 bits.
- *
- * The sequence is otherwise assumed to be well-formed, without checking.
- */
-
- const STRLEN len = e - s;
- int is_overlong;
-
- PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
-
- assert(! UTF8_IS_INVARIANT(*s) && e > s);
-
-#ifdef EBCDIC
-
- PERL_UNUSED_ARG(consider_overlongs);
-
- /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
- * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can
- * also be the start byte for a 31-bit code point; we need at least 2
- * bytes, and maybe up through 8 bytes, to determine that. (It can also be
- * the start byte for an overlong sequence, but for 30-bit or smaller code
- * points, so we don't have to worry about overlongs on EBCDIC.) */
- if (*s != 0xFE) {
- return 0;
- }
-
- if (len == 1) {
- return -1;
- }
-
-#else
-
- /* On ASCII, FE and FF are the only start bytes that can evaluate to
- * needing more than 31 bits. */
- if (LIKELY(*s < 0xFE)) {
- return 0;
- }
-
- /* What we have left are FE and FF. Both of these require more than 31
- * bits unless they are for overlongs. */
- if (! consider_overlongs) {
- return 1;
- }
-
- /* Here, we have FE or FF. If the input isn't overlong, it evaluates to
- * above 31 bits. But we need more than one byte to discern this, so if
- * passed just the start byte, it could be an overlong evaluating to
- * smaller */
- if (len == 1) {
- return -1;
- }
-
- /* Having excluded len==1, and knowing that FE and FF are both valid start
- * bytes, we can call the function below to see if the sequence is
- * overlong. (We don't need the full generality of the called function,
- * but for these huge code points, speed shouldn't be a consideration, and
- * the compiler does have enough information, since it's static to this
- * file, to optimize to just the needed parts.) */
- is_overlong = is_utf8_overlong(s, len);
-
- /* If it isn't overlong, more than 31 bits are required. */
- if (is_overlong == 0) {
- return 1;
- }
-
- /* If it is indeterminate if it is overlong, return that */
- if (is_overlong < 0) {
- return -1;
- }
-
- /* Here is overlong. Such a sequence starting with FE is below 31 bits, as
- * the max it can be is 2**31 - 1 */
- if (*s == 0xFE) {
- return 0;
- }
-
-#endif
-
- /* Here, ASCII and EBCDIC rejoin:
- * On ASCII: We have an overlong sequence starting with FF
- * On EBCDIC: We have a sequence starting with FE. */
-
- { /* For C89, use a block so the declaration can be close to its use */
-
-#ifdef EBCDIC
-
- /* U+7FFFFFFF (2 ** 31 - 1)
- * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
- * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
- * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
- * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
- * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
- * U+80000000 (2 ** 31):
- * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
- *
- * and since we know that *s = \xfe, any continuation sequcence
- * following it that is gt the below is above 31 bits
- [0] [1] [2] [3] [4] [5] [6] */
- const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
-
-#else
-
- /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
- * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
- * FF overlong for U+80000000 (2 ** 31):
- * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
- * and since we know that *s = \xff, any continuation sequcence
- * following it that is gt the below is above 30 bits
- [0] [1] [2] [3] [4] [5] [6] */
- const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
-
-
-#endif
- const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
- const STRLEN cmp_len = MIN(conts_len, len - 1);
-
- /* Now compare the continuation bytes in s with the ones we have
- * compiled in that are for the largest 30 bit code point. If we have
- * enough bytes available to determine the answer, or the bytes we do
- * have differ from them, we can compare the two to get a definitive
- * answer (Note that in UTF-EBCDIC, the two lowest possible
- * continuation bytes are \x41 and \x42.) */
- if (cmp_len >= conts_len || memNE(s + 1,
- conts_for_highest_30_bit,
- cmp_len))
- {
- return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
- }
-
- /* Here, all the bytes we have are the same as the highest 30-bit code
- * point, but we are missing so many bytes that we can't make the
- * determination */
- return -1;
- }
-}
-
-#endif
-
PERL_STATIC_INLINE int
S_is_utf8_overlong(const U8 * const s, const STRLEN len)
{
* convert each byte to I8, but it's very rare input indeed that would
* approach overflow, so the loop below will likely only get executed once.)
*
- * 'e' - 1 must not be beyond a full character. */
-
+ */
+ const STRLEN len = e - s;
+ const U8 *x;
+ const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
+ int is_overlong = 0;
PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
- assert(s <= e && s + UTF8SKIP(s) >= e);
-#if ! defined(UV_IS_QUAD)
+ for (x = s; x < e; x++, y++) {
- return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
+ /* 'y' is set up to not include the trailing bytes that are all the
+ * maximum possible continuation byte. So when we reach the end of 'y'
+ * (known to be NUL terminated), it is impossible for 'x' to contain
+ * bytes larger than those omitted bytes, and therefore 'x' can't
+ * overflow */
+ if (*y == '\0') {
+ return 0;
+ }
-#else
+ /* If this byte is less than the corresponding highest non-overflowing
+ * UTF-8, the sequence doesn't overflow */
+ if (NATIVE_UTF8_TO_I8(*x) < *y) {
+ return 0;
+ }
- PERL_UNUSED_ARG(consider_overlongs);
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+ goto overflows_if_not_overlong;
+ }
+ }
- {
- const STRLEN len = e - s;
- const U8 *x;
- const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
-
- for (x = s; x < e; x++, y++) {
-
- /* 'y' is set up to not include the trailing bytes that are all the
- * maximum possible continuation byte. So when we reach the end of
- * 'y' (known to be NUL terminated), it is impossible for 'x' to
- * contain bytes larger than those omitted bytes, and therefore 'x'
- * can't overflow */
- if (*y == '\0') {
- return 0;
- }
+ /* Got to the end, and all bytes are the same. If the input is a whole
+ * character, it doesn't overflow. And if it is a partial character,
+ * there's not enough information to tell */
+ return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
- if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
- continue;
- }
+ overflows_if_not_overlong:
- /* If this byte is larger than the corresponding highest UTF-8
- * byte, the sequence overflow; otherwise the byte is less than,
- * and so the sequence doesn't overflow */
- return NATIVE_UTF8_TO_I8(*x) > *y;
+ /* Here, a well-formed sequence overflows. If we are assuming
+ * well-formedness, return that it overflows. */
+ if (! consider_overlongs) {
+ return 1;
+ }
- }
+ /* Here, it could be the overlong malformation, and might not actually
+ * overflow if you were to calculate it out.
+ *
+ * See if it actually is overlong */
+ is_overlong = is_utf8_overlong(s, len);
- /* Got to the end and all bytes are the same. If the input is a whole
- * character, it doesn't overflow. And if it is a partial character,
- * there's not enough information to tell */
- if (len < STRLENs(HIGHEST_REPRESENTABLE_UTF)) {
- return -1;
- }
+ /* If it isn't overlong, is well-formed, so overflows */
+ if (is_overlong == 0) {
+ return 1;
+ }
+
+ /* Not long enough to determine */
+ if (is_overlong < 0) {
+ return -1;
+ }
+
+ /* Here, it appears to overflow, but it is also overlong */
+
+#if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
+
+ /* On many platforms, it is impossible for an overlong to overflow. For
+ * these, no further work is necessary: we can return immediately that this
+ * overlong that is an apparent overflow actually isn't
+ *
+ * To see why, note that a length_N sequence can represent as overlongs all
+ * the code points representable by shorter length sequences, but no
+ * higher. If it could represent a higher code point without being an
+ * overlong, we wouldn't have had to increase the sequence length!
+ *
+ * The highest possible start byte is FF; the next highest is FE. The
+ * highest code point representable as an overlong on the platform is thus
+ * the highest code point representable by a non-overlong sequence whose
+ * start byte is FE. If that value doesn't overflow the platform's word
+ * size, overlongs can't overflow.
+ *
+ * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
+ * information (the high 7 bits, all ones, say that the sequence is 7 bytes
+ * long, and the bottom, zero, bit is s placeholder. That leaves the 6
+ * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
+ If that number of bits doesn't exceed the word size, it can't overflow. */
+
+ return 0;
+
+#else
+ /* In practice, only a 32-bit ASCII box gets here. The FE start byte can
+ * represent, as an overlong, the highest code point representable by an FD
+ * start byte, which is 5*6 continuation bytes of info plus one bit from
+ * the start byte, or 31 bits. That doesn't overflow. More explicitly:
+ * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
+ *
+ * That means only the FF start byte can have an overflowing overlong. */
+ if (*s < 0xFF) {
return 0;
}
+ /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
+ * evaluates to 2**31, so overflows an IV. For a UV it's
+ * \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
+# define OVERFLOWS "\xff\x80\x80\x80\x80\x80\x80\x82"
+
+ if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) { /* Not enough info */
+ return -1;
+ }
+
+# define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
+
+ return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
+
#endif
}
-#undef FF_OVERLONG_PREFIX
-
STRLEN
-Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
{
- STRLEN len;
- const U8 *x;
+ SSize_t len, full_len;
- /* A helper function that should not be called directly.
- *
- * This function returns non-zero if the string beginning at 's' and
- * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
- * code point; otherwise it returns 0. The examination stops after the
- * first code point in 's' is validated, not looking at the rest of the
- * input. If 'e' is such that there are not enough bytes to represent a
- * complete code point, this function will return non-zero anyway, if the
- * bytes it does have are well-formed UTF-8 as far as they go, and aren't
- * excluded by 'flags'.
- *
- * A non-zero return gives the number of bytes required to represent the
- * code point. Be aware that if the input is for a partial character, the
- * return will be larger than 'e - s'.
- *
- * This function assumes that the code point represented is UTF-8 variant.
- * The caller should have excluded the possibility of it being invariant
- * before calling this function.
+ /* An internal helper function.
*
+ * On input:
+ * 's' is a string, which is known to be syntactically valid UTF-8 as far
+ * as (e - 1); e > s must hold.
+ * 'e' This function is allowed to look at any byte from 's'...'e-1', but
+ * nowhere else. The function has to cope as best it can if that
+ * sequence does not form a full character.
* 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
- * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
- * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
- * disallowed by the flags. If the input is only for a partial character,
- * the function will return non-zero if there is any sequence of
- * well-formed UTF-8 that, when appended to the input sequence, could
- * result in an allowed code point; otherwise it returns 0. Non characters
- * cannot be determined based on partial character input. But many of the
- * other excluded types can be determined with just the first one or two
- * bytes.
+ * accepted by L</utf8n_to_uvchr>. If non-zero, this function returns
+ * 0 if it determines the input will match something disallowed.
+ * On output:
+ * The return is the number of bytes required to represent the code point
+ * if it isn't disallowed by 'flags'; 0 otherwise. Be aware that if the
+ * input is for a partial character, a successful return will be larger
+ * than 'e - s'.
+ *
+ * If *s..*(e-1) is only for a partial character, the function will return
+ * non-zero if there is any sequence of well-formed UTF-8 that, when
+ * appended to the input sequence, could result in an allowed code point;
+ * otherwise it returns 0. Non characters cannot be determined based on
+ * partial character input. But many of the other excluded types can be
+ * determined with just the first one or two bytes.
*
*/
- PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
+ assert(e > s);
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
- assert(! UTF8_IS_INVARIANT(*s));
- /* A variant char must begin with a start byte */
- if (UNLIKELY(! UTF8_IS_START(*s))) {
- return 0;
- }
+ full_len = UTF8SKIP(s);
- /* Examine a maximum of a single whole code point */
- if (e - s > UTF8SKIP(s)) {
- e = s + UTF8SKIP(s);
+ len = e - s;
+ if (len > full_len) {
+ e = s + full_len;
+ len = full_len;
}
- len = e - s;
+ switch (full_len) {
+ bool is_super;
- if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
- const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+ default: /* Extended */
+ if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
+ return 0;
+ }
- /* Here, we are disallowing some set of largish code points, and the
- * first byte indicates the sequence is for a code point that could be
- * in the excluded set. We generally don't have to look beyond this or
- * the second byte to see if the sequence is actually for one of the
- * excluded classes. The code below is derived from this table:
- *
- * UTF-8 UTF-EBCDIC I8
- * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
- * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
- * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
- *
- * Keep in mind that legal continuation bytes range between \x80..\xBF
- * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
- * continuation bytes. Hence, we don't have to test the upper edge
- * because if any of those is encountered, the sequence is malformed,
- * and would fail elsewhere in this function.
- *
- * The code here likewise assumes that there aren't other
- * malformations; again the function should fail elsewhere because of
- * these. For example, an overlong beginning with FC doesn't actually
- * have to be a super; it could actually represent a small code point,
- * even U+0000. But, since overlongs (and other malformations) are
- * illegal, the function should return FALSE in either case.
- */
-
-#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
-# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
-# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
-
-# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
- /* B6 and B7 */ \
- && ((s1) & 0xFE ) == 0xB6)
-#else
-# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
-# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
-# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
-#endif
+ /* FALLTHROUGH */
- if ( (flags & UTF8_DISALLOW_SUPER)
- && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
- {
- return 0; /* Above Unicode */
+ case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
+ case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT: /* above Unicode */
+
+ if (flags & UTF8_DISALLOW_SUPER) {
+ return 0; /* Above Unicode */
}
- if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
- && UNLIKELY(UTF8_IS_PERL_EXTENDED(s)))
+ return full_len;
+
+ case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
+ is_super = ( UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
+ || ( len > 1
+ && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
+ && NATIVE_UTF8_TO_I8(s[1])
+ >= UTF_FIRST_CONT_BYTE_110000_));
+ if (is_super) {
+ if (flags & UTF8_DISALLOW_SUPER) {
+ return 0;
+ }
+ }
+ else if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && len == full_len
+ && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
{
return 0;
}
- if (len > 1) {
- const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+ return full_len;
- if ( (flags & UTF8_DISALLOW_SUPER)
- && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
- {
- return 0; /* Above Unicode */
- }
+ case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
- if ( (flags & UTF8_DISALLOW_SURROGATE)
- && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
- {
- return 0; /* Surrogate */
- }
+ if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
+ return full_len;
+ }
- if ( (flags & UTF8_DISALLOW_NONCHAR)
- && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
- {
- return 0; /* Noncharacter code point */
- }
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(is_SURROGATE_utf8(s)))
+ {
+ return 0; /* Surrogate */
}
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && len == full_len
+ && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
+ {
+ return 0;
+ }
+
+ return full_len;
+
+ /* The lower code points don't have any disallowable characters */
+#ifdef EBCDIC
+ case 3:
+ return full_len;
+#endif
+
+ case 2:
+ case 1:
+ return full_len;
}
+}
+
+Size_t
+Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
+ const bool require_partial)
+{
+ /* This is called to determine if the UTF-8 sequence starting at s0 and
+ * continuing for up to one full character of bytes, but looking no further
+ * than 'e - 1', is legal. *s0 must be 0xFF (or whatever the native
+ * equivalent of FF in I8 on EBCDIC platforms is). This marks it as being
+ * for the largest code points recognized by Perl, the ones that require
+ * the most UTF-8 bytes per character to represent (somewhat less than
+ * twice the size of the next longest kind). This sequence will only ever
+ * be Perl extended UTF-8.
+ *
+ * The routine returns 0 if the sequence is not fully valid, syntactically
+ * or semantically. That means it checks that everything following the
+ * start byte is a continuation byte, and that it doesn't overflow, nor is
+ * an overlong representation.
+ *
+ * If 'require_partial' is FALSE, the routine returns non-zero only if the
+ * input (as far as 'e-1') is a full character. The return is the count of
+ * the bytes in the character.
+ *
+ * If 'require_partial' is TRUE, the routine returns non-zero only if the
+ * input as far as 'e-1' is a partial, not full character, with no
+ * malformations found before position 'e'. The return is either just
+ * FALSE, or TRUE. */
+
+ const U8 *s = s0 + 1;
+ const U8 *send = e;
+
+ PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
+
+ assert(s0 < e);
+ assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
- /* Make sure that all that follows are continuation bytes */
- for (x = s + 1; x < e; x++) {
- if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ send = s + MIN(UTF8_MAXBYTES - 1, e - s);
+ while (s < send) {
+ if (! UTF8_IS_CONTINUATION(*s)) {
return 0;
}
+
+ s++;
}
- /* Here is syntactically valid. Next, make sure this isn't the start of an
- * overlong. */
- if (is_utf8_overlong(s, len) > 0) {
+ if (0 < does_utf8_overflow(s0, e,
+ FALSE /* Don't consider_overlongs */
+ )) {
return 0;
}
- /* And finally, that the code point represented fits in a word on this
- * platform */
- if (0 < does_utf8_overflow(s, e,
- 0 /* Don't consider overlongs */
- ))
- {
+ if (0 < isFF_overlong(s0, e - s0)) {
return 0;
}
- return UTF8SKIP(s);
+ /* Here, the character is valid as far as it got. Check if got a partial
+ * character */
+ if (s - s0 < UTF8_MAXBYTES) {
+ return (require_partial) ? 1 : 0;
+ }
+
+ /* Here, got a full character */
+ return (require_partial) ? 0 : UTF8_MAXBYTES;
}
char *
*/
s = s0;
- uv = *s0;
possible_problems = 0;
expectlen = 0;
avail_len = 0;
goto ready_to_handle_errors;
}
+ /* We now know we can examine the first byte of the input */
expectlen = UTF8SKIP(s);
+ uv = *s;
/* A well-formed UTF-8 character, as the vast majority of calls to this
* function will be for, has this expected length. For efficiency, set
- * things up here to return it. It will be overriden only in those rare
+ * things up here to return it. It will be overridden only in those rare
* cases where a malformation is found */
if (retlen) {
*retlen = expectlen;
/* uv is valid for overlongs */
if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
-
- /* isn't problematic if < this */
- && uv >= UNICODE_SURROGATE_FIRST)
+ && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
|| ( UNLIKELY(possible_problems)
/* if overflow, we know without looking further
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
possible_problems |= UTF8_GOT_SURROGATE;
}
- else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+ else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
possible_problems |= UTF8_GOT_SUPER;
}
else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
adjusted to be non-overlong */
if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
- >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ > UTF_START_BYTE_110000_))
{
possible_problems |= UTF8_GOT_SUPER;
}
else if (curlen > 1) {
- if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
- NATIVE_UTF8_TO_I8(*adjusted_s0),
- NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
+ == UTF_START_BYTE_110000_
+ && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
+ >= UTF_FIRST_CONT_BYTE_110000_))
{
possible_problems |= UTF8_GOT_SUPER;
}
- else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
- NATIVE_UTF8_TO_I8(*adjusted_s0),
- NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
- {
+ else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
possible_problems |= UTF8_GOT_SURROGATE;
}
}
* avail_len gives the available number of bytes passed in, but
* only if this is less than the expected number of
* bytes, based on the code point's start byte.
- * possible_problems' is 0 if there weren't any problems; otherwise a bit
+ * possible_problems is 0 if there weren't any problems; otherwise a bit
* is set in it for each potential problem found.
* uv contains the code point the input sequence
* represents; or if there is a problem that prevents
* a well-defined value from being computed, it is
- * some subsitute value, typically the REPLACEMENT
+ * some substitute value, typically the REPLACEMENT
* CHARACTER.
* s0 points to the first byte of the character
- * s points to just after were we left off processing
+ * s points to just after where we left off processing
* the character
* send points to just after where that character should
* end, based on how many bytes the start byte tells
uv, 0);
/* Don't use U+ for non-Unicode code points, which
* includes those in the Latin1 range */
- const char * preface = ( uv > PERL_UNICODE_MAX
+ const char * preface = ( UNICODE_IS_SUPER(uv)
#ifdef EBCDIC
- || uv <= 0xFF
+ || uv <= 0xFF
#endif
- )
+ )
? "0x"
: "U+";
message = Perl_form(aTHX_
/* Since there was a possible problem, the returned length may need to
* be changed from the one stored at the beginning of this function.
- * Instead of trying to figure out if that's needed, just do it. */
+ * Instead of trying to figure out if it has changed, just do it. */
if (retlen) {
*retlen = curlen;
}
return utf8_to_uvchr_buf_helper(s, send, retlen);
}
-/* This is marked as deprecated
- *
-=for apidoc utf8_to_uvuni_buf
-
-Only in very rare circumstances should code need to be dealing in Unicode
-(as opposed to native) code points. In those few cases, use
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
-If you are not absolutely sure this is one of those cases, then assume it isn't
-and use plain C<utf8_to_uvchr_buf> instead.
-
-Returns the Unicode (not-native) code point of the first character in the
-string C<s> which
-is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
-C<retlen> will be set to the length, in bytes, of that character.
-
-If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-NULL) to -1. If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
-returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
-{
- PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
-
- assert(send > s);
-
- return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
-}
-
/*
=for apidoc utf8_length
return s;
}
- {
- U8 * const save = s;
- U8 * const send = s + *lenp;
- U8 * d;
-
- /* Nothing before the first variant needs to be changed, so start the real
- * work there */
- s = first_variant;
- while (s < send) {
+ /* Nothing before 'first_variant' needs to be changed, so start the real
+ * work there */
+
+ U8 * const save = s;
+ U8 * const send = s + *lenp;
+ U8 * d;
+
+#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
+
+ /* There is some start-up/tear-down overhead with this, so no real gain
+ * unless the string is long enough. The current value is just a
+ * guess. */
+ if (*lenp > 5 * PERL_WORDSIZE) {
+
+ /* First, go through the string a word at-a-time to verify that it is
+ * downgradable. If it contains any start byte besides C2 and C3, then
+ * it isn't. */
+
+ const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
+ const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
+ const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
+
+ /* Points to the first byte >=s which is positioned at a word boundary.
+ * If s is on a word boundary, it is s, otherwise it is the first byte
+ * of the next word. */
+ U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+ - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
+
+ /* Here there is at least a full word beyond the first word boundary.
+ * Process up to that boundary. */
+ while (s < partial_word_end) {
if (! UTF8_IS_INVARIANT(*s)) {
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
*lenp = ((STRLEN) -1);
- return 0;
+ return NULL;
}
s++;
}
s++;
}
- /* Is downgradable, so do it */
- d = s = first_variant;
- while (s < send) {
- U8 c = *s++;
- if (! UVCHR_IS_INVARIANT(c)) {
- /* Then it is two-byte encoded */
- c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
- s++;
+ /* Adjust back down any overshoot */
+ s = partial_word_end;
+
+ /* Process per-word */
+ do {
+
+ PERL_UINTMAX_T C2_C3_start_bytes;
+
+ /* First find the bytes that are start bytes. ANDing with
+ * C0C0...C0 causes any start byte to become C0; any other byte
+ * becomes something else. Then XORing with C0 causes any start
+ * byte to become 0; all other bytes non-zero. */
+ PERL_UINTMAX_T start_bytes
+ = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
+
+ /* These shifts causes the most significant bit to be set to 1 for
+ * any bytes in the word that aren't completely 0. Hence after
+ * these, only the start bytes have 0 in their msb */
+ start_bytes |= start_bytes << 1;
+ start_bytes |= start_bytes << 2;
+ start_bytes |= start_bytes << 4;
+
+ /* When we complement, then AND with 8080...80, the start bytes
+ * will have 1 in their msb, and all other bits are 0 */
+ start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
+
+ /* Now repeat the procedure, but look for bytes that match only
+ * C2-C3. */
+ C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
+ ^ C2_mask;
+ C2_C3_start_bytes |= C2_C3_start_bytes << 1;
+ C2_C3_start_bytes |= C2_C3_start_bytes << 2;
+ C2_C3_start_bytes |= C2_C3_start_bytes << 4;
+ C2_C3_start_bytes = ~ C2_C3_start_bytes
+ & PERL_VARIANTS_WORD_MASK;
+
+ /* Here, start_bytes has a 1 in the msb of each byte that has a
+ * start_byte; And
+ * C2_C3_start_bytes has a 1 in the msb of each byte that has a
+ * start_byte of C2 or C3
+ * If they're not equal, there are start bytes that aren't C2
+ * nor C3, hence this is not downgradable */
+ if (start_bytes != C2_C3_start_bytes) {
+ *lenp = ((STRLEN) -1);
+ return NULL;
}
- *d++ = c;
+
+ s += PERL_WORDSIZE;
+ } while (s + PERL_WORDSIZE <= send);
+
+ /* If the final byte was a start byte, it means that the character
+ * straddles two words, so back off one to start looking below at the
+ * first byte of the character */
+ if (s > first_variant && UTF8_IS_START(*(s-1))) {
+ s--;
+ }
+ }
+
+#endif
+
+ /* Do the straggler bytes beyond the final word boundary (or all bytes
+ * in the case of EBCDIC) */
+ while (s < send) {
+ if (! UTF8_IS_INVARIANT(*s)) {
+ if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
+ *lenp = ((STRLEN) -1);
+ return NULL;
+ }
+ s++;
+ }
+ s++;
+ }
+
+ /* Here, we passed the tests above. For the EBCDIC case, everything
+ * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC,
+ * it means only that all start bytes were C2 or C3, hence any
+ * well-formed sequences are downgradable. But we didn't test, for
+ * example, that there weren't two C2's in a row. That means that in
+ * the loop below, we have to be sure things are well-formed. Because
+ * this is very very likely, and we don't care about having speedy
+ * handling of malformed input, the loop proceeds as if well formed,
+ * and should a malformed one come along, it undoes what it already has
+ * done */
+
+ d = s = first_variant;
+
+ while (s < send) {
+ U8 * s1;
+
+ if (UVCHR_IS_INVARIANT(*s)) {
+ *d++ = *s++;
+ continue;
+ }
+
+ /* Here it is two-byte encoded. */
+ if ( LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
+ && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
+ {
+ U8 first_byte = *s++;
+ *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
+ s++;
+ continue;
+ }
+
+ /* Here, it is malformed. This shouldn't happen on EBCDIC, and on
+ * ASCII platforms, we know that the only start bytes in the text
+ * are C2 and C3, and the code above has made sure that it doesn't
+ * end with a start byte. That means the only malformations that
+ * are possible are a start byte without a continuation (either
+ * followed by another start byte or an invariant) or an unexpected
+ * continuation.
+ *
+ * We have to undo all we've done before, back down to the first
+ * UTF-8 variant. Note that each 2-byte variant we've done so far
+ * (converted to single byte) slides things to the left one byte,
+ * and so we have bytes that haven't been written over.
+ *
+ * Here, 'd' points to the next position to overwrite, and 's'
+ * points to the first invalid byte. That means 'd's contents
+ * haven't been changed yet, nor has anything else beyond it in the
+ * string. In restoring to the original contents, we don't need to
+ * do anything past (d-1).
+ *
+ * In particular, the bytes from 'd' to 's' have not been changed.
+ * This loop uses a new variable 's1' (to avoid confusing 'source'
+ * and 'destination') set to 'd', and moves 's' and 's1' in lock
+ * step back so that afterwards, 's1' points to the first changed
+ * byte that will be the source for the first byte (or bytes) at
+ * 's' that need to be changed back. Note that s1 can expand to
+ * two bytes */
+ s1 = d;
+ while (s >= d) {
+ s--;
+ if (! UVCHR_IS_INVARIANT(*s1)) {
+ s--;
+ }
+ s1--;
}
- *d = '\0';
- *lenp = d - save;
- return save;
+ /* Do the changing back */
+ while (s1 >= first_variant) {
+ if (UVCHR_IS_INVARIANT(*s1)) {
+ *s-- = *s1--;
+ }
+ else {
+ *s-- = UTF8_EIGHT_BIT_LO(*s1);
+ *s-- = UTF8_EIGHT_BIT_HI(*s1);
+ s1--;
+ }
+ }
+
+ *lenp = ((STRLEN) -1);
+ return NULL;
}
+
+ /* Success! */
+ *d = '\0';
+ *lenp = d - save;
+
+ return save;
}
/*
}
/*
- * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian),
- * use utf16_to_utf8_reversed().
+ * Convert native UTF-16 to UTF-8. Called via the more public functions
+ * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
+ * little-endian,
*
- * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
- * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
- * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
+ * 'p' is the UTF-16 input string, passed as a pointer to U8.
+ * 'bytelen' is its length (must be even)
+ * 'd' is the pointer to the destination buffer. The caller must ensure that
+ * the space is large enough. The maximum expansion factor is 2 times
+ * 'bytelen'. 1.5 if never going to run on an EBCDIC box.
+ * '*newlen' will contain the number of bytes this function filled of 'd'.
+ * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
+ * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
*
- * These functions don't check for overflow. The worst case is every code
- * point in the input is 2 bytes, and requires 4 bytes on output. (If the code
- * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the
- * destination must be pre-extended to 2 times the source length.
+ * The expansion factor is because UTF-16 requires 2 bytes for every code point
+ * below 0x10000; otherwise 4 bytes. UTF-8 requires 1-3 bytes for every code
+ * point below 0x1000; otherwise 4 bytes. UTF-EBCDIC requires 1-4 bytes for
+ * every code point below 0x1000; otherwise 4-5 bytes.
*
- * Do not use in-place. We optimize for native, for obvious reasons. */
+ * The worst case is where every code point is below U+10000, hence requiring 2
+ * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
+ * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
+ *
+ * Do not use in-place. */
U8*
-Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
+Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
+ const bool high_byte, /* Which of next two bytes is
+ high order */
+ const bool low_byte)
{
U8* pend;
U8* dstart = d;
- PERL_ARGS_ASSERT_UTF16_TO_UTF8;
+ PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
if (bytelen & 1)
- Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
- (UV)bytelen);
-
+ Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
+ ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
pend = p + bytelen;
while (p < pend) {
- UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
+
+ /* Next 16 bits is what we want. (The bool is cast to U8 because on
+ * platforms where a bool is implemented as a signed char, a compiler
+ * warning may be generated) */
+ U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
p += 2;
- if (OFFUNI_IS_INVARIANT(uv)) {
- *d++ = LATIN1_TO_NATIVE((U8) uv);
- continue;
- }
- if (uv <= MAX_UTF8_TWO_BYTE) {
- *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
- *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
- continue;
- }
+
+ /* If it's a surrogate, we find the uv that the surrogate pair encodes.
+ * */
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
#define LAST_HIGH_SURROGATE 0xDBFF
#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
#define FIRST_IN_PLANE1 0x10000
- /* This assumes that most uses will be in the first Unicode plane, not
- * needing surrogates */
- if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST,
- UNICODE_SURROGATE_LAST)))
- {
if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
else {
- UV low = (p[0] << 8) + p[1];
- if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
- LAST_LOW_SURROGATE)))
+ U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
+ if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
+ LAST_LOW_SURROGATE)))
{
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
+
p += 2;
- uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
- + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
+
+ /* Here uv is the high surrogate. Combine with low surrogate
+ * just computed to form the actual U32 code point.
+ *
+ * From https://unicode.org/faq/utf_bom.html#utf16-4 */
+ uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
+ + low_surrogate - FIRST_LOW_SURROGATE;
}
}
-#ifdef EBCDIC
- d = uvoffuni_to_utf8_flags(d, uv, 0);
-#else
- if (uv < FIRST_IN_PLANE1) {
- *d++ = (U8)(( uv >> 12) | 0xe0);
- *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
- *d++ = (U8)(( uv & 0x3f) | 0x80);
- continue;
- }
- else {
- *d++ = (U8)(( uv >> 18) | 0xf0);
- *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
- *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
- *d++ = (U8)(( uv & 0x3f) | 0x80);
- continue;
- }
-#endif
+
+ /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
+ d = uvchr_to_utf8(d, uv);
}
+
*newlen = d - dstart;
return d;
}
-/* Note: this one is slightly destructive of the source. */
+U8*
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
+{
+ PERL_ARGS_ASSERT_UTF16_TO_UTF8;
+
+ return utf16_to_utf8(p, d, bytelen, newlen);
+}
U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
- U8* s = (U8*)p;
- U8* const send = s + bytelen;
-
PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
- if (bytelen & 1)
- Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
- (UV)bytelen);
+ return utf16_to_utf8_reversed(p, d, bytelen, newlen);
+}
+
+/*
+ * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
+ * big-endian and utf8_to_utf16_reversed() for little-endian,
+ *
+ * 's' is the UTF-8 input string, passed as a pointer to U8.
+ * 'bytelen' is its length
+ * 'd' is the pointer to the destination buffer, currently passed as U8 *. The
+ * caller must ensure that the space is large enough. The maximum
+ * expansion factor is 2 times 'bytelen'. This happens when the input is
+ * entirely single-byte ASCII, expanding to two-byte UTF-16.
+ * '*newlen' will contain the number of bytes this function filled of 'd'.
+ * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
+ * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
+ *
+ * Do not use in-place. */
+U8*
+Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
+ const bool high_byte, /* Which of next two bytes
+ is high order */
+ const bool low_byte)
+{
+ U8* send;
+ U8* dstart = d;
+
+ PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
+
+ send = s + bytelen;
while (s < send) {
- const U8 tmp = s[0];
- s[0] = s[1];
- s[1] = tmp;
- s += 2;
+ STRLEN retlen;
+ UV uv = utf8n_to_uvchr(s, send - s, &retlen,
+ /* No surrogates nor above-Unicode */
+ UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+
+ /* The modern method is to keep going with malformed input,
+ * substituting the REPLACEMENT CHARACTER */
+ if (UNLIKELY(uv == 0 && *s != '\0')) {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */
+
+ /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
+ U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
+ + FIRST_HIGH_SURROGATE;
+
+ /* (The bool is cast to U8 because on platforms where a bool is
+ * implemented as a signed char, a compiler warning may be
+ * generated) */
+ d[(U8) high_byte] = high_surrogate >> 8;
+ d[(U8) low_byte] = high_surrogate & nBIT_MASK(8);
+ d += 2;
+
+ /* The low surrogate is the lower 10 bits plus the offset */
+ uv &= nBIT_MASK(10);
+ uv += FIRST_LOW_SURROGATE;
+
+ /* Drop down to output the low surrogate like it were a
+ * non-surrogate */
+ }
+
+ d[(U8) high_byte] = uv >> 8;
+ d[(U8) low_byte] = uv & nBIT_MASK(8);
+ d += 2;
+
+ s += retlen;
}
- return utf16_to_utf8(p, d, bytelen, newlen);
+
+ *newlen = d - dstart;
+ return d;
}
bool
# define CF_AUX_TABLE_ptrs NULL
# define CF_AUX_TABLE_lengths NULL
#endif
-#ifndef HAS_UC_AUX_TABLES
-# define UC_AUX_TABLE_ptrs NULL
-# define UC_AUX_TABLE_lengths NULL
-#endif
/* Call the function to convert a UTF-8 encoded character to the specified case.
* Note that there may be more than one character in the result.
if (flags & FOLD_FLAGS_LOCALE) {
/* Treat a non-Turkic UTF-8 locale as not being in locale at all,
* except for potentially warning */
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
+ CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
+ if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) {
flags &= ~FOLD_FLAGS_LOCALE;
}
else {
}
STATIC UV
-S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
- U8* ustrp, STRLEN *lenp,
- SV *invlist, const I32 * const invmap,
- const U32 * const * const aux_tables,
- const U8 * const aux_table_lengths,
- const char * const normal)
+S_to_case_cp_list(pTHX_
+ const UV original,
+ const U32 ** const remaining_list,
+ Size_t * remaining_count,
+ SV *invlist, const I32 * const invmap,
+ const U32 * const * const aux_tables,
+ const U8 * const aux_table_lengths,
+ const char * const normal)
{
- STRLEN len = 0;
+ SSize_t index;
+ I32 base;
- /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
- * by this routine to be valid) begins at 'p'. 'normal' is a string to use
- * to name the new case in any generated messages, as a fallback if the
- * operation being used is not available. The new case is given by the
- * data structures in the remaining arguments.
+ /* Calculate the changed case of code point 'original'. The first code
+ * point of the changed case is returned.
*
- * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
- * entire changed case string, and the return value is the first code point
- * in that string */
+ * If 'remaining_count' is not NULL, *remaining_count will be set to how
+ * many *other* code points are in the changed case. If non-zero and
+ * 'remaining_list' is also not NULL, *remaining_list will be set to point
+ * to a non-modifiable array containing the second and potentially third
+ * code points in the changed case. (Unicode guarantees a maximum of 3.)
+ * Note that this means that *remaining_list is undefined unless there are
+ * multiple code points, and the caller has chosen to find out how many by
+ * making 'remaining_count' not NULL.
+ *
+ * 'normal' is a string to use to name the new case in any generated
+ * messages, as a fallback if the operation being used is not available.
+ *
+ * The casing to use is given by the data structures in the remaining
+ * arguments.
+ */
- PERL_ARGS_ASSERT__TO_UTF8_CASE;
+ PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
- /* For code points that don't change case, we already know that the output
- * of this function is the unchanged input, so we can skip doing look-ups
- * for them. Unfortunately the case-changing code points are scattered
- * around. But there are some long consecutive ranges where there are no
- * case changing code points. By adding tests, we can eliminate the lookup
- * for all the ones in such ranges. This is currently done here only for
- * just a few cases where the scripts are in common use in modern commerce
- * (and scripts adjacent to those which can be included without additional
- * tests). */
-
- if (uv1 >= 0x0590) {
- /* This keeps from needing further processing the code points most
- * likely to be used in the following non-cased scripts: Hebrew,
- * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
- * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
- * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
- if (uv1 < 0x10A0) {
- goto cases_to_self;
- }
+ /* 'index' is guaranteed to be non-negative, as this is an inversion map
+ * that covers all possible inputs. See [perl #133365] */
+ index = _invlist_search(invlist, original);
+ base = invmap[index];
- /* The following largish code point ranges also don't have case
- * changes, but khw didn't think they warranted extra tests to speed
- * them up (which would slightly slow down everything else above them):
- * 1100..139F Hangul Jamo, Ethiopic
- * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
- * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
- * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
- * Combining Diacritical Marks Extended, Balinese,
- * Sundanese, Batak, Lepcha, Ol Chiki
- * 2000..206F General Punctuation
- */
-
- if (uv1 >= 0x2D30) {
-
- /* This keeps the from needing further processing the code points
- * most likely to be used in the following non-cased major scripts:
- * CJK, Katakana, Hiragana, plus some less-likely scripts.
- *
- * (0x2D30 above might have to be changed to 2F00 in the unlikely
- * event that Unicode eventually allocates the unused block as of
- * v8.0 2FE0..2FEF to code points that are cased. khw has verified
- * that the test suite will start having failures to alert you
- * should that happen) */
- if (uv1 < 0xA640) {
- goto cases_to_self;
- }
+ /* Most likely, the case change will contain just a single code point */
+ if (remaining_count) {
+ *remaining_count = 0;
+ }
- if (uv1 >= 0xAC00) {
- if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
- if (ckWARN_d(WARN_SURROGATE)) {
- const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
- Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
- "Operation \"%s\" returns its argument for"
- " UTF-16 surrogate U+%04" UVXf, desc, uv1);
- }
- goto cases_to_self;
- }
+ if (LIKELY(base == 0)) { /* 0 => original was unchanged by casing */
- /* AC00..FAFF Catches Hangul syllables and private use, plus
- * some others */
- if (uv1 < 0xFB00) {
- goto cases_to_self;
+ /* At this bottom level routine is where we warn about illegal code
+ * points */
+ if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
+ if (ckWARN_d(WARN_SURROGATE)) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
+ "Operation \"%s\" returns its argument for"
+ " UTF-16 surrogate U+%04" UVXf, desc, original);
}
-
- if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
- if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
- Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
- }
- if (ckWARN_d(WARN_NON_UNICODE)) {
- const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
- Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
- "Operation \"%s\" returns its argument for"
- " non-Unicode code point 0x%04" UVXf, desc, uv1);
- }
- goto cases_to_self;
+ }
+ else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
+ if (UNLIKELY(original > MAX_LEGAL_CP)) {
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
}
-#ifdef HIGHEST_CASE_CHANGING_CP
- if (UNLIKELY(uv1 > HIGHEST_CASE_CHANGING_CP)) {
-
- goto cases_to_self;
+ if (ckWARN_d(WARN_NON_UNICODE)) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+ "Operation \"%s\" returns its argument for"
+ " non-Unicode code point 0x%04" UVXf, desc, original);
}
-#endif
}
+
+ /* Note that non-characters are perfectly legal, so no warning
+ * should be given. */
}
- /* Note that non-characters are perfectly legal, so no warning should
- * be given. */
+ return original;
}
- {
- unsigned int i;
- const U32 * cp_list;
- U8 * d;
-
- /* 'index' is guaranteed to be non-negative, as this is an inversion
- * map that covers all possible inputs. See [perl #133365] */
- SSize_t index = _invlist_search(invlist, uv1);
- I32 base = invmap[index];
-
- /* The data structures are set up so that if 'base' is non-negative,
- * the case change is 1-to-1; and if 0, the change is to itself */
- if (base >= 0) {
- IV lc;
-
- if (base == 0) {
- goto cases_to_self;
- }
+ if (LIKELY(base > 0)) { /* means original mapped to a single code point,
+ different from itself */
+ return base + original - invlist_array(invlist)[index];
+ }
- /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
- lc = base + uv1 - invlist_array(invlist)[index];
- *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
- return lc;
+ /* Here 'base' is negative. That means the mapping is 1-to-many, and
+ * requires an auxiliary table look up. abs(base) gives the index into a
+ * list of such tables which points to the proper aux table. And a
+ * parallel list gives the length of each corresponding aux table. Skip
+ * the first entry in the *remaining returns, as it is returned by the
+ * function. */
+ base = -base;
+ if (remaining_count) {
+ *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
+
+ if (remaining_list) {
+ *remaining_list = aux_tables[base] + 1;
}
+ }
- /* Here 'base' is negative. That means the mapping is 1-to-many, and
- * requires an auxiliary table look up. abs(base) gives the index into
- * a list of such tables which points to the proper aux table. And a
- * parallel list gives the length of each corresponding aux table. */
- cp_list = aux_tables[-base];
+ return (UV) aux_tables[base][0];
+}
- /* Create the string of UTF-8 from the mapped-to code points */
- d = ustrp;
- for (i = 0; i < aux_table_lengths[-base]; i++) {
- d = uvchr_to_utf8(d, cp_list[i]);
- }
- *d = '\0';
- *lenp = d - ustrp;
+STATIC UV
+S__to_utf8_case(pTHX_ const UV original, const U8 *p,
+ U8* ustrp, STRLEN *lenp,
+ SV *invlist, const I32 * const invmap,
+ const U32 * const * const aux_tables,
+ const U8 * const aux_table_lengths,
+ const char * const normal)
+{
+ /* Change the case of code point 'original'. If 'p' is non-NULL, it points to
+ * the beginning of the (assumed to be valid) UTF-8 representation of
+ * 'original'. 'normal' is a string to use to name the new case in any
+ * generated messages, as a fallback if the operation being used is not
+ * available. The new case is given by the data structures in the
+ * remaining arguments.
+ *
+ * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
+ * entire changed case string, and the return value is the first code point
+ * in that string
+ *
+ * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
+ * since the changed version may be longer than the original character. */
- return cp_list[0];
- }
+ const U32 * remaining_list;
+ Size_t remaining_count;
+ UV first = to_case_cp_list(original,
+ &remaining_list, &remaining_count,
+ invlist, invmap,
+ aux_tables, aux_table_lengths,
+ normal);
+
+ PERL_ARGS_ASSERT__TO_UTF8_CASE;
+
+ /* If the code point maps to itself and we already have its representation,
+ * copy it instead of recalculating */
+ if (original == first && p) {
+ *lenp = UTF8SKIP(p);
- /* Here, there was no mapping defined, which means that the code point maps
- * to itself. Return the inputs */
- cases_to_self:
- if (p) {
- len = UTF8SKIP(p);
if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ Copy(p, ustrp, *lenp, U8);
}
- *lenp = len;
}
else {
- *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
- }
+ U8 * d = ustrp;
+ Size_t i;
- return uv1;
+ d = uvchr_to_utf8(d, first);
+
+ for (i = 0; i < remaining_count; i++) {
+ d = uvchr_to_utf8(d, remaining_list[i]);
+ }
+
+ *d = '\0';
+ *lenp = d - ustrp;
+ }
+ return first;
}
Size_t
*
* If you read the two macros as sequential, it's easier to understand what's
* going on. */
-#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
+#define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func, \
L1_func_extra_param, turkic) \
\
if (flags & (locale_flags)) { \
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
if (IN_UTF8_CTYPE_LOCALE) { \
- if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
+ if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) { \
UV ret = turkic(p, e, ustrp, lenp); \
if (ret) return ret; \
} \
\
if (UTF8_IS_INVARIANT(*p)) { \
if (flags & (locale_flags)) { \
- result = LC_L1_change_macro(*p); \
+ result = libc_change_function(*p); \
} \
else { \
return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
if (flags & (locale_flags)) { \
- result = LC_L1_change_macro(c); \
+ result = libc_change_function(c); \
} \
else { \
return L1_func(c, ustrp, lenp, L1_func_extra_param); \
/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
/* 2nd char of uc(U+DF) is 'S' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
+ CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
}
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
/* 2nd char of ucfirst(U+DF) is 's' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
+ CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
}
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
- CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
+ CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
turkic_lc);
CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
}
assert(p != ustrp); /* Otherwise overwrites */
- CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
+ CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
turkic_fc);
if (flags & FOLDEQ_LOCALE) {
if (IN_UTF8_CTYPE_LOCALE) {
- if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+ if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {
flags_for_folder |= FOLD_FLAGS_LOCALE;
}
else {