X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/8d6204cca6dfeab549e87d3b40fa1900a8ca8a0e..5af9f82224b62a56edca6981b8638328d12ba98a:/utf8.c diff --git a/utf8.c b/utf8.c index c05866a..8920982 100644 --- a/utf8.c +++ b/utf8.c @@ -32,14 +32,18 @@ #define PERL_IN_UTF8_C #include "perl.h" #include "invlist_inline.h" +#include "uni_keywords.h" static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; + +/* Be sure to synchronize this message with the similar one in regcomp.c */ static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28"; + "Use of code point 0x%" UVXf " is not allowed; the" + " permissible max is 0x%" UVXf; -#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) +#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX)) /* =head1 Unicode Support @@ -100,6 +104,29 @@ Perl__force_out_malformed_utf8_message(pTHX_ } } +STATIC HV * +S_new_msg_hv(pTHX_ const char * const message, /* The message text */ + U32 categories, /* Packed warning categories */ + U32 flag) /* Flag associated with this message */ +{ + /* Creates, populates, and returns an HV* that describes an error message + * for the translators between UTF8 and code point */ + + SV* msg_sv = newSVpv(message, 0); + SV* category_sv = newSVuv(categories); + SV* flag_bit_sv = newSVuv(flag); + + HV* msg_hv = newHV(); + + PERL_ARGS_ASSERT_NEW_MSG_HV; + + (void) hv_stores(msg_hv, "text", msg_sv); + (void) hv_stores(msg_hv, "warn_categories", category_sv); + (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv); + + return msg_hv; +} + /* =for apidoc uvoffuni_to_utf8_flags @@ -116,6 +143,14 @@ For details, see the description for L. =cut */ +U8 * +Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) +{ + PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; + + return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL); +} + /* All these formats take a single UV code point argument */ const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf; const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf @@ -126,22 +161,38 @@ const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ " Unicode, requires a Perl extension," \ " and so is not portable"; -#define HANDLE_UNICODE_SURROGATE(uv, flags) \ +#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \ STMT_START { \ if (flags & UNICODE_WARN_SURROGATE) { \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \ - surrogate_cp_format, uv); \ + U32 category = packWARN(WARN_SURROGATE); \ + const char * format = surrogate_cp_format; \ + if (msgs) { \ + *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \ + category, \ + UNICODE_GOT_SURROGATE); \ + } \ + else { \ + Perl_ck_warner_d(aTHX_ category, format, uv); \ + } \ } \ if (flags & UNICODE_DISALLOW_SURROGATE) { \ return NULL; \ } \ } STMT_END; -#define HANDLE_UNICODE_NONCHAR(uv, flags) \ +#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \ STMT_START { \ if (flags & UNICODE_WARN_NONCHAR) { \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \ - nonchar_cp_format, uv); \ + U32 category = packWARN(WARN_NONCHAR); \ + const char * format = nonchar_cp_format; \ + if (msgs) { \ + *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), \ + category, \ + UNICODE_GOT_NONCHAR); \ + } \ + else { \ + Perl_ck_warner_d(aTHX_ category, format, uv); \ + } \ } \ if (flags & UNICODE_DISALLOW_NONCHAR) { \ return NULL; \ @@ -154,10 +205,62 @@ const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ #define MARK UTF_CONTINUATION_MARK #define MASK UTF_CONTINUATION_MASK +/* +=for apidoc uvchr_to_utf8_flags_msgs + +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. + +Most code should use C()> rather than call this directly. + +This function is for code that wants any warning and/or error messages to be +returned to the caller rather than be displayed. All messages that would have +been displayed if all lexical warnings are enabled will be returned. + +It is just like C> but it takes an extra parameter +placed after all the others, C. If this parameter is 0, this function +behaves identically to C>. Otherwise, C should +be a pointer to an C variable, in which this function creates a new HV to +contain any appropriate messages. The hash has three key-value pairs, as +follows: + +=over 4 + +=item C + +The text of the message as a C. + +=item C + +The warning category (or categories) packed into a C. + +=item C + +A single flag bit associated with this message, in a C. +The bit corresponds to some bit in the C<*errors> return value, +such as C. + +=back + +It's important to note that specifying this parameter as non-null will cause +any warnings this function would otherwise generate to be suppressed, and +instead be placed in C<*msgs>. The caller can check the lexical warnings state +(or not) when choosing what to do with the returned messages. + +The caller, of course, is responsible for freeing any returned HV. + +=cut +*/ + +/* Undocumented; we don't want people using this. Instead they should use + * uvchr_to_utf8_flags_msgs() */ U8 * -Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) +Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) { - PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; + PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS; + + if (msgs) { + *msgs = NULL; + } if (OFFUNI_IS_INVARIANT(uv)) { *d++ = LATIN1_TO_NATIVE(uv); @@ -189,10 +292,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) { - HANDLE_UNICODE_NONCHAR(uv, flags); + HANDLE_UNICODE_NONCHAR(uv, flags, msgs); } else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { - HANDLE_UNICODE_SURROGATE(uv, flags); + HANDLE_UNICODE_SURROGATE(uv, flags, msgs); } } #endif @@ -207,23 +310,34 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP); } - if ( (flags & UNICODE_WARN_SUPER) - || ( (flags & UNICODE_WARN_PERL_EXTENDED) + if ( (flags & UNICODE_WARN_SUPER) + || ( (flags & UNICODE_WARN_PERL_EXTENDED) && UNICODE_IS_PERL_EXTENDED(uv))) { - Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), + const char * format = super_cp_format; + U32 category = packWARN(WARN_NON_UNICODE); + U32 flag = UNICODE_GOT_SUPER; + + /* Choose the more dire applicable warning */ + if (UNICODE_IS_PERL_EXTENDED(uv)) { + format = perl_extended_cp_format; + if (flags & (UNICODE_WARN_PERL_EXTENDED + |UNICODE_DISALLOW_PERL_EXTENDED)) + { + flag = UNICODE_GOT_PERL_EXTENDED; + } + } - /* Choose the more dire applicable warning */ - (UNICODE_IS_PERL_EXTENDED(uv)) - ? perl_extended_cp_format - : super_cp_format, - uv); + if (msgs) { + *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), + category, flag); + } + else { + Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv); + } } if ( (flags & UNICODE_DISALLOW_SUPER) || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) @@ -233,7 +347,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) } } else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) { - HANDLE_UNICODE_NONCHAR(uv, flags); + HANDLE_UNICODE_NONCHAR(uv, flags, msgs); } /* Test for and handle 4-byte result. In the test immediately below, the @@ -252,10 +366,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) characters. The end-plane non-characters for EBCDIC were handled just above */ if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) { - HANDLE_UNICODE_NONCHAR(uv, flags); + HANDLE_UNICODE_NONCHAR(uv, flags, msgs); } else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { - HANDLE_UNICODE_SURROGATE(uv, flags); + HANDLE_UNICODE_SURROGATE(uv, flags, msgs); } #endif @@ -273,8 +387,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) STRLEN len = OFFUNISKIP(uv); U8 *p = d+len-1; while (p > d) { - *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); - uv >>= UTF_ACCUMULATION_SHIFT; + *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK); + uv >>= SHIFT; } *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); return d+len; @@ -295,9 +409,8 @@ is the recommended wide native character-aware way of saying *(d++) = uv; -This function accepts any UV as input, but very high code points (above -C on the platform) will raise a deprecation warning. This is -typically 0x7FFF_FFFF in a 32-bit word. +This function accepts any code point from 0..C as input. +C is typically 0x7FFF_FFFF in a 32-bit word. It is possible to forbid or warn on non-Unicode code points, or those that may be problematic by using L. @@ -332,9 +445,8 @@ This is the Unicode-aware way of saying *(d++) = uv; -If C is 0, this function accepts any UV as input, but very high code -points (above C for the platform) will raise a deprecation warning. -This is typically 0x7FFF_FFFF in a 32-bit word. +If C is 0, this function accepts any code point from 0..C as +input. C is typically 0x7FFF_FFFF in a 32-bit word. Specifying C can further restrict what is allowed and not warned on, as follows: @@ -379,11 +491,9 @@ there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) A somewhat misleadingly named synonym for C is retained for backward compatibility: C. Similarly, C is usable instead of the more accurately named -C. The names are misleading because these -flags can apply to code points that actually do fit in 31 bits. This happens -on EBCDIC platforms, and sometimes when the L> is also present. The new names accurately -describe the situation in all cases. +C. The names are misleading because on EBCDIC +platforms,these flags can apply to code points that actually do fit in 31 bits. +The new names accurately describe the situation in all cases. =cut */ @@ -399,8 +509,10 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) #ifndef UV_IS_QUAD -PERL_STATIC_INLINE bool -S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) +STATIC int +S_is_utf8_cp_above_31_bits(const U8 * const s, + const U8 * const e, + const bool consider_overlongs) { /* Returns TRUE if the first code point represented by the Perl-extended- * UTF-8-encoded string starting at 's', and looking no further than 'e - @@ -412,166 +524,170 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) * the final ones necessary for the complete representation may be beyond * 'e - 1'. * - * The function assumes that the sequence is well-formed UTF-8 as far as it - * goes, and is for a UTF-8 variant code point. If the sequence is - * incomplete, the function returns FALSE if there is any well-formed - * UTF-8 byte sequence that can complete it in such a way that a code point - * < 2**31 is produced; otherwise it returns TRUE. - * - * Getting this exactly right is slightly tricky, and has to be done in - * several places in this file, so is centralized here. It is based on the - * following table: + * The function also can handle the case where the input is an overlong + * sequence. If 'consider_overlongs' is 0, the function assumes the + * input is not overlong, without checking, and will return based on that + * assumption. If this parameter is 1, the function will go to the trouble + * of figuring out if it actually evaluates to above or below 31 bits. * - * U+7FFFFFFF (2 ** 31 - 1) - * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF - * U+80000000 (2 ** 31): - * ASCII: \xFE\x82\x80\x80\x80\x80\x80 - * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * The sequence is otherwise assumed to be well-formed, without checking. */ -#ifdef EBCDIC - - /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */ - const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42"; - const STRLEN prefix_len = sizeof(prefix) - 1; const STRLEN len = e - s; - const STRLEN cmp_len = MIN(prefix_len, len - 1); - -#else - - PERL_UNUSED_ARG(e); - -#endif + int is_overlong; PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS; - assert(! UTF8_IS_INVARIANT(*s)); - -#ifndef EBCDIC + assert(! UTF8_IS_INVARIANT(*s) && e > s); - /* Technically, a start byte of FE can be for a code point that fits into - * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong - * malformation. */ - return (*s >= 0xFE); +#ifdef EBCDIC -#else + PERL_UNUSED_ARG(consider_overlongs); /* On the EBCDIC code pages we handle, only the native start byte 0xFE can - * mean a 32-bit or larger code point (0xFF is an invariant). For 0xFE, we - * need at least 2 bytes, and maybe up through 8 bytes, to be sure that the - * value is above 31 bits. */ - if (*s != 0xFE || len == 1) { - return FALSE; + * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can + * also be the start byte for a 31-bit code point; we need at least 2 + * bytes, and maybe up through 8 bytes, to determine that. (It can also be + * the start byte for an overlong sequence, but for 30-bit or smaller code + * points, so we don't have to worry about overlongs on EBCDIC.) */ + if (*s != 0xFE) { + return 0; } - /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are - * \x41 and \x42. */ - return cBOOL(memGT(s + 1, prefix, cmp_len)); + if (len == 1) { + return -1; + } -#endif +#else -} + /* On ASCII, FE and FF are the only start bytes that can evaluate to + * needing more than 31 bits. */ + if (LIKELY(*s < 0xFE)) { + return 0; + } -#endif + /* What we have left are FE and FF. Both of these require more than 31 + * bits unless they are for overlongs. */ + if (! consider_overlongs) { + return 1; + } -/* Anything larger than this will overflow the word if it were converted into a UV */ -#if defined(UV_IS_QUAD) -# ifdef EBCDIC /* Actually is I8 */ -# define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" -# else -# define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" -# endif -#else /* 32-bit */ -# ifdef EBCDIC -# define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF" -# else -# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF" -# endif -#endif + /* Here, we have FE or FF. If the input isn't overlong, it evaluates to + * above 31 bits. But we need more than one byte to discern this, so if + * passed just the start byte, it could be an overlong evaluating to + * smaller */ + if (len == 1) { + return -1; + } -PERL_STATIC_INLINE bool -S_does_utf8_overflow(const U8 * const s, const U8 * e) -{ - const U8 *x; - const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + /* Having excluded len==1, and knowing that FE and FF are both valid start + * bytes, we can call the function below to see if the sequence is + * overlong. (We don't need the full generality of the called function, + * but for these huge code points, speed shouldn't be a consideration, and + * the compiler does have enough information, since it's static to this + * file, to optimize to just the needed parts.) */ + is_overlong = is_utf8_overlong_given_start_byte_ok(s, len); -#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) + /* If it isn't overlong, more than 31 bits are required. */ + if (is_overlong == 0) { + return 1; + } - const STRLEN len = e - s; + /* If it is indeterminate if it is overlong, return that */ + if (is_overlong < 0) { + return -1; + } + + /* Here is overlong. Such a sequence starting with FE is below 31 bits, as + * the max it can be is 2**31 - 1 */ + if (*s == 0xFE) { + return 0; + } #endif - /* Returns a boolean as to if this UTF-8 string would overflow a UV on this - * platform, that is if it represents a code point larger than the highest - * representable code point. (For ASCII platforms, we could use memcmp() - * because we don't have to convert each byte to I8, but it's very rare - * input indeed that would approach overflow, so the loop below will likely - * only get executed once. - * - * 'e' must not be beyond a full character. If it is less than a full - * character, the function returns FALSE if there is any input beyond 'e' - * that could result in a non-overflowing code point */ + /* Here, ASCII and EBCDIC rejoin: + * On ASCII: We have an overlong sequence starting with FF + * On EBCDIC: We have a sequence starting with FE. */ - PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; - assert(s <= e && s + UTF8SKIP(s) >= e); + { /* For C89, use a block so the declaration can be close to its use */ -#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) +#ifdef EBCDIC - /* On 32 bit ASCII machines, many overlongs that start with FF don't - * overflow */ + /* U+7FFFFFFF (2 ** 31 - 1) + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF + * U+80000000 (2 ** 31): + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * + * and since we know that *s = \xfe, any continuation sequcence + * following it that is gt the below is above 31 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42"; - if (isFF_OVERLONG(s, len) > 0) { - const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84"; - return memGE(s, max_32_bit_overlong, - MIN(len, sizeof(max_32_bit_overlong) - 1)); - } +#else -#endif + /* FF overlong for U+7FFFFFFF (2 ** 31 - 1) + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF + * FF overlong for U+80000000 (2 ** 31): + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80 + * and since we know that *s = \xff, any continuation sequcence + * following it that is gt the below is above 30 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81"; - for (x = s; x < e; x++, y++) { - if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { - continue; +#endif + const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1; + const STRLEN cmp_len = MIN(conts_len, len - 1); + + /* Now compare the continuation bytes in s with the ones we have + * compiled in that are for the largest 30 bit code point. If we have + * enough bytes available to determine the answer, or the bytes we do + * have differ from them, we can compare the two to get a definitive + * answer (Note that in UTF-EBCDIC, the two lowest possible + * continuation bytes are \x41 and \x42.) */ + if (cmp_len >= conts_len || memNE(s + 1, + conts_for_highest_30_bit, + cmp_len)) + { + return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len)); } - /* If this byte is larger than the corresponding highest UTF-8 byte, - * the sequence overflow; otherwise the byte is less than, and so the - * sequence doesn't overflow */ - return NATIVE_UTF8_TO_I8(*x) > *y; - + /* Here, all the bytes we have are the same as the highest 30-bit code + * point, but we are missing so many bytes that we can't make the + * determination */ + return -1; } - - /* Got to the end and all bytes are the same. If the input is a whole - * character, it doesn't overflow. And if it is a partial character, - * there's not enough information to tell, so assume doesn't overflow */ - return FALSE; } -PERL_STATIC_INLINE bool +#endif + +PERL_STATIC_INLINE int S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) { - /* Overlongs can occur whenever the number of continuation bytes - * changes. That means whenever the number of leading 1 bits in a start - * byte increases from the next lower start byte. That happens for start - * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following - * illegal start bytes have already been excluded, so don't need to be - * tested here; + /* Returns an int indicating whether or not the UTF-8 sequence from 's' to + * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if + * it isn't, and -1 if there isn't enough information to tell. This last + * return value can happen if the sequence is incomplete, missing some + * trailing bytes that would form a complete character. If there are + * enough bytes to make a definitive decision, this function does so. + * Usually 2 bytes sufficient. + * + * Overlongs can occur whenever the number of continuation bytes changes. + * That means whenever the number of leading 1 bits in a start byte + * increases from the next lower start byte. That happens for start bytes + * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal + * start bytes have already been excluded, so don't need to be tested here; * ASCII platforms: C0, C1 * EBCDIC platforms C0, C1, C2, C3, C4, E0 - * - * At least a second byte is required to determine if other sequences will - * be an overlong. */ + */ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); @@ -596,7 +712,7 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) # else if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) { - return TRUE; + return 1; } # define F0_ABOVE_OVERLONG 0x90 @@ -612,11 +728,11 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG)) || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG))) { - return TRUE; + return 1; } /* Check for the FF overlong */ - return isFF_OVERLONG(s, len) > 0; + return isFF_OVERLONG(s, len); } PERL_STATIC_INLINE int @@ -650,6 +766,140 @@ S_isFF_OVERLONG(const U8 * const s, const STRLEN len) return -1; } +#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */ +# ifdef EBCDIC /* Actually is I8 */ +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# endif +#endif + +PERL_STATIC_INLINE int +S_does_utf8_overflow(const U8 * const s, + const U8 * e, + const bool consider_overlongs) +{ + /* Returns an int indicating whether or not the UTF-8 sequence from 's' to + * 'e' - 1 would overflow an IV on this platform; that is if it represents + * a code point larger than the highest representable code point. It + * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't + * enough information to tell. This last return value can happen if the + * sequence is incomplete, missing some trailing bytes that would form a + * complete character. If there are enough bytes to make a definitive + * decision, this function does so. + * + * If 'consider_overlongs' is TRUE, the function checks for the possibility + * that the sequence is an overlong that doesn't overflow. Otherwise, it + * assumes the sequence is not an overlong. This can give different + * results only on ASCII 32-bit platforms. + * + * (For ASCII platforms, we could use memcmp() because we don't have to + * convert each byte to I8, but it's very rare input indeed that would + * approach overflow, so the loop below will likely only get executed once.) + * + * 'e' - 1 must not be beyond a full character. */ + + + PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; + assert(s <= e && s + UTF8SKIP(s) >= e); + +#if ! defined(UV_IS_QUAD) + + return is_utf8_cp_above_31_bits(s, e, consider_overlongs); + +#else + + PERL_UNUSED_ARG(consider_overlongs); + + { + const STRLEN len = e - s; + const U8 *x; + const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + + for (x = s; x < e; x++, y++) { + + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { + continue; + } + + /* If this byte is larger than the corresponding highest UTF-8 + * byte, the sequence overflow; otherwise the byte is less than, + * and so the sequence doesn't overflow */ + return NATIVE_UTF8_TO_I8(*x) > *y; + + } + + /* Got to the end and all bytes are the same. If the input is a whole + * character, it doesn't overflow. And if it is a partial character, + * there's not enough information to tell */ + if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) { + return -1; + } + + return 0; + } + +#endif + +} + +#if 0 + +/* This is the portions of the above function that deal with UV_MAX instead of + * IV_MAX. They are left here in case we want to combine them so that internal + * uses can have larger code points. The only logic difference is that the + * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has + * different logic. + */ + +/* Anything larger than this will overflow the word if it were converted into a UV */ +#if defined(UV_IS_QUAD) +# ifdef EBCDIC /* Actually is I8 */ +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# endif +#else /* 32-bit */ +# ifdef EBCDIC +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF" +# endif +#endif + +#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) + + /* On 32 bit ASCII machines, many overlongs that start with FF don't + * overflow */ + if (consider_overlongs && isFF_OVERLONG(s, len) > 0) { + + /* To be such an overlong, the first bytes of 's' must match + * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we + * don't have any additional bytes available, the sequence, when + * completed might or might not fit in 32 bits. But if we have that + * next byte, we can tell for sure. If it is <= 0x83, then it does + * fit. */ + if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) { + return -1; + } + + return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83; + } + +/* Starting with the #else, the rest of the function is identical except + * 1. we need to move the 'len' declaration to be global to the function + * 2. the endif move to just after the UNUSED_ARG. + * An empty endif is given just below to satisfy the preprocessor + */ +#endif + +#endif + #undef F0_ABOVE_OVERLONG #undef F8_ABOVE_OVERLONG #undef FC_ABOVE_OVERLONG @@ -799,13 +1049,16 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) /* Here is syntactically valid. Next, make sure this isn't the start of an * overlong. */ - if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) { + if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) { return 0; } /* And finally, that the code point represented fits in a word on this * platform */ - if (does_utf8_overflow(s, e)) { + if (0 < does_utf8_overflow(s, e, + 0 /* Don't consider overlongs */ + )) + { return 0; } @@ -813,10 +1066,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) } char * -Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) +Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) { /* Returns a mortalized C string that is a displayable copy of the 'len' - * bytes starting at 's'. 'format' gives how to display each byte. + * bytes starting at 'start'. 'format' gives how to display each byte. * Currently, there are only two formats, so it is currently a bool: * 0 \xab * 1 ab (that is a space between two hex digit bytes) @@ -824,7 +1077,8 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a trailing NUL */ - const U8 * const e = s + len; + const U8 * s = start; + const U8 * const e = start + len; char * output; char * d; @@ -834,12 +1088,14 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) SAVEFREEPV(output); d = output; - for (; s < e; s++) { + for (s = start; s < e; s++) { const unsigned high_nibble = (*s & 0xF0) >> 4; const unsigned low_nibble = (*s & 0x0F); if (format) { - *d++ = ' '; + if (s > start) { + *d++ = ' '; + } } else { *d++ = '\\'; @@ -868,7 +1124,7 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) PERL_STATIC_INLINE char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, - /* How many bytes to print */ + /* Max number of bytes to print */ STRLEN print_len, /* Which one is the non-continuation */ @@ -884,6 +1140,8 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, ? "immediately" : Perl_form(aTHX_ "%d bytes", (int) non_cont_byte_pos); + const U8 * x = s + non_cont_byte_pos; + const U8 * e = s + print_len; PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; @@ -891,10 +1149,20 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, * calculated, it's likely faster to pass it; verify under DEBUGGING */ assert(expect_len == UTF8SKIP(s)); + /* As a defensive coding measure, don't output anything past a NUL. Such + * bytes shouldn't be in the middle of a malformation, and could mark the + * end of the allocated string, and what comes after is undefined */ + for (; x < e; x++) { + if (*x == '\0') { + x++; /* Output this particular NUL */ + break; + } + } + return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," " %s after start byte 0x%02x; need %d bytes, got %d)", malformed_text, - _byte_dump_string(s, print_len, 0), + _byte_dump_string(s, x - s, 0), *(s + non_cont_byte_pos), where, *s, @@ -996,10 +1264,6 @@ EBCDIC platforms, and sometimes when the L> is also present. The new names accurately describe the situation in all cases. -It is now deprecated to have very high code points (above C on the -platforms) and this function will raise a deprecation warning for these (unless -such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1) -in a 32-bit word. All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never @@ -1029,7 +1293,8 @@ THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. Most code should use L() rather than call this directly. This function is for code that needs to know what the precise malformation(s) -are when an error is found. +are when an error is found. If you also need to know the generated warning +messages, use L() instead. It is like C> but it takes an extra parameter placed after all the others, C. If this parameter is 0, this function behaves @@ -1106,7 +1371,7 @@ in a position where only a continuation type one should be. =item C The input sequence was malformed in that it is for a code point that is not -representable in the number of bits available in a UV on the current platform. +representable in the number of bits available in an IV on the current platform. =item C @@ -1134,37 +1399,178 @@ To do your own error handling, call this function with the C flag to suppress any warnings, and then examine the C<*errors> return. =cut + +Also implemented as a macro in utf8.h */ UV Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors) + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors) +{ + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; + + return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); +} + +/* + +=for apidoc utf8n_to_uvchr_msgs + +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +Most code should use L() rather than call this directly. + +This function is for code that needs to know what the precise malformation(s) +are when an error is found, and wants the corresponding warning and/or error +messages to be returned to the caller rather than be displayed. All messages +that would have been displayed if all lexcial warnings are enabled will be +returned. + +It is just like C> but it takes an extra parameter +placed after all the others, C. If this parameter is 0, this function +behaves identically to C>. Otherwise, C should +be a pointer to an C variable, in which this function creates a new AV to +contain any appropriate messages. The elements of the array are ordered so +that the first message that would have been displayed is in the 0th element, +and so on. Each element is a hash with three key-value pairs, as follows: + +=over 4 + +=item C + +The text of the message as a C. + +=item C + +The warning category (or categories) packed into a C. + +=item C + +A single flag bit associated with this message, in a C. +The bit corresponds to some bit in the C<*errors> return value, +such as C. + +=back + +It's important to note that specifying this parameter as non-null will cause +any warnings this function would otherwise generate to be suppressed, and +instead be placed in C<*msgs>. The caller can check the lexical warnings state +(or not) when choosing what to do with the returned messages. + +If the flag C is passed, no warnings are generated, and hence +no AV is created. + +The caller, of course, is responsible for freeing any returned AV. + +=cut +*/ + +UV +Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) { const U8 * const s0 = s; - U8 * send = NULL; /* (initialized to silence compilers' wrong - warning) */ - U32 possible_problems = 0; /* A bit is set here for each potential problem - found as we go along */ - UV uv = *s; - STRLEN expectlen = 0; /* How long should this sequence be? - (initialized to silence compilers' wrong - warning) */ - STRLEN avail_len = 0; /* When input is too short, gives what that is */ - U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL; - this gets set and discarded */ + const U8 * send = s0 + curlen; + U32 possible_problems; /* A bit is set here for each potential problem + found as we go along */ + UV uv; + STRLEN expectlen; /* How long should this sequence be? */ + STRLEN avail_len; /* When input is too short, gives what that is */ + U32 discard_errors; /* Used to save branches when 'errors' is NULL; this + gets set and discarded */ /* The below are used only if there is both an overlong malformation and a * too short one. Otherwise the first two are set to 's0' and 'send', and * the third not used at all */ - U8 * adjusted_s0 = (U8 *) s0; + U8 * adjusted_s0; U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this routine; see [perl #130921] */ - UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ + UV uv_so_far; + UV state = 0; - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + + /* Measurements show that this dfa is somewhat faster than the regular code + * below, so use it first, dropping down for the non-normal cases. */ + +#define PERL_UTF8_DECODE_REJECT 1 + + while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) { + UV type = strict_utf8_dfa_tab[*s]; + + uv = (state == 0) + ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) + : UTF8_ACCUMULATE(uv, *s); + state = strict_utf8_dfa_tab[256 + state + type]; + + if (state == 0) { + if (retlen) { + *retlen = s - s0 + 1; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return uv; + } + + s++; + } + + /* Here, is one of: a) malformed; b) a problematic code point (surrogate, + * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul + * syllables that the dfa doesn't properly handle. Quickly dispose of the + * final case. */ + +#ifndef EBCDIC + + /* Each of the affected Hanguls starts with \xED */ + + if (is_HANGUL_ED_utf8_safe(s0, send)) { + if (retlen) { + *retlen = 3; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + } + +#endif + + /* In conjunction with the exhaustive tests that can be enabled in + * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely + * what it is intended to do, and that no flaws in it are masked by + * dropping down and executing the code below + assert(! isUTF8_CHAR(s0, send) + || UTF8_IS_SURROGATE(s0, send) + || UTF8_IS_SUPER(s0, send) + || UTF8_IS_NONCHAR(s0,send)); + */ + + s = s0; + uv = *s0; + possible_problems = 0; + expectlen = 0; + avail_len = 0; + discard_errors = 0; + adjusted_s0 = (U8 *) s0; + uv_so_far = 0; if (errors) { *errors = 0; @@ -1217,11 +1623,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *retlen = expectlen; } - /* An invariant is trivially well-formed */ - if (UTF8_IS_INVARIANT(uv)) { - return uv; - } - /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -1242,14 +1643,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Setup the loop end point, making sure to not look past the end of the * input string, and flag it as too short if the size isn't big enough. */ - send = (U8*) s0; if (UNLIKELY(curlen < expectlen)) { possible_problems |= UTF8_GOT_SHORT; avail_len = curlen; - send += curlen; } else { - send += expectlen; + send = (U8*) s0 + expectlen; } /* Now, loop through the remaining bytes in the character's sequence, @@ -1289,7 +1688,10 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Check for overflow. The algorithm requires us to not look past the end * of the current character, even if partial, so the upper limit is 's' */ - if (UNLIKELY(does_utf8_overflow(s0, s))) { + if (UNLIKELY(0 < does_utf8_overflow(s0, s, + 1 /* Do consider overlongs */ + ))) + { possible_problems |= UTF8_GOT_OVERFLOW; uv = UNICODE_REPLACEMENT; } @@ -1303,7 +1705,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, || ( UNLIKELY(possible_problems) && ( UNLIKELY(! UTF8_IS_START(*s0)) || ( curlen > 1 - && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0, + && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0, s - s0)))))) { possible_problems |= UTF8_GOT_LONG; @@ -1364,14 +1766,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, |UTF8_WARN_NONCHAR |UTF8_WARN_SURROGATE |UTF8_WARN_SUPER - |UTF8_WARN_PERL_EXTENDED)) - /* In case of a malformation, 'uv' is not valid, and has - * been changed to something in the Unicode range. - * Currently we don't output a deprecation message if there - * is already a malformation, so we don't have to special - * case the test immediately below */ - || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)))) + |UTF8_WARN_PERL_EXTENDED)))) { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ @@ -1442,9 +1837,14 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, bool disallowed = FALSE; const U32 orig_problems = possible_problems; + if (msgs) { + *msgs = NULL; + } + while (possible_problems) { /* Handle each possible problem */ UV pack_warn = 0; char * message = NULL; + U32 this_flag_bit = 0; /* Each 'if' clause handles one problem. They are ordered so that * the first ones' messages will be displayed before the later @@ -1477,11 +1877,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, disallowed = TRUE; } - /* Likewise, warn if any say to, plus if deprecation warnings - * are on, because this code point is above IV_MAX */ - if ( ckWARN_d(WARN_DEPRECATED) - || ! (flags & UTF8_ALLOW_OVERFLOW) - || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) + /* Likewise, warn if any say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) { /* The warnings code explicitly says it doesn't handle the @@ -1491,16 +1889,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * necessarily do so in the future. We output (only) the * most dire warning */ if (! (flags & UTF8_CHECK_ONLY)) { - if (ckWARN_d(WARN_UTF8)) { + if (msgs || ckWARN_d(WARN_UTF8)) { pack_warn = packWARN(WARN_UTF8); } - else if (ckWARN_d(WARN_NON_UNICODE)) { + else if (msgs || ckWARN_d(WARN_NON_UNICODE)) { pack_warn = packWARN(WARN_NON_UNICODE); } if (pack_warn) { message = Perl_form(aTHX_ "%s: %s (overflows)", malformed_text, _byte_dump_string(s0, curlen, 0)); + this_flag_bit = UTF8_GOT_OVERFLOW; } } } @@ -1517,10 +1916,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, assert(0); disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if ( (msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s (empty string)", malformed_text); + this_flag_bit = UTF8_GOT_EMPTY; } } } @@ -1530,13 +1932,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_CONTINUATION)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s: %s (unexpected continuation byte 0x%02x," " with no preceding start byte)", malformed_text, _byte_dump_string(s0, 1, 0), *s0); + this_flag_bit = UTF8_GOT_CONTINUATION; } } } @@ -1546,7 +1951,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_SHORT)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s: %s (too short; %d byte%s available, need %d)", @@ -1555,6 +1962,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, (int)avail_len, avail_len == 1 ? "" : "s", (int)expectlen); + this_flag_bit = UTF8_GOT_SHORT; } } @@ -1565,7 +1973,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { /* If we don't know for sure that the input length is * valid, avoid as much as possible reading past the @@ -1579,6 +1989,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, printlen, s - s0, (int) expectlen)); + this_flag_bit = UTF8_GOT_NON_CONTINUATION; } } } @@ -1589,7 +2000,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SURROGATE; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_SURROGATE)) + && (msgs || ckWARN_d(WARN_SURROGATE))) { pack_warn = packWARN(WARN_SURROGATE); @@ -1604,6 +2015,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { message = Perl_form(aTHX_ surrogate_cp_format, uv); } + this_flag_bit = UTF8_GOT_SURROGATE; } } @@ -1619,7 +2031,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SUPER; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_NON_UNICODE)) + && (msgs || ckWARN_d(WARN_NON_UNICODE))) { pack_warn = packWARN(WARN_NON_UNICODE); @@ -1633,6 +2045,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { message = Perl_form(aTHX_ super_cp_format, uv); } + this_flag_bit = UTF8_GOT_SUPER; } } @@ -1642,7 +2055,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) - && ckWARN_d(WARN_NON_UNICODE)) + && (msgs || ckWARN_d(WARN_NON_UNICODE))) { pack_warn = packWARN(WARN_NON_UNICODE); @@ -1666,6 +2079,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, " so is not portable", _byte_dump_string(s0, curlen, 0)); } + this_flag_bit = UTF8_GOT_PERL_EXTENDED; } if (flags & ( UTF8_WARN_PERL_EXTENDED @@ -1683,21 +2097,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SUPER; disallowed = TRUE; } - - /* The deprecated warning overrides any non-deprecated one. If - * there are other problems, a deprecation message is not - * really helpful, so don't bother to raise it in that case. - * This also keeps the code from having to handle the case - * where 'uv' is not valid. */ - if ( ! (orig_problems - & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) - && UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - message = Perl_form(aTHX_ cp_above_legal_max, - uv, MAX_NON_DEPRECATED_CP); - pack_warn = packWARN(WARN_DEPRECATED); - } } else if (possible_problems & UTF8_GOT_NONCHAR) { possible_problems &= ~UTF8_GOT_NONCHAR; @@ -1706,7 +2105,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_NONCHAR; if ( ! (flags & UTF8_CHECK_ONLY) - && ckWARN_d(WARN_NONCHAR)) + && (msgs || ckWARN_d(WARN_NONCHAR))) { /* The code above should have guaranteed that we don't * get here with errors other than overlong */ @@ -1715,6 +2114,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, pack_warn = packWARN(WARN_NONCHAR); message = Perl_form(aTHX_ nonchar_cp_format, uv); + this_flag_bit = UTF8_GOT_NONCHAR; } } @@ -1740,7 +2140,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { disallowed = TRUE; - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + if (( msgs + || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY)) + { pack_warn = packWARN(WARN_UTF8); /* These error types cause 'uv' to be something that @@ -1763,9 +2165,15 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, U8 tmpbuf[UTF8_MAXBYTES+1]; const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, uv, 0); - const char * preface = (uv <= PERL_UNICODE_MAX) - ? "U+" - : "0x"; + /* Don't use U+ for non-Unicode code points, which + * includes those in the Latin1 range */ + const char * preface = ( uv > PERL_UNICODE_MAX +#ifdef EBCDIC + || uv <= 0xFF +#endif + ) + ? "0x" + : "U+"; message = Perl_form(aTHX_ "%s: %s (overlong; instead use %s to represent" " %s%0*" UVXf ")", @@ -1775,8 +2183,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, preface, ((uv < 256) ? 2 : 4), /* Field width of 2 for small code points */ - uv); + UNI_TO_NATIVE(uv)); } + this_flag_bit = UTF8_GOT_LONG; } } } /* End of looking through the possible flags */ @@ -1784,7 +2193,18 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Display the message (if any) for the problem being handled in * this iteration of the loop */ if (message) { - if (PL_op) + if (msgs) { + assert(this_flag_bit); + + if (*msgs == NULL) { + *msgs = newAV(); + } + + av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message, + pack_warn, + this_flag_bit))); + } + else if (PL_op) Perl_warner(aTHX_ pack_warn, "%s in %s", message, OP_DESC(PL_op)); else @@ -1826,9 +2246,6 @@ the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut Also implemented as a macro in utf8.h @@ -1853,7 +2270,9 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) Only in very rare circumstances should code need to be dealing in Unicode (as opposed to native) code points. In those few cases, use -C> instead. +C> instead. If you +are not absolutely sure this is one of those cases, then assume it isn't and +use plain C instead. Returns the Unicode (not-native) code point of the first character in the string C which @@ -1868,9 +2287,6 @@ is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut */ @@ -1881,16 +2297,18 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) assert(send > s); - /* Call the low level routine, asking for checks */ return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen)); } /* =for apidoc utf8_length -Return the length of the UTF-8 char encoded string C in characters. -Stops at C (inclusive). If C s> or if the scan would end -up past C, croaks. +Returns the number of characters in the sequence of UTF-8-encoded bytes starting +at C and ending at the byte just before C. If and point to the +same place, it returns 0 with no warning raised. + +If C s> or if the scan would end up past C, it raises a UTF8 warning +and returns the number of valid characters. =cut */ @@ -2075,7 +2493,9 @@ C<*lenp> are unchanged, and the return value is the original C. Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a newly created string containing a downgraded copy of C, and whose length is -returned in C<*lenp>, updated. The new string is C-terminated. +returned in C<*lenp>, updated. The new string is C-terminated. The +caller is responsible for arranging for the memory used by this string to get +freed. Upon successful return, the number of variants in the string can be computed by having saved the value of C<*lenp> before the call, and subtracting the @@ -2190,8 +2610,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f } finish_and_return: - *d = '\0'; - *lenp = d - converted_start; + *d = '\0'; + *lenp = d - converted_start; /* Trim unused space */ Renew(converted_start, *lenp + 1, U8); @@ -2205,7 +2625,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f Converts a string C of length C<*lenp> bytes from the native encoding into UTF-8. Returns a pointer to the newly-created string, and sets C<*lenp> to -reflect the new length in bytes. +reflect the new length in bytes. The caller is responsible for arranging for +the memory used by this string to get freed. Upon successful return, the number of variants in the string can be computed by having saved the value of C<*lenp> before the call, and subtracting it from the @@ -2237,16 +2658,30 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) append_utf8_from_native_byte(*s, &d); s++; } + *d = '\0'; *lenp = d-dst; + + /* Trim unused space */ + Renew(dst, *lenp + 1, U8); + return dst; } /* - * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. + * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian), + * use utf16_to_utf8_reversed(). + * + * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes. + * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes. + * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes. + * + * These functions don't check for overflow. The worst case is every code + * point in the input is 2 bytes, and requires 4 bytes on output. (If the code + * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the + * destination must be pre-extended to 2 times the source length. * - * Destination must be pre-extended to 3/2 source. Do not use in-place. - * We optimize for native, for obvious reasons. */ + * Do not use in-place. We optimize for native, for obvious reasons. */ U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) @@ -2274,10 +2709,12 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); continue; } + #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST #define LAST_HIGH_SURROGATE 0xDBFF #define FIRST_LOW_SURROGATE 0xDC00 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST +#define FIRST_IN_PLANE1 0x10000 /* This assumes that most uses will be in the first Unicode plane, not * needing surrogates */ @@ -2296,13 +2733,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) } p += 2; uv = ((uv - FIRST_HIGH_SURROGATE) << 10) - + (low - FIRST_LOW_SURROGATE) + 0x10000; + + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1; } } #ifdef EBCDIC d = uvoffuni_to_utf8_flags(d, uv, 0); #else - if (uv < 0x10000) { + if (uv < FIRST_IN_PLANE1) { *d++ = (U8)(( uv >> 12) | 0xe0); *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); *d++ = (U8)(( uv & 0x3f) | 0x80); @@ -2347,9 +2784,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf)); + return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c); } /* Internal function so we can deprecate the external one, and call @@ -2362,23 +2797,21 @@ Perl__is_utf8_idstart(pTHX_ const U8 *p) if (*p == '_') return TRUE; - return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL); + return is_utf8_common(p, NULL, + "This is buggy if this gets used", + PL_utf8_idstart); } bool Perl__is_uni_perl_idcont(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); + return _invlist_contains_cp(PL_utf8_perl_idcont, c); } bool Perl__is_uni_perl_idstart(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); + return _invlist_contains_cp(PL_utf8_perl_idstart, c); } UV @@ -2439,27 +2872,72 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, return converted; } +/* If compiled on an early Unicode version, there may not be auxiliary tables + * */ +#ifndef HAS_UC_AUX_TABLES +# define UC_AUX_TABLE_ptrs NULL +# define UC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_TC_AUX_TABLES +# define TC_AUX_TABLE_ptrs NULL +# define TC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_LC_AUX_TABLES +# define LC_AUX_TABLE_ptrs NULL +# define LC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_CF_AUX_TABLES +# define CF_AUX_TABLE_ptrs NULL +# define CF_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_UC_AUX_TABLES +# define UC_AUX_TABLE_ptrs NULL +# define UC_AUX_TABLE_lengths NULL +#endif + /* Call the function to convert a UTF-8 encoded character to the specified case. * Note that there may be more than one character in the result. - * INP is a pointer to the first byte of the input character - * OUTP will be set to the first byte of the string of changed characters. It + * 's' is a pointer to the first byte of the input character + * 'd' will be set to the first byte of the string of changed characters. It * needs to have space for UTF8_MAXBYTES_CASE+1 bytes - * LENP will be set to the length in bytes of the string of changed characters + * 'lenp' will be set to the length in bytes of the string of changed characters * * The functions return the ordinal of the first character in the string of - * OUTP */ + * 'd' */ #define CALL_UPPER_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \ + Uppercase_Mapping_invmap, \ + UC_AUX_TABLE_ptrs, \ + UC_AUX_TABLE_lengths, \ + "uppercase") #define CALL_TITLE_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \ + Titlecase_Mapping_invmap, \ + TC_AUX_TABLE_ptrs, \ + TC_AUX_TABLE_lengths, \ + "titlecase") #define CALL_LOWER_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \ + Lowercase_Mapping_invmap, \ + LC_AUX_TABLE_ptrs, \ + LC_AUX_TABLE_lengths, \ + "lowercase") + /* This additionally has the input parameter 'specials', which if non-zero will * cause this to use the specials hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ -_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL) + (specials) \ + ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \ + Case_Folding_invmap, \ + CF_AUX_TABLE_ptrs, \ + CF_AUX_TABLE_lengths, \ + "foldcase") \ + : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \ + Simple_Case_Folding_invmap, \ + NULL, NULL, \ + "foldcase") UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -2536,8 +3014,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } UV -Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, - const unsigned int flags) +Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) { /* Corresponds to to_lower_latin1(); bits meanings: * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited @@ -2549,7 +3026,6 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, UV converted; PERL_ARGS_ASSERT__TO_FOLD_LATIN1; - PERL_UNUSED_CONTEXT; assert (! (flags & FOLD_FLAGS_LOCALE)); @@ -2612,12 +3088,13 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; if (flags & FOLD_FLAGS_LOCALE) { - /* Treat a UTF-8 locale as not being in locale at all */ + /* Treat a UTF-8 locale as not being in locale at all, except for + * potentially warning */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (IN_UTF8_CTYPE_LOCALE) { flags &= ~FOLD_FLAGS_LOCALE; } else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; goto needs_full_generality; } } @@ -2674,6 +3151,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, NOT_REACHED; /* NOTREACHED */ } + if (invlist) { + return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL)); + } + + assert(swash); + if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", @@ -2709,6 +3192,12 @@ S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, NOT_REACHED; /* NOTREACHED */ } + if (invlist) { + return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL)); + } + + assert(swash); + if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", @@ -2794,8 +3283,8 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_CASED: return is_utf8_common(p, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], + NULL, + "This is buggy if this gets used", PL_XPosix_ptrs[classnum]); case _CC_SPACE: @@ -2811,19 +3300,13 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_VERTSPACE: return is_VERTWS_high(p); case _CC_IDFIRST: - if (! PL_utf8_perl_idstart) { - PL_utf8_perl_idstart - = _new_invlist_C_array(_Perl_IDStart_invlist); - } - return is_utf8_common(p, &PL_utf8_perl_idstart, - "_Perl_IDStart", NULL); + return is_utf8_common(p, NULL, + "This is buggy if this gets used", + PL_utf8_perl_idstart); case _CC_IDCONT: - if (! PL_utf8_perl_idcont) { - PL_utf8_perl_idcont - = _new_invlist_C_array(_Perl_IDCont_invlist); - } - return is_utf8_common(p, &PL_utf8_perl_idcont, - "_Perl_IDCont", NULL); + return is_utf8_common(p, NULL, + "This is buggy if this gets used", + PL_utf8_perl_idcont); } } @@ -2862,27 +3345,19 @@ Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, { PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; - assert(classnum < _FIRST_NON_SWASH_CC); - - return is_utf8_common_with_len(p, - e, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], + return is_utf8_common_with_len(p, e, NULL, + "This is buggy if this gets used", PL_XPosix_ptrs[classnum]); } bool Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) { - SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; - if (! PL_utf8_perl_idstart) { - invlist = _new_invlist_C_array(_Perl_IDStart_invlist); - } - return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart, - "_Perl_IDStart", invlist); + return is_utf8_common_with_len(p, e, NULL, + "This is buggy if this gets used", + PL_utf8_perl_idstart); } bool @@ -2898,15 +3373,11 @@ Perl__is_utf8_xidstart(pTHX_ const U8 *p) bool Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) { - SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; - if (! PL_utf8_perl_idcont) { - invlist = _new_invlist_C_array(_Perl_IDCont_invlist); - } - return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont, - "_Perl_IDCont", invlist); + return is_utf8_common_with_len(p, e, NULL, + "This is buggy if this gets used", + PL_utf8_perl_idcont); } bool @@ -2922,7 +3393,7 @@ Perl__is_utf8_xidcont(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_XIDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL); + return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL); } bool @@ -2933,13 +3404,26 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); } - /* change namve uv1 to 'from' */ STATIC UV -S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, - SV **swashp, const char *normal, const char *special) +S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, + U8* ustrp, STRLEN *lenp, + SV *invlist, const int * const invmap, + const unsigned int * const * const aux_tables, + const U8 * const aux_table_lengths, + const char * const normal) { STRLEN len = 0; + /* Change the case of code point 'uv1' whose UTF-8 representation (assumed + * by this routine to be valid) begins at 'p'. 'normal' is a string to use + * to name the new case in any generated messages, as a fallback if the + * operation being used is not available. The new case is given by the + * data structures in the remaining arguments. + * + * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the + * entire changed case string, and the return value is the first code point + * in that string */ + PERL_ARGS_ASSERT__TO_UTF8_CASE; /* For code points that don't change case, we already know that the output @@ -3004,15 +3488,12 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, * some others */ if (uv1 < 0xFB00) { goto cases_to_self; - } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv1, + MAX_EXTERNALLY_LEGAL_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3036,62 +3517,46 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } /* Note that non-characters are perfectly legal, so no warning should - * be given. There are so few of them, that it isn't worth the extra - * tests to avoid swash creation */ + * be given. */ } - if (!*swashp) /* load on-demand */ - *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, - 4, 0, NULL, NULL); + { + unsigned int i; + const unsigned int * cp_list; + U8 * d; + SSize_t index = _invlist_search(invlist, uv1); + IV base = invmap[index]; - if (special) { - /* It might be "special" (sometimes, but not always, - * a multicharacter mapping) */ - HV *hv = NULL; - SV **svp; + /* The data structures are set up so that if 'base' is non-negative, + * the case change is 1-to-1; and if 0, the change is to itself */ + if (base >= 0) { + IV lc; - /* If passed in the specials name, use that; otherwise use any - * given in the swash */ - if (*special != '\0') { - hv = get_hv(special, 0); - } - else { - svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0); - if (svp) { - hv = MUTABLE_HV(SvRV(*svp)); + if (base == 0) { + goto cases_to_self; } - } - if (hv - && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE)) - && (*svp)) - { - const char *s; - - s = SvPV_const(*svp, len); - if (len == 1) - /* EIGHTBIT */ - len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp; - else { - Copy(s, ustrp, len, U8); - } - } - } - - if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */); + /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */ + lc = base + uv1 - invlist_array(invlist)[index]; + *lenp = uvchr_to_utf8(ustrp, lc) - ustrp; + return lc; + } - if (uv2) { - /* It was "normal" (a single character mapping). */ - len = uvchr_to_utf8(ustrp, uv2) - ustrp; - } - } + /* Here 'base' is negative. That means the mapping is 1-to-many, and + * requires an auxiliary table look up. abs(base) gives the index into + * a list of such tables which points to the proper aux table. And a + * parallel list gives the length of each corresponding aux table. */ + cp_list = aux_tables[-base]; - if (len) { - if (lenp) { - *lenp = len; + /* Create the string of UTF-8 from the mapped-to code points */ + d = ustrp; + for (i = 0; i < aux_table_lengths[-base]; i++) { + d = uvchr_to_utf8(d, cp_list[i]); } - return valid_utf8_to_uvchr(ustrp, 0); + *d = '\0'; + *lenp = d - ustrp; + + return cp_list[0]; } /* Here, there was no mapping defined, which means that the code point maps @@ -3102,11 +3567,72 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, Copy(p, ustrp, len, U8); } - if (lenp) - *lenp = len; + if (lenp) + *lenp = len; + + return uv1; + +} + +Size_t +Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, + const unsigned int ** remaining_folds_to) +{ + /* Returns the count of the number of code points that fold to the input + * 'cp' (besides itself). + * + * If the return is 0, there is nothing else that folds to it, and + * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. + * + * If the return is 1, '*first_folds_to' is set to the single code point, + * and '*remaining_folds_to' is set to NULL. + * + * Otherwise, '*first_folds_to' is set to a code point, and + * '*remaining_fold_to' is set to an array that contains the others. The + * length of this array is the returned count minus 1. + * + * The reason for this convolution is to avoid having to deal with + * allocating and freeing memory. The lists are already constructed, so + * the return can point to them, but single code points aren't, so would + * need to be constructed if we didn't employ something like this API */ + + SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); + int base = _Perl_IVCF_invmap[index]; + + PERL_ARGS_ASSERT__INVERSE_FOLDS; + + if (base == 0) { /* No fold */ + *first_folds_to = 0; + *remaining_folds_to = NULL; + return 0; + } + +#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ - return uv1; + assert(base > 0); + +#else + + if (UNLIKELY(base < 0)) { /* Folds to more than one character */ + /* The data structure is set up so that the absolute value of 'base' is + * an index into a table of pointers to arrays, with the array + * corresponding to the index being the list of code points that fold + * to 'cp', and the parallel array containing the length of the list + * array */ + *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; + *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes + *first_folds_to + */ + return IVCF_AUX_TABLE_lengths[-base]; + } + +#endif + + /* Only the single code point. This works like 'fc(G) = G - A + a' */ + *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index]; + *remaining_folds_to = NULL; + return 1; } STATIC UV @@ -3133,7 +3659,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, assert(UTF8_IS_ABOVE_LATIN1(*p)); /* We know immediately if the first character in the string crosses the - * boundary, so can skip */ + * boundary, so can skip testing */ if (result > 255) { /* Look at every character in the result; if any cross the @@ -3278,13 +3804,11 @@ S_check_and_deprecate(pTHX_ const U8 *p, L1_func_extra_param) \ \ if (flags & (locale_flags)) { \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ /* Treat a UTF-8 locale as not being in locale at all */ \ if (IN_UTF8_CTYPE_LOCALE) { \ flags &= ~(locale_flags); \ } \ - else { \ - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ - } \ } \ \ if (UTF8_IS_INVARIANT(*p)) { \ @@ -3296,13 +3820,12 @@ S_check_and_deprecate(pTHX_ const U8 *p, } \ } \ else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \ if (flags & (locale_flags)) { \ - result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \ - *(p+1))); \ + result = LC_L1_change_macro(c); \ } \ else { \ - return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \ - ustrp, lenp, L1_func_extra_param); \ + return L1_func(c, ustrp, lenp, L1_func_extra_param); \ } \ } \ else { /* malformed UTF-8 or ord above 255 */ \ @@ -3476,17 +3999,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, if (flags & FOLD_FLAGS_LOCALE) { # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 - const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; - # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 - const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1; - /* Special case these two characters, as what normally gets * returned under locale doesn't work */ - if (UTF8SKIP(p) == cap_sharp_s_len - && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len)) + if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3496,8 +4014,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, } else #endif - if (UTF8SKIP(p) == long_s_t_len - && memEQ((char *) p, LONG_S_T, long_s_t_len)) + if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3516,9 +4033,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * 255/256 boundary which is forbidden under /l, and so the code * wouldn't catch that they are equivalent (which they are only in * this release) */ - else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1 - && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1)) - { + else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " @@ -3706,11 +4221,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, SV* retval = &PL_sv_undef; HV* swash_hv = NULL; - const int invlist_swash_boundary = - (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) - ? 512 /* Based on some benchmarking, but not extensive, see commit - message */ - : -1; /* Never return just an inversion list */ + const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST); assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); assert(! invlist || minbits == 1); @@ -3877,7 +4388,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, /* Here, there is no swash already. Set up a minimal one, if * we are going to return a swash */ - if ((int) _invlist_len(invlist) > invlist_swash_boundary) { + if (! use_invlist) { swash_hv = newHV(); retval = newRV_noinc(MUTABLE_SV(swash_hv)); } @@ -3888,9 +4399,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, /* Here, we have computed the union of all the passed-in data. It may * be that there was an inversion list in the swash which didn't get * touched; otherwise save the computed one */ - if (! invlist_in_swash_is_valid - && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) - { + if (! invlist_in_swash_is_valid && ! use_invlist) { if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) { Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); @@ -3903,8 +4412,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, /* The result is immutable. Forbid attempts to change it. */ SvREADONLY_on(swash_invlist); - /* Use the inversion list stand-alone if small enough */ - if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { + if (use_invlist) { SvREFCNT_dec(retval); if (!swash_invlist_unclaimed) SvREFCNT_inc_simple_void_NN(swash_invlist); @@ -4540,320 +5048,6 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) return swatch; } -HV* -Perl__swash_inversion_hash(pTHX_ SV* const swash) -{ - - /* Subject to change or removal. For use only in regcomp.c and regexec.c - * Can't be used on a property that is subject to user override, as it - * relies on the value of SPECIALS in the swash which would be set by - * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set - * for overridden properties - * - * Returns a hash which is the inversion and closure of a swash mapping. - * For example, consider the input lines: - * 004B 006B - * 004C 006C - * 212A 006B - * - * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for - * 006C. The value for each key is an array. For 006C, the array would - * have two elements, the UTF-8 for itself, and for 004C. For 006B, there - * would be three elements in its array, the UTF-8 for 006B, 004B and 212A. - * - * Note that there are no elements in the hash for 004B, 004C, 212A. The - * keys are only code points that are folded-to, so it isn't a full closure. - * - * Essentially, for any code point, it gives all the code points that map to - * it, or the list of 'froms' for that point. - * - * Currently it ignores any additions or deletions from other swashes, - * looking at just the main body of the swash, and if there are SPECIALS - * in the swash, at that hash - * - * The specials hash can be extra code points, and most likely consists of - * maps from single code points to multiple ones (each expressed as a string - * of UTF-8 characters). This function currently returns only 1-1 mappings. - * However consider this possible input in the specials hash: - * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074 - * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074 - * - * Both FB05 and FB06 map to the same multi-char sequence, which we don't - * currently handle. But it also means that FB05 and FB06 are equivalent in - * a 1-1 mapping which we should handle, and this relationship may not be in - * the main table. Therefore this function examines all the multi-char - * sequences and adds the 1-1 mappings that come out of that. - * - * XXX This function was originally intended to be multipurpose, but its - * only use is quite likely to remain for constructing the inversion of - * the CaseFolding (//i) property. If it were more general purpose for - * regex patterns, it would have to do the FB05/FB06 game for simple folds, - * because certain folds are prohibited under /iaa and /il. As an example, - * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both - * equivalent under /i. But under /iaa and /il, the folds to 'i' are - * prohibited, so we would not figure out that they fold to each other. - * Code could be written to automatically figure this out, similar to the - * code that does this for multi-character folds, but this is the only case - * where something like this is ever likely to happen, as all the single - * char folds to the 0-255 range are now quite settled. Instead there is a - * little special code that is compiled only for this Unicode version. This - * is smaller and didn't require much coding time to do. But this makes - * this routine strongly tied to being used just for CaseFolding. If ever - * it should be generalized, this would have to be fixed */ - - U8 *l, *lend; - STRLEN lcur; - HV *const hv = MUTABLE_HV(SvRV(swash)); - - /* The string containing the main body of the table. This will have its - * assertion fail if the swash has been converted to its inversion list */ - SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); - - SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); - SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); - SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); - /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/ - const U8* const typestr = (U8*)SvPV_nolen(*typesvp); - const STRLEN bits = SvUV(*bitssvp); - const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ - const UV none = SvUV(*nonesvp); - SV **specials_p = hv_fetchs(hv, "SPECIALS", 0); - - HV* ret = newHV(); - - PERL_ARGS_ASSERT__SWASH_INVERSION_HASH; - - /* Must have at least 8 bits to get the mappings */ - if (bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" - UVuf, (UV)bits); - } - - if (specials_p) { /* It might be "special" (sometimes, but not always, a - mapping to more than one character */ - - /* Construct an inverse mapping hash for the specials */ - HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p)); - HV * specials_inverse = newHV(); - char *char_from; /* the lhs of the map */ - I32 from_len; /* its byte length */ - char *char_to; /* the rhs of the map */ - I32 to_len; /* its byte length */ - SV *sv_to; /* and in a sv */ - AV* from_list; /* list of things that map to each 'to' */ - - hv_iterinit(specials_hv); - - /* The keys are the characters (in UTF-8) that map to the corresponding - * UTF-8 string value. Iterate through the list creating the inverse - * list. */ - while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) { - SV** listp; - if (! SvPOK(sv_to)) { - Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() " - "unexpectedly is not a string, flags=%lu", - (unsigned long)SvFLAGS(sv_to)); - } - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ - - /* Each key in the inverse list is a mapped-to value, and the key's - * hash value is a list of the strings (each in UTF-8) that map to - * it. Those strings are all one character long */ - if ((listp = hv_fetch(specials_inverse, - SvPVX(sv_to), - SvCUR(sv_to), 0))) - { - from_list = (AV*) *listp; - } - else { /* No entry yet for it: create one */ - from_list = newAV(); - if (! hv_store(specials_inverse, - SvPVX(sv_to), - SvCUR(sv_to), - (SV*) from_list, 0)) - { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - } - - /* Here have the list associated with this 'to' (perhaps newly - * created and empty). Just add to it. Note that we ASSUME that - * the input is guaranteed to not have duplications, so we don't - * check for that. Duplications just slow down execution time. */ - av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE)); - } - - /* Here, 'specials_inverse' contains the inverse mapping. Go through - * it looking for cases like the FB05/FB06 examples above. There would - * be an entry in the hash like - * 'st' => [ FB05, FB06 ] - * In this example we will create two lists that get stored in the - * returned hash, 'ret': - * FB05 => [ FB05, FB06 ] - * FB06 => [ FB05, FB06 ] - * - * Note that there is nothing to do if the array only has one element. - * (In the normal 1-1 case handled below, we don't have to worry about - * two lists, as everything gets tied to the single list that is - * generated for the single character 'to'. But here, we are omitting - * that list, ('st' in the example), so must have multiple lists.) */ - while ((from_list = (AV *) hv_iternextsv(specials_inverse, - &char_to, &to_len))) - { - if (av_tindex_skip_len_mg(from_list) > 0) { - SSize_t i; - - /* We iterate over all combinations of i,j to place each code - * point on each list */ - for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) { - SSize_t j; - AV* i_list = newAV(); - SV** entryp = av_fetch(from_list, i, FALSE); - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly" - " failed"); - } - if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) { - Perl_croak(aTHX_ "panic: unexpected entry for %s", - SvPVX(*entryp)); - } - if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp), - (SV*) i_list, FALSE)) - { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - - /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) { - entryp = av_fetch(from_list, j, FALSE); - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); - } - - /* When i==j this adds itself to the list */ - av_push(i_list, newSVuv(utf8_to_uvchr_buf( - (U8*) SvPVX(*entryp), - (U8*) SvPVX(*entryp) + SvCUR(*entryp), - 0))); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ - } - } - } - } - SvREFCNT_dec(specials_inverse); /* done with it */ - } /* End of specials */ - - /* read $swash->{LIST} */ - -#if UNICODE_MAJOR_VERSION == 3 \ - && UNICODE_DOT_VERSION == 0 \ - && UNICODE_DOT_DOT_VERSION == 1 - - /* For this version only U+130 and U+131 are equivalent under qr//i. Add a - * rule so that things work under /iaa and /il */ - - SV * mod_listsv = sv_mortalcopy(*listsvp); - sv_catpv(mod_listsv, "130\t130\t131\n"); - l = (U8*)SvPV(mod_listsv, lcur); - -#else - - l = (U8*)SvPV(*listsvp, lcur); - -#endif - - lend = l + lcur; - - /* Go through each input line */ - while (l < lend) { - UV min, max, val; - UV inverse; - l = swash_scan_list_line(l, lend, &min, &max, &val, - cBOOL(octets), typestr); - if (l > lend) { - break; - } - - /* Each element in the range is to be inverted */ - for (inverse = min; inverse <= max; inverse++) { - AV* list; - SV** listp; - IV i; - bool found_key = FALSE; - bool found_inverse = FALSE; - - /* The key is the inverse mapping */ - char key[UTF8_MAXBYTES+1]; - char* key_end = (char *) uvchr_to_utf8((U8*) key, val); - STRLEN key_len = key_end - key; - - /* Get the list for the map */ - if ((listp = hv_fetch(ret, key, key_len, FALSE))) { - list = (AV*) *listp; - } - else { /* No entry yet for it: create one */ - list = newAV(); - if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - } - - /* Look through list to see if this inverse mapping already is - * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex_skip_len_mg(list); i++) { - SV** entryp = av_fetch(list, i, FALSE); - SV* entry; - UV uv; - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); - } - entry = *entryp; - uv = SvUV(entry); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/ - if (uv == val) { - found_key = TRUE; - } - if (uv == inverse) { - found_inverse = TRUE; - } - - /* No need to continue searching if found everything we are - * looking for */ - if (found_key && found_inverse) { - break; - } - } - - /* Make sure there is a mapping to itself on the list */ - if (! found_key) { - av_push(list, newSVuv(val)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/ - } - - - /* Simply add the value to the list */ - if (! found_inverse) { - av_push(list, newSVuv(inverse)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/ - } - - /* swatch_get() increments the value of val for each element in the - * range. That makes more compact tables possible. You can - * express the capitalization, for example, of all consecutive - * letters with a single line: 0061\t007A\t0041 This maps 0061 to - * 0041, 0062 to 0042, etc. I (khw) have never understood 'none', - * and it's not documented; it appears to be used only in - * implementing tr//; I copied the semantics from swatch_get(), just - * in case */ - if (!none || val < none) { - ++val; - } - } - } - - return ret; -} - SV* Perl__swash_to_invlist(pTHX_ SV* const swash) { @@ -4933,6 +5127,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) /* Get the 0th element, which is needed to setup the inversion list * */ while (isSPACE(*l)) l++; + after_atou = (char *) lend; if (!grok_atoUV((const char *)l, &element0, &after_atou)) { Perl_croak(aTHX_ "panic: Expecting a valid 0th element for" " inversion list"); @@ -4949,6 +5144,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) " elements than available", elements); } while (isSPACE(*l)) l++; + after_atou = (char *) lend; if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) { @@ -4965,7 +5161,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) * size based on worst possible case, which is each line in the input * creates 2 elements in the inversion list: 1) the beginning of a * range in the list; 2) the beginning of a range not in the list. */ - while ((loc = (strchr(loc, '\n'))) != NULL) { + while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) { elements += 2; loc++; } @@ -5110,10 +5306,8 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) /* May change: warns if surrogates, non-character code points, or * non-Unicode code points are in 's' which has length 'len' bytes. * Returns TRUE if none found; FALSE otherwise. The only other validity - * check is to make sure that this won't exceed the string's length. - * - * Code points above the platform's C will raise a deprecation - * warning, unless those are turned off. */ + * check is to make sure that this won't exceed the string's length nor + * overflow */ const U8* const e = s + len; bool ok = TRUE; @@ -5129,22 +5323,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { if (UNLIKELY(UTF8_IS_SUPER(s, e))) { if ( ckWARN_d(WARN_NON_UNICODE) - || ( ckWARN_d(WARN_DEPRECATED) -#ifndef UV_IS_QUAD - && UNLIKELY(is_utf8_cp_above_31_bits(s, e)) -#else /* Below is 64-bit words */ - /* 2**63 and up meet these conditions provided we have - * a 64-bit word. */ -# ifdef EBCDIC - && *s == 0xFE - && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8 -# else - && *s == 0xFF - /* s[1] being above 0x80 overflows */ - && s[2] >= 0x88 -# endif -#endif - )) { + || UNLIKELY(0 < does_utf8_overflow(s, s + len, + 0 /* Don't consider overlongs */ + ))) + { /* A side effect of this function will be to warn */ (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); ok = FALSE; @@ -5558,7 +5740,7 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) { PERL_ARGS_ASSERT_UVUNI_TO_UTF8; - return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0); + return uvoffuni_to_utf8_flags(d, uv, 0); } /* @@ -5609,6 +5791,588 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) return uvoffuni_to_utf8_flags(d, uv, flags); } +void +Perl_init_uniprops(pTHX) +{ + /* Set up the inversion list global variables */ + + PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]); + PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALNUM]); + PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALPHA]); + PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXBLANK]); + PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]); + PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXCNTRL]); + PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXDIGIT]); + PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXGRAPH]); + PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXLOWER]); + PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPRINT]); + PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPUNCT]); + PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXSPACE]); + PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXUPPER]); + PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]); + PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXWORD]); + PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXXDIGIT]); + + PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]); + PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALNUM]); + PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALPHA]); + PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXBLANK]); + PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]); + PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXCNTRL]); + PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXDIGIT]); + PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXGRAPH]); + PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXLOWER]); + PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPRINT]); + PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPUNCT]); + PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXSPACE]); + PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXUPPER]); + PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]); + PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXWORD]); + PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXXDIGIT]); + + PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist); + PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist); + PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist); + PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist); + PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist); + + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + + PL_Assigned_invlist = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASSIGNED]); + + PL_utf8_perl_idstart = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDSTART]); + PL_utf8_perl_idcont = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDCONT]); + + PL_utf8_charname_begin = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_BEGIN]); + PL_utf8_charname_continue = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_CONTINUE]); + + PL_utf8_foldable = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_ANY_FOLDS]); + PL_HasMultiCharFold = _new_invlist_C_array(PL_uni_prop_ptrs[ + PL__PERL_FOLDS_TO_MULTI_CHAR]); + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + + PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); + PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); + PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist); + PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist); + PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); + PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); +} + +SV * +Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const bool to_fold, bool * invert) +{ + /* Parse the interior meat of \p{} passed to this in 'name' with length 'len', + * and return an inversion list if a property with 'name' is found, or NULL + * if not. 'name' point to the input with leading and trailing space trimmed. + * 'to_fold' indicates if /i is in effect. + * + * When the return is an inversion list, '*invert' will be set to a boolean + * indicating if it should be inverted or not + * + * This currently doesn't handle all cases. A NULL return indicates the + * caller should try a different approach + */ + + char* lookup_name; + bool stricter = FALSE; + bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one + of the cjk numeric properties (though + it requires extra effort to compile + them) */ + unsigned int i; + unsigned int j = 0, lookup_len; + int equals_pos = -1; /* Where the '=' is found, or negative if none */ + int slash_pos = -1; /* Where the '/' is found, or negative if none */ + int table_index = 0; + bool starts_with_In_or_Is = FALSE; + Size_t lookup_offset = 0; + + PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; + + /* The input will be modified into 'lookup_name' */ + Newx(lookup_name, len, char); + SAVEFREEPV(lookup_name); + + /* Parse the input. */ + for (i = 0; i < len; i++) { + char cur = name[i]; + + /* These characters can be freely ignored in most situations. Later it + * may turn out we shouldn't have ignored them, and we have to reparse, + * but we don't have enough information yet to make that decision */ + if (cur == '-' || cur == '_' || isSPACE(cur)) { + continue; + } + + /* Case differences are also ignored. Our lookup routine assumes + * everything is lowercase */ + if (isUPPER(cur)) { + lookup_name[j++] = toLOWER(cur); + continue; + } + + /* A double colon is either an error, or a package qualifier to a + * subroutine user-defined property; neither of which do we currently + * handle + * + * But a single colon is a synonym for '=' */ + if (cur == ':') { + if (i < len - 1 && name[i+1] == ':') { + return NULL; + } + cur = '='; + } + + /* Otherwise, this character is part of the name. */ + lookup_name[j++] = cur; + + /* Only the equals sign needs further processing */ + if (cur == '=') { + equals_pos = j; /* Note where it occurred in the input */ + break; + } + } + + /* Here, we are either done with the whole property name, if it was simple; + * or are positioned just after the '=' if it is compound. */ + + if (equals_pos >= 0) { + assert(! stricter); /* We shouldn't have set this yet */ + + /* Space immediately after the '=' is ignored */ + i++; + for (; i < len; i++) { + if (! isSPACE(name[i])) { + break; + } + } + + /* Certain properties need special handling. They may optionally be + * prefixed by 'is'. Ignore that prefix for the purposes of checking + * if this is one of those properties */ + if (memBEGINPs(lookup_name, len, "is")) { + lookup_offset = 2; + } + + /* Then check if it is one of these properties. This is hard-coded + * because easier this way, and the list is unlikely to change. There + * are several properties like this in the Unihan DB, which is unlikely + * to be compiled, and they all end with 'numeric'. The interiors + * aren't checked for the precise property. This would stop working if + * a cjk property were to be created that ended with 'numeric' and + * wasn't a numeric type */ + is_nv_type = memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "numericvalue") + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "nv") + || ( memENDPs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "numeric") + && ( memBEGINPs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "cjk") + || memBEGINPs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "k"))); + if ( is_nv_type + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "canonicalcombiningclass") + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "ccc") + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "age") + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "in") + || memEQs(lookup_name + lookup_offset, + j - 1 - lookup_offset, "presentin")) + { + unsigned int k; + + /* What makes these properties special is that the stuff after the + * '=' is a number. Therefore, we can't throw away '-' + * willy-nilly, as those could be a minus sign. Other stricter + * rules also apply. However, these properties all can have the + * rhs not be a number, in which case they contain at least one + * alphabetic. In those cases, the stricter rules don't apply. + * But the numeric type properties can have the alphas [Ee] to + * signify an exponent, and it is still a number with stricter + * rules. So look for an alpha that signifys not-strict */ + stricter = TRUE; + for (k = i; k < len; k++) { + if ( isALPHA(name[k]) + && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E'))) + { + stricter = FALSE; + break; + } + } + } + + if (stricter) { + + /* A number may have a leading '+' or '-'. The latter is retained + * */ + if (name[i] == '+') { + i++; + } + else if (name[i] == '-') { + lookup_name[j++] = '-'; + i++; + } + + /* Skip leading zeros including single underscores separating the + * zeros, or between the final leading zero and the first other + * digit */ + for (; i < len - 1; i++) { + if ( name[i] != '0' + && (name[i] != '_' || ! isDIGIT(name[i+1]))) + { + break; + } + } + } + } + else { /* No '=' */ + + /* We are now in a position to determine if this property should have + * been parsed using stricter rules. Only a few are like that, and + * unlikely to change. */ + if ( memBEGINPs(lookup_name, j, "perl") + && memNEs(lookup_name + 4, j - 4, "space") + && memNEs(lookup_name + 4, j - 4, "word")) + { + stricter = TRUE; + + /* We set the inputs back to 0 and the code below will reparse, + * using strict */ + i = j = 0; + } + } + + /* Here, we have either finished the property, or are positioned to parse + * the remainder, and we know if stricter rules apply. Finish out, if not + * already done */ + for (; i < len; i++) { + char cur = name[i]; + + /* In all instances, case differences are ignored, and we normalize to + * lowercase */ + if (isUPPER(cur)) { + lookup_name[j++] = toLOWER(cur); + continue; + } + + /* An underscore is skipped, but not under strict rules unless it + * separates two digits */ + if (cur == '_') { + if ( stricter + && ( i == 0 || (int) i == equals_pos || i == len- 1 + || ! isDIGIT(name[i-1]) || ! isDIGIT(name[i+1]))) + { + lookup_name[j++] = '_'; + } + continue; + } + + /* Hyphens are skipped except under strict */ + if (cur == '-' && ! stricter) { + continue; + } + + /* XXX Bug in documentation. It says white space skipped adjacent to + * non-word char. Maybe we should, but shouldn't skip it next to a dot + * in a number */ + if (isSPACE(cur) && ! stricter) { + continue; + } + + lookup_name[j++] = cur; + + /* Unless this is a non-trailing slash, we are done with it */ + if (i >= len - 1 || cur != '/') { + continue; + } + + slash_pos = j; + + /* A slash in the 'numeric value' property indicates that what follows + * is a denominator. It can have a leading '+' and '0's that should be + * skipped. But we have never allowed a negative denominator, so treat + * a minus like every other character. (No need to rule out a second + * '/', as that won't match anything anyway */ + if (is_nv_type) { + i++; + if (i < len && name[i] == '+') { + i++; + } + + /* Skip leading zeros including underscores separating digits */ + for (; i < len - 1; i++) { + if ( name[i] != '0' + && (name[i] != '_' || ! isDIGIT(name[i+1]))) + { + break; + } + } + + /* Store the first real character in the denominator */ + lookup_name[j++] = name[i]; + } + } + + /* Here are completely done parsing the input 'name', and 'lookup_name' + * contains a copy, normalized. + * + * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and + * different from without the underscores. */ + if ( ( UNLIKELY(memEQs(lookup_name, j, "l")) + || UNLIKELY(memEQs(lookup_name, j, "gc=l"))) + && UNLIKELY(name[len-1] == '_')) + { + lookup_name[j++] = '&'; + } + else if (len > 2 && name[0] == 'I' && ( name[1] == 'n' || name[1] == 's')) + { + + /* Also, if the original input began with 'In' or 'Is', it could be a + * subroutine call instead of a property names, which currently isn't + * handled by this function. Subroutine calls can't happen if there is + * an '=' in the name */ + if (equals_pos < 0 && get_cvn_flags(name, len, GV_NOTQUAL) != NULL) { + return NULL; + } + + starts_with_In_or_Is = TRUE; + } + + lookup_len = j; /* Use a more mnemonic name starting here */ + + /* Get the index into our pointer table of the inversion list corresponding + * to the property */ + table_index = match_uniprop((U8 *) lookup_name, lookup_len); + + /* If it didn't find the property */ + if (table_index == 0) { + + /* If didn't find the property, we try again stripping off any initial + * 'In' or 'Is' */ + if (starts_with_In_or_Is) { + lookup_name += 2; + lookup_len -= 2; + equals_pos -= 2; + slash_pos -= 2; + + table_index = match_uniprop((U8 *) lookup_name, lookup_len); + } + + if (table_index == 0) { + char * canonical; + + /* If not found, and not a numeric type property, isn't a legal + * property */ + if (! is_nv_type) { + return NULL; + } + + /* But the numeric type properties need more work to decide. What + * we do is make sure we have the number in canonical form and look + * that up. */ + + if (slash_pos < 0) { /* No slash */ + + /* When it isn't a rational, take the input, convert it to a + * NV, then create a canonical string representation of that + * NV. */ + + NV value; + + /* Get the value */ + if (my_atof3(lookup_name + equals_pos, &value, + lookup_len - equals_pos) + != lookup_name + lookup_len) + { + return NULL; + } + + /* If the value is an integer, the canonical value is integral */ + if (Perl_ceil(value) == value) { + canonical = Perl_form(aTHX_ "%.*s%.0" NVff, + equals_pos, lookup_name, value); + } + else { /* Otherwise, it is %e with a known precision */ + canonical = Perl_form(aTHX_ "%.*s%.*" NVef, + equals_pos, lookup_name, + PL_E_FORMAT_PRECISION, value); + } + } + else { /* Has a slash. Create a rational in canonical form */ + UV numerator, denominator, gcd, trial; + const char * end_ptr; + const char * sign = ""; + + /* We can't just find the numerator, denominator, and do the + * division, then use the method above, because that is + * inexact. And the input could be a rational that is within + * epsilon (given our precision) of a valid rational, and would + * then incorrectly compare valid. + * + * We're only interested in the part after the '=' */ + const char * this_lookup_name = lookup_name + equals_pos; + lookup_len -= equals_pos; + slash_pos -= equals_pos; + + /* Handle any leading minus */ + if (this_lookup_name[0] == '-') { + sign = "-"; + this_lookup_name++; + lookup_len--; + slash_pos--; + } + + /* Convert the numerator to numeric */ + end_ptr = this_lookup_name + slash_pos; + if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) { + return NULL; + } + + /* It better have included all characters before the slash */ + if (*end_ptr != '/') { + return NULL; + } + + /* Set to look at just the denominator */ + this_lookup_name += slash_pos; + lookup_len -= slash_pos; + end_ptr = this_lookup_name + lookup_len; + + /* Convert the denominator to numeric */ + if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) { + return NULL; + } + + /* It better be the rest of the characters, and don't divide by + * 0 */ + if ( end_ptr != this_lookup_name + lookup_len + || denominator == 0) + { + return NULL; + } + + /* Get the greatest common denominator using + http://en.wikipedia.org/wiki/Euclidean_algorithm */ + gcd = numerator; + trial = denominator; + while (trial != 0) { + UV temp = trial; + trial = gcd % trial; + gcd = temp; + } + + /* If already in lowest possible terms, we have already tried + * looking this up */ + if (gcd == 1) { + return NULL; + } + + /* Reduce the rational, which should put it in canonical form. + * Then look it up */ + numerator /= gcd; + denominator /= gcd; + + canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf, + equals_pos, lookup_name, sign, numerator, denominator); + } + + /* Here, we have the number in canonical form. Try that */ + table_index = match_uniprop((U8 *) canonical, strlen(canonical)); + if (table_index == 0) { + return NULL; + } + } + } + + /* The return is an index into a table of ptrs. A negative return + * signifies that the real index is the absolute value, but the result + * needs to be inverted */ + if (table_index < 0) { + *invert = TRUE; + table_index = -table_index; + } + else { + *invert = FALSE; + } + + /* Out-of band indices indicate a deprecated property. The proper index is + * modulo it with the table size. And dividing by the table size yields + * an offset into a table constructed to contain the corresponding warning + * message */ + if (table_index > MAX_UNI_KEYWORD_INDEX) { + Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX; + table_index %= MAX_UNI_KEYWORD_INDEX; + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), + "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s", + (int) len, name, deprecated_property_msgs[warning_offset]); + } + + /* In a few properties, a different property is used under /i. These are + * unlikely to change, so are hard-coded here. */ + if (to_fold) { + if ( table_index == PL_XPOSIXUPPER + || table_index == PL_XPOSIXLOWER + || table_index == PL_TITLE) + { + table_index = PL_CASED; + } + else if ( table_index == PL_UPPERCASELETTER + || table_index == PL_LOWERCASELETTER +#ifdef PL_TITLECASELETTER /* Missing from early Unicodes */ + || table_index == PL_TITLECASELETTER +#endif + ) { + table_index = PL_CASEDLETTER; + } + else if ( table_index == PL_POSIXUPPER + || table_index == PL_POSIXLOWER) + { + table_index = PL_POSIXALPHA; + } + } + + /* Create and return the inversion list */ + return _new_invlist_C_array(PL_uni_prop_ptrs[table_index]); +} + +/* +=for apidoc utf8_to_uvchr + +Returns the native code point of the first character in the string C +which is assumed to be in UTF-8 encoding; C will be set to the +length, in bytes, of that character. + +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer, which +is why this function is deprecated. Use L instead. + +If C points to one of the detected malformations, and UTF8 warnings are +enabled, zero is returned and C<*retlen> is set (if C isn't +C) to -1. If those warnings are off, the computed value if well-defined (or +the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> +is set (if C isn't NULL) so that (S + C<*retlen>>) is the +next possible position in C that could begin a non-malformed character. +See L for details on when the REPLACEMENT CHARACTER is returned. + +=cut +*/ + +UV +Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR; + + return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen); +} + /* * ex: set ts=8 sts=4 sw=4 et: */