X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/df6bd76f98839f1564209d4723f1b8599e14b2f9..a192978fac97e3535f3d8ae6857c8156871bb916:/utf8.c diff --git a/utf8.c b/utf8.c index 7809e7b..7a45ca9 100644 --- a/utf8.c +++ b/utf8.c @@ -37,11 +37,6 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; -/* Be sure to synchronize this message with the similar one in regcomp.c */ -static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is not allowed; the" - " permissible max is 0x%" UVXf; - /* =head1 Unicode Support These are various utility functions for manipulating UTF8-encoded @@ -60,9 +55,7 @@ within non-zero characters. static void S_restore_cop_warnings(pTHX_ void *p) { - if (!specialWARN(PL_curcop->cop_warnings)) - PerlMemShared_free(PL_curcop->cop_warnings); - PL_curcop->cop_warnings = (STRLEN*)p; + free_and_set_cop_warnings(PL_curcop, (STRLEN*) p); } @@ -171,9 +164,6 @@ const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf " is not recommended for open interchange"; const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," " may not be portable"; -const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ - " Unicode, requires a Perl extension," \ - " and so is not portable"; #define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \ STMT_START { \ @@ -327,7 +317,7 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) if (UNLIKELY( uv > MAX_LEGAL_CP && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))) { - Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP); + Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv)); } if ( (flags & UNICODE_WARN_SUPER) || ( (flags & UNICODE_WARN_PERL_EXTENDED) @@ -339,7 +329,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) /* Choose the more dire applicable warning */ if (UNICODE_IS_PERL_EXTENDED(uv)) { - format = perl_extended_cp_format; + format = PL_extended_cp_format; + category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); if (flags & (UNICODE_WARN_PERL_EXTENDED |UNICODE_DISALLOW_PERL_EXTENDED)) { @@ -351,8 +342,11 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), category, flag); } - else { - Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv); + else if ( ckWARN_d(WARN_NON_UNICODE) + || ( (flag & UNICODE_GOT_PERL_EXTENDED) + && ckWARN(WARN_PORTABLE))) + { + Perl_warner(aTHX_ category, format, uv); } } if ( (flags & UNICODE_DISALLOW_SUPER) @@ -1286,6 +1280,20 @@ All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never warn. +=for apidoc Amnh||UTF8_CHECK_ONLY +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_SURROGATE +=for apidoc Amnh||UTF8_DISALLOW_NONCHAR +=for apidoc Amnh||UTF8_DISALLOW_SUPER +=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_SURROGATE +=for apidoc Amnh||UTF8_WARN_NONCHAR +=for apidoc Amnh||UTF8_WARN_SUPER +=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED +=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED + =cut Also implemented as a macro in utf8.h @@ -1359,7 +1367,7 @@ describes the situation in all cases. =item C -The input sequence was malformed in that the first byte was a a UTF-8 +The input sequence was malformed in that the first byte was a UTF-8 continuation byte. =item C @@ -1613,7 +1621,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, * returning to the caller C<*retlen> pointing to the very next byte (one - * which is actually part of of the overflowing sequence), that could look + * which is actually part of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately. * @@ -1860,7 +1868,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } while (possible_problems) { /* Handle each possible problem */ - UV pack_warn = 0; + U32 pack_warn = 0; char * message = NULL; U32 this_flag_bit = 0; @@ -1999,8 +2007,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * valid, avoid as much as possible reading past the * end of the buffer */ int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) - ? s - s0 - : send - s0; + ? (int) (s - s0) + : (int) (send - s0); pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s", unexpected_non_continuation_text(s0, @@ -2073,9 +2081,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) - && (msgs || ckWARN_d(WARN_NON_UNICODE))) + && (msgs || ( ckWARN_d(WARN_NON_UNICODE) + || ckWARN(WARN_PORTABLE)))) { - pack_warn = packWARN(WARN_NON_UNICODE); + pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); /* If it is an overlong that evaluates to a code point * that doesn't have to use the Perl extended UTF-8, it @@ -2088,7 +2097,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * */ if (UNICODE_IS_PERL_EXTENDED(uv)) { message = Perl_form(aTHX_ - perl_extended_cp_format, uv); + PL_extended_cp_format, uv); } else { message = Perl_form(aTHX_ @@ -2702,7 +2711,7 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) * Do not use in-place. We optimize for native, for obvious reasons. */ U8* -Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) +Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) { U8* pend; U8* dstart = d; @@ -2736,16 +2745,16 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) /* This assumes that most uses will be in the first Unicode plane, not * needing surrogates */ - if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST - && uv <= UNICODE_SURROGATE_LAST)) + if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST, + UNICODE_SURROGATE_LAST))) { if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } else { UV low = (p[0] << 8) + p[1]; - if ( UNLIKELY(low < FIRST_LOW_SURROGATE) - || UNLIKELY(low > LAST_LOW_SURROGATE)) + if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE, + LAST_LOW_SURROGATE))) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } @@ -2779,7 +2788,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) /* Note: this one is slightly destructive of the source. */ U8* -Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) +Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) { U8* s = (U8*)p; U8* const send = s + bytelen; @@ -3223,8 +3232,8 @@ Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e) STATIC UV S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, - SV *invlist, const int * const invmap, - const unsigned int * const * const aux_tables, + SV *invlist, const I32 * const invmap, + const U32 * const * const aux_tables, const U8 * const aux_table_lengths, const char * const normal) { @@ -3308,8 +3317,7 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { if (UNLIKELY(uv1 > MAX_LEGAL_CP)) { - Perl_croak(aTHX_ cp_above_legal_max, uv1, - MAX_LEGAL_CP); + Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1)); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3336,13 +3344,13 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, { unsigned int i; - const unsigned int * cp_list; + const U32 * cp_list; U8 * d; /* 'index' is guaranteed to be non-negative, as this is an inversion * map that covers all possible inputs. See [perl #133365] */ SSize_t index = _invlist_search(invlist, uv1); - IV base = invmap[index]; + I32 base = invmap[index]; /* The data structures are set up so that if 'base' is non-negative, * the case change is 1-to-1; and if 0, the change is to itself */ @@ -3395,8 +3403,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, } Size_t -Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, - const unsigned int ** remaining_folds_to) +Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to, + const U32 ** remaining_folds_to) { /* Returns the count of the number of code points that fold to the input * 'cp' (besides itself). @@ -3414,13 +3422,18 @@ Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, * The reason for this convolution is to avoid having to deal with * allocating and freeing memory. The lists are already constructed, so * the return can point to them, but single code points aren't, so would - * need to be constructed if we didn't employ something like this API */ + * need to be constructed if we didn't employ something like this API + * + * The code points returned by this function are all legal Unicode, which + * occupy at most 21 bits, and so a U32 is sufficient, and the lists are + * constructed with this size (to save space and memory), and we return + * pointers, so they must be this size */ dVAR; /* 'index' is guaranteed to be non-negative, as this is an inversion map * that covers all possible inputs. See [perl #133365] */ SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); - int base = _Perl_IVCF_invmap[index]; + I32 base = _Perl_IVCF_invmap[index]; PERL_ARGS_ASSERT__INVERSE_FOLDS; @@ -3444,16 +3457,16 @@ Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, * to 'cp', and the parallel array containing the length of the list * array */ *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; - *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes - *first_folds_to - */ + *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; + /* +1 excludes first_folds_to */ return IVCF_AUX_TABLE_lengths[-base]; } #endif /* Only the single code point. This works like 'fc(G) = G - A + a' */ - *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index]; + *first_folds_to = (U32) (base + cp + - invlist_array(PL_utf8_foldclosures)[index]); *remaining_folds_to = NULL; return 1; } @@ -3609,7 +3622,7 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, * ustrp will contain *lenp bytes * * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER - * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER * DOTLESS I */ PERL_ARGS_ASSERT_TURKIC_UC; @@ -3726,7 +3739,7 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, return result; /* Not currently externally documented, and subject to change: - * is set iff iff the rules from the current underlying locale are to + * is set iff the rules from the current underlying locale are to * be used. */ UV @@ -3773,7 +3786,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, } /* Not currently externally documented, and subject to change: - * is set iff iff the rules from the current underlying locale are to + * is set iff the rules from the current underlying locale are to * be used. */ @@ -4032,9 +4045,9 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) /* =for apidoc pv_uni_display -Build to the scalar C a displayable version of the string C, -length C, the displayable version being at most C bytes long -(if longer, the rest is truncated and C<"..."> will be appended). +Build to the scalar C a displayable version of the UTF-8 encoded string +C, length C, the displayable version being at most C bytes +long (if longer, the rest is truncated and C<"..."> will be appended). The C argument can have C set to display Cable characters as themselves, C @@ -4043,6 +4056,9 @@ to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">) C (and its alias C) have both C and C turned on. +Additionally, there is now C which allows C<\b> for a +backspace, but only when C also is set. + The pointer to the PV of the C is returned. See also L. @@ -4061,10 +4077,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; - /* This serves double duty as a flag and a character to print after - a \ when flags & UNI_DISPLAY_BACKSLASH is true. - */ - char ok = 0; + bool ok = 0; if (pvlim && SvCUR(dsv) >= pvlim) { truncated++; @@ -4074,27 +4087,19 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, if (u < 256) { const unsigned char c = (unsigned char)u & 0xFF; if (flags & UNI_DISPLAY_BACKSLASH) { - switch (c) { - case '\n': - ok = 'n'; break; - case '\r': - ok = 'r'; break; - case '\t': - ok = 't'; break; - case '\f': - ok = 'f'; break; - case '\a': - ok = 'a'; break; - case '\\': - ok = '\\'; break; - default: break; - } - if (ok) { - const char string = ok; - sv_catpvs(dsv, "\\"); - sv_catpvn(dsv, &string, 1); - } - } + if ( isMNEMONIC_CNTRL(c) + && ( c != '\b' + || (flags & UNI_DISPLAY_BACKSPACE))) + { + const char * mnemonic = cntrl_to_mnemonic(c); + sv_catpvn(dsv, mnemonic, strlen(mnemonic)); + ok = 1; + } + else if (c == '\\') { + sv_catpvs(dsv, "\\\\"); + ok = 1; + } + } /* isPRINT() is the locale-blind version. */ if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { const char string = c; @@ -4430,106 +4435,6 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, return 1; } -/* XXX The next two functions should likely be moved to mathoms.c once all - * occurrences of them are removed from the core; some cpan-upstream modules - * still use them */ - -U8 * -Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) -{ - PERL_ARGS_ASSERT_UVUNI_TO_UTF8; - - return uvoffuni_to_utf8_flags(d, uv, 0); -} - -/* -=for apidoc utf8n_to_uvuni - -Instead use L, or rarely, L. - -This function was useful for code that wanted to handle both EBCDIC and -ASCII platforms with Unicode properties, but starting in Perl v5.20, the -distinctions between the platforms have mostly been made invisible to most -code, so this function is quite unlikely to be what you want. If you do need -this precise functionality, use instead -C> -or C>. - -=cut -*/ - -UV -Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; - - return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); -} - -/* -=for apidoc uvuni_to_utf8_flags - -Instead you almost certainly want to use L or -L. - -This function is a deprecated synonym for L, -which itself, while not deprecated, should be used only in isolated -circumstances. These functions were useful for code that wanted to handle -both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl -v5.20, the distinctions between the platforms have mostly been made invisible -to most code, so this function is quite unlikely to be what you want. - -=cut -*/ - -U8 * -Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) -{ - PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; - - return uvoffuni_to_utf8_flags(d, uv, flags); -} - -/* -=for apidoc utf8_to_uvchr - -Returns the native code point of the first character in the string C -which is assumed to be in UTF-8 encoding; C will be set to the -length, in bytes, of that character. - -Some, but not all, UTF-8 malformations are detected, and in fact, some -malformed input could cause reading beyond the end of the input buffer, which -is why this function is deprecated. Use L instead. - -If C points to one of the detected malformations, and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value if well-defined (or -the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> -is set (if C isn't NULL) so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. - -=cut -*/ - -UV -Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) -{ - PERL_ARGS_ASSERT_UTF8_TO_UVCHR; - - /* This function is unsafe if malformed UTF-8 input is given it, which is - * why the function is deprecated. If the first byte of the input - * indicates that there are more bytes remaining in the sequence that forms - * the character than there are in the input buffer, it can read past the - * end. But we can make it safe if the input string happens to be - * NUL-terminated, as many strings in Perl are, by refusing to read past a - * NUL, which is what UTF8_CHK_SKIP() does. A NUL indicates the start of - * the next character anyway. If the input isn't NUL-terminated, the - * function remains unsafe, as it always has been. */ - - return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen); -} - /* * ex: set ts=8 sts=4 sw=4 et: */