X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1b292063004c9bbdfe307aadd8beb72df25e7c2f..a192978fac97e3535f3d8ae6857c8156871bb916:/utf8.c diff --git a/utf8.c b/utf8.c index 7db7dd3..7a45ca9 100644 --- a/utf8.c +++ b/utf8.c @@ -37,11 +37,6 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; -/* Be sure to synchronize this message with the similar one in regcomp.c */ -static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is not allowed; the" - " permissible max is 0x%" UVXf; - /* =head1 Unicode Support These are various utility functions for manipulating UTF8-encoded @@ -60,9 +55,7 @@ within non-zero characters. static void S_restore_cop_warnings(pTHX_ void *p) { - if (!specialWARN(PL_curcop->cop_warnings)) - PerlMemShared_free(PL_curcop->cop_warnings); - PL_curcop->cop_warnings = (STRLEN*)p; + free_and_set_cop_warnings(PL_curcop, (STRLEN*) p); } @@ -171,9 +164,6 @@ const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf " is not recommended for open interchange"; const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," " may not be portable"; -const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ - " Unicode, requires a Perl extension," \ - " and so is not portable"; #define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \ STMT_START { \ @@ -327,7 +317,7 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) if (UNLIKELY( uv > MAX_LEGAL_CP && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))) { - Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP); + Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv)); } if ( (flags & UNICODE_WARN_SUPER) || ( (flags & UNICODE_WARN_PERL_EXTENDED) @@ -339,7 +329,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) /* Choose the more dire applicable warning */ if (UNICODE_IS_PERL_EXTENDED(uv)) { - format = perl_extended_cp_format; + format = PL_extended_cp_format; + category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); if (flags & (UNICODE_WARN_PERL_EXTENDED |UNICODE_DISALLOW_PERL_EXTENDED)) { @@ -351,8 +342,11 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) *msgs = new_msg_hv(Perl_form(aTHX_ format, uv), category, flag); } - else { - Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv); + else if ( ckWARN_d(WARN_NON_UNICODE) + || ( (flag & UNICODE_GOT_PERL_EXTENDED) + && ckWARN(WARN_PORTABLE))) + { + Perl_warner(aTHX_ category, format, uv); } } if ( (flags & UNICODE_DISALLOW_SUPER) @@ -1373,7 +1367,7 @@ describes the situation in all cases. =item C -The input sequence was malformed in that the first byte was a a UTF-8 +The input sequence was malformed in that the first byte was a UTF-8 continuation byte. =item C @@ -1627,7 +1621,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, * returning to the caller C<*retlen> pointing to the very next byte (one - * which is actually part of of the overflowing sequence), that could look + * which is actually part of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately. * @@ -1874,7 +1868,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } while (possible_problems) { /* Handle each possible problem */ - UV pack_warn = 0; + U32 pack_warn = 0; char * message = NULL; U32 this_flag_bit = 0; @@ -2013,8 +2007,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * valid, avoid as much as possible reading past the * end of the buffer */ int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) - ? s - s0 - : send - s0; + ? (int) (s - s0) + : (int) (send - s0); pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s", unexpected_non_continuation_text(s0, @@ -2087,9 +2081,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) - && (msgs || ckWARN_d(WARN_NON_UNICODE))) + && (msgs || ( ckWARN_d(WARN_NON_UNICODE) + || ckWARN(WARN_PORTABLE)))) { - pack_warn = packWARN(WARN_NON_UNICODE); + pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE); /* If it is an overlong that evaluates to a code point * that doesn't have to use the Perl extended UTF-8, it @@ -2102,7 +2097,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * */ if (UNICODE_IS_PERL_EXTENDED(uv)) { message = Perl_form(aTHX_ - perl_extended_cp_format, uv); + PL_extended_cp_format, uv); } else { message = Perl_form(aTHX_ @@ -2716,7 +2711,7 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) * Do not use in-place. We optimize for native, for obvious reasons. */ U8* -Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) +Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) { U8* pend; U8* dstart = d; @@ -2793,7 +2788,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) /* Note: this one is slightly destructive of the source. */ U8* -Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) +Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) { U8* s = (U8*)p; U8* const send = s + bytelen; @@ -3322,8 +3317,7 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { if (UNLIKELY(uv1 > MAX_LEGAL_CP)) { - Perl_croak(aTHX_ cp_above_legal_max, uv1, - MAX_LEGAL_CP); + Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1)); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3350,13 +3344,13 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, { unsigned int i; - const unsigned int * cp_list; + const U32 * cp_list; U8 * d; /* 'index' is guaranteed to be non-negative, as this is an inversion * map that covers all possible inputs. See [perl #133365] */ SSize_t index = _invlist_search(invlist, uv1); - IV base = invmap[index]; + I32 base = invmap[index]; /* The data structures are set up so that if 'base' is non-negative, * the case change is 1-to-1; and if 0, the change is to itself */ @@ -3463,9 +3457,8 @@ Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to, * to 'cp', and the parallel array containing the length of the list * array */ *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; - *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes - *first_folds_to - */ + *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; + /* +1 excludes first_folds_to */ return IVCF_AUX_TABLE_lengths[-base]; } @@ -3629,7 +3622,7 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, * ustrp will contain *lenp bytes * * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER - * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER * DOTLESS I */ PERL_ARGS_ASSERT_TURKIC_UC; @@ -3746,7 +3739,7 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, return result; /* Not currently externally documented, and subject to change: - * is set iff iff the rules from the current underlying locale are to + * is set iff the rules from the current underlying locale are to * be used. */ UV @@ -3793,7 +3786,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, } /* Not currently externally documented, and subject to change: - * is set iff iff the rules from the current underlying locale are to + * is set iff the rules from the current underlying locale are to * be used. */ @@ -4052,9 +4045,9 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) /* =for apidoc pv_uni_display -Build to the scalar C a displayable version of the string C, -length C, the displayable version being at most C bytes long -(if longer, the rest is truncated and C<"..."> will be appended). +Build to the scalar C a displayable version of the UTF-8 encoded string +C, length C, the displayable version being at most C bytes +long (if longer, the rest is truncated and C<"..."> will be appended). The C argument can have C set to display Cable characters as themselves, C @@ -4063,6 +4056,9 @@ to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">) C (and its alias C) have both C and C turned on. +Additionally, there is now C which allows C<\b> for a +backspace, but only when C also is set. + The pointer to the PV of the C is returned. See also L. @@ -4081,10 +4077,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; - /* This serves double duty as a flag and a character to print after - a \ when flags & UNI_DISPLAY_BACKSLASH is true. - */ - char ok = 0; + bool ok = 0; if (pvlim && SvCUR(dsv) >= pvlim) { truncated++; @@ -4094,27 +4087,19 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, if (u < 256) { const unsigned char c = (unsigned char)u & 0xFF; if (flags & UNI_DISPLAY_BACKSLASH) { - switch (c) { - case '\n': - ok = 'n'; break; - case '\r': - ok = 'r'; break; - case '\t': - ok = 't'; break; - case '\f': - ok = 'f'; break; - case '\a': - ok = 'a'; break; - case '\\': - ok = '\\'; break; - default: break; - } - if (ok) { - const char string = ok; - sv_catpvs(dsv, "\\"); - sv_catpvn(dsv, &string, 1); - } - } + if ( isMNEMONIC_CNTRL(c) + && ( c != '\b' + || (flags & UNI_DISPLAY_BACKSPACE))) + { + const char * mnemonic = cntrl_to_mnemonic(c); + sv_catpvn(dsv, mnemonic, strlen(mnemonic)); + ok = 1; + } + else if (c == '\\') { + sv_catpvs(dsv, "\\\\"); + ok = 1; + } + } /* isPRINT() is the locale-blind version. */ if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { const char string = c; @@ -4450,106 +4435,6 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, return 1; } -/* XXX The next two functions should likely be moved to mathoms.c once all - * occurrences of them are removed from the core; some cpan-upstream modules - * still use them */ - -U8 * -Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) -{ - PERL_ARGS_ASSERT_UVUNI_TO_UTF8; - - return uvoffuni_to_utf8_flags(d, uv, 0); -} - -/* -=for apidoc utf8n_to_uvuni - -Instead use L, or rarely, L. - -This function was useful for code that wanted to handle both EBCDIC and -ASCII platforms with Unicode properties, but starting in Perl v5.20, the -distinctions between the platforms have mostly been made invisible to most -code, so this function is quite unlikely to be what you want. If you do need -this precise functionality, use instead -C> -or C>. - -=cut -*/ - -UV -Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; - - return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); -} - -/* -=for apidoc uvuni_to_utf8_flags - -Instead you almost certainly want to use L or -L. - -This function is a deprecated synonym for L, -which itself, while not deprecated, should be used only in isolated -circumstances. These functions were useful for code that wanted to handle -both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl -v5.20, the distinctions between the platforms have mostly been made invisible -to most code, so this function is quite unlikely to be what you want. - -=cut -*/ - -U8 * -Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) -{ - PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; - - return uvoffuni_to_utf8_flags(d, uv, flags); -} - -/* -=for apidoc utf8_to_uvchr - -Returns the native code point of the first character in the string C -which is assumed to be in UTF-8 encoding; C will be set to the -length, in bytes, of that character. - -Some, but not all, UTF-8 malformations are detected, and in fact, some -malformed input could cause reading beyond the end of the input buffer, which -is why this function is deprecated. Use L instead. - -If C points to one of the detected malformations, and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value if well-defined (or -the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> -is set (if C isn't NULL) so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. - -=cut -*/ - -UV -Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) -{ - PERL_ARGS_ASSERT_UTF8_TO_UVCHR; - - /* This function is unsafe if malformed UTF-8 input is given it, which is - * why the function is deprecated. If the first byte of the input - * indicates that there are more bytes remaining in the sequence that forms - * the character than there are in the input buffer, it can read past the - * end. But we can make it safe if the input string happens to be - * NUL-terminated, as many strings in Perl are, by refusing to read past a - * NUL, which is what UTF8_CHK_SKIP() does. A NUL indicates the start of - * the next character anyway. If the input isn't NUL-terminated, the - * function remains unsafe, as it always has been. */ - - return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen); -} - /* * ex: set ts=8 sts=4 sw=4 et: */