X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/4b31b634f714aca0e8d52b4081eadb551c8efe0d..cc54cad05e7c2a2d63591c413ecd0fa0d42e8477:/utf8.c diff --git a/utf8.c b/utf8.c index ed5b027..4949bf6 100644 --- a/utf8.c +++ b/utf8.c @@ -37,7 +37,7 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf; + "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28"; #define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) @@ -78,7 +78,7 @@ Perl__force_out_malformed_utf8_message(pTHX_ PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; ENTER; - SAVESPTR(PL_dowarn); + SAVEI8(PL_dowarn); SAVESPTR(PL_curcop); PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; @@ -754,11 +754,15 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) return UTF8SKIP(s); } -STATIC char * -S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len) +char * +Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) { /* Returns a mortalized C string that is a displayable copy of the 'len' - * bytes starting at 's', each in a \xXY format. */ + * bytes starting at 's'. 'format' gives how to display each byte. + * Currently, there are only two formats, so it is currently a bool: + * 0 \xab + * 1 ab (that is a space between two hex digit bytes) + */ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a trailing NUL */ @@ -776,8 +780,13 @@ S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len) const unsigned high_nibble = (*s & 0xF0) >> 4; const unsigned low_nibble = (*s & 0x0F); - *d++ = '\\'; - *d++ = 'x'; + if (format) { + *d++ = ' '; + } + else { + *d++ = '\\'; + *d++ = 'x'; + } if (high_nibble < 10) { *d++ = high_nibble + '0'; @@ -827,7 +836,7 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," " %s after start byte 0x%02x; need %d bytes, got %d)", malformed_text, - _byte_dump_string(s, print_len), + _byte_dump_string(s, print_len, 0), *(s + non_cont_byte_pos), where, *s, @@ -1070,6 +1079,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, U8 * adjusted_s0 = (U8 *) s0; U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong warning) */ + U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this + routine; see [perl #130921] */ UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; @@ -1179,14 +1190,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Save how many bytes were actually in the character */ curlen = s - s0; - /* A convenience macro that matches either of the too-short conditions. */ -# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { - uv_so_far = uv; - uv = UNICODE_REPLACEMENT; - } - /* Note that there are two types of too-short malformation. One is when * there is actual wrong data before the normal termination of the * sequence. The other is that the sequence wasn't complete before the end @@ -1194,7 +1197,15 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * This means that we were passed data for a partial character, but it is * valid as far as we saw. The other is definitely invalid. This * distinction could be important to a caller, so the two types are kept - * separate. */ + * separate. + * + * A convenience macro that matches either of the too-short conditions. */ +# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) + + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + uv_so_far = uv; + uv = UNICODE_REPLACEMENT; + } /* Check for overflow */ if (UNLIKELY(does_utf8_overflow(s0, send))) { @@ -1236,10 +1247,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK)); } - Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8); - SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get - to free it ourselves if - warnings are made fatal */ + adjusted_s0 = temp_char_buf; adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); } } @@ -1401,7 +1409,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (pack_warn) { message = Perl_form(aTHX_ "%s: %s (overflows)", malformed_text, - _byte_dump_string(s0, send - s0)); + _byte_dump_string(s0, send - s0, 0)); } } } @@ -1437,7 +1445,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "%s: %s (unexpected continuation byte 0x%02x," " with no preceding start byte)", malformed_text, - _byte_dump_string(s0, 1), *s0); + _byte_dump_string(s0, 1, 0), *s0); } } } @@ -1452,7 +1460,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, message = Perl_form(aTHX_ "%s: %s (too short; %d byte%s available, need %d)", malformed_text, - _byte_dump_string(s0, send - s0), + _byte_dump_string(s0, send - s0, 0), (int)avail_len, avail_len == 1 ? "" : "s", (int)expectlen); @@ -1516,8 +1524,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, " should be represented with a" " different, shorter sequence)", malformed_text, - _byte_dump_string(s0, send - s0), - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(s0, curlen, 0)); } else { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -1527,8 +1535,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "%s: %s (overlong; instead use %s to represent" " U+%0*" UVXf ")", malformed_text, - _byte_dump_string(s0, send - s0), - _byte_dump_string(tmpbuf, e - tmpbuf), + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(tmpbuf, e - tmpbuf, 0), ((uv < 256) ? 2 : 4), /* Field width of 2 for small code points */ uv); @@ -1553,7 +1561,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, message = Perl_form(aTHX_ "UTF-16 surrogate (any UTF-8 sequence that" " starts with \"%s\" is for a surrogate)", - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, curlen, 0)); } else { message = Perl_form(aTHX_ @@ -1583,7 +1591,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "Any UTF-8 sequence that starts with" " \"%s\" is for a non-Unicode code point," " may not be portable", - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, curlen, 0)); } else { message = Perl_form(aTHX_ @@ -1622,7 +1630,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "Any UTF-8 sequence that starts with" " \"%s\" is for a non-Unicode code" " point, and is not portable", - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, curlen, 0)); } else { message = Perl_form(aTHX_ @@ -1748,6 +1756,8 @@ Also implemented as a macro in utf8.h UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; + assert(s < send); return utf8n_to_uvchr(s, send - s, retlen, @@ -4541,12 +4551,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while ((from_list = (AV *) hv_iternextsv(specials_inverse, &char_to, &to_len))) { - if (av_tindex_nomg(from_list) > 0) { + if (av_tindex_skip_len_mg(from_list) > 0) { SSize_t i; /* We iterate over all combinations of i,j to place each code * point on each list */ - for (i = 0; i <= av_tindex_nomg(from_list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) { SSize_t j; AV* i_list = newAV(); SV** entryp = av_fetch(from_list, i, FALSE); @@ -4563,7 +4573,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex_nomg(from_list); j++) { + for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); @@ -4639,7 +4649,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Look through list to see if this inverse mapping already is * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex_nomg(list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(list); i++) { SV** entryp = av_fetch(list, i, FALSE); SV* entry; UV uv;