static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
+ "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
ENTER;
- SAVESPTR(PL_dowarn);
+ SAVEI8(PL_dowarn);
SAVESPTR(PL_curcop);
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
#define MASK UTF_CONTINUATION_MASK
U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
return UTF8SKIP(s);
}
-STATIC char *
-S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+char *
+Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
{
/* Returns a mortalized C string that is a displayable copy of the 'len'
- * bytes starting at 's', each in a \xXY format. */
+ * bytes starting at 's'. 'format' gives how to display each byte.
+ * Currently, there are only two formats, so it is currently a bool:
+ * 0 \xab
+ * 1 ab (that is a space between two hex digit bytes)
+ */
const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
trailing NUL */
const unsigned high_nibble = (*s & 0xF0) >> 4;
const unsigned low_nibble = (*s & 0x0F);
- *d++ = '\\';
- *d++ = 'x';
+ if (format) {
+ *d++ = ' ';
+ }
+ else {
+ *d++ = '\\';
+ *d++ = 'x';
+ }
if (high_nibble < 10) {
*d++ = high_nibble + '0';
return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
" %s after start byte 0x%02x; need %d bytes, got %d)",
malformed_text,
- _byte_dump_string(s, print_len),
+ _byte_dump_string(s, print_len, 0),
*(s + non_cont_byte_pos),
where,
*s,
U8 * adjusted_s0 = (U8 *) s0;
U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
warning) */
+ U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
+ routine; see [perl #130921] */
UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
/* Save how many bytes were actually in the character */
curlen = s - s0;
- /* A convenience macro that matches either of the too-short conditions. */
-# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
- if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
- uv_so_far = uv;
- uv = UNICODE_REPLACEMENT;
- }
-
/* Note that there are two types of too-short malformation. One is when
* there is actual wrong data before the normal termination of the
* sequence. The other is that the sequence wasn't complete before the end
* This means that we were passed data for a partial character, but it is
* valid as far as we saw. The other is definitely invalid. This
* distinction could be important to a caller, so the two types are kept
- * separate. */
+ * separate.
+ *
+ * A convenience macro that matches either of the too-short conditions. */
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
/* Check for overflow */
if (UNLIKELY(does_utf8_overflow(s0, send))) {
I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
}
- Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
- SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
- to free it ourselves if
- warnings are made fatal */
+ adjusted_s0 = temp_char_buf;
adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
}
}
if (pack_warn) {
message = Perl_form(aTHX_ "%s: %s (overflows)",
malformed_text,
- _byte_dump_string(s0, send - s0));
+ _byte_dump_string(s0, send - s0, 0));
}
}
}
"%s: %s (unexpected continuation byte 0x%02x,"
" with no preceding start byte)",
malformed_text,
- _byte_dump_string(s0, 1), *s0);
+ _byte_dump_string(s0, 1, 0), *s0);
}
}
}
message = Perl_form(aTHX_
"%s: %s (too short; %d byte%s available, need %d)",
malformed_text,
- _byte_dump_string(s0, send - s0),
+ _byte_dump_string(s0, send - s0, 0),
(int)avail_len,
avail_len == 1 ? "" : "s",
(int)expectlen);
" should be represented with a"
" different, shorter sequence)",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(s0, curlen, 0));
}
else {
U8 tmpbuf[UTF8_MAXBYTES+1];
"%s: %s (overlong; instead use %s to represent"
" U+%0*" UVXf ")",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(tmpbuf, e - tmpbuf),
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(tmpbuf, e - tmpbuf, 0),
((uv < 256) ? 2 : 4), /* Field width of 2 for
small code points */
uv);
message = Perl_form(aTHX_
"UTF-16 surrogate (any UTF-8 sequence that"
" starts with \"%s\" is for a surrogate)",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code point,"
" may not be portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code"
" point, and is not portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
assert(s < send);
return utf8n_to_uvchr(s, send - s, retlen,
warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
- SV * invlist;
switch (classnum) {
case _CC_WORDCHAR:
return is_VERTWS_high(p);
case _CC_IDFIRST:
if (! PL_utf8_perl_idstart) {
- invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+ PL_utf8_perl_idstart
+ = _new_invlist_C_array(_Perl_IDStart_invlist);
}
- return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
+ return is_utf8_common(p, &PL_utf8_perl_idstart,
+ "_Perl_IDStart", NULL);
case _CC_IDCONT:
if (! PL_utf8_perl_idcont) {
- invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+ PL_utf8_perl_idcont
+ = _new_invlist_C_array(_Perl_IDCont_invlist);
}
- return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
+ return is_utf8_common(p, &PL_utf8_perl_idcont,
+ "_Perl_IDCont", NULL);
}
}
while ((from_list = (AV *) hv_iternextsv(specials_inverse,
&char_to, &to_len)))
{
- if (av_tindex_nomg(from_list) > 0) {
+ if (av_tindex_skip_len_mg(from_list) > 0) {
SSize_t i;
/* We iterate over all combinations of i,j to place each code
* point on each list */
- for (i = 0; i <= av_tindex_nomg(from_list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
SSize_t j;
AV* i_list = newAV();
SV** entryp = av_fetch(from_list, i, FALSE);
}
/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
- for (j = 0; j <= av_tindex_nomg(from_list); j++) {
+ for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
entryp = av_fetch(from_list, j, FALSE);
if (entryp == NULL) {
Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
/* Look through list to see if this inverse mapping already is
* listed, or if there is a mapping to itself already */
- for (i = 0; i <= av_tindex_nomg(list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
SV** entryp = av_fetch(list, i, FALSE);
SV* entry;
UV uv;
invlist = _new_invlist(0);
}
else {
- while (isSPACE(*l)) l++;
l = (U8 *) after_atou;
/* Get the 0th element, which is needed to setup the inversion list */