static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
+ "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
ENTER;
- SAVESPTR(PL_dowarn);
+ SAVEI8(PL_dowarn);
SAVESPTR(PL_curcop);
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
#define MASK UTF_CONTINUATION_MASK
U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
return UTF8SKIP(s);
}
-STATIC char *
-S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+char *
+Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
{
/* Returns a mortalized C string that is a displayable copy of the 'len'
- * bytes starting at 's', each in a \xXY format. */
+ * bytes starting at 's'. 'format' gives how to display each byte.
+ * Currently, there are only two formats, so it is currently a bool:
+ * 0 \xab
+ * 1 ab (that is a space between two hex digit bytes)
+ */
const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
trailing NUL */
const unsigned high_nibble = (*s & 0xF0) >> 4;
const unsigned low_nibble = (*s & 0x0F);
- *d++ = '\\';
- *d++ = 'x';
+ if (format) {
+ *d++ = ' ';
+ }
+ else {
+ *d++ = '\\';
+ *d++ = 'x';
+ }
if (high_nibble < 10) {
*d++ = high_nibble + '0';
return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
" %s after start byte 0x%02x; need %d bytes, got %d)",
malformed_text,
- _byte_dump_string(s, print_len),
+ _byte_dump_string(s, print_len, 0),
*(s + non_cont_byte_pos),
where,
*s,
U8 * adjusted_s0 = (U8 *) s0;
U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
warning) */
+ U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
+ routine; see [perl #130921] */
UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
/* Save how many bytes were actually in the character */
curlen = s - s0;
- /* A convenience macro that matches either of the too-short conditions. */
-# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
- if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
- uv_so_far = uv;
- uv = UNICODE_REPLACEMENT;
- }
-
/* Note that there are two types of too-short malformation. One is when
* there is actual wrong data before the normal termination of the
* sequence. The other is that the sequence wasn't complete before the end
* This means that we were passed data for a partial character, but it is
* valid as far as we saw. The other is definitely invalid. This
* distinction could be important to a caller, so the two types are kept
- * separate. */
+ * separate.
+ *
+ * A convenience macro that matches either of the too-short conditions. */
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
/* Check for overflow */
if (UNLIKELY(does_utf8_overflow(s0, send))) {
I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
}
- Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
- SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
- to free it ourselves if
- warnings are made fatal */
+ adjusted_s0 = temp_char_buf;
adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
}
}
if (pack_warn) {
message = Perl_form(aTHX_ "%s: %s (overflows)",
malformed_text,
- _byte_dump_string(s0, send - s0));
+ _byte_dump_string(s0, send - s0, 0));
}
}
}
"%s: %s (unexpected continuation byte 0x%02x,"
" with no preceding start byte)",
malformed_text,
- _byte_dump_string(s0, 1), *s0);
+ _byte_dump_string(s0, 1, 0), *s0);
}
}
}
message = Perl_form(aTHX_
"%s: %s (too short; %d byte%s available, need %d)",
malformed_text,
- _byte_dump_string(s0, send - s0),
+ _byte_dump_string(s0, send - s0, 0),
(int)avail_len,
avail_len == 1 ? "" : "s",
(int)expectlen);
" should be represented with a"
" different, shorter sequence)",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(s0, curlen, 0));
}
else {
U8 tmpbuf[UTF8_MAXBYTES+1];
"%s: %s (overlong; instead use %s to represent"
" U+%0*" UVXf ")",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(tmpbuf, e - tmpbuf),
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(tmpbuf, e - tmpbuf, 0),
((uv < 256) ? 2 : 4), /* Field width of 2 for
small code points */
uv);
message = Perl_form(aTHX_
"UTF-16 surrogate (any UTF-8 sequence that"
" starts with \"%s\" is for a surrogate)",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code point,"
" may not be portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code"
" point, and is not portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
assert(s < send);
return utf8n_to_uvchr(s, send - s, retlen,
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
}
+ if (instr(file, "mathoms.c")) {
+ Perl_warner(aTHX_ WARN_DEPRECATED,
+ "In %s, line %d, starting in Perl v5.30, %s()"
+ " will be removed. Avoid this message by"
+ " converting to use %s().\n",
+ file, line, name, alternative);
+ }
+ else {
Perl_warner(aTHX_ WARN_DEPRECATED,
"In %s, line %d, starting in Perl v5.30, %s() will"
" require an additional parameter. Avoid this"
" message by converting to use %s().\n",
file, line, name, alternative);
+ }
}
}
}
warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
- SV * invlist;
switch (classnum) {
case _CC_WORDCHAR:
return is_VERTWS_high(p);
case _CC_IDFIRST:
if (! PL_utf8_perl_idstart) {
- invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+ PL_utf8_perl_idstart
+ = _new_invlist_C_array(_Perl_IDStart_invlist);
}
- return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
+ return is_utf8_common(p, &PL_utf8_perl_idstart,
+ "_Perl_IDStart", NULL);
case _CC_IDCONT:
if (! PL_utf8_perl_idcont) {
- invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+ PL_utf8_perl_idcont
+ = _new_invlist_C_array(_Perl_IDCont_invlist);
}
- return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
+ return is_utf8_common(p, &PL_utf8_perl_idcont,
+ "_Perl_IDCont", NULL);
}
}
L</toLOWER_utf8_safe>,
or L</toFOLD_utf8_safe>.
+This function will be removed in Perl v5.28.
+
C<p> contains the pointer to the UTF-8 string encoding
the character that is being converted. This routine assumes that the character
at C<p> is well-formed.
Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
SV **swashp, const char *normal, const char *special)
{
+ STRLEN len_cp;
+ UV cp;
+ const U8 * e = p + UTF8SKIP(p);
+
PERL_ARGS_ASSERT_TO_UTF8_CASE;
- return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
+ cp = utf8n_to_uvchr(p, e - p, &len_cp, UTF8_CHECK_ONLY);
+ if (len_cp == (STRLEN) -1) {
+ _force_out_malformed_utf8_message(p, e,
+ _UTF8_NO_CONFIDENCE_IN_CURLEN, 1 /* Die */ );
+ }
+
+ return _to_utf8_case(cp, p, ustrp, lenp, swashp, normal, special);
}
/* change namve uv1 to 'from' */
return original;
}
+STATIC U32
+S_check_and_deprecate(pTHX_ const U8 *p,
+ const U8 **e,
+ const unsigned int type, /* See below */
+ const bool use_locale, /* Is this a 'LC_'
+ macro call? */
+ const char * const file,
+ const unsigned line)
+{
+ /* This is a temporary function to deprecate the unsafe calls to the case
+ * changing macros and functions. It keeps all the special stuff in just
+ * one place.
+ *
+ * It updates *e with the pointer to the end of the input string. If using
+ * the old-style macros, *e is NULL on input, and so this function assumes
+ * the input string is long enough to hold the entire UTF-8 sequence, and
+ * sets *e accordingly, but it then returns a flag to pass the
+ * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
+ * using the full length if possible.
+ *
+ * It also does the assert that *e > p when *e is not NULL. This should be
+ * migrated to the callers when this function gets deleted.
+ *
+ * The 'type' parameter is used for the caller to specify which case
+ * changing function this is called from: */
+
+# define DEPRECATE_TO_UPPER 0
+# define DEPRECATE_TO_TITLE 1
+# define DEPRECATE_TO_LOWER 2
+# define DEPRECATE_TO_FOLD 3
+
+ U32 utf8n_flags = 0;
+ const char * name;
+ const char * alternative;
+
+ PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
+
+ if (*e == NULL) {
+ utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
+ *e = p + UTF8SKIP(p);
+
+ /* For mathoms.c calls, we use the function name we know is stored
+ * there. It could be part of a larger path */
+ if (type == DEPRECATE_TO_UPPER) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_upper"
+ : "toUPPER_utf8";
+ alternative = "toUPPER_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_TITLE) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_title"
+ : "toTITLE_utf8";
+ alternative = "toTITLE_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_LOWER) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_lower"
+ : "toLOWER_utf8";
+ alternative = "toLOWER_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_FOLD) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_fold"
+ : "toFOLD_utf8";
+ alternative = "toFOLD_utf8_safe";
+ }
+ else Perl_croak(aTHX_ "panic: Unexpected case change type");
+
+ warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
+ }
+ else {
+ assert (p < *e);
+ }
+
+ return utf8n_flags;
+}
+
/* The process for changing the case is essentially the same for the four case
* change types, except there are complications for folding. Otherwise the
* difference is only which case to change to. To make sure that they all do
* going on. */
#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
L1_func_extra_param) \
- if (e == NULL) e = p + UTF8SKIP(p); \
\
if (flags & (locale_flags)) { \
/* Treat a UTF-8 locale as not being in locale at all */ \
STRLEN len_result; \
result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
if (len_result == (STRLEN) -1) { \
- _force_out_malformed_utf8_message(p, e, \
- _UTF8_NO_CONFIDENCE_IN_CURLEN, \
- 1 /* Die */ ); \
+ _force_out_malformed_utf8_message(p, e, utf8n_flags, \
+ 1 /* Die */ ); \
}
#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
* be used. */
UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 *e, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
*/
UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, const U8 *e, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
*/
UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, const U8 *e, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
*/
UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, const U8 *e, U8* ustrp, STRLEN *lenp, U8 flags)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ U8 flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
while ((from_list = (AV *) hv_iternextsv(specials_inverse,
&char_to, &to_len)))
{
- if (av_tindex_nomg(from_list) > 0) {
+ if (av_tindex_skip_len_mg(from_list) > 0) {
SSize_t i;
/* We iterate over all combinations of i,j to place each code
* point on each list */
- for (i = 0; i <= av_tindex_nomg(from_list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
SSize_t j;
AV* i_list = newAV();
SV** entryp = av_fetch(from_list, i, FALSE);
}
/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
- for (j = 0; j <= av_tindex_nomg(from_list); j++) {
+ for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
entryp = av_fetch(from_list, j, FALSE);
if (entryp == NULL) {
Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
/* Look through list to see if this inverse mapping already is
* listed, or if there is a mapping to itself already */
- for (i = 0; i <= av_tindex_nomg(list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
SV** entryp = av_fetch(list, i, FALSE);
SV* entry;
UV uv;
invlist = _new_invlist(0);
}
else {
- while (isSPACE(*l)) l++;
l = (U8 *) after_atou;
/* Get the 0th element, which is needed to setup the inversion list */