static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
+ "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
ENTER;
- SAVESPTR(PL_dowarn);
+ SAVEI8(PL_dowarn);
SAVESPTR(PL_curcop);
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
#define MASK UTF_CONTINUATION_MASK
U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
#endif
if ( (flags & UTF8_DISALLOW_SUPER)
- && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
+ && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ {
return 0; /* Above Unicode */
}
return UTF8SKIP(s);
}
-STATIC char *
-S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+char *
+Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
{
/* Returns a mortalized C string that is a displayable copy of the 'len'
- * bytes starting at 's', each in a \xXY format. */
+ * bytes starting at 's'. 'format' gives how to display each byte.
+ * Currently, there are only two formats, so it is currently a bool:
+ * 0 \xab
+ * 1 ab (that is a space between two hex digit bytes)
+ */
const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
trailing NUL */
const unsigned high_nibble = (*s & 0xF0) >> 4;
const unsigned low_nibble = (*s & 0x0F);
- *d++ = '\\';
- *d++ = 'x';
+ if (format) {
+ *d++ = ' ';
+ }
+ else {
+ *d++ = '\\';
+ *d++ = 'x';
+ }
if (high_nibble < 10) {
*d++ = high_nibble + '0';
? "immediately"
: Perl_form(aTHX_ "%d bytes",
(int) non_cont_byte_pos);
- unsigned int i;
PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
* calculated, it's likely faster to pass it; verify under DEBUGGING */
assert(expect_len == UTF8SKIP(s));
- /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
- * length that is larger than is actually available in the buffer. If we
- * print all the bytes based on that length, we will read past the buffer
- * end. Often, the strings are NUL terminated, so to lower the chances of
- * this happening, print the malformed bytes only up through any NUL. */
- for (i = 1; i < print_len; i++) {
- if (*(s + i) == '\0') {
- print_len = i + 1; /* +1 gets the NUL printed */
- break;
- }
- }
-
return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
" %s after start byte 0x%02x; need %d bytes, got %d)",
malformed_text,
- _byte_dump_string(s, print_len),
+ _byte_dump_string(s, print_len, 0),
*(s + non_cont_byte_pos),
where,
*s,
overlong sequences are expressly forbidden in the UTF-8 standard due to
potential security issues). Another malformation example is the first byte of
a character not being a legal first byte. See F<utf8.h> for the list of such
-flags. For allowed 0 length strings, this function returns 0; for allowed
-overlong sequences, the computed code point is returned; for all other allowed
-malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
-determinable reasonable value.
+flags. Even if allowed, this function generally returns the Unicode
+REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
+F<utf8.h> to override this behavior for the overlong malformations, but don't
+do that except for very specialized purposes.
The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
flags) malformation is found. If this flag is set, the routine assumes that
U8 * adjusted_s0 = (U8 *) s0;
U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
warning) */
+ U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
+ routine; see [perl #130921] */
UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
if (UNLIKELY(curlen == 0)) {
possible_problems |= UTF8_GOT_EMPTY;
curlen = 0;
- uv = 0; /* XXX It could be argued that this should be
- UNICODE_REPLACEMENT? */
+ uv = UNICODE_REPLACEMENT;
goto ready_to_handle_errors;
}
}
/* Here is not a continuation byte, nor an invariant. The only thing left
- * is a start byte (possibly for an overlong) */
+ * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
+ * because it excludes start bytes like \xC0 that always lead to
+ * overlongs.) */
/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
* that indicate the number of bytes in the character's whole UTF-8
/* Save how many bytes were actually in the character */
curlen = s - s0;
- /* A convenience macro that matches either of the too-short conditions. */
-# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
- if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
- uv_so_far = uv;
- uv = UNICODE_REPLACEMENT;
- }
-
/* Note that there are two types of too-short malformation. One is when
* there is actual wrong data before the normal termination of the
* sequence. The other is that the sequence wasn't complete before the end
* This means that we were passed data for a partial character, but it is
* valid as far as we saw. The other is definitely invalid. This
* distinction could be important to a caller, so the two types are kept
- * separate. */
+ * separate.
+ *
+ * A convenience macro that matches either of the too-short conditions. */
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
/* Check for overflow */
if (UNLIKELY(does_utf8_overflow(s0, send))) {
I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
}
- Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
- SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
- to free it ourselves if
- warnings are made fatal */
+ adjusted_s0 = temp_char_buf;
adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
}
}
/* isn't problematic if < this */
if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
|| ( UNLIKELY(possible_problems)
+
+ /* if overflow, we know without looking further
+ * precisely which of the problematic types it is,
+ * and we deal with those in the overflow handling
+ * code */
+ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
&& isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
*errors |= UTF8_GOT_SUPER;
}
- if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+ if (flags
+ & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
+ {
*errors |= UTF8_GOT_ABOVE_31_BIT;
}
- disallowed = TRUE;
+ /* Disallow if any of the three categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & ( UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
+ {
+ disallowed = TRUE;
+ }
- /* The warnings code explicitly says it doesn't handle the case
- * of packWARN2 and two categories which have parent-child
- * relationship. Even if it works now to raise the warning if
- * either is enabled, it wouldn't necessarily do so in the
- * future. We output (only) the most dire warning*/
- if (! (flags & UTF8_CHECK_ONLY)) {
- if (ckWARN_d(WARN_UTF8)) {
- pack_warn = packWARN(WARN_UTF8);
- }
- else if (ckWARN_d(WARN_NON_UNICODE)) {
- pack_warn = packWARN(WARN_NON_UNICODE);
- }
- if (pack_warn) {
- message = Perl_form(aTHX_ "%s: %s (overflows)",
- malformed_text,
- _byte_dump_string(s0, send - s0));
+
+ /* Likewise, warn if any say to, plus if deprecation warnings
+ * are on, because this code point is above IV_MAX */
+ if ( ckWARN_d(WARN_DEPRECATED)
+ || ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+ {
+
+ /* The warnings code explicitly says it doesn't handle the
+ * case of packWARN2 and two categories which have
+ * parent-child relationship. Even if it works now to
+ * raise the warning if either is enabled, it wouldn't
+ * necessarily do so in the future. We output (only) the
+ * most dire warning*/
+ if (! (flags & UTF8_CHECK_ONLY)) {
+ if (ckWARN_d(WARN_UTF8)) {
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ else if (ckWARN_d(WARN_NON_UNICODE)) {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+ }
+ if (pack_warn) {
+ message = Perl_form(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0, 0));
+ }
}
}
}
*errors |= UTF8_GOT_EMPTY;
if (! (flags & UTF8_ALLOW_EMPTY)) {
+
+ /* This so-called malformation is now treated as a bug in
+ * the caller. If you have nothing to decode, skip calling
+ * this function */
+ assert(0);
+
disallowed = TRUE;
if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
pack_warn = packWARN(WARN_UTF8);
"%s: %s (unexpected continuation byte 0x%02x,"
" with no preceding start byte)",
malformed_text,
- _byte_dump_string(s0, 1), *s0);
+ _byte_dump_string(s0, 1, 0), *s0);
}
}
}
message = Perl_form(aTHX_
"%s: %s (too short; %d byte%s available, need %d)",
malformed_text,
- _byte_dump_string(s0, send - s0),
+ _byte_dump_string(s0, send - s0, 0),
(int)avail_len,
avail_len == 1 ? "" : "s",
(int)expectlen);
if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
disallowed = TRUE;
if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+
+ /* If we don't know for sure that the input length is
+ * valid, avoid as much as possible reading past the
+ * end of the buffer */
+ int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
+ ? s - s0
+ : send - s0;
pack_warn = packWARN(WARN_UTF8);
message = Perl_form(aTHX_ "%s",
unexpected_non_continuation_text(s0,
- send - s0,
+ printlen,
s - s0,
(int) expectlen));
}
possible_problems &= ~UTF8_GOT_LONG;
*errors |= UTF8_GOT_LONG;
- if (! (flags & UTF8_ALLOW_LONG)) {
+ if (flags & UTF8_ALLOW_LONG) {
+
+ /* We don't allow the actual overlong value, unless the
+ * special extra bit is also set */
+ if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
+ & ~UTF8_ALLOW_LONG)))
+ {
+ uv = UNICODE_REPLACEMENT;
+ }
+ }
+ else {
disallowed = TRUE;
if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
" should be represented with a"
" different, shorter sequence)",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(s0, curlen, 0));
}
else {
U8 tmpbuf[UTF8_MAXBYTES+1];
"%s: %s (overlong; instead use %s to represent"
" U+%0*" UVXf ")",
malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(tmpbuf, e - tmpbuf),
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(tmpbuf, e - tmpbuf, 0),
((uv < 256) ? 2 : 4), /* Field width of 2 for
small code points */
uv);
message = Perl_form(aTHX_
"UTF-16 surrogate (any UTF-8 sequence that"
" starts with \"%s\" is for a surrogate)",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code point,"
" may not be portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
"Any UTF-8 sequence that starts with"
" \"%s\" is for a non-Unicode code"
" point, and is not portable",
- _byte_dump_string(s0, curlen));
+ _byte_dump_string(s0, curlen, 0));
}
else {
message = Perl_form(aTHX_
}
}
- if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+ if (flags & ( UTF8_WARN_ABOVE_31_BIT
+ |UTF8_DISALLOW_ABOVE_31_BIT))
+ {
*errors |= UTF8_GOT_ABOVE_31_BIT;
if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
else
Perl_warner(aTHX_ pack_warn, "%s", message);
}
- } /* End of 'while (possible_problems) {' */
+ } /* End of 'while (possible_problems)' */
/* Since there was a possible problem, the returned length may need to
* be changed from the one stored at the beginning of this function.
UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
assert(s < send);
return utf8n_to_uvchr(s, send - s, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/* This is marked as deprecated
uvchr_to_utf8(p, c);
return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
}
- else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+ else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
the special flags. */
U8 utf8_c[UTF8_MAXBYTES + 1];
needs_full_generality:
uvchr_to_utf8(utf8_c, c);
- return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
+ return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags);
}
}
* validating routine */
if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
_force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
- 0,
+ _UTF8_NO_CONFIDENCE_IN_CURLEN,
1 /* Die */ );
NOT_REACHED; /* NOTREACHED */
}
return swash_fetch(*swash, p, TRUE) != 0;
}
+STATIC void
+S_warn_on_first_deprecated_use(pTHX_ const char * const name,
+ const char * const alternative,
+ const bool use_locale,
+ const char * const file,
+ const unsigned line)
+{
+ const char * key;
+
+ PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
+
+ if (ckWARN_d(WARN_DEPRECATED)) {
+
+ key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
+ if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
+ if (! PL_seen_deprecated_macro) {
+ PL_seen_deprecated_macro = newHV();
+ }
+ if (! hv_store(PL_seen_deprecated_macro, key,
+ strlen(key), &PL_sv_undef, 0))
+ {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+
+ if (instr(file, "mathoms.c")) {
+ Perl_warner(aTHX_ WARN_DEPRECATED,
+ "In %s, line %d, starting in Perl v5.30, %s()"
+ " will be removed. Avoid this message by"
+ " converting to use %s().\n",
+ file, line, name, alternative);
+ }
+ else {
+ Perl_warner(aTHX_ WARN_DEPRECATED,
+ "In %s, line %d, starting in Perl v5.30, %s() will"
+ " require an additional parameter. Avoid this"
+ " message by converting to use %s().\n",
+ file, line, name, alternative);
+ }
+ }
+ }
+}
+
bool
-Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
+Perl__is_utf8_FOO(pTHX_ U8 classnum,
+ const U8 *p,
+ const char * const name,
+ const char * const alternative,
+ const bool use_utf8,
+ const bool use_locale,
+ const char * const file,
+ const unsigned line)
{
PERL_ARGS_ASSERT__IS_UTF8_FOO;
- assert(classnum < _FIRST_NON_SWASH_CC);
+ warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
+
+ if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
+
+ switch (classnum) {
+ case _CC_WORDCHAR:
+ case _CC_DIGIT:
+ case _CC_ALPHA:
+ case _CC_LOWER:
+ case _CC_UPPER:
+ case _CC_PUNCT:
+ case _CC_PRINT:
+ case _CC_ALPHANUMERIC:
+ case _CC_GRAPH:
+ case _CC_CASED:
+
+ return is_utf8_common(p,
+ &PL_utf8_swash_ptrs[classnum],
+ swash_property_names[classnum],
+ PL_XPosix_ptrs[classnum]);
+
+ case _CC_SPACE:
+ return is_XPERLSPACE_high(p);
+ case _CC_BLANK:
+ return is_HORIZWS_high(p);
+ case _CC_XDIGIT:
+ return is_XDIGIT_high(p);
+ case _CC_CNTRL:
+ return 0;
+ case _CC_ASCII:
+ return 0;
+ case _CC_VERTSPACE:
+ return is_VERTWS_high(p);
+ case _CC_IDFIRST:
+ if (! PL_utf8_perl_idstart) {
+ PL_utf8_perl_idstart
+ = _new_invlist_C_array(_Perl_IDStart_invlist);
+ }
+ return is_utf8_common(p, &PL_utf8_perl_idstart,
+ "_Perl_IDStart", NULL);
+ case _CC_IDCONT:
+ if (! PL_utf8_perl_idcont) {
+ PL_utf8_perl_idcont
+ = _new_invlist_C_array(_Perl_IDCont_invlist);
+ }
+ return is_utf8_common(p, &PL_utf8_perl_idcont,
+ "_Perl_IDCont", NULL);
+ }
+ }
+
+ /* idcont is the same as wordchar below 256 */
+ if (classnum == _CC_IDCONT) {
+ classnum = _CC_WORDCHAR;
+ }
+ else if (classnum == _CC_IDFIRST) {
+ if (*p == '_') {
+ return TRUE;
+ }
+ classnum = _CC_ALPHA;
+ }
+
+ if (! use_locale) {
+ if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
+ return _generic_isCC(*p, classnum);
+ }
+
+ return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
+ }
+ else {
+ if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
+ return isFOO_lc(classnum, *p);
+ }
+
+ return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
+ }
- return is_utf8_common(p,
- &PL_utf8_swash_ptrs[classnum],
- swash_property_names[classnum],
- PL_XPosix_ptrs[classnum]);
+ NOT_REACHED; /* NOTREACHED */
}
bool
}
bool
-Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
-{
- SV* invlist = NULL;
-
- PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
-
- if (! PL_utf8_perl_idstart) {
- invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
- }
- return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
-}
-
-bool
Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
{
SV* invlist = NULL;
}
bool
-Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
-{
- SV* invlist = NULL;
-
- PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
-
- if (! PL_utf8_perl_idcont) {
- invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
- }
- return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
-}
-
-bool
Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
{
SV* invlist = NULL;
/*
=for apidoc to_utf8_case
-Instead use the appropriate one of L</toUPPER_utf8>,
-L</toTITLE_utf8>,
-L</toLOWER_utf8>,
-or L</toFOLD_utf8>.
+Instead use the appropriate one of L</toUPPER_utf8_safe>,
+L</toTITLE_utf8_safe>,
+L</toLOWER_utf8_safe>,
+or L</toFOLD_utf8_safe>.
+
+This function will be removed in Perl v5.28.
C<p> contains the pointer to the UTF-8 string encoding
the character that is being converted. This routine assumes that the character
Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
SV **swashp, const char *normal, const char *special)
{
+ STRLEN len_cp;
+ UV cp;
+ const U8 * e = p + UTF8SKIP(p);
+
PERL_ARGS_ASSERT_TO_UTF8_CASE;
- return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
+ cp = utf8n_to_uvchr(p, e - p, &len_cp, UTF8_CHECK_ONLY);
+ if (len_cp == (STRLEN) -1) {
+ _force_out_malformed_utf8_message(p, e,
+ _UTF8_NO_CONFIDENCE_IN_CURLEN, 1 /* Die */ );
+ }
+
+ return _to_utf8_case(cp, p, ustrp, lenp, swashp, normal, special);
}
/* change namve uv1 to 'from' */
return original;
}
+STATIC U32
+S_check_and_deprecate(pTHX_ const U8 *p,
+ const U8 **e,
+ const unsigned int type, /* See below */
+ const bool use_locale, /* Is this a 'LC_'
+ macro call? */
+ const char * const file,
+ const unsigned line)
+{
+ /* This is a temporary function to deprecate the unsafe calls to the case
+ * changing macros and functions. It keeps all the special stuff in just
+ * one place.
+ *
+ * It updates *e with the pointer to the end of the input string. If using
+ * the old-style macros, *e is NULL on input, and so this function assumes
+ * the input string is long enough to hold the entire UTF-8 sequence, and
+ * sets *e accordingly, but it then returns a flag to pass the
+ * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
+ * using the full length if possible.
+ *
+ * It also does the assert that *e > p when *e is not NULL. This should be
+ * migrated to the callers when this function gets deleted.
+ *
+ * The 'type' parameter is used for the caller to specify which case
+ * changing function this is called from: */
+
+# define DEPRECATE_TO_UPPER 0
+# define DEPRECATE_TO_TITLE 1
+# define DEPRECATE_TO_LOWER 2
+# define DEPRECATE_TO_FOLD 3
+
+ U32 utf8n_flags = 0;
+ const char * name;
+ const char * alternative;
+
+ PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
+
+ if (*e == NULL) {
+ utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
+ *e = p + UTF8SKIP(p);
+
+ /* For mathoms.c calls, we use the function name we know is stored
+ * there. It could be part of a larger path */
+ if (type == DEPRECATE_TO_UPPER) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_upper"
+ : "toUPPER_utf8";
+ alternative = "toUPPER_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_TITLE) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_title"
+ : "toTITLE_utf8";
+ alternative = "toTITLE_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_LOWER) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_lower"
+ : "toLOWER_utf8";
+ alternative = "toLOWER_utf8_safe";
+ }
+ else if (type == DEPRECATE_TO_FOLD) {
+ name = instr(file, "mathoms.c")
+ ? "to_utf8_fold"
+ : "toFOLD_utf8";
+ alternative = "toFOLD_utf8_safe";
+ }
+ else Perl_croak(aTHX_ "panic: Unexpected case change type");
+
+ warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
+ }
+ else {
+ assert (p < *e);
+ }
+
+ return utf8n_flags;
+}
+
/* The process for changing the case is essentially the same for the four case
* change types, except there are complications for folding. Otherwise the
* difference is only which case to change to. To make sure that they all do
* going on. */
#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
L1_func_extra_param) \
+ \
if (flags & (locale_flags)) { \
/* Treat a UTF-8 locale as not being in locale at all */ \
if (IN_UTF8_CTYPE_LOCALE) { \
return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
} \
} \
- else if UTF8_IS_DOWNGRADEABLE_START(*p) { \
+ else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
if (flags & (locale_flags)) { \
result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
*(p+1))); \
ustrp, lenp, L1_func_extra_param); \
} \
} \
- else { /* malformed UTF-8 */ \
- result = valid_utf8_to_uvchr(p, NULL); \
+ else { /* malformed UTF-8 or ord above 255 */ \
+ STRLEN len_result; \
+ result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
+ if (len_result == (STRLEN) -1) { \
+ _force_out_malformed_utf8_message(p, e, utf8n_flags, \
+ 1 /* Die */ ); \
+ }
#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
result = change_macro(result, p, ustrp, lenp); \
/*
=for apidoc to_utf8_upper
-Instead use L</toUPPER_utf8>.
+Instead use L</toUPPER_utf8_safe>.
=cut */
* be used. */
UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
/*
=for apidoc to_utf8_title
-Instead use L</toTITLE_utf8>.
+Instead use L</toTITLE_utf8_safe>.
=cut */
*/
UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
/*
=for apidoc to_utf8_lower
-Instead use L</toLOWER_utf8>.
+Instead use L</toLOWER_utf8_safe>.
=cut */
*/
UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ bool flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
/*
=for apidoc to_utf8_fold
-Instead use L</toFOLD_utf8>.
+Instead use L</toFOLD_utf8_safe>.
=cut */
*/
UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
+ const U8 *e,
+ U8* ustrp,
+ STRLEN *lenp,
+ U8 flags,
+ const char * const file,
+ const int line)
{
UV result;
+ const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
+ cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
while ((from_list = (AV *) hv_iternextsv(specials_inverse,
&char_to, &to_len)))
{
- if (av_tindex_nomg(from_list) > 0) {
+ if (av_tindex_skip_len_mg(from_list) > 0) {
SSize_t i;
/* We iterate over all combinations of i,j to place each code
* point on each list */
- for (i = 0; i <= av_tindex_nomg(from_list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
SSize_t j;
AV* i_list = newAV();
SV** entryp = av_fetch(from_list, i, FALSE);
}
/* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
- for (j = 0; j <= av_tindex_nomg(from_list); j++) {
+ for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
entryp = av_fetch(from_list, j, FALSE);
if (entryp == NULL) {
Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
/* Look through list to see if this inverse mapping already is
* listed, or if there is a mapping to itself already */
- for (i = 0; i <= av_tindex_nomg(list); i++) {
+ for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
SV** entryp = av_fetch(list, i, FALSE);
SV* entry;
UV uv;
invlist = _new_invlist(0);
}
else {
- while (isSPACE(*l)) l++;
l = (U8 *) after_atou;
/* Get the 0th element, which is needed to setup the inversion list */
*foldbuf1 = toFOLD(*p1);
}
else if (u1) {
- _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
+ _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
}
else { /* Not UTF-8, get UTF-8 fold */
_to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
*foldbuf2 = toFOLD(*p2);
}
else if (u2) {
- _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
+ _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
}
else {
_to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);