Code review showed several places in core where a UTF-8 sequence that
was for a code point below 256 could be malformed, and be blindly
accepted. Convert these to use the similar macro that does the check.
One place in regexec.c was not converted because it is working on the
pattern, which perl should have generated itself, so very unlikely to be
bemalformed.
I didn't add tests for these, as it would be a pain to figure out
somehow to trigger them, and this is precautionary, based on code
reading rather than any known field experience.
+ char * e = (char *) t + len;
for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
U8 cur_char = t[i];
if (UTF8_IS_INVARIANT(cur_char)) {
s[d++] = cur_char;
}
for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
U8 cur_char = t[i];
if (UTF8_IS_INVARIANT(cur_char)) {
s[d++] = cur_char;
}
- else if (UTF8_IS_DOWNGRADEABLE_START(cur_char)) {
+ else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
}
else { /* Replace illegal cp with highest collating
s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
}
else { /* Replace illegal cp with highest collating
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+ else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, s + len)) {
if (
#ifdef USE_LOCALE_CTYPE
/* In locale, we quote all non-ASCII Latin1 chars.
if (
#ifdef USE_LOCALE_CTYPE
/* In locale, we quote all non-ASCII Latin1 chars.
if ((UTF8_IS_INVARIANT(*s)
&& to_complement ^ cBOOL(_generic_isCC((U8) *s,
classnum)))
if ((UTF8_IS_INVARIANT(*s)
&& to_complement ^ cBOOL(_generic_isCC((U8) *s,
classnum)))
- || (UTF8_IS_DOWNGRADEABLE_START(*s)
+ || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
&& to_complement ^ cBOOL(
_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
&& to_complement ^ cBOOL(
_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
- if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
- _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
+ /* An above Latin-1 code point, or malformed */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+ reginfo->strend);
goto utf8_posix_above_latin1;
}
goto utf8_posix_above_latin1;
}
- else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+ else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
if (! (to_complement
^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
if (! (to_complement
^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
} \
} \
return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
} \
} \
- else if UTF8_IS_DOWNGRADEABLE_START(*p) { \
+ else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, p + UTF8SKIP(p)) { \
if (flags & (locale_flags)) { \
result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
*(p+1))); \
if (flags & (locale_flags)) { \
result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
*(p+1))); \