This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Convert some calls to test for malformations
authorKarl Williamson <khw@cpan.org>
Fri, 16 Dec 2016 03:11:00 +0000 (20:11 -0700)
committerKarl Williamson <khw@cpan.org>
Sat, 24 Dec 2016 05:36:34 +0000 (22:36 -0700)
Code review showed several places in core where a UTF-8 sequence that
was for a code point below 256 could be malformed, and be blindly
accepted.  Convert these to use the similar macro that does the check.

One place in regexec.c was not converted because it is working on the
pattern, which perl should have generated itself, so very unlikely to be
bemalformed.

I didn't add tests for these, as it would be a pain to figure out
somehow to trigger them, and this is precautionary, based on code
reading rather than any known field experience.

locale.c
pp.c
regexec.c
utf8.c

index 07f599c..b86077f 100644 (file)
--- a/locale.c
+++ b/locale.c
@@ -1723,13 +1723,14 @@ Perl__mem_collxfrm(pTHX_ const char *input_string,
                 {
                     STRLEN i;
                     STRLEN d= 0;
                 {
                     STRLEN i;
                     STRLEN d= 0;
+                    char * e = (char *) t + len;
 
                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
                         U8 cur_char = t[i];
                         if (UTF8_IS_INVARIANT(cur_char)) {
                             s[d++] = cur_char;
                         }
 
                     for (i = 0; i < len; i+= UTF8SKIP(t + i)) {
                         U8 cur_char = t[i];
                         if (UTF8_IS_INVARIANT(cur_char)) {
                             s[d++] = cur_char;
                         }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(cur_char)) {
+                        else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) {
                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
                         }
                         else {  /* Replace illegal cp with highest collating
                             s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]);
                         }
                         else {  /* Replace illegal cp with highest collating
diff --git a/pp.c b/pp.c
index 26b1cb3..9dad252 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4404,7 +4404,7 @@ PP(pp_quotemeta)
                        to_quote = TRUE;
                    }
                }
                        to_quote = TRUE;
                    }
                }
-               else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+               else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, s + len)) {
                    if (
 #ifdef USE_LOCALE_CTYPE
                    /* In locale, we quote all non-ASCII Latin1 chars.
                    if (
 #ifdef USE_LOCALE_CTYPE
                    /* In locale, we quote all non-ASCII Latin1 chars.
index 8b5caa7..d9898cb 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2429,7 +2429,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     if ((UTF8_IS_INVARIANT(*s)
                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
                                                                 classnum)))
                     if ((UTF8_IS_INVARIANT(*s)
                          && to_complement ^ cBOOL(_generic_isCC((U8) *s,
                                                                 classnum)))
-                        || (UTF8_IS_DOWNGRADEABLE_START(*s)
+                        || (   UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
                             && to_complement ^ cBOOL(
                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 1)),
                             && to_complement ^ cBOOL(
                                 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 1)),
@@ -6373,8 +6373,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 break;
             }
 
                 break;
             }
 
-            if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */
-                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
+                /* An above Latin-1 code point, or malformed */
+                _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+                                                       reginfo->strend);
                 goto utf8_posix_above_latin1;
             }
 
                 goto utf8_posix_above_latin1;
             }
 
@@ -6458,7 +6460,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 }
                 locinput++;
             }
                 }
                 locinput++;
             }
-            else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+            else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
                 if (! (to_complement
                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 1)),
                 if (! (to_complement
                        ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 1)),
diff --git a/utf8.c b/utf8.c
index 5b98352..7f3ea11 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -3037,7 +3037,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
         }                                                                    \
     }                                                                        \
             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
         }                                                                    \
     }                                                                        \
-    else if UTF8_IS_DOWNGRADEABLE_START(*p) {                                \
+    else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, p + UTF8SKIP(p)) {            \
         if (flags & (locale_flags)) {                                        \
             result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p,         \
                                                                  *(p+1)));   \
         if (flags & (locale_flags)) {                                        \
             result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p,         \
                                                                  *(p+1)));   \