This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Remove some read beyond buffer ends
authorKarl Williamson <public@khwilliamson.com>
Wed, 19 Dec 2012 17:28:29 +0000 (10:28 -0700)
committerKarl Williamson <public@khwilliamson.com>
Wed, 19 Dec 2012 18:04:10 +0000 (11:04 -0700)
It turns out that this paradigm used in several places in regexec.c is
wrong:

while (s + (len = UTF8SKIP(s)) <= PL_regeol) {
    ...
    s += len;
}

The reason it is wrong is that PL_regeol points to one past the end, and
this will inevitably end up reading the character at PL_regeol, which is
hence out-of-bounds.  I don't know how many times I've looked at this
code and missed this, but valgrind found it after I tried Dave
Mitchell's hack to remove the trailing NUL for testing purposes.

The way to make sure that there isn't malformed UTF-8 pointing us to
read too far would be:

while (s < PL_regeol && s + (len = UTF8SKIP(s)) <= PL_regeol) {}

This commit converts the one place that uses this safe thing to the
others:

while (s < PL_regeol) {}

In many places (but not all) in this file, we assume that the input is
well formed.  If we're going to be paranoid, it seems to me we should do
it consistently.  So I've converted this one relevant instance of
paranoia to not be, to be in line with the rest of the code.

regexec.c

index cd9e555..bb0c198 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1329,9 +1329,9 @@ STMT_START {                                              \
 
 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
 STMT_START {                                          \
-    while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
+    while (s < strend) {                              \
        CoDe                                          \
-       s += uskip;                                   \
+       s += UTF8SKIP(s);                             \
     }                                                 \
 } STMT_END
 
@@ -5982,7 +5982,7 @@ NULL
                else if (utf8_target) {
                    int m = ST.max - ST.min;
                    for (ST.maxpos = locinput;
-                        m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
+                        m >0 && ST.maxpos < PL_regeol; m--)
                        ST.maxpos += UTF8SKIP(ST.maxpos);
                }
                else {
@@ -6758,11 +6758,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
                 /* When both target and pattern are UTF-8, we have to do
                  * string EQ */
                 while (hardcount < max
-                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
-                       && scan_char_len <= STR_LEN(p)
+                       && scan < loceol
+                       && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
                        && memEQ(scan, STRING(p), scan_char_len))
                 {
-                    scan += scan_char_len;
+                    scan += UTF8SKIP(scan);
                     hardcount++;
                 }
             }
@@ -6874,12 +6874,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     }
     case ANYOF:
        if (utf8_target) {
-           STRLEN inclasslen;
            while (hardcount < max
-                   && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
+                   && scan < loceol
                   && reginclass(prog, p, (U8*)scan, utf8_target))
            {
-               scan += inclasslen;
+               scan += UTF8SKIP(scan);
                hardcount++;
            }
        } else {