This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Check for UTF-8 fitting
authorKarl Williamson <khw@cpan.org>
Tue, 27 Mar 2018 21:49:06 +0000 (15:49 -0600)
committerKarl Williamson <khw@cpan.org>
Sat, 31 Mar 2018 21:36:46 +0000 (15:36 -0600)
We've been burned before by malformed UTF-8 causing us to read outside
the buffer bounds.  Here is a case I saw during code inspection, and
it's easy to add the buffer end limit

embed.fnc
embed.h
proto.h
regexec.c

index d2f52d4..580e304 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2543,7 +2543,7 @@ ERp       |bool   |_is_grapheme   |NN const U8 * strbeg|NN const U8 * s|NN const U8 *stren
 #endif
 
 #if defined(PERL_IN_REGEXEC_C)
-ERs    |bool   |isFOO_utf8_lc  |const U8 classnum|NN const U8* character
+ERs    |bool   |isFOO_utf8_lc  |const U8 classnum|NN const U8* character|NN const U8* e
 ERns   |char * |find_next_ascii|NN char* s|NN const char * send|const bool is_utf8
 ERns   |char * |find_next_non_ascii|NN char* s|NN const char * send|const bool is_utf8
 ERns   |U8 *   |find_next_masked|NN U8 * s                             \
diff --git a/embed.h b/embed.h
index e71262a..97832ba 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define find_next_non_ascii    S_find_next_non_ascii
 #define find_span_end          S_find_span_end
 #define find_span_end_mask     S_find_span_end_mask
-#define isFOO_utf8_lc(a,b)     S_isFOO_utf8_lc(aTHX_ a,b)
+#define isFOO_utf8_lc(a,b,c)   S_isFOO_utf8_lc(aTHX_ a,b,c)
 #define isGCB(a,b,c,d,e)       S_isGCB(aTHX_ a,b,c,d,e)
 #define isLB(a,b,c,d,e,f)      S_isLB(aTHX_ a,b,c,d,e,f)
 #define isSB(a,b,c,d,e,f)      S_isSB(aTHX_ a,b,c,d,e,f)
diff --git a/proto.h b/proto.h
index 91bcd6d..ff049bf 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5620,10 +5620,10 @@ STATIC U8 *     S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, co
 #define PERL_ARGS_ASSERT_FIND_SPAN_END_MASK    \
        assert(s); assert(send)
 
-STATIC bool    S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+STATIC bool    S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
                        __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
-       assert(character)
+       assert(character); assert(e)
 
 STATIC bool    S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
                        __attribute__warn_unused_result__;
index b4f2f6c..7f0849e 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -494,7 +494,7 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 #endif
 
 STATIC bool
-S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
+S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
 {
     /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
      * 'character' is a member of the Posix character class given by 'classnum'
@@ -516,7 +516,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
                         EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
     }
 
-    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
 
     switch ((_char_class_number) classnum) {
         case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
@@ -525,7 +525,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
         case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
         default:
             return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
-                                        valid_utf8_to_uvchr(character, NULL));
+                                        utf8_to_uvchr_buf(character, e, NULL));
     }
 
     return FALSE; /* Things like CNTRL are always below 256 */
@@ -2789,7 +2789,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
     case POSIXL:
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
-        REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
+        REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
                         to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
         break;
 
@@ -9512,7 +9512,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
        } else {
            while (hardcount < max && scan < loceol
                    && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
-                                                                  (U8 *) scan)))
+                                                                  (U8 *) scan,
+                                                                  (U8 *) loceol)))
             {
                 scan += UTF8SKIP(scan);
                hardcount++;