This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
PATCH: [perl #134329] Use after free in regcomp.c
authorKarl Williamson <khw@cpan.org>
Fri, 23 Aug 2019 18:40:24 +0000 (12:40 -0600)
committerKarl Williamson <khw@cpan.org>
Fri, 30 Aug 2019 14:22:05 +0000 (08:22 -0600)
A compiled regex is composed of nodes, forming a linked list, with
normally a maximum of 16 bits used to specify the offset of the next
link.  For patterns that require more space than this, the nodes that
jump around are replaced with ones that have wider offsets.  Most nodes
are unaffected, as they just contain the offset of the next node, and
that number is always small.  The jump nodes are the ones affected.

When compiling a pattern, the 16 bit mechanism is used, until it
overflows, at which point the pattern is recompiled with the long jumps
instead.

When I rewrote the compiler last year to make it generally one pass, I
noticed a lot of the cases where a node was added didn't check if the
result overflowed (the function that does this returns FALSE in that
case).  I presumed the prior authors knew better, and did not change
things, except to put in a bogus value in the link (offset) field that
should cause a crash if it were used.  That's what's happening in this
ticket.

But seeing this example, it's clear that the return value should be
checked every time, because you can reach the limit at any time.  This
commit changes to do that, and to require the function's return value to
not be ignored, to guard against future changes.

My guess is that the reason it generally worked when there were multiple
passes is that the first pass didn't do anything except count space, and
that at some point before the end of the pass the return value did get
checked, so by the time the nodes were allocated for real, it knew
enough to use the long jumps.

MANIFEST
embed.fnc
proto.h
regcomp.c
t/re/bigfuzzy_not_utf8.t [new file with mode: 0644]

index 8706958..1bc5c81 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -5839,6 +5839,7 @@ t/porting/utils.t         Check that utility scripts still compile
 t/re/alpha_assertions.t                See if things like '(*postive_lookahed:...) work properly
 t/re/anyof.t                   See if bracketed char classes [...] compile properly
 t/re/begin-once.t              Checking that /o freeze a variable in a RegExp
+t/re/bigfuzzy_not_utf8.t       Big and ugly tests not storable as UTF-8
 t/re/charset.t                 See if regex modifiers like /d, /u work properly
 t/re/fold_grind.pl             Core file to see if regex case folding works properly
 t/re/fold_grind_8.t            Wrapper for fold_grind.pl for /l testing with a UTF-8 locale
index c9230aa..c373205 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2494,7 +2494,7 @@ ES        |void   |reginsert      |NN RExC_state_t *pRExC_state \
                                |const U8 op                                \
                                |const regnode_offset operand               \
                                |const U32 depth
-ES     |bool   |regtail        |NN RExC_state_t * pRExC_state              \
+ESR    |bool   |regtail        |NN RExC_state_t * pRExC_state              \
                                |NN const regnode_offset p                  \
                                |NN const regnode_offset val                \
                                |const U32 depth
@@ -2628,7 +2628,7 @@ ES        |void   |dump_trie_interim_list|NN const struct _reg_trie_data *trie\
 ES     |void   |dump_trie_interim_table|NN const struct _reg_trie_data *trie\
                                |NULLOK HV* widecharmap|NN AV *revcharmap\
                                |U32 next_alloc|U32 depth
-ES     |bool   |regtail_study  |NN RExC_state_t *pRExC_state \
+ESR    |bool   |regtail_study  |NN RExC_state_t *pRExC_state \
                                |NN regnode_offset p|NN const regnode_offset val|U32 depth
 #  endif
 #endif
diff --git a/proto.h b/proto.h
index 00a2ca0..29a1e0c 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4461,9 +4461,11 @@ PERL_CALLCONV int        Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...);
        assert(fmt)
 STATIC void    S_regdump_extflags(pTHX_ const char *lead, const U32 flags);
 STATIC void    S_regdump_intflags(pTHX_ const char *lead, const U32 flags);
-STATIC bool    S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, const regnode_offset val, U32 depth);
+STATIC bool    S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, const regnode_offset val, U32 depth)
+                       __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_REGTAIL_STUDY \
        assert(pRExC_state); assert(p); assert(val)
+
 #  endif
 #  if defined(PERL_IN_REGEXEC_C)
 STATIC void    S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, const char *start, const char *end, const char *blurb);
@@ -5595,9 +5597,11 @@ STATIC regnode_offset    S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 o
 STATIC regnode_offset  S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth);
 #define PERL_ARGS_ASSERT_REGPIECE      \
        assert(pRExC_state); assert(flagp)
-STATIC bool    S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, const U32 depth);
+STATIC bool    S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, const U32 depth)
+                       __attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_REGTAIL       \
        assert(pRExC_state); assert(p); assert(val)
+
 STATIC void    S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf);
 #define PERL_ARGS_ASSERT_SCAN_COMMIT   \
        assert(pRExC_state); assert(data); assert(minlenp)
index abb029f..edd97a8 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11340,10 +11340,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                         return 0;
                     }
 
-                    REGTAIL(pRExC_state, ret, atomic);
+                    if (! REGTAIL(pRExC_state, ret, atomic)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
 
-                    REGTAIL(pRExC_state, atomic,
-                           reg_node(pRExC_state, SRCLOSE));
+                    if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+                                                                SRCLOSE)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
 
                     RExC_in_script_run = 0;
                     return ret;
@@ -11803,7 +11808,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                        RExC_flags & RXf_PMf_COMPILETIME
                                       );
                     FLAGS(REGNODE_p(ret)) = 2;
-                    REGTAIL(pRExC_state, ret, eval);
+                    if (! REGTAIL(pRExC_state, ret, eval)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     /* deal with the length of this later - MJD */
                    return ret;
                }
@@ -11856,7 +11863,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
 
                     tail = reg(pRExC_state, 1, &flag, depth+1);
                     RETURN_FAIL_ON_RESTART(flag, flagp);
-                    REGTAIL(pRExC_state, ret, tail);
+                    if (! REGTAIL(pRExC_state, ret, tail)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     goto insert_if;
                 }
                else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
@@ -11948,15 +11957,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                    }
                    nextchar(pRExC_state);
                  insert_if:
-                    REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+                    if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+                                                             IFTHEN, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     br = regbranch(pRExC_state, &flags, 1, depth+1);
                    if (br == 0) {
                         RETURN_FAIL_ON_RESTART(flags,flagp);
                         FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                               (UV) flags);
                     } else
-                        REGTAIL(pRExC_state, br, reganode(pRExC_state,
-                                                          LONGJMP, 0));
+                    if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+                                                             LONGJMP, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                    c = UCHARAT(RExC_parse);
                     nextchar(pRExC_state);
                    if (flags&HASWIDTH)
@@ -11973,7 +11989,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                             FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                                   (UV) flags);
                         }
-                        REGTAIL(pRExC_state, ret, lastbr);
+                        if (! REGTAIL(pRExC_state, ret, lastbr)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                        if (flags&HASWIDTH)
                            *flagp |= HASWIDTH;
                         c = UCHARAT(RExC_parse);
@@ -11988,16 +12006,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                             vFAIL("Switch (?(condition)... contains too many branches");
                     }
                    ender = reg_node(pRExC_state, TAIL);
-                    REGTAIL(pRExC_state, br, ender);
+                    if (! REGTAIL(pRExC_state, br, ender)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                    if (lastbr) {
-                        REGTAIL(pRExC_state, lastbr, ender);
-                        REGTAIL(pRExC_state, REGNODE_OFFSET(
-                                                NEXTOPER(
-                                                NEXTOPER(REGNODE_p(lastbr)))),
-                                             ender);
+                        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
+                        if (! REGTAIL(pRExC_state,
+                                      REGNODE_OFFSET(
+                                                 NEXTOPER(
+                                                 NEXTOPER(REGNODE_p(lastbr)))),
+                                      ender))
+                        {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                    }
                    else
-                        REGTAIL(pRExC_state, ret, ender);
+                        if (! REGTAIL(pRExC_state, ret, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
 #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
                     RExC_size++; /* XXX WHY do we need this?!!
                                     For large programs it seems to be required
@@ -12147,7 +12175,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
        *flagp |= flags&SIMPLE;
     }
     if (is_open) {                             /* Starts with OPEN. */
-        REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
+        if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
     }
     else if (paren != '?')             /* Not Conditional */
        ret = br;
@@ -12155,12 +12185,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
     lastbr = br;
     while (*RExC_parse == '|') {
        if (RExC_use_BRANCHJ) {
+            bool shut_gcc_up;
+
            ender = reganode(pRExC_state, LONGJMP, 0);
 
             /* Append to the previous. */
-            REGTAIL(pRExC_state,
-                    REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
-                    ender);
+            shut_gcc_up = REGTAIL(pRExC_state,
+                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+                         ender);
+            PERL_UNUSED_VAR(shut_gcc_up);
        }
        nextchar(pRExC_state);
        if (freeze_paren) {
@@ -12271,9 +12304,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                         is_nothing= 0;
                }
                else if (op == BRANCHJ) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
-                                  ender);
+                    bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+                                        ender);
+                    PERL_UNUSED_VAR(shut_gcc_up);
                     /* for now we always disable this optimisation * /
                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
                          || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
@@ -12588,7 +12622,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                const regnode_offset w = reg_node(pRExC_state, WHILEM);
 
                FLAGS(REGNODE_p(w)) = 0;
-                REGTAIL(pRExC_state, ret, w);
+                if (!  REGTAIL(pRExC_state, ret, w)) {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                if (RExC_use_BRANCHJ) {
                    reginsert(pRExC_state, LONGJMP, ret, depth+1);
                    reginsert(pRExC_state, NOTHING, ret, depth+1);
@@ -12603,7 +12639,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                if (RExC_use_BRANCHJ)
                     NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
                                                        LONGJMP. */
-                REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+                if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+                                                          NOTHING)))
+                {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                 RExC_whilem_seen++;
                 MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
            }
@@ -12675,16 +12715,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     if (*RExC_parse == '?') {
        nextchar(pRExC_state);
        reginsert(pRExC_state, MINMOD, ret, depth+1);
-        REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+        if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
     }
     else if (*RExC_parse == '+') {
         regnode_offset ender;
         nextchar(pRExC_state);
         ender = reg_node(pRExC_state, SUCCEED);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
         reginsert(pRExC_state, SUSPEND, ret, depth+1);
         ender = reg_node(pRExC_state, TAIL);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
     }
 
     if (ISMULT2(RExC_parse)) {
@@ -19892,8 +19938,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
     }
     else {
         if (val - scan > U16_MAX) {
-            /* Since not all callers check the return value, populate this with
-             * something that won't loop and will likely lead to a crash if
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
              * execution continues */
             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
             return FALSE;
@@ -20004,6 +20050,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
     }
     else {
         if (val - scan > U16_MAX) {
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
+             * execution continues */
             NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
             return FALSE;
         }
diff --git a/t/re/bigfuzzy_not_utf8.t b/t/re/bigfuzzy_not_utf8.t
new file mode 100644 (file)
index 0000000..b4dfd15
Binary files /dev/null and b/t/re/bigfuzzy_not_utf8.t differ