fix Perl #126182, out of memory due to infinite pattern recursion
authorYves Orton <demerphq@gmail.com>
Sun, 6 Mar 2016 12:56:44 +0000 (13:56 +0100)
committerYves Orton <demerphq@gmail.com>
Sun, 6 Mar 2016 13:06:08 +0000 (14:06 +0100)
The way we tracked if pattern recursion was infinite did not work
properly. A pattern like

    "a"=~/(.(?2))((?<=(?=(?1)).))/

would loop forever, slowly eat up all available ram as it added
pattern recursion stack frames.

This patch changes the rules for recursion so that recursively
entering a given pattern "subroutine" twice from the same position
fails the match. This means that where previously we might have
seen fatal exception we will now simply fail. This means that

    "aaabbb"=~/a(?R)?b/

succeeds with $& equal to "aaabbb".

regcomp.c
regcomp.h
regexec.c
regexp.h
t/re/pat.t

index b789ca3..771f217 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4719,11 +4719,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              RExC_study_chunk_recursed_bytes, U8);
                     }
                     /* we havent recursed into this paren yet, so recurse into it */
-                   DEBUG_STUDYDATA("set:", data,depth);
+                    DEBUG_STUDYDATA("gosub-set:", data,depth);
                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
                     my_recursed_depth= recursed_depth + 1;
                 } else {
-                   DEBUG_STUDYDATA("inf:", data,depth);
+                    DEBUG_STUDYDATA("gosub-inf:", data,depth);
                     /* some form of infinite recursion, assume infinite length
                      * */
                     if (flags & SCF_DO_SUBSTR) {
@@ -7617,8 +7617,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     if (r->minlen < minlen)
         r->minlen = minlen;
 
-    if (RExC_seen & REG_RECURSE_SEEN )
+    if (RExC_seen & REG_RECURSE_SEEN ) {
         r->intflags |= PREGf_RECURSE_SEEN;
+        Newxz(r->recurse_locinput, r->nparens + 1, char *);
+    }
     if (RExC_seen & REG_GPOS_SEEN)
         r->intflags |= PREGf_GPOS_SEEN;
     if (RExC_seen & REG_LOOKBEHIND_SEEN)
@@ -19041,6 +19043,8 @@ Perl_pregfree2(pTHX_ REGEXP *rx)
 #endif
     Safefree(r->offs);
     SvREFCNT_dec(r->qr_anoncv);
+    if (r->recurse_locinput)
+        Safefree(r->recurse_locinput);
     rx->sv_u.svu_rx = 0;
 }
 
@@ -19124,6 +19128,8 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
 #endif
     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
     SvREFCNT_inc_void(ret->qr_anoncv);
+    if (r->recurse_locinput)
+        Newxz(ret->recurse_locinput,r->nparens + 1,char *);
 
     return ret_x;
 }
@@ -19262,7 +19268,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
 #define SAVEPVN(p,n)   ((p) ? savepvn(p,n) : NULL)
 
 /*
-   re_dup - duplicate a regexp.
+   re_dup_guts - duplicate a regexp.
 
    This routine is expected to clone a given regexp structure. It is only
    compiled under USE_ITHREADS.
@@ -19330,6 +19336,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
 
     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
+    if (r->recurse_locinput)
+        Newxz(ret->recurse_locinput,r->nparens + 1,char *);
 
     if (ret->pprivate)
        RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
@@ -19384,6 +19392,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
           char, regexp_internal);
     Copy(ri->program, reti->program, len+1, regnode);
 
+
     reti->num_code_blocks = ri->num_code_blocks;
     if (ri->code_blocks) {
        int n;
@@ -19444,7 +19453,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
                d->data[i] = ri->data->data[i];
                break;
             default:
-               Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
+                Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
                                                            ri->data->what[i]);
            }
        }
index f16197f..c2e44aa 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -90,8 +90,6 @@
 /* This is the stuff that used to live in regexp.h that was truly
    private to the engine itself. It now lives here. */
 
-
-
  typedef struct regexp_internal {
         int name_list_idx;     /* Optional data index of an array of paren names */
         union {
index 5e188fd..07a80e2 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -3126,6 +3126,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
        ));
     }
 
+    if (prog->recurse_locinput)
+        Zero(prog->recurse_locinput,prog->nparens + 1, char *);
+
     /* Simplest case: anchored match need be tried only once, or with
      * MBOL, only at the beginning of each line.
      *
@@ -5184,7 +5187,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
     bool is_utf8_pat = reginfo->is_utf8_pat;
     bool match = FALSE;
 
-
 #ifdef DEBUGGING
     GET_RE_DEBUG_FLAGS_DECL;
 #endif
@@ -6494,11 +6496,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             regexp *re;
             regexp_internal *rei;
             regnode *startpoint;
+            U32 arg;
 
        case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
-           if (cur_eval && cur_eval->locinput==locinput) {
-                if ( EVAL_CLOSE_PAREN_IS( cur_eval, (U32)ARG(scan) ) )
-                    Perl_croak(aTHX_ "Infinite recursion in regex");
+            arg= (U32)ARG(scan);
+            if (cur_eval && cur_eval->locinput == locinput) {
                 if ( ++nochange_depth > max_nochange_depth )
                     Perl_croak(aTHX_ 
                         "Pattern subroutine nesting without pos change"
@@ -6510,7 +6512,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             re = rex;
             rei = rexi;
             startpoint = scan + ARG2L(scan);
-            EVAL_CLOSE_PAREN_SET( st, ARG(scan) ); /* ST.close_paren = 1 + ARG(scan) */
+            EVAL_CLOSE_PAREN_SET( st, arg ); /* ST.close_paren = 1 + ARG(scan) */
+            /* Detect infinite recursion
+             *
+             * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
+             * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
+             * So we track the position in the string we are at each time
+             * we recurse and if we try to enter the same routine twice from
+             * the same position we fail. This means that a pattern like
+             * "aaabbb"=~/a(?R)?b/ works as expected and does not throw an
+             * error.
+             */
+            if ( rex->recurse_locinput[arg] == locinput ) {
+                DEBUG_r({
+                    GET_RE_DEBUG_FLAGS_DECL;
+                    DEBUG_EXECUTE_r({
+                        PerlIO_printf(Perl_debug_log,
+                                    "%*s  pattern left-recursion without consuming input always fails...\n",
+                                    REPORT_CODE_OFF + depth*2, "");
+                    });
+                });
+                /* this would be infinite recursion, so we fail */
+                sayNO;
+            } else {
+                ST.prev_recurse_locinput= rex->recurse_locinput[arg];
+                rex->recurse_locinput[arg]= locinput;
+            }
 
             /* Save all the positions seen so far. */
             ST.cp = regcppush(rex, 0, maxopenparen);
@@ -6547,10 +6574,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                n = ARG(scan);
 
                if (rexi->data->what[n] == 'r') { /* code from an external qr */
-                   newcv = (ReANY(
-                                               (REGEXP*)(rexi->data->data[n])
-                                           ))->qr_anoncv
-                                       ;
+                    newcv = (ReANY(
+                                    (REGEXP*)(rexi->data->data[n])
+                            ))->qr_anoncv;
                    nop = (OP*)rexi->data->data[n+1];
                }
                else if (rexi->data->what[n] == 'l') { /* literal code */
@@ -6771,6 +6797,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                startpoint = rei->program + 1;
                 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
                                              * close_paren only for GOSUB */
+                ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
                 /* Save all the seen positions so far. */
                 ST.cp = regcppush(rex, 0, maxopenparen);
                 REGCP_SET(ST.lastcp);
@@ -6812,6 +6839,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
        case EVAL_AB: /* cleanup after a successful (??{A})B */
            /* note: this is called twice; first after popping B, then A */
+            if ( cur_eval && cur_eval->u.eval.close_paren )
+                rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
+
            rex_sv = ST.prev_rex;
             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
            SET_reg_curpm(rex_sv);
@@ -6837,6 +6867,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
        case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
            /* note: this is called twice; first after popping B, then A */
+            if ( cur_eval && cur_eval->u.eval.close_paren )
+                rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
+
            rex_sv = ST.prev_rex;
             is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
            SET_reg_curpm(rex_sv);
@@ -7905,6 +7938,8 @@ NULL
           fake_end:
            if (cur_eval) {
                /* we've just finished A in /(??{A})B/; now continue with B */
+                if ( cur_eval->u.eval.close_paren )
+                    rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
 
                st->u.eval.prev_rex = rex_sv;           /* inner */
 
index 02258fe..09bb615 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -102,6 +102,7 @@ struct reg_code_block {
        const struct regexp_engine* engine;                             \
        REGEXP *mother_re; /* what re is this a lightweight copy of? */ \
        HV *paren_names;   /* Optional hash of paren names */           \
+        /*--------------------------------------------------------*/    \
        /* Information about the match that the perl core uses to */    \
        /* manage things */                                             \
        U32 extflags;   /* Flags used both externally and internally */ \
@@ -116,12 +117,15 @@ struct reg_code_block {
        U32 intflags;   /* Engine Specific Internal flags */            \
        void *pprivate; /* Data private to the regex engine which */    \
                        /* created this object. */                      \
+        /*--------------------------------------------------------*/    \
        /* Data about the last/current match. These are modified */     \
        /* during matching */                                           \
        U32 lastparen;                  /* last open paren matched */   \
        U32 lastcloseparen;             /* last close paren matched */  \
        /* Array of offsets for (@-) and (@+) */                        \
        regexp_paren_pair *offs;                                        \
+        char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */ \
+        /*--------------------------------------------------------*/    \
        /* saved or original string so \digit works forever. */         \
        char *subbeg;                                                   \
        SV_SAVED_COPY   /* If non-NULL, SV which is COW from original */\
@@ -130,11 +134,13 @@ struct reg_code_block {
        SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \
        /* Information about the match that isn't often used */         \
         SSize_t maxlen;        /* mininum possible number of chars in string to match */\
+        /*--------------------------------------------------------*/    \
        /* offset from wrapped to the start of precomp */               \
        PERL_BITFIELD32 pre_prefix:4;                                   \
         /* original flags used to compile the pattern, may differ */    \
         /* from extflags in various ways */                             \
         PERL_BITFIELD32 compflags:9;                                    \
+        /*--------------------------------------------------------*/    \
        CV *qr_anoncv   /* the anon sub wrapped round qr/(?{..})/ */
 
 typedef struct regexp {
@@ -657,7 +663,7 @@ typedef struct {
 /* structures for holding and saving the state maintained by regmatch() */
 
 #ifndef MAX_RECURSE_EVAL_NOCHANGE_DEPTH
-#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 1000
+#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10
 #endif
 
 typedef I32 CHECKPOINT;
@@ -749,6 +755,7 @@ typedef struct regmatch_state {
            CHECKPOINT  lastcp;
             U32         close_paren; /* which close bracket is our end (+1) */
            regnode     *B;     /* the node following us  */
+            char        *prev_recurse_locinput;
        } eval;
 
        struct {
index fb4caf6..c35bbff 100644 (file)
@@ -23,7 +23,7 @@ BEGIN {
     skip_all_without_unicode_tables();
 }
 
-plan tests => 776;  # Update this when adding/deleting tests.
+plan tests => 781;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1721,6 +1721,14 @@ EOP
             fresh_perl_is($code, "", {},
                             "perl [#126406] panic");
        }
+        {
+            # [perl #126182] test for infinite pattern recursion
+            ok("aaabbb"=~/a(?R)?b/, "optional self recursion works");
+            ok("aaabbb"=~/a(?R)?b/, "optional self recursion works");
+            ok(not("aa"=~/(?R)a/), "left-recursion fails fast");
+            ok("bbaa"=~/(?&x)(?(DEFINE)(?<x>(?&y)*a)(?<y>(?&x)*b))/,"inter-cyclic optional left recursion works");
+            ok(not("a"=~/(.(?2))((?<=(?=(?1)).))/),"look ahead left-recursion fails fast");
+        }
 } # End of sub run_tests
 
 1;