This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
first step cleaning up regexp recursion "return" logic
authorYves Orton <demerphq@gmail.com>
Sat, 5 Mar 2016 19:40:36 +0000 (20:40 +0100)
committerYves Orton <demerphq@gmail.com>
Sun, 6 Mar 2016 13:02:14 +0000 (14:02 +0100)
When we do a GOSUB/GOSTART we need to know where the recursion
"returns" to the previous position in the pattern. Currently
this is done by comparing cur_eval->u.eval.close_paren to the
the argument of CLOSE parens, and within END blocks (for GOSTART).

However, there is a problem. The state machinery for GOSUB/GOSTART
is shared with EVAL ( implementing /(??{ ... })/ ), which also sets
cur_eval->u.eval.close_paren to 0. This for instance breaks /(?(R)YES|NO)/
by making it return true within a /(??{ ... })/ when arguably it
shouldn't. It also is confusing.

So we change the meaning of close_paren so that 0 means "from EVAL",
and that otherwise the value is "+1", so 1 means GOSTART (trigger for
END), and 2 means "CLOSE1", etc. In order to make this transparent
this patch adds the EVAL_CLOSE_PAREN_IS( cur_eval, EXPR ) macro which
does the right thing:

    ( cur_eval && cur_eval->u.eval.close_paren &&
      ( ( cur_eval->u.eval.close_paren - 1 ) == EXPR ) )

regcomp.c
regexec.c
regexp.h

index 68c599f..edd49a6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10847,13 +10847,21 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                }
                else if (RExC_parse[0] == 'R') {
                    RExC_parse++;
+                    /* parno == 0 => /(?(R)YES|NO)/  "in any form of recursion OR eval"
+                     * parno == 1 => /(?(R0)YES|NO)/ "in GOSTART (?0) / (?R)"
+                     * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
+                     */
                    parno = 0;
-                   if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+                    if (RExC_parse[0] == '0') {
+                        parno = 1;
+                        RExC_parse++;
+                    }
+                    else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                         UV uv;
                         if (grok_atoUV(RExC_parse, &uv, &endptr)
                             && uv <= I32_MAX
                         ) {
-                            parno = (I32)uv;
+                            parno = (I32)uv + 1;
                             RExC_parse = (char*)endptr;
                         }
                         /* else "Switch condition not recognized" below */
@@ -10864,7 +10872,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                             SIZE_ONLY
                             ? REG_RSN_RETURN_NULL
                             : REG_RSN_RETURN_DATA);
-                       parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
+
+                        /* we should only have a false sv_dat when
+                         * SIZE_ONLY is true, and we always have false
+                         * sv_dat when SIZE_ONLY is true.
+                         * reg_scan_name() will VFAIL() if the name is
+                         * unknown when SIZE_ONLY is false, and otherwise
+                         * will return something, and when SIZE_ONLY is
+                         * true, reg_scan_name() just parses the string,
+                         * and doesnt return anything. (in theory) */
+                        assert(SIZE_ONLY ? !sv_dat : sv_dat);
+
+                        if (sv_dat)
+                            parno = 1 + *((I32 *)SvPVX(sv_dat));
                    }
                    ret = reganode(pRExC_state,INSUBP,parno);
                    goto insert_if_check_paren;
index 9b1bac6..66dc245 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -213,7 +213,8 @@ static const char* const non_utf8_target_but_utf8_required
 */
 #define JUMPABLE(rn) (                                                             \
     OP(rn) == OPEN ||                                                              \
-    (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
+    (OP(rn) == CLOSE &&                                                            \
+     !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) ||                                   \
     OP(rn) == EVAL ||                                                              \
     OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
     OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
@@ -6497,7 +6498,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
        case GOSTART: /*  (?R)  */
        case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
            if (cur_eval && cur_eval->locinput==locinput) {
-                if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) 
+                if ( EVAL_CLOSE_PAREN_IS( cur_eval, (U32)ARG(scan) ) )
                     Perl_croak(aTHX_ "Infinite recursion in regex");
                 if ( ++nochange_depth > max_nochange_depth )
                     Perl_croak(aTHX_ 
@@ -6509,12 +6510,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            re_sv = rex_sv;
             re = rex;
             rei = rexi;
-            if (OP(scan)==GOSUB) {
+            if ( OP(scan) == GOSUB ) {
                 startpoint = scan + ARG2L(scan);
-                ST.close_paren = ARG(scan);
+                ST.close_paren = 1 + ARG(scan);
             } else {
-                startpoint = rei->program+1;
-                ST.close_paren = 0;
+                startpoint = rei->program + 1;
+                ST.close_paren = 1;
             }
 
             /* Save all the positions seen so far. */
@@ -6893,9 +6894,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            if (n > rex->lastparen)
                rex->lastparen = n;
            rex->lastcloseparen = n;
-            if (cur_eval && cur_eval->u.eval.close_paren == n) {
+            if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
                goto fake_end;
-           }    
+
            break;
 
         case ACCEPT:  /*  (*ACCEPT)  */
@@ -6914,8 +6915,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                             if (n > rex->lastparen)
                                 rex->lastparen = n;
                             rex->lastcloseparen = n;
-                            if ( n == ARG(scan) || (cur_eval &&
-                                cur_eval->u.eval.close_paren == n))
+                            if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
                                 break;
                         }
                     }
@@ -6936,7 +6936,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
         case INSUBP:   /*  (?(R))  */
             n = ARG(scan);
-            sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
+            sw = cur_eval && (n == 0 || cur_eval->u.eval.close_paren == n);
             break;
 
         case DEFINEP:  /*  (?(DEFINE))  */
@@ -7468,8 +7468,7 @@ NULL
                          (IV) ST.count, (IV)ST.alen)
            );
 
-           if (cur_eval && cur_eval->u.eval.close_paren && 
-               cur_eval->u.eval.close_paren == (U32)ST.me->flags) 
+            if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.me->flags))
                goto fake_end;
                
            {
@@ -7482,9 +7481,9 @@ NULL
        case CURLYM_A_fail: /* just failed to match an A */
            REGCP_UNWIND(ST.cp);
 
+
            if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ 
-               || (cur_eval && cur_eval->u.eval.close_paren &&
-                   cur_eval->u.eval.close_paren == (U32)ST.me->flags))
+                || EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.me->flags))
                sayNO;
 
          curlym_do_B: /* execute the B in /A{m,n}B/  */
@@ -7567,8 +7566,8 @@ NULL
                }
                else
                    rex->offs[paren].end = -1;
-               if (cur_eval && cur_eval->u.eval.close_paren &&
-                   cur_eval->u.eval.close_paren == (U32)ST.me->flags) 
+
+                if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.me->flags))
                {
                    if (ST.count) 
                        goto fake_end;
@@ -7637,8 +7636,8 @@ NULL
                maxopenparen = ST.paren;
            ST.min = ARG1(scan);  /* min to match */
            ST.max = ARG2(scan);  /* max to match */
-           if (cur_eval && cur_eval->u.eval.close_paren &&
-               cur_eval->u.eval.close_paren == (U32)ST.paren) {
+            if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.paren))
+            {
                ST.min=1;
                ST.max=1;
            }
@@ -7825,10 +7824,8 @@ NULL
                     assert(n == REG_INFTY || locinput == li);
                }
                CURLY_SETPAREN(ST.paren, ST.count);
-               if (cur_eval && cur_eval->u.eval.close_paren && 
-                   cur_eval->u.eval.close_paren == (U32)ST.paren) {
+                if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.paren))
                    goto fake_end;
-               }
                PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
            }
            NOT_REACHED; /* NOTREACHED */
@@ -7855,10 +7852,8 @@ NULL
                {
                  curly_try_B_min:
                    CURLY_SETPAREN(ST.paren, ST.count);
-                   if (cur_eval && cur_eval->u.eval.close_paren &&
-                       cur_eval->u.eval.close_paren == (U32)ST.paren) {
+                    if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.paren))
                         goto fake_end;
-                    }
                    PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
                }
            }
@@ -7867,10 +7862,8 @@ NULL
 
           curly_try_B_max:
            /* a successful greedy match: now try to match B */
-            if (cur_eval && cur_eval->u.eval.close_paren &&
-                cur_eval->u.eval.close_paren == (U32)ST.paren) {
+            if (EVAL_CLOSE_PAREN_IS(cur_eval,(U32)ST.paren))
                 goto fake_end;
-            }
            {
                bool could_match = locinput < reginfo->strend;
 
index 5dbab2e..ff44df2 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -747,7 +747,7 @@ typedef struct regmatch_state {
            REGEXP      *prev_rex;
            CHECKPOINT  cp;     /* remember current savestack indexes */
            CHECKPOINT  lastcp;
-           U32        close_paren; /* which close bracket is our end */
+            U32         close_paren; /* which close bracket is our end (+1) */
            regnode     *B;     /* the node following us  */
        } eval;
 
@@ -833,6 +833,13 @@ typedef struct regmatch_state {
     } u;
 } regmatch_state;
 
+#define EVAL_CLOSE_PAREN_IS(cur_eval,expr) \
+(\
+    ( ( cur_eval )                                         ) && \
+    ( ( cur_eval )->u.eval.close_paren                     ) && \
+    ( ( ( cur_eval )->u.eval.close_paren - 1 ) == ( expr ) ) \
+)
+
 /* how many regmatch_state structs to allocate as a single slab.
  * We do it in 4K blocks for efficiency. The "3" is 2 for the next/prev
  * pointers, plus 1 for any mythical malloc overhead. */