Implement variable length lookbehind in regex patterns

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 50a83df..275945c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -163,6 +163,7 @@ struct RExC_state_t {
      I32                seen_zerolen;
      regnode_offset *open_parens;       /* offsets to open parens */
      regnode_offset *close_parens;      /* offsets to close parens */
+    I32      parens_buf_size;           /* #slots malloced open/close_parens */
      regnode     *end_op;                /* END node in program */
      I32                utf8;           /* whether the pattern is utf8 or not */
      I32                orig_utf8;      /* whether the pattern was originally in utf8 */
@@ -253,6 +254,7 @@ struct RExC_state_t {
  #define RExC_maxlen        (pRExC_state->maxlen)
  #define RExC_npar      (pRExC_state->npar)
  #define RExC_total_parens      (pRExC_state->total_par)
+#define RExC_parens_buf_size   (pRExC_state->parens_buf_size)
  #define RExC_nestroot   (pRExC_state->nestroot)
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_utf8      (pRExC_state->utf8)
@@ -353,7 +355,7 @@ struct RExC_state_t {
              if (DEPENDS_SEMANTICS) {                                        \
                  set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
                  RExC_uni_semantics = 1;                                     \
-                if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) {     \
+                if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
                      /* No need to restart the parse if we haven't seen      \
                       * anything that differs between /u and /d, and no need \
                       * to restart immediately if we're going to reparse     \
@@ -364,11 +366,10 @@ struct RExC_state_t {
              }                                                               \
      } STMT_END
  
-#define BRANCH_MAX_OFFSET   U16_MAX
  #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
      STMT_START {                                                            \
                  RExC_use_BRANCHJ = 1;                                       \
-                if (LIKELY(RExC_total_parens >= 0)) {                       \
+                if (LIKELY(! IN_PARENS_PASS)) {                             \
                      /* No need to restart the parse immediately if we're    \
                       * going to reparse anyway to count parens */           \
                      *flagp |= RESTART_PARSE;                                \
@@ -376,10 +377,19 @@ struct RExC_state_t {
                  }                                                           \
      } STMT_END
  
+/* Until we have completed the parse, we leave RExC_total_parens at 0 or
+ * less.  After that, it must always be positive, because the whole re is
+ * considered to be surrounded by virtual parens.  Setting it to negative
+ * indicates there is some construct that needs to know the actual number of
+ * parens to be properly handled.  And that means an extra pass will be
+ * required after we've counted them all */
+#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
  #define REQUIRE_PARENS_PASS                                                 \
-    STMT_START {                                                            \
-                    if (RExC_total_parens == 0) RExC_total_parens = -1;     \
+    STMT_START {  /* No-op if have completed a pass */                      \
+                    if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
      } STMT_END
+#define IN_PARENS_PASS (RExC_total_parens < 0)
+
  
  /* This is used to return failure (zero) early from the calling function if
   * various flags in 'flags' are set.  Two flags always cause a return:
@@ -2692,7 +2702,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
          trie_words = newAV();
      });
  
-    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
      assert(re_trie_maxbuff);
      if (!SvIOK(re_trie_maxbuff)) {
          sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
@@ -5976,14 +5986,27 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                        last, &data_fake, stopparen,
                                        recursed_depth, NULL, f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (minnext > (I32)U8_MAX) {
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || minnext > (I32)U8_MAX
+                        || minnext + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)minnext;
+
+                    /* The 'next_off' field has been repurposed to count the
+                     * additional starting positions to try beyond the initial
+                     * one.  (This leaves it at 0 for non-variable length
+                     * matches to avoid breakage for those not using this
+                     * extension) */
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                        ckWARNexperimental(RExC_parse,
+                            WARN_EXPERIMENTAL__VLB,
+                            "Variable length lookbehind is experimental");
+                    }
+                    scan->flags = (U8)minnext + deltanext;
                  }
                  if (data) {
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -6068,14 +6091,21 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                          stopparen, recursed_depth, NULL,
                                          f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (*minnextp > (I32)U8_MAX) {
+                    assert(0);  /* This code has never been tested since this
+                                   is normally not compiled */
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || *minnextp > (I32)U8_MAX
+                        || *minnextp + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)*minnextp;
+
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                    }
+                    scan->flags = (U8)*minnextp + deltanext;
                  }
  
                  *minnextp += min;
@@ -7658,6 +7688,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      RExC_naughty = 0;
      RExC_npar = 1;
+    RExC_parens_buf_size = 0;
      RExC_emit_start = RExC_rxi->program;
      pRExC_state->code_index = 0;
  
@@ -7667,9 +7698,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      /* Do the parse */
      if (reg(pRExC_state, 0, &flags, 1)) {
  
-        /* Success!, But if RExC_total_parens < 0, we need to redo the parse
-         * knowing how many parens there actually are */
-        if (RExC_total_parens < 0) {
+        /* Success!, But we may need to redo the parse knowing how many parens
+         * there actually are */
+        if (IN_PARENS_PASS) {
              flags |= RESTART_PARSE;
          }
  
@@ -7711,7 +7742,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
              DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
          }
  
-        if (RExC_total_parens > 0) {
+        if (ALL_PARENS_COUNTED) {
              /* Make enough room for all the known parens, and zero it */
              Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
              Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
@@ -8809,7 +8840,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
              /* It might be a forward reference; we can't fail until we
                  * know, by completing the parse to get all the groups, and
                  * then reparsing */
-            if (RExC_total_parens > 0)  {
+            if (ALL_PARENS_COUNTED)  {
                  vFAIL("Reference to nonexistent named group");
              }
              else {
@@ -11001,6 +11032,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      I32 freeze_paren = 0;
      I32 after_freeze = 0;
      I32 num; /* numeric backreferences */
+    SV * max_open;  /* Max number of unclosed parens */
  
      char * parse_start = RExC_parse; /* MJD */
      char * const oregcomp_parse = RExC_parse;
@@ -11010,6 +11042,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      PERL_ARGS_ASSERT_REG;
      DEBUG_PARSE("reg ");
  
+
+    max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
+    assert(max_open);
+    if (!SvIOK(max_open)) {
+        sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
+    }
+    if (depth > 4 * SvIV(max_open)) { /* We increase depth by 4 for each open
+                                         paren */
+        vFAIL("Too many nested open parens");
+    }
+
      *flagp = 0;                                /* Tentatively. */
  
      /* Having this true makes it feasible to have a lot fewer tests for the
@@ -11595,7 +11638,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          /* It might be a forward reference; we can't fail until
                           * we know, by completing the parse to get all the
                           * groups, and then reparsing */
-                        if (RExC_total_parens > 0)  {
+                        if (ALL_PARENS_COUNTED)  {
                              RExC_parse++;
                              vFAIL("Reference to nonexistent group");
                          }
@@ -11621,7 +11664,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                      /* It might be a forward reference; we can't fail until we
                       * know, by completing the parse to get all the groups, and
                       * then reparsing */
-                    if (RExC_total_parens > 0)  {
+                    if (ALL_PARENS_COUNTED)  {
                          if (num >= RExC_total_parens) {
                              RExC_parse++;
                              vFAIL("Reference to nonexistent group");
@@ -11952,34 +11995,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
           capturing_parens:
             parno = RExC_npar;
             RExC_npar++;
-            if (RExC_total_parens <= 0) {
+            if (! ALL_PARENS_COUNTED) {
                  /* If we are in our first pass through (and maybe only pass),
                   * we  need to allocate memory for the capturing parentheses
-                 * data structures.  Since we start at npar=1, when it reaches
-                 * 2, for the first time it has something to put in it.  Above
-                 * 2 means we extend what we already have */
-                if (RExC_npar == 2) {
+                 * data structures.
+                 */
+
+                if (!RExC_parens_buf_size) {
+                    /* first guess at number of parens we might encounter */
+                    RExC_parens_buf_size = 10;
+
                      /* setup RExC_open_parens, which holds the address of each
                       * OPEN tag, and to make things simpler for the 0 index the
                       * start of the program - this is used later for offsets */
-                    Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
  
                      /* setup RExC_close_parens, which holds the address of each
                       * CLOSE tag, and to make things simpler for the 0 index
                       * the end of the program - this is used later for offsets
                       * */
-                    Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      /* we dont know where end op starts yet, so we dont need to
                       * set RExC_close_parens[0] like we do RExC_open_parens[0]
                       * above */
                  }
-                else {
-                    Renew(RExC_open_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+                else if (RExC_npar > RExC_parens_buf_size) {
+                    I32 old_size = RExC_parens_buf_size;
+
+                    RExC_parens_buf_size *= 2;
+
+                    Renew(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_open_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
  
-                    Renew(RExC_close_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+                    Renew(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_close_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
                  }
              }
  
@@ -12061,7 +12117,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              RETURN_FAIL_ON_RESTART(flags, flagp);
              FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
          }
-        REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
+        if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
         lastbr = br;
         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
      }
@@ -12132,7 +12190,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                            (IV)(ender - lastbr)
              );
          );
-        REGTAIL(pRExC_state, lastbr, ender);
+        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
  
         if (have_branch) {
              char is_nothing= 1;
@@ -12143,9 +12203,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             for (br = REGNODE_p(ret); br; br = regnext(br)) {
                 const U8 op = PL_regkind[OP(br)];
                 if (op == BRANCH) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(br)),
-                                  ender);
+                    if (! REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(br)),
+                                        ender))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      if ( OP(NEXTOPER(br)) != NOTHING
                           || regnext(NEXTOPER(br)) != REGNODE_p(ender))
                          is_nothing= 0;
@@ -12212,7 +12275,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
             FLAGS(REGNODE_p(ret)) = flag;
-            REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+            if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
+            {
+                REQUIRE_BRANCHJ(flagp, 0);
+            }
         }
      }
  
@@ -12306,14 +12372,12 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             /* FIXME adding one for every branch after the first is probably
              * excessive now we have TRIE support. (hv) */
             MARK_NAUGHTY(1);
-            if (     chain > (SSize_t) BRANCH_MAX_OFFSET
-                && ! RExC_use_BRANCHJ)
-            {
+            if (! REGTAIL(pRExC_state, chain, latest)) {
                  /* XXX We could just redo this branch, but figuring out what
-                 * bookkeeping needs to be reset is a pain */
+                 * bookkeeping needs to be reset is a pain, and it's likely
+                 * that other branches that goto END will also be too large */
                  REQUIRE_BRANCHJ(flagp, 0);
              }
-            REGTAIL(pRExC_state, chain, latest);
         }
         chain = latest;
         c++;
@@ -13731,7 +13795,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      /* It might be a forward reference; we can't fail until we
                       * know, by completing the parse to get all the groups, and
                       * then reparsing */
-                    if (RExC_total_parens > 0)  {
+                    if (ALL_PARENS_COUNTED)  {
                          if (num >= RExC_total_parens)  {
                              vFAIL("Reference to nonexistent group");
                          }
@@ -19541,7 +19605,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
      src = REGNODE_p(RExC_emit);
      RExC_emit += size;
      dst = REGNODE_p(RExC_emit);
-    if (RExC_open_parens) {
+
+    /* If we are in a "count the parentheses" pass, the numbers are unreliable,
+     * and [perl #133871] shows this can lead to problems, so skip this
+     * realignment of parens until a later pass when they are reliable */
+    if (! IN_PARENS_PASS && RExC_open_parens) {
          int paren;
          /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
          /* remember that RExC_npar is rex->nparens + 1,
@@ -19614,10 +19682,13 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
  }
  
  /*
-- regtail - set the next-pointer at the end of a node chain of p to val.
+- regtail - set the next-pointer at the end of a node chain of p to val.  If
+            that value won't fit in the space available, instead returns FALSE.
+            (Except asserts if we can't fit in the largest space the regex
+            engine is designed for.)
  - SEE ALSO: regtail_study
  */
-STATIC void
+STATIC bool
  S_regtail(pTHX_ RExC_state_t * pRExC_state,
                  const regnode_offset p,
                  const regnode_offset val,
@@ -19650,11 +19721,21 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      }
  
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert(val - scan <= U32_MAX);
          ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            /* Since not all callers check the return value, populate this with
+             * something that won't loop and will likely lead to a crash if
+             * execution continues */
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
          NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
+
+    return TRUE;
  }
  
  #ifdef DEBUGGING
@@ -19671,10 +19752,14 @@ that it is purely analytical.
  Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
  to control which is which.
  
+This used to return a value that was ignored.  It was a problem that it is
+#ifdef'd to be another function that didn't return a value.  khw has changed it
+so both currently return a pass/fail return.
+
  */
  /* TODO: All four parms should be const */
  
-STATIC U8
+STATIC bool
  S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
                        const regnode_offset val, U32 depth)
  {
@@ -19698,7 +19783,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
             bool unfolded_multi_char;   /* Unexamined in this routine */
              if (join_exact(pRExC_state, scan, &min,
                             &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
-                return EXACT;
+                return TRUE; /* Was return EXACT */
         }
  #endif
          if ( exact ) {
@@ -19748,13 +19833,18 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
          );
      });
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert(val - scan <= U32_MAX);
         ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
         NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
  
-    return exact;
+    return TRUE; /* Was 'return exact' */
  }
  #endif
  
@@ -20362,8 +20452,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
          sv_catpv(sv, bounds[FLAGS(o)]);
      }
-    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
-       Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
+       Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+        if (o->next_off) {
+            Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+        }
+       Perl_sv_catpvf(aTHX_ sv, "]");
+    }
      else if (OP(o) == SBOL)
          Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");