Use static asserts when comparing sizeof() to a constant

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index bc89365..781399e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -163,6 +163,7 @@ typedef struct scan_frame {
      regnode *next_regnode;      /* next node to process when last is reached */
      U32 prev_recursed_depth;
      I32 stopparen;              /* what stopparen do we use */
+    bool in_gosub;              /* this or an outer frame is for GOSUB */
  
      struct scan_frame *this_prev_frame; /* this previous frame */
      struct scan_frame *prev_frame;      /* previous frame */
@@ -1497,15 +1498,15 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
                         ? OPTIMIZE_INFTY
                         : (l
                            ? data->last_start_max
+                          /* temporary underflow guard for 5.32 */
+                          : data->pos_delta < 0 ? OPTIMIZE_INFTY
                            : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
                                          ? OPTIMIZE_INFTY
                                          : data->pos_min + data->pos_delta));
          }
  
-        if (data->flags & SF_BEFORE_EOL)
-            data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
-        else
-            data->substrs[i].flags &= ~SF_BEFORE_EOL;
+        data->substrs[i].flags &= ~SF_BEFORE_EOL;
+        data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
          data->substrs[i].minlenp = minlenp;
          data->substrs[i].lookbehind = 0;
      }
@@ -4004,7 +4005,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      character folded sequences.  Since a single character can fold into
   *      such a sequence, the minimum match length for this node is less than
   *      the number of characters in the node.  This routine returns in
- *      *min_subtract how many characters to subtract from the the actual
+ *      *min_subtract how many characters to subtract from the actual
   *      length of the string to get a real minimum match length; it is 0 if
   *      there are no multi-char foldeds.  This delta is used by the caller to
   *      adjust the min length of the match, and the delta between min and max,
@@ -4442,7 +4443,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                  s++;
              }
          }
-       else {
+       else if (OP(scan) != EXACTFAA_NO_TRIE) {
  
              /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
               * folds that are all Latin1.  As explained in the comments
@@ -4474,7 +4475,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                      /* EXACTF nodes need to know that the minimum length
                       * changed so that a sharp s in the string can match this
                       * ss in the pattern, but they remain EXACTF nodes, as they
-                     * won't match this unless the target string is is UTF-8,
+                     * won't match this unless the target string is in UTF-8,
                       * which we don't know until runtime.  EXACTFL nodes can't
                       * transform into EXACTFU nodes */
                      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
@@ -4527,6 +4528,44 @@ S_unwind_scan_frames(pTHX_ const void *p)
      } while (f);
  }
  
+/* Follow the next-chain of the current node and optimize away
+   all the NOTHINGs from it.
+ */
+STATIC void
+S_rck_elide_nothing(pTHX_ regnode *node)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
+
+    if (OP(node) != CURLYX) {
+        const int max = (reg_off_by_arg[OP(node)]
+                        ? I32_MAX
+                          /* I32 may be smaller than U16 on CRAYs! */
+                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+        int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
+        int noff;
+        regnode *n = node;
+
+        /* Skip NOTHING and LONGJMP. */
+        while (
+            (n = regnext(n))
+            && (
+                (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+                || ((OP(n) == LONGJMP) && (noff = ARG(n)))
+            )
+            && off + noff < max
+        ) {
+            off += noff;
+        }
+        if (reg_off_by_arg[OP(node)])
+            ARG(node) = off;
+        else
+            NEXT_OFF(node) = off;
+    }
+    return;
+}
+
  /* the return from this sub is the minimum length that could possibly match */
  STATIC SSize_t
  S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
@@ -4536,7 +4575,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         I32 stopparen,
                          U32 recursed_depth,
                         regnode_ssc *and_withp,
-                       U32 flags, U32 depth)
+                       U32 flags, U32 depth, bool was_mutate_ok)
                         /* scanp: Start here (read-write). */
                         /* deltap: Write maxlen-minlen here. */
                         /* last: Stop before this one. */
@@ -4608,6 +4647,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                     node length to get a real minimum (because
                                     the folded version may be shorter) */
         bool unfolded_multi_char = FALSE;
+        /* avoid mutating ops if we are anywhere within the recursed or
+         * enframed handling for a GOSUB: the outermost level will handle it.
+         */
+        bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
         /* Peephole optimizer: */
          DEBUG_STUDYDATA("Peep", data, depth, is_inf);
          DEBUG_PEEP("Peep", scan, depth, flags);
@@ -4618,33 +4661,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
           * parsing code, as each (?:..) is handled by a different invocation of
           * reg() -- Yves
           */
-        if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
-                                          && OP(scan) != LEXACT_REQ8)
+        if (PL_regkind[OP(scan)] == EXACT
+            && OP(scan) != LEXACT
+            && OP(scan) != LEXACT_REQ8
+            && mutate_ok
+        ) {
              join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
                      0, NULL, depth + 1);
+        }
  
          /* Follow the next-chain of the current node and optimize
-           away all the NOTHINGs from it.  */
-        if (OP(scan) != CURLYX) {
-            const int max = (reg_off_by_arg[OP(scan)]
-                            ? I32_MAX
-                              /* I32 may be smaller than U16 on CRAYs! */
-                            : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
-            int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
-            int noff;
-            regnode *n = scan;
-
-            /* Skip NOTHING and LONGJMP. */
-            while (   (n = regnext(n))
-                   && (   (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
-                       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
-                   && off + noff < max)
-                off += noff;
-            if (reg_off_by_arg[OP(scan)])
-                ARG(scan) = off;
-            else
-                NEXT_OFF(scan) = off;
-        }
+           away all the NOTHINGs from it.
+         */
+        rck_elide_nothing(scan);
  
          /* The principal pseudo-switch.  Cannot be a switch, since we look into
           * several different things.  */
@@ -4671,7 +4700,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
              /* DEFINEP study_chunk() recursion */
              (void)study_chunk(pRExC_state, &scan, &minlen,
                                &deltanext, next, &data_fake, stopparen,
-                              recursed_depth, NULL, f, depth+1);
+                              recursed_depth, NULL, f, depth+1, mutate_ok);
  
              scan = next;
          } else
@@ -4739,7 +4768,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      /* recurse study_chunk() for each BRANCH in an alternation */
                     minnext = study_chunk(pRExC_state, &scan, minlenp,
                                        &deltanext, next, &data_fake, stopparen,
-                                      recursed_depth, NULL, f, depth+1);
+                                      recursed_depth, NULL, f, depth+1,
+                                      mutate_ok);
  
                     if (min1 > minnext)
                         min1 = minnext;
@@ -4806,9 +4836,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     }
                 }
  
-                if (PERL_ENABLE_TRIE_OPTIMISATION &&
-                        OP( startbranch ) == BRANCH )
-                {
+                if (PERL_ENABLE_TRIE_OPTIMISATION
+                    && OP(startbranch) == BRANCH
+                    && mutate_ok
+                ) {
                 /* demq.
  
                     Assuming this was/is a branch we are dealing with: 'scan'
@@ -5261,6 +5292,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  newframe->stopparen = stopparen;
                  newframe->prev_recursed_depth = recursed_depth;
                  newframe->this_prev_frame= frame;
+                newframe->in_gosub = (
+                    (frame && frame->in_gosub) || OP(scan) == GOSUB
+                );
  
                  DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
                  DEBUG_PEEP("fnew", scan, depth, flags);
@@ -5298,8 +5332,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    offset, later match for variable offset.  */
                 if (data->last_end == -1) { /* Update the start info. */
                     data->last_start_min = data->pos_min;
-                   data->last_start_max = is_inf
-                        ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
+                    data->last_start_max =
+                        is_inf ? OPTIMIZE_INFTY
+                        : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
+                            ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
                 }
                 sv_catpvn(data->last_found, STRING(scan), bytelen);
                 if (UTF)
@@ -5345,8 +5381,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  &&   isALPHA_A(*s)
                  &&  (         OP(scan) == EXACTFAA
                       || (     OP(scan) == EXACTFU
-                         && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
-            {
+                         && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
+                &&   mutate_ok
+            ) {
                  U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
  
                  OP(scan) = ANYOFM;
@@ -5439,7 +5476,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
                  /* This temporary node can now be turned into EXACTFU, and
                   * must, as regexec.c doesn't handle it */
-                if (OP(next) == EXACTFU_S_EDGE) {
+                if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
                      OP(next) = EXACTFU;
                  }
  
@@ -5447,8 +5484,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      &&   isALPHA_A(* STRING(next))
                      && (         OP(next) == EXACTFAA
                          || (     OP(next) == EXACTFU
-                            && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
-                {
+                            && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
+                    &&   mutate_ok
+                ) {
                      /* These differ in just one bit */
                      U8 mask = ~ ('A' ^ 'a');
  
@@ -5535,7 +5573,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                    (mincount == 0
                                     ? (f & ~SCF_DO_SUBSTR)
                                     : f)
-                                  ,depth+1);
+                                  , depth+1, mutate_ok);
  
                 if (flags & SCF_DO_STCLASS)
                     data->start_class = oclass;
@@ -5581,6 +5619,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                   RExC_precomp)));
                  }
  
+                if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
+                    || min >= SSize_t_MAX - minnext * mincount )
+                {
+                    FAIL("Regexp out of space");
+                }
+
                 min += minnext * mincount;
                 is_inf_internal |= deltanext == OPTIMIZE_INFTY
                           || (maxcount == REG_INFTY && minnext + deltanext > 0);
@@ -5595,7 +5639,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (  OP(oscan) == CURLYX && data
                       && data->flags & SF_IN_PAR
                       && !(data->flags & SF_HAS_EVAL)
-                     && !deltanext && minnext == 1 ) {
+                     && !deltanext && minnext == 1
+                      && mutate_ok
+                ) {
                     /* Try to optimize to CURLYN.  */
                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
                     regnode * const nxt1 = nxt;
@@ -5645,10 +5691,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                       && !(data->flags & SF_HAS_EVAL)
                       && !deltanext     /* atom is fixed width */
                       && minnext != 0   /* CURLYM can't handle zero width */
-
                           /* Nor characters whose fold at run-time may be
                            * multi-character */
                        && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
+                      && mutate_ok
                 ) {
                     /* XXXX How to optimize if data == 0? */
                     /* Optimize to a simpler form.  */
@@ -5701,7 +5747,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          /* recurse study_chunk() on optimised CURLYX => CURLYM */
                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
                                      NULL, stopparen, recursed_depth, NULL, 0,
-                                    depth+1);
+                                    depth+1, mutate_ok);
                     }
                     else
                         oscan->flags = 0;
@@ -5831,11 +5877,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                 if (data && (fl & SF_HAS_EVAL))
                     data->flags |= SF_HAS_EVAL;
               optimize_curly_tail:
-               if (OP(oscan) != CURLYX) {
-                   while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
-                          && NEXT_OFF(next))
-                       NEXT_OFF(oscan) += NEXT_OFF(next);
-               }
+               rck_elide_nothing(oscan);
                 continue;
  
             default:
@@ -5966,7 +6008,10 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                                            (regnode_charclass *) scan);
                     break;
  
-                case NANYOFM:
+                case NANYOFM: /* NANYOFM already contains the inversion of the
+                                 input ANYOF data, so, unlike things like
+                                 NPOSIXA, don't change 'invert' to TRUE */
+                    /* FALLTHROUGH */
                  case ANYOFM:
                    {
                      SV* cp_list = get_ANYOFM_contents(scan);
@@ -6131,7 +6176,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  /* recurse study_chunk() for lookahead body */
                  minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
                                        last, &data_fake, stopparen,
-                                      recursed_depth, NULL, f, depth+1);
+                                      recursed_depth, NULL, f, depth+1,
+                                      mutate_ok);
                  if (scan->flags) {
                      if (   deltanext < 0
                          || deltanext > (I32) U8_MAX
@@ -6236,7 +6282,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
                                          &deltanext, last, &data_fake,
                                          stopparen, recursed_depth, NULL,
-                                        f, depth+1);
+                                        f, depth+1, mutate_ok);
                  if (scan->flags) {
                      assert(0);  /* This code has never been tested since this
                                     is normally not compiled */
@@ -6403,7 +6449,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                          /* optimise study_chunk() for TRIE */
                          minnext = study_chunk(pRExC_state, &scan, minlenp,
                              &deltanext, (regnode *)nextbranch, &data_fake,
-                            stopparen, recursed_depth, NULL, f, depth+1);
+                            stopparen, recursed_depth, NULL, f, depth+1,
+                            mutate_ok);
                      }
                      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
                          nextbranch= regnext((regnode*)nextbranch);
@@ -6876,7 +6923,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
          /* we make the assumption here that each op in the list of
           * op_siblings maps to one SV pushed onto the stack,
           * except for code blocks, with have both an OP_NULL and
-         * and OP_CONST.
+         * an OP_CONST.
           * This allows us to match up the list of SVs against the
           * list of OPs to find the next code block.
           *
@@ -6926,7 +6973,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
                  pRExC_state->code_blocks->count -= n;
              n = 0;
          }
-        else  {
+        else {
              /* ... or failing that, try "" overload */
              while (SvAMAGIC(msv)
                      && (sv = AMG_CALLunary(msv, string_amg))
@@ -7369,7 +7416,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
      PERL_ARGS_ASSERT_SET_REGEX_PV;
  
      /* make sure PL_bitcount bounds not exceeded */
-    assert(sizeof(STD_PAT_MODS) <= 8);
+    STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
  
      p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
      SvPOK_on(Rx);
@@ -7463,7 +7510,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
   * length of the pattern.  Patches welcome to improve that guess.  That amount
   * of space is malloc'd and then immediately freed, and then clawed back node
   * by node.  This design is to minimze, to the extent possible, memory churn
- * when doing the the reallocs.
+ * when doing the reallocs.
   *
   * A separate parentheses counting pass may be needed in some cases.
   * (Previously the sizing pass did this.)  Patches welcome to reduce the number
@@ -7842,6 +7889,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
          /* We have that number in RExC_npar */
          RExC_total_parens = RExC_npar;
+
+        /* XXX For backporting, use long jumps if there is any possibility of
+         * overflow */
+        if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
+            RExC_use_BRANCHJ = TRUE;
+            flags |= RESTART_PARSE;
+        }
      }
      else if (! MUST_RESTART(flags)) {
         ReREFCNT_dec(Rx);
@@ -8191,7 +8245,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
              &data, -1, 0, NULL,
              SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
                            | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
-            0);
+            0, TRUE);
  
  
          CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
@@ -8320,7 +8374,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
              SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
                                                        ? SCF_TRIE_DOING_RESTUDY
                                                        : 0),
-            0);
+            0, TRUE);
  
          CHECK_RESTUDY_GOTO_butfirst(NOOP);
  
@@ -9700,7 +9754,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
       * one of them */
      while (i_a < len_a && i_b < len_b) {
         UV cp;      /* The element to potentially add to the union's array */
-       bool cp_in_set;   /* is it in the the input list's set or not */
+       bool cp_in_set;   /* is it in the input list's set or not */
  
         /* We need to take one or the other of the two inputs for the union.
          * Since we are merging two sorted lists, we take the smaller of the
@@ -10658,7 +10712,7 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
          fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
  
          /* The only code points that aren't folded in a UTF EXACTFish
-         * node are are the problematic ones in EXACTFL nodes */
+         * node are the problematic ones in EXACTFL nodes */
          if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
              /* We need to check for the possibility that this EXACTFL
               * node begins with a multi-char fold.  Therefore we fold
@@ -11472,6 +11526,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             bool is_logical = 0;
             const char * const seqstart = RExC_parse;
              const char * endptr;
+            const char non_existent_group_msg[]
+                                            = "Reference to nonexistent group";
+            const char impossible_group[] = "Invalid reference to group";
+
              if (has_intervening_patws) {
                  RExC_parse++;
                  vFAIL("In '(?...)', the '(' and '?' must be adjacent");
@@ -11698,10 +11756,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                      ) {
                          num = (I32)unum;
                          RExC_parse = (char*)endptr;
-                    } else
-                        num = I32_MAX;
+                    }
+                    else {  /* Overflow, or something like that.  Position
+                               beyond all digits for the message */
+                        while (RExC_parse < RExC_end && isDIGIT(*RExC_parse))  {
+                            RExC_parse++;
+                        }
+                        vFAIL(impossible_group);
+                    }
                      if (is_neg) {
-                        /* Some limit for num? */
+                        /* -num is always representable on 1 and 2's complement
+                         * machines */
                          num = -num;
                      }
                  }
@@ -11709,45 +11774,43 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     vFAIL("Expecting close bracket");
  
                gen_recurse_regop:
-                if ( paren == '-' ) {
+                if (paren == '-' || paren == '+') {
+
+                    /* Don't overflow */
+                    if (UNLIKELY(I32_MAX - RExC_npar < num)) {
+                        RExC_parse++;
+                        vFAIL(impossible_group);
+                    }
+
                      /*
                      Diagram of capture buffer numbering.
                      Top line is the normal capture buffer numbers
                      Bottom line is the negative indexing as from
                      the X (the (?-2))
  
-                    +   1 2    3 4 5 X          6 7
+                        1 2    3 4 5 X   Y      6 7
+                       /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
                         /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
-                    -   5 4    3 2 1 X          x x
+                    -   5 4    3 2 1 X   Y      x x
  
+                    Resolve to absolute group.  Recall that RExC_npar is +1 of
+                    the actual parenthesis group number.  For lookahead, we
+                    have to compensate for that.  Using the above example, when
+                    we get to Y in the parse, num is 2 and RExC_npar is 6.  We
+                    want 7 for +2, and 4 for -2.
                      */
-                    num = RExC_npar + num;
-                    if (num < 1)  {
+                    if ( paren == '+' ) {
+                        num--;
+                    }
  
-                        /* It might be a forward reference; we can't fail until
-                         * we know, by completing the parse to get all the
-                         * groups, and then reparsing */
-                        if (ALL_PARENS_COUNTED)  {
-                            RExC_parse++;
-                            vFAIL("Reference to nonexistent group");
-                        }
-                        else {
-                            REQUIRE_PARENS_PASS;
-                        }
+                    num += RExC_npar;
+
+                    if (paren == '-' && num < 1) {
+                        RExC_parse++;
+                        vFAIL(non_existent_group_msg);
                      }
-                } else if ( paren == '+' ) {
-                    num = RExC_npar + num - 1;
                  }
-                /* We keep track how many GOSUB items we have produced.
-                   To start off the ARG2L() of the GOSUB holds its "id",
-                   which is used later in conjunction with RExC_recurse
-                   to calculate the offset we need to jump for the GOSUB,
-                   which it will store in the final representation.
-                   We have to defer the actual calculation until much later
-                   as the regop may move.
-                 */
  
-                ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
                  if (num >= RExC_npar) {
  
                      /* It might be a forward reference; we can't fail until we
@@ -11756,13 +11819,23 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                      if (ALL_PARENS_COUNTED)  {
                          if (num >= RExC_total_parens) {
                              RExC_parse++;
-                            vFAIL("Reference to nonexistent group");
+                            vFAIL(non_existent_group_msg);
                          }
                      }
                      else {
                          REQUIRE_PARENS_PASS;
                      }
                  }
+
+                /* We keep track how many GOSUB items we have produced.
+                   To start off the ARG2L() of the GOSUB holds its "id",
+                   which is used later in conjunction with RExC_recurse
+                   to calculate the offset we need to jump for the GOSUB,
+                   which it will store in the final representation.
+                   We have to defer the actual calculation until much later
+                   as the regop may move.
+                 */
+                ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
                  RExC_recurse_count++;
                  DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
                      "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
@@ -13527,8 +13600,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  /* SBOL is shared with /^/ so we set the flags so we can tell
                   * /\A/ from /^/ in split. */
                  FLAGS(REGNODE_p(ret)) = 1;
+                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
-           *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'G':
              if (RExC_pm_flags & PMf_WILDCARD) {
@@ -13566,8 +13639,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              }
              else {
                  ret = reg_node(pRExC_state, SEOL);
+                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
-           *flagp |= SIMPLE;
             RExC_seen_zerolen++;                /* Do not optimize RE away */
             goto finish_meta_pat;
         case 'z':
@@ -13577,8 +13650,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              }
              else {
                  ret = reg_node(pRExC_state, EOS);
+                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
-           *flagp |= SIMPLE;
             RExC_seen_zerolen++;                /* Do not optimize RE away */
             goto finish_meta_pat;
         case 'C':
@@ -15135,9 +15208,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * have to map that back to the original */
                          if (need_to_fold_loc) {
                              upper_fill = loc_correspondence[s - s_start];
-                            Safefree(locfold_buf);
-                            Safefree(loc_correspondence);
-
                              if (upper_fill == 0) {
                                  FAIL2("panic: loc_correspondence[%d] is 0",
                                        (int) (s - s_start));
@@ -15148,10 +15218,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          }
                          goto reparse;
                      }
-                    else if (need_to_fold_loc) {
-                        Safefree(locfold_buf);
-                        Safefree(loc_correspondence);
-                    }
  
                      /* Here the node consists entirely of non-final multi-char
                       * folds.  (Likely it is all 'f's or all 's's.)  There's no
@@ -15159,6 +15225,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * whole thing */
                      len = old_s - s0;
                  }
+
+                if (need_to_fold_loc) {
+                    Safefree(locfold_buf);
+                    Safefree(loc_correspondence);
+                }
             }   /* End of verifying node ends with an appropriate char */
  
              /* We need to start the next node at the character that didn't fit
@@ -16428,7 +16499,10 @@ redo_curchar:
                             /* If more than a single node returned, the nested
                              * parens evaluated to more than just a (?[...]),
                              * which isn't legal */
-                        || node != 1) {
+                        || RExC_emit != orig_emit
+                                      + NODE_STEP_REGNODE
+                                      + regarglen[REGEX_SET])
+                    {
                          vFAIL("Expecting interpolated extended charclass");
                      }
                      resultant_invlist = (SV *) ARGp(REGNODE_p(node));
@@ -16479,6 +16553,8 @@ redo_curchar:
                      goto regclass_failed;
                  }
  
+                assert(current);
+
                  /* regclass() will return with parsing just the \ sequence,
                   * leaving the parse pointer at the next thing to parse */
                  RExC_parse--;
@@ -16516,9 +16592,7 @@ redo_curchar:
                      goto regclass_failed;
                  }
  
-                if (! current) {
-                    break;
-                }
+                assert(current);
  
                  /* function call leaves parse pointing to the ']', except if we
                   * faked it */
@@ -16827,7 +16901,7 @@ redo_curchar:
      if (RExC_sets_depth) {  /* If within a recursive call, return in a special
                                 regnode */
          RExC_parse++;
-        node = regpnode(pRExC_state, REGEX_SET, (void *) final);
+        node = regpnode(pRExC_state, REGEX_SET, final);
      }
      else {
  
@@ -17333,6 +17407,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      PERL_UNUSED_ARG(depth);
  #endif
  
+    assert(! (ret_invlist && allow_mutiple_chars));
  
      /* If wants an inversion list returned, we can't optimize to something
       * else. */
@@ -17702,7 +17777,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      assert(prop_definition || strings);
  
                      if (strings) {
-                        if (! RExC_in_multi_char_class) {
+                        if (ret_invlist) {
+                            if (! prop_definition) {
+                                RExC_parse = e + 1;
+                                vFAIL("Unicode string properties are not implemented in (?[...])");
+                            }
+                            else {
+                                ckWARNreg(e + 1,
+                                    "Using just the single character results"
+                                    " returned by \\p{} in (?[...])");
+                            }
+                        }
+                        else if (! RExC_in_multi_char_class) {
                              if (invert ^ (value == 'P')) {
                                  RExC_parse = e + 1;
                                  vFAIL("Inverting a character class which contains"
@@ -18273,7 +18359,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                       * printable should have each end point be a portable value
                       * for it (preferably like 'A', but we don't warn if it is
                       * a (portable) Unicode name or code point), and the range
-                     * must be be all digits or all letters of the same case.
+                     * must be all digits or all letters of the same case.
                       * Otherwise, the range is non-portable and unclear as to
                       * what it contains */
                      if (             (isPRINT_A(prevvalue) || isPRINT_A(value))
@@ -18931,7 +19017,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      if (ret_invlist) {
          *ret_invlist = cp_list;
  
-        return RExC_emit;
+        return (cp_list) ? RExC_emit : 0;
      }
  
      if (anyof_flags & ANYOF_LOCALE_FLAGS) {
@@ -19272,7 +19358,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                       * points) in the ASCII range, so we can't use it here to
                       * artificially restrict the fold domain, so we check if
                       * the class does or does not match some EXACTFish node.
-                     * Further, if we aren't under /i, and and the folded-to
+                     * Further, if we aren't under /i, and the folded-to
                       * character is part of a multi-character fold, we can't do
                       * this optimization, as the sequence around it could be
                       * that multi-character fold, and we don't here know the
@@ -19939,6 +20025,9 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
              av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
          }
  
+        /* (Note that if any of this changes, the size calculations in
+         * S_optimize_regclass() might need to be updated.) */
+
          if (only_utf8_locale_list) {
              av_store(av, ONLY_LOCALE_MATCHES_INDEX,
                                       SvREFCNT_inc_NN(only_utf8_locale_list));
@@ -20435,10 +20524,10 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
  }
  
  /*
-- regpnode - emit a temporary node with a void* argument
+- regpnode - emit a temporary node with a SV* argument
  */
  STATIC regnode_offset /* Location. */
-S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
+S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
  {
      const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
      regnode_offset ptr = ret;
@@ -20601,7 +20690,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      PERL_UNUSED_ARG(depth);
  #endif
  
-    /* Find last node. */
+    /* The final node in the chain is the first one with a nonzero next pointer
+     * */
      scan = (regnode_offset) p;
      for (;;) {
         regnode * const temp = regnext(REGNODE_p(scan));
@@ -20619,6 +20709,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
          scan = REGNODE_OFFSET(temp);
      }
  
+    /* Populate this node's next pointer */
      assert(val >= scan);
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
          assert((UV) (val - scan) <= U32_MAX);
@@ -21370,11 +21461,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                           : (OP(o) == ANYOFH || OP(o) == ANYOFR)
                             ? 0xFF
                             : lowest;
-            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
-            if (lowest != highest) {
-                Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+#ifndef EBCDIC
+            if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+            {
+                Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+                if (lowest != highest) {
+                    Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+                }
+                Perl_sv_catpvf(aTHX_ sv, ")");
              }
-            Perl_sv_catpvf(aTHX_ sv, ")");
          }
  
          SvREFCNT_dec(unresolved);
@@ -21453,7 +21549,9 @@ SV *
  Perl_re_intuit_string(pTHX_ REGEXP * const r)
  {                              /* Assume that RE_INTUIT is set */
      /* Returns an SV containing a string that must appear in the target for it
-     * to match */
+     * to match, or NULL if nothing is known that must match.
+     *
+     * CAUTION: the SV can be freed during execution of the regex engine */
  
      struct regexp *const prog = ReANY(r);
      DECLARE_AND_GET_RE_DEBUG_FLAGS;
@@ -22646,7 +22744,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
  
          int inverted_bias, as_is_bias;
  
-        /* We will apply our bias to whichever of the the results doesn't have
+        /* We will apply our bias to whichever of the results doesn't have
           * the '^' */
          if (invert) {
              invert = FALSE;
@@ -23476,7 +23574,7 @@ S_handle_user_defined_property(pTHX_
  #    define CUR_CONTEXT      aTHX
  #    define ORIGINAL_CONTEXT save_aTHX
  #  else
-#    define DECLARATION_FOR_GLOBAL_CONTEXT
+#    define DECLARATION_FOR_GLOBAL_CONTEXT    dNOOP
  #    define SWITCH_TO_GLOBAL_CONTEXT          NOOP
  #    define RESTORE_CONTEXT                   NOOP
  #    define CUR_CONTEXT                       NULL
@@ -24973,8 +25071,10 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
                                     where we are now */
      bool found_matches = FALSE; /* Did any name match so far? */
      SV * empty;                 /* For matching zero length names */
-    SV * must;                  /* What substring, if any, must be in a name
-                                   for the subpattern to match */
+    SV * must_sv;               /* Contains the substring, if any, that must be
+                                   in a name for the subpattern to match */
+    const char * must;          /* The PV of 'must' */
+    STRLEN must_len;            /* And its length */
      SV * syllable_name = NULL;  /* For Hangul syllables */
      const char hangul_prefix[] = "HANGUL SYLLABLE ";
      const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
@@ -25039,7 +25139,23 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
  
      /* Compile the subpattern consisting of the name being looked for */
      subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
-    must = re_intuit_string(subpattern_re);
+
+    must_sv = re_intuit_string(subpattern_re);
+    if (must_sv) {
+        /* regexec.c can free the re_intuit_string() return. GH #17734 */
+        must_sv = sv_2mortal(newSVsv(must_sv));
+        must = SvPV(must_sv, must_len);
+    }
+    else {
+        must = "";
+        must_len = 0;
+    }
+
+    /* (Note: 'must' could contain a NUL.  And yet we use strspn() below on it.
+     * This works because the NUL causes the function to return early, thus
+     * showing that there are characters in it other than the acceptable ones,
+     * which is our desired result.) */
+
      prog = ReANY(subpattern_re);
  
      /* If only nothing is matched, skip to where empty names are looked for */
@@ -25049,10 +25165,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
  
      /* And match against the string of all names /gc.  Don't even try if it
       * must match a character not found in any name. */
-    if ( ! must
-        || SvCUR(must) == 0
-        || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()")
-                                                              == SvCUR(must))
+    if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
      {
          while (execute_wildcard(subpattern_re,
                                  cur_pos,
@@ -25192,9 +25305,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
       * one of the characters in that isn't in any Hangul syllable. */
      if (    prog->minlen <= (SSize_t) syl_max_len
          &&  prog->maxlen > 0
-        && ( ! must
-            || SvCUR(must) == 0
-            || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must)))
+        && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
      {
          /* These constants, names, values, and algorithm are adapted from the
           * Unicode standard, version 5.1, section 3.12, and should never
@@ -25289,9 +25400,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
           * series */
          if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
              &&  prog->maxlen > 0
-            && ( ! must
-                || SvCUR(must) == 0
-                || strspn(SvPVX(must), legal) == SvCUR(must)))
+            && (strspn(must, legal) == must_len))
          {
              for (j = low; j <= high; j++) { /* For each code point in the series */