This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Rmv #if 0 bitrotted code
[perl5.git] / regcomp.c
index 640c99d..5d78f60 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -370,22 +370,15 @@ struct RExC_state_t {
         RExC_naughty += RExC_naughty / (exp) + (add)
 
 #define        ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
-#define        ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
-       ((*s) == '{' && regcurly(s)))
+#define        ISMULT2(s)      (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
 
 /*
  * Flags to be passed up and down.
  */
-#define        WORST           0       /* Worst case. */
 #define        HASWIDTH        0x01    /* Known to not match null strings, could match
                                    non-null ones. */
-
-/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
- * character.  (There needs to be a case: in the switch statement in regexec.c
- * for any node marked SIMPLE.)  Note that this is not the same thing as
- * REGNODE_SIMPLE */
-#define        SIMPLE          0x02
-#define        SPSTART         0x04    /* Starts with * or + */
+#define        SIMPLE          0x02    /* Exactly one character wide */
+                                /* (or LNBREAK as a special case) */
 #define POSTPONED      0x08    /* (?1),(?&name), (??{...}) or similar */
 #define TRYAGAIN       0x10    /* Weeded out a declaration. */
 #define RESTART_PARSE   0x20    /* Need to redo the parse */
@@ -1661,7 +1654,6 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      * returned list must, and will, contain every code point that is a
      * possibility. */
 
-    dVAR;
     SV* invlist = NULL;
     SV* only_utf8_locale_invlist = NULL;
     unsigned int i;
@@ -4534,8 +4526,6 @@ S_unwind_scan_frames(pTHX_ const void *p)
 STATIC void
 S_rck_elide_nothing(pTHX_ regnode *node)
 {
-    dVAR;
-
     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
 
     if (OP(node) != CURLYX) {
@@ -4584,7 +4574,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        /* recursed: which subroutines have we recursed into */
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
-    dVAR;
     SSize_t final_minlen;
     /* There must be at least this number of characters to match */
     SSize_t min = 0;
@@ -5219,7 +5208,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      * might result in a minlen of 1 and not of 4,
                      * but this doesn't make us mismatch, just try a bit
                      * harder than we should.
-                     * */
+                     *
+                     * However we must assume this GOSUB is infinite, to
+                     * avoid wrongly applying other optimizations in the
+                     * enclosing scope - see GH 18096, for example.
+                     */
+                    is_inf = is_inf_internal = 1;
                     scan= regnext(scan);
                     continue;
                 }
@@ -5469,6 +5463,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                }
                if (flags & SCF_DO_SUBSTR)
                    data->pos_min++;
+                /* This will bypass the formal 'min += minnext * mincount'
+                 * calculation in the do_curly path, so assumes min width
+                 * of the PLUS payload is exactly one. */
                min++;
                /* FALLTHROUGH */
            case STAR:
@@ -7416,7 +7413,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
     PERL_ARGS_ASSERT_SET_REGEX_PV;
 
     /* make sure PL_bitcount bounds not exceeded */
-    assert(sizeof(STD_PAT_MODS) <= 8);
+    STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
 
     p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
     SvPOK_on(Rx);
@@ -7528,7 +7525,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                    OP *expr, const regexp_engine* eng, REGEXP *old_re,
                     bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
 {
-    dVAR;
     REGEXP *Rx;         /* Capital 'R' means points to a REGEXP */
     STRLEN plen;
     char *exp;
@@ -7889,13 +7885,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
         /* We have that number in RExC_npar */
         RExC_total_parens = RExC_npar;
-
-        /* XXX For backporting, use long jumps if there is any possibility of
-         * overflow */
-        if (RExC_size > U16_MAX && ! RExC_use_BRANCHJ) {
-            RExC_use_BRANCHJ = TRUE;
-            flags |= RESTART_PARSE;
-        }
     }
     else if (! MUST_RESTART(flags)) {
        ReREFCNT_dec(Rx);
@@ -8729,9 +8718,9 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
         } else if (flags & RXapif_ONE) {
             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
             av = MUTABLE_AV(SvRV(ret));
-            length = av_tindex(av);
+            length = av_count(av);
            SvREFCNT_dec_NN(ret);
-            return newSViv(length + 1);
+            return newSViv(length);
         } else {
             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
                                                 (int)flags);
@@ -10651,7 +10640,6 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
 STATIC SV*
 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
 {
-    dVAR;
     const U8 * s = (U8*)STRING(node);
     SSize_t bytelen = STR_LEN(node);
     UV uc;
@@ -11189,7 +11177,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
         vFAIL("Too many nested open parens");
     }
 
-    *flagp = 0;                                /* Tentatively. */
+    *flagp = 0;                                /* Initialize. */
 
     if (RExC_in_lookbehind) {
        RExC_in_lookbehind++;
@@ -12283,7 +12271,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
     }
     else if (paren != '?')             /* Not Conditional */
        ret = br;
-    *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
+    *flagp |= flags & (HASWIDTH | POSTPONED);
     lastbr = br;
     while (*RExC_parse == '|') {
        if (RExC_use_BRANCHJ) {
@@ -12313,7 +12301,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             REQUIRE_BRANCHJ(flagp, 0);
         }
        lastbr = br;
-       *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
+       *flagp |= flags & (HASWIDTH | POSTPONED);
     }
 
     if (have_branch || paren != ':') {
@@ -12546,7 +12534,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
         }
     }
 
-    *flagp = WORST;                    /* Tentatively. */
+    *flagp = 0;                        /* Initialize. */
 
     skip_to_be_ignored_text(pRExC_state, &RExC_parse,
                             FALSE /* Don't force to /x */ );
@@ -12562,9 +12550,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
        else if (ret == 0)
             ret = latest;
        *flagp |= flags&(HASWIDTH|POSTPONED);
-       if (chain == 0)         /* First piece. */
-           *flagp |= flags&SPSTART;
-       else {
+       if (chain != 0) {
            /* FIXME adding one for every branch after the first is probably
             * excessive now we have TRIE support. (hv) */
            MARK_NAUGHTY(1);
@@ -12638,212 +12624,199 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
     }
 
+    if (! ISMULT2(RExC_parse)) {
+        *flagp = flags;
+        return(ret);
+    }
+
+    /* Here we know the input is a legal quantifier, including {m,n} */
+
     op = *RExC_parse;
 
-    if (op == '{' && regcurly(RExC_parse)) {
-       maxpos = NULL;
 #ifdef RE_TRACK_PATTERN_OFFSETS
-        parse_start = RExC_parse; /* MJD */
+    parse_start = RExC_parse;
 #endif
-       next = RExC_parse + 1;
-       while (isDIGIT(*next) || *next == ',') {
-           if (*next == ',') {
-               if (maxpos)
-                   break;
-               else
-                   maxpos = next;
-           }
-           next++;
-       }
-       if (*next == '}') {             /* got one */
-            const char* endptr;
-           if (!maxpos)
-               maxpos = next;
-           RExC_parse++;
-            if (isDIGIT(*RExC_parse)) {
-                endptr = RExC_end;
-                if (!grok_atoUV(RExC_parse, &uv, &endptr))
-                    vFAIL("Invalid quantifier in {,}");
-                if (uv >= REG_INFTY)
-                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-                min = (I32)uv;
-            } else {
-                min = 0;
-            }
-           if (*maxpos == ',')
-               maxpos++;
-           else
-               maxpos = RExC_parse;
-            if (isDIGIT(*maxpos)) {
-                endptr = RExC_end;
-                if (!grok_atoUV(maxpos, &uv, &endptr))
-                    vFAIL("Invalid quantifier in {,}");
-                if (uv >= REG_INFTY)
-                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-                max = (I32)uv;
-            } else {
-               max = REG_INFTY;                /* meaning "infinity" */
-            }
-           RExC_parse = next;
-           nextchar(pRExC_state);
-            if (max < min) {    /* If can't match, warn and optimize to fail
-                                   unconditionally */
-                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
-                ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
-                NEXT_OFF(REGNODE_p(orig_emit)) =
-                                    regarglen[OPFAIL] + NODE_STEP_REGNODE;
-                return ret;
-            }
-            else if (min == max && *RExC_parse == '?')
-            {
-                ckWARN2reg(RExC_parse + 1,
-                           "Useless use of greediness modifier '%c'",
-                           *RExC_parse);
-            }
 
-         do_curly:
-           if ((flags&SIMPLE)) {
-                if (min == 0 && max == REG_INFTY) {
-
-                    /* Going from 0..inf is currently forbidden in wildcard
-                     * subpatterns.  The only reason is to make it harder to
-                     * write patterns that take a long long time to halt, and
-                     * because the use of this construct isn't necessary in
-                     * matching Unicode property values */
-                    if (RExC_pm_flags & PMf_WILDCARD) {
-                        RExC_parse++;
-                        /* diag_listed_as: Use of %s is not allowed in Unicode
-                           property wildcard subpatterns in regex; marked by
-                           <-- HERE in m/%s/ */
-                        vFAIL("Use of quantifier '*' is not allowed in"
-                              " Unicode property wildcard subpatterns");
-                        /* Note, don't need to worry about {0,}, as a '}' isn't
-                         * legal at all in wildcards, so wouldn't get this far
-                         * */
-                    }
-                    reginsert(pRExC_state, STAR, ret, depth+1);
-                    MARK_NAUGHTY(4);
-                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-                    goto nest_check;
-                }
-                if (min == 1 && max == REG_INFTY) {
-                    reginsert(pRExC_state, PLUS, ret, depth+1);
-                    MARK_NAUGHTY(3);
-                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-                    goto nest_check;
-                }
-                MARK_NAUGHTY_EXP(2, 2);
-               reginsert(pRExC_state, CURLY, ret, depth+1);
-                Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
-                Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
-           }
-           else {
-               const regnode_offset w = reg_node(pRExC_state, WHILEM);
+    if (op != '{') {
+        nextchar(pRExC_state);
 
-               FLAGS(REGNODE_p(w)) = 0;
-                if (!  REGTAIL(pRExC_state, ret, w)) {
-                    REQUIRE_BRANCHJ(flagp, 0);
-                }
-               if (RExC_use_BRANCHJ) {
-                   reginsert(pRExC_state, LONGJMP, ret, depth+1);
-                   reginsert(pRExC_state, NOTHING, ret, depth+1);
-                   NEXT_OFF(REGNODE_p(ret)) = 3;       /* Go over LONGJMP. */
-               }
-               reginsert(pRExC_state, CURLYX, ret, depth+1);
-                                /* MJD hk */
-                Set_Node_Offset(REGNODE_p(ret), parse_start+1);
-                Set_Node_Length(REGNODE_p(ret),
-                                op == '{' ? (RExC_parse - parse_start) : 1);
+        *flagp = HASWIDTH;
 
-               if (RExC_use_BRANCHJ)
-                    NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
-                                                       LONGJMP. */
-                if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
-                                                          NOTHING)))
-                {
-                    REQUIRE_BRANCHJ(flagp, 0);
-                }
-                RExC_whilem_seen++;
-                MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
-           }
-           FLAGS(REGNODE_p(ret)) = 0;
-
-           if (min > 0)
-               *flagp = WORST;
-           if (max > 0)
-               *flagp |= HASWIDTH;
-            ARG1_SET(REGNODE_p(ret), (U16)min);
-            ARG2_SET(REGNODE_p(ret), (U16)max);
-            if (max == REG_INFTY)
-                RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-
-           goto nest_check;
-       }
+        if (op == '*') {
+            min = 0;
+        }
+        else if (op == '+') {
+            min = 1;
+        }
+        else if (op == '?') {
+            min = 0; max = 1;
+        }
     }
+    else {  /* is '{' */
+        const char* endptr;
 
-    if (!ISMULT1(op)) {
-       *flagp = flags;
-       return(ret);
-    }
+        maxpos = NULL;
+        next = RExC_parse + 1;
+        while (isDIGIT(*next) || *next == ',') {
+            if (*next == ',') {
+                if (maxpos)
+                    break;
+                else
+                    maxpos = next;
+            }
+            next++;
+        }
 
-#if 0                          /* Now runtime fix should be reliable. */
+        assert(*next == '}');
 
-    /* if this is reinstated, don't forget to put this back into perldiag:
+        if (!maxpos)
+            maxpos = next;
+        RExC_parse++;
+        if (isDIGIT(*RExC_parse)) {
+            endptr = RExC_end;
+            if (!grok_atoUV(RExC_parse, &uv, &endptr))
+                vFAIL("Invalid quantifier in {,}");
+            if (uv >= REG_INFTY)
+                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+            min = (I32)uv;
+        } else {
+            min = 0;
+        }
+        if (*maxpos == ',')
+            maxpos++;
+        else
+            maxpos = RExC_parse;
+        if (isDIGIT(*maxpos)) {
+            endptr = RExC_end;
+            if (!grok_atoUV(maxpos, &uv, &endptr))
+                vFAIL("Invalid quantifier in {,}");
+            if (uv >= REG_INFTY)
+                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+            max = (I32)uv;
+        } else {
+            max = REG_INFTY;            /* meaning "infinity" */
+        }
+        RExC_parse = next;
+        nextchar(pRExC_state);
+        if (max < min) {    /* If can't match, warn and optimize to fail
+                               unconditionally */
+            reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
+            ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+            NEXT_OFF(REGNODE_p(orig_emit)) =
+                                regarglen[OPFAIL] + NODE_STEP_REGNODE;
+            return ret;
+        }
+        else if (min == max && *RExC_parse == '?')
+        {
+            ckWARN2reg(RExC_parse + 1,
+                       "Useless use of greediness modifier '%c'",
+                       *RExC_parse);
+        }
+    }
 
-           =item Regexp *+ operand could be empty at {#} in regex m/%s/
+    if (! (flags & (HASWIDTH|POSTPONED))) {
+        if (max > REG_INFTY/3) {
+            if (origparse[0] == '\\' && origparse[1] == 'K') {
+                vFAIL2utf8f(
+                           "%" UTF8f " is forbidden - matches null string"
+                           " many times",
+                           UTF8fARG(UTF, (RExC_parse >= origparse
+                                         ? RExC_parse - origparse
+                                         : 0),
+                           origparse));
+            } else {
+                ckWARN2reg(RExC_parse,
+                           "%" UTF8f " matches null string many times",
+                           UTF8fARG(UTF, (RExC_parse >= origparse
+                                         ? RExC_parse - origparse
+                                         : 0),
+                           origparse));
+            }
+        }
+    }
 
-          (F) The part of the regexp subject to either the * or + quantifier
-           could match an empty string. The {#} shows in the regular
-           expression about where the problem was discovered.
+    if ((flags&SIMPLE)) {
+        if (min == 0 && max == REG_INFTY) {
 
-    */
+            /* Going from 0..inf is currently forbidden in wildcard
+             * subpatterns.  The only reason is to make it harder to
+             * write patterns that take a long long time to halt, and
+             * because the use of this construct isn't necessary in
+             * matching Unicode property values */
+            if (RExC_pm_flags & PMf_WILDCARD) {
+                RExC_parse++;
+                /* diag_listed_as: Use of %s is not allowed in Unicode
+                   property wildcard subpatterns in regex; marked by
+                   <-- HERE in m/%s/ */
+                vFAIL("Use of quantifier '*' is not allowed in"
+                      " Unicode property wildcard subpatterns");
+                /* Note, don't need to worry about {0,}, as a '}' isn't
+                 * legal at all in wildcards, so wouldn't get this far
+                 * */
+            }
+            reginsert(pRExC_state, STAR, ret, depth+1);
+            MARK_NAUGHTY(4);
+            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+            goto nest_check;
+        }
+        if (min == 1 && max == REG_INFTY) {
+            reginsert(pRExC_state, PLUS, ret, depth+1);
+            MARK_NAUGHTY(3);
+            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+            goto nest_check;
+        }
+        MARK_NAUGHTY_EXP(2, 2);
+        reginsert(pRExC_state, CURLY, ret, depth+1);
+        Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
+        Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
+    }
+    else {
+        const regnode_offset w = reg_node(pRExC_state, WHILEM);
 
-    if (!(flags&HASWIDTH) && op != '?')
-      vFAIL("Regexp *+ operand could be empty");
-#endif
+        FLAGS(REGNODE_p(w)) = 0;
+        if (!  REGTAIL(pRExC_state, ret, w)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
+        if (RExC_use_BRANCHJ) {
+            reginsert(pRExC_state, LONGJMP, ret, depth+1);
+            reginsert(pRExC_state, NOTHING, ret, depth+1);
+            NEXT_OFF(REGNODE_p(ret)) = 3;        /* Go over LONGJMP. */
+        }
+        reginsert(pRExC_state, CURLYX, ret, depth+1);
+                        /* MJD hk */
+        Set_Node_Offset(REGNODE_p(ret), parse_start+1);
+        Set_Node_Length(REGNODE_p(ret),
+                        op == '{' ? (RExC_parse - parse_start) : 1);
+
+        if (RExC_use_BRANCHJ)
+            NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
+                                               LONGJMP. */
+        if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+                                                  NOTHING)))
+        {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
+        RExC_whilem_seen++;
+        MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
+    }
+    FLAGS(REGNODE_p(ret)) = 0;
 
-#ifdef RE_TRACK_PATTERN_OFFSETS
-    parse_start = RExC_parse;
-#endif
-    nextchar(pRExC_state);
+    if (min > 0)
+        *flagp = 0;
+    if (max > 0)
+        *flagp |= HASWIDTH;
+    ARG1_SET(REGNODE_p(ret), (U16)min);
+    ARG2_SET(REGNODE_p(ret), (U16)max);
+    if (max == REG_INFTY)
+        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
 
-    *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
+    goto nest_check;
 
-    if (op == '*') {
-       min = 0;
-       goto do_curly;
-    }
-    else if (op == '+') {
-       min = 1;
-       goto do_curly;
-    }
-    else if (op == '?') {
-       min = 0; max = 1;
-       goto do_curly;
-    }
   nest_check:
-    if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
-        if (origparse[0] == '\\' && origparse[1] == 'K') {
-            vFAIL2utf8f(
-                       "%" UTF8f " is forbidden - matches null string many times",
-                       UTF8fARG(UTF, (RExC_parse >= origparse
-                                     ? RExC_parse - origparse
-                                     : 0),
-                       origparse));
-            /* NOT-REACHED */
-        } else {
-            ckWARN2reg(RExC_parse,
-                       "%" UTF8f " matches null string many times",
-                       UTF8fARG(UTF, (RExC_parse >= origparse
-                                     ? RExC_parse - origparse
-                                     : 0),
-                       origparse));
-        }
-    }
 
     if (*RExC_parse == '?') {
-       nextchar(pRExC_state);
-       reginsert(pRExC_state, MINMOD, ret, depth+1);
+        nextchar(pRExC_state);
+        reginsert(pRExC_state, MINMOD, ret, depth+1);
         if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
             REQUIRE_BRANCHJ(flagp, 0);
         }
@@ -12863,8 +12836,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     }
 
     if (ISMULT2(RExC_parse)) {
-       RExC_parse++;
-       vFAIL("Nested quantifiers");
+        RExC_parse++;
+        vFAIL("Nested quantifiers");
     }
 
     return(ret);
@@ -13305,7 +13278,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
             (UV) flags);
     }
-    *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+    *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
 
     nextchar(pRExC_state);
 
@@ -13468,7 +13441,6 @@ S_backref_value(char *p, char *e)
 STATIC regnode_offset
 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 {
-    dVAR;
     regnode_offset ret = 0;
     I32 flags = 0;
     char *parse_start;
@@ -13477,7 +13449,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     DECLARE_AND_GET_RE_DEBUG_FLAGS;
 
-    *flagp = WORST;            /* Tentatively. */
+    *flagp = 0;                /* Initialize. */
 
     DEBUG_PARSE("atom");
 
@@ -13555,7 +13527,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
                                                                  (UV) flags);
        }
-       *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+       *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
        break;
     case '|':
     case ')':
@@ -13614,13 +13586,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             }
            ret = reg_node(pRExC_state, GPOS);
             RExC_seen |= REG_GPOS_SEEN;
-           *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 'K':
             if (!RExC_in_lookbehind && !RExC_in_lookahead) {
                 RExC_seen_zerolen++;
                 ret = reg_node(pRExC_state, KEEPS);
-                *flagp |= SIMPLE;
                 /* XXX:dmq : disabling in-place substitution seems to
                  * be necessary here to avoid cases of memory corruption, as
                  * with: C<$_="x" x 80; s/x\K/y/> -- rgs
@@ -13774,8 +13744,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            ret = reg_node(pRExC_state, op);
             FLAGS(REGNODE_p(ret)) = flags;
 
-           *flagp |= SIMPLE;
-
            goto finish_meta_pat;
           }
 
@@ -13950,10 +13918,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         num > 9
                         /* any numeric escape < RExC_npar is a backref */
                         && num >= RExC_npar
-                        /* cannot be an octal escape if it starts with 8 */
-                        && *RExC_parse != '8'
-                        /* cannot be an octal escape if it starts with 9 */
-                        && *RExC_parse != '9'
+                        /* cannot be an octal escape if it starts with [89] */
+                        && ! inRANGE(*RExC_parse, '8', '9')
                     ) {
                         /* Probably not meant to be a backref, instead likely
                          * to be an octal character escape, e.g. \35 or \777.
@@ -15212,6 +15178,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                 FAIL2("panic: loc_correspondence[%d] is 0",
                                       (int) (s - s_start));
                             }
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
                         }
                         else {
                             upper_fill = s - s0;
@@ -15390,7 +15358,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      * sets up the bitmap and any flags, removing those code points from the
      * inversion list, setting it to NULL should it become completely empty */
 
-    dVAR;
 
     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
     assert(PL_regkind[OP(node)] == ANYOF);
@@ -15430,9 +15397,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
                    ? end
                    : NUM_ANYOF_CODE_POINTS - 1;
            for (i = start; i <= (int) high; i++) {
-               if (! ANYOF_BITMAP_TEST(node, i)) {
-                   ANYOF_BITMAP_SET(node, i);
-               }
+                ANYOF_BITMAP_SET(node, i);
            }
        }
        invlist_iterfinish(*invlist_ptr);
@@ -16247,7 +16212,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
 
             if (   posix_warnings
                 && RExC_warn_text
-                && av_top_index(RExC_warn_text) > -1)
+                && av_count(RExC_warn_text) > 0)
             {
                 *posix_warnings = RExC_warn_text;
             }
@@ -17250,10 +17215,10 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
  *
  * There is a line below that uses the same white space criteria but is outside
  * this macro.  Both here and there must use the same definition */
-#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p)                  \
     STMT_START {                                                        \
         if (do_skip) {                                                  \
-            while (isBLANK_A(UCHARAT(p)))                               \
+            while (p < stop_p && isBLANK_A(UCHARAT(p)))                 \
             {                                                           \
                 p++;                                                    \
             }                                                           \
@@ -17303,7 +17268,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * UTF-8
      */
 
-    dVAR;
     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
     IV range = 0;
     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
@@ -17429,7 +17393,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     initial_listsv_len = SvCUR(listsv);
     SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
 
-    SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+    SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
 
     assert(RExC_parse <= RExC_end);
 
@@ -17438,7 +17402,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         invert = TRUE;
         allow_mutiple_chars = FALSE;
         MARK_NAUGHTY(1);
-        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
     }
 
     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
@@ -17485,12 +17449,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             output_posix_warnings(pRExC_state, posix_warnings);
         }
 
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
+
         if  (RExC_parse >= stop_ptr) {
             break;
         }
 
-        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
-
         if  (UCHARAT(RExC_parse) == ']') {
             break;
         }
@@ -17796,16 +17760,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             }
 
                             /* For each multi-character string ... */
-                            while (av_tindex(strings) >= 0) {
+                            while (av_count(strings) > 0) {
                                 /* ... Each entry is itself an array of code
                                 * points. */
                                 AV * this_string = (AV *) av_shift( strings);
-                                STRLEN cp_count = av_tindex(this_string) + 1;
+                                STRLEN cp_count = av_count(this_string);
                                 SV * final = newSV(cp_count * 4);
                                 SvPVCLEAR(final);
 
                                 /* Create another string of sequences of \x{...} */
-                                while (av_tindex(this_string) >= 0) {
+                                while (av_count(this_string) > 0) {
                                     SV * character = av_shift(this_string);
                                     UV cp = SvUV(character);
 
@@ -18179,7 +18143,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            }
        } /* end of namedclass \blah */
 
-        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
 
         /* If 'range' is set, 'value' is the ending of a range--check its
          * validity.  (If value isn't a single code point in the case of a
@@ -18222,7 +18186,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 char* next_char_ptr = RExC_parse + 1;
 
                 /* Get the next real char after the '-' */
-                SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
+                SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
 
                 /* If the '-' is at the end of the class (just before the ']',
                  * it is a literal minus; otherwise it is a range */
@@ -18617,7 +18581,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
        ret = reg(pRExC_state, 1, &reg_flags, depth+1);
 
-        *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
+        *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
 
         /* And restore so can parse the rest of the pattern */
         RExC_parse = save_parse;
@@ -20025,6 +19989,9 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
             av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
         }
 
+        /* (Note that if any of this changes, the size calculations in
+         * S_optimize_regclass() might need to be updated.) */
+
         if (only_utf8_locale_list) {
             av_store(av, ONLY_LOCALE_MATCHES_INDEX,
                                      SvREFCNT_inc_NN(only_utf8_locale_list));
@@ -20687,7 +20654,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
     PERL_UNUSED_ARG(depth);
 #endif
 
-    /* Find last node. */
+    /* The final node in the chain is the first one with a nonzero next pointer
+     * */
     scan = (regnode_offset) p;
     for (;;) {
        regnode * const temp = regnext(REGNODE_p(scan));
@@ -20705,6 +20673,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
         scan = REGNODE_OFFSET(temp);
     }
 
+    /* Populate this node's next pointer */
     assert(val >= scan);
     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
         assert((UV) (val - scan) <= U32_MAX);
@@ -21097,7 +21066,6 @@ void
 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
 {
 #ifdef DEBUGGING
-    dVAR;
     int k;
     RXi_GET_DECL(prog, progi);
     DECLARE_AND_GET_RE_DEBUG_FLAGS;
@@ -21815,7 +21783,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
                     U32 refcount;
                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
 #ifdef USE_ITHREADS
-                    dVAR;
 #endif
                     OP_REFCNT_LOCK;
                     refcount = --aho->refcount;
@@ -21844,7 +21811,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
                    U32 refcount;
                    reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
 #ifdef USE_ITHREADS
-                    dVAR;
 #endif
                     OP_REFCNT_LOCK;
                     refcount = --trie->refcount;
@@ -21880,15 +21846,19 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
 #define SAVEPVN(p, n)  ((p) ? savepvn(p, n) : NULL)
 
 /*
-   re_dup_guts - duplicate a regexp.
+=for apidoc_section REGEXP Functions
+=for apidoc re_dup_guts
+Duplicate a regexp.
 
-   This routine is expected to clone a given regexp structure. It is only
-   compiled under USE_ITHREADS.
+This routine is expected to clone a given regexp structure. It is only
+compiled under USE_ITHREADS.
 
-   After all of the core data stored in struct regexp is duplicated
-   the regexp_engine.dupe method is used to copy any private data
-   stored in the *pprivate pointer. This allows extensions to handle
-   any duplication it needs to do.
+After all of the core data stored in struct regexp is duplicated
+the regexp_engine.dupe method is used to copy any private data
+stored in the *pprivate pointer. This allows extensions to handle
+any duplication they need to do.
+
+=cut
 
    See pregfree() and regfree_internal() if you change anything here.
 */
@@ -21897,7 +21867,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
 void
 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
 {
-    dVAR;
     I32 npar;
     const struct regexp *r = ReANY(sstr);
     struct regexp *ret = ReANY(dstr);
@@ -22000,7 +21969,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
 void *
 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
 {
-    dVAR;
     struct regexp *const r = ReANY(rx);
     regexp_internal *reti;
     int len;
@@ -22265,9 +22233,11 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
         UV this_end;
         const char * format;
 
-        if (end - start < min_range_count) {
-
-            /* Output chars individually when they occur in short ranges */
+        if (    end - start < min_range_count
+            && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
+        {
+            /* Output a range of 1 or 2 chars individually, or longer ranges
+             * when printable */
             for (; start <= end; start++) {
                 put_code_point(sv, start);
             }
@@ -22491,7 +22461,6 @@ S_put_charclass_bitmap_innards_common(pTHX_
      * output would have been only the inversion indicator '^', NULL is instead
      * returned. */
 
-    dVAR;
     SV * output;
 
     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
@@ -22598,7 +22567,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
      * cases where it can't try inverting, as what actually matches isn't known
      * until runtime, and hence the inversion isn't either. */
 
-    dVAR;
     bool inverting_allowed = ! force_as_is_display;
 
     int i;
@@ -22991,7 +22959,6 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
 void
 Perl_init_uniprops(pTHX)
 {
-    dVAR;
 
 #  ifdef DEBUGGING
     char * dump_len_string;
@@ -23583,7 +23550,6 @@ S_delete_recursion_entry(pTHX_ void *key)
      * properties.  This is a function so it can be set up to be called even if
      * the program unexpectedly quits */
 
-    dVAR;
     SV ** current_entry;
     const STRLEN key_len = strlen((const char *) key);
     DECLARATION_FOR_GLOBAL_CONTEXT;
@@ -23676,7 +23642,6 @@ S_parse_uniprop_string(pTHX_
                                    this */
     const STRLEN level)         /* Recursion level of this call */
 {
-    dVAR;
     char* lookup_name;          /* normalized name for lookup in our tables */
     unsigned lookup_len;        /* Its length */
     enum { Not_Strict = 0,      /* Some properties have stricter name */
@@ -24068,7 +24033,7 @@ S_parse_uniprop_string(pTHX_
                 goto append_name_to_msg;
             }
 
-            lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
+            lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
             if (! lookup_loose) {
                 Perl_croak(aTHX_
                        "panic: Can't find '_charnames::_loose_regcomp_lookup");
@@ -24697,8 +24662,10 @@ S_parse_uniprop_string(pTHX_
         /* Try again stripping off any initial 'Is'.  This is because we
          * promise that an initial Is is optional.  The same isn't true of
          * names that start with 'In'.  Those can match only blocks, and the
-         * lookup table already has those accounted for. */
-        if (starts_with_Is) {
+         * lookup table already has those accounted for.  The lookup table also
+         * has already accounted for Perl extensions (without and = sign)
+         * starting with 'i's'. */
+        if (starts_with_Is && equals_pos >= 0) {
             lookup_name += 2;
             lookup_len -= 2;
             equals_pos -= 2;