perlapi: Remove per-thread section; move to real scns

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 5d78f60..0c8beb0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -143,13 +143,6 @@ EXTERN_C const struct regexp_engine wild_reg_engine;
  #include "invlist_inline.h"
  #include "unicode_constants.h"
  
-#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
- _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
-#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
- _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
-#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-
  #ifndef STATIC
  #define        STATIC  static
  #endif
@@ -242,8 +235,7 @@ struct RExC_state_t {
      U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
                                             through */
      U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
-    I32                in_lookbehind;
-    I32                in_lookahead;
+    I32                in_lookaround;
      I32                contains_locale;
      I32                override_recoding;
      I32         recode_x_to_native;
@@ -330,8 +322,7 @@ struct RExC_state_t {
  #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
  #define RExC_study_chunk_recursed_bytes  \
                                     (pRExC_state->study_chunk_recursed_bytes)
-#define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
-#define RExC_in_lookahead      (pRExC_state->in_lookahead)
+#define RExC_in_lookaround     (pRExC_state->in_lookaround)
  #define RExC_contains_locale   (pRExC_state->contains_locale)
  #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
  
@@ -414,6 +405,11 @@ struct RExC_state_t {
                                       }                                     \
                               } STMT_END
  
+/* /u is to be chosen if we are supposed to use Unicode rules, or if the
+ * pattern is in UTF-8.  This latter condition is in case the outermost rules
+ * are locale.  See GH #17278 */
+#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
+
  /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
   * a flag that indicates we need to override /d with /u as a result of
   * something in the pattern.  It should only be used in regards to calling
@@ -2130,8 +2126,6 @@ S_ssc_clear_locale(regnode_ssc *ssc)
      ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
  }
  
-#define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
-
  STATIC bool
  S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
  {
@@ -2941,11 +2935,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                  /* See if *uc is the beginning of a multi-character fold.  If
                   * so, we decrement the length remaining to look at, to account
                   * for the current character this iteration.  (We can use 'uc'
-                 * instead of the fold returned by TRIE_READ_CHAR because for
-                 * non-UTF, the latin1_safe macro is smart enough to account
-                 * for all the unfolded characters, and because for UTF, the
-                 * string will already have been folded earlier in the
-                 * compilation process */
+                 * instead of the fold returned by TRIE_READ_CHAR because the
+                 * macro is smart enough to account for any unfolded
+                 * characters. */
                  if (UTF) {
                      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
                          foldlen -= UTF8SKIP(uc);
@@ -5303,12 +5295,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 continue;
             }
         }
-       else if (   OP(scan) == EXACT
-                 || OP(scan) == LEXACT
-                 || OP(scan) == EXACT_REQ8
-                 || OP(scan) == LEXACT_REQ8
-                 || OP(scan) == EXACTL)
-        {
+       else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) {
             SSize_t bytelen = STR_LEN(scan), charlen;
             UV uc;
              assert(bytelen);
@@ -5447,11 +5434,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             case PLUS:
                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                     next = NEXTOPER(scan);
-                   if (   OP(next) == EXACT
-                        || OP(next) == LEXACT
-                        || OP(next) == EXACT_REQ8
-                        || OP(next) == LEXACT_REQ8
-                        || OP(next) == EXACTL
+                   if (   (     PL_regkind[OP(next)] == EXACT
+                            && ! isEXACTFish(OP(next)))
                          || (flags & SCF_DO_STCLASS))
                      {
                         mincount = 1;
@@ -7746,7 +7730,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      rx_flags = orig_rx_flags;
  
-    if (   (UTF || RExC_uni_semantics)
+    if (   toUSE_UNI_CHARSET_NOT_DEPENDS
          && initial_charset == REGEX_DEPENDS_CHARSET)
      {
  
@@ -7777,8 +7761,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      RExC_seen = 0;
      RExC_maxlen = 0;
-    RExC_in_lookbehind = 0;
-    RExC_in_lookahead = 0;
+    RExC_in_lookaround = 0;
      RExC_seen_zerolen = *exp == '^' ? -1 : 0;
      RExC_recode_x_to_native = 0;
      RExC_in_multi_char_class = 0;
@@ -8123,12 +8106,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          DEBUG_PEEP("first:", first, 0, 0);
          /* Ignore EXACT as we deal with it later. */
         if (PL_regkind[OP(first)] == EXACT) {
-           if (   OP(first) == EXACT
-               || OP(first) == LEXACT
-                || OP(first) == EXACT_REQ8
-                || OP(first) == LEXACT_REQ8
-                || OP(first) == EXACTL)
-            {
+           if (! isEXACTFish(OP(first))) {
                 NOOP;   /* Empty, get anchored substr later. */
              }
             else
@@ -8472,9 +8450,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                   && nop == END)
              RExC_rx->extflags |= RXf_WHITE;
          else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (   fop == EXACT || fop == LEXACT
-                      || fop == EXACT_REQ8 || fop == LEXACT_REQ8
-                      || fop == EXACTL)
+                  && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop))
                    && STR_LEN(first) == 1
                    && *(STRING(first)) == ' '
                    && nop == END )
@@ -10685,8 +10661,8 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
              /* Some characters match above-Latin1 ones under /i.  This
               * is true of EXACTFL ones when the locale is UTF-8 */
              if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
-                && (! isASCII(uc) || (OP(node) != EXACTFAA
-                                    && OP(node) != EXACTFAA_NO_TRIE)))
+                && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
+                                                         EXACTFAA_NO_TRIE)))
              {
                  add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
              }
@@ -10740,12 +10716,8 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
           * the folded string to be just past any possible multi-char
           * fold.
           *
-         * Unlike the non-UTF-8 case, the macro for determining if a
-         * string is a multi-char fold requires all the characters to
-         * already be folded.  This is because of all the complications
-         * if not.  Note that they are folded anyway, except in EXACTFL
-         * nodes.  Like the non-UTF case above, we punt if the node
-         * begins with a multi-char fold  */
+         * Like the non-UTF case above, we punt if the node begins with a
+         * multi-char fold  */
  
          if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
              invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
@@ -10767,7 +10739,7 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
                  UV c = (k == 0) ? first_fold : remaining_folds[k-1];
  
                  /* /aa doesn't allow folds between ASCII and non- */
-                if (   (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
+                if (   inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
                      && isASCII(c) != isASCII(fc))
                  {
                      continue;
@@ -10840,7 +10812,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
          RExC_parse++;
          has_use_defaults = TRUE;
          STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-        cs = (RExC_uni_semantics)
+        cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
               ? REGEX_UNICODE_CHARSET
               : REGEX_DEPENDS_CHARSET;
          set_regex_charset(&RExC_flags, cs);
@@ -10848,7 +10820,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
      else {
          cs = get_regex_charset(RExC_flags);
          if (   cs == REGEX_DEPENDS_CHARSET
-            && RExC_uni_semantics)
+            && (toUSE_UNI_CHARSET_NOT_DEPENDS))
          {
              cs = REGEX_UNICODE_CHARSET;
          }
@@ -10932,7 +10904,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                   * pattern (or target, not known until runtime) are
                   * utf8, or something in the pattern indicates unicode
                   * semantics */
-                cs = (RExC_uni_semantics)
+                cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
                       ? REGEX_UNICODE_CHARSET
                       : REGEX_DEPENDS_CHARSET;
                  has_charset_modifier = DEPENDS_PAT_MOD;
@@ -11158,6 +11130,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      I32 after_freeze = 0;
      I32 num; /* numeric backreferences */
      SV * max_open;  /* Max number of unclosed parens */
+    I32 was_in_lookaround = RExC_in_lookaround;
  
      char * parse_start = RExC_parse; /* MJD */
      char * const oregcomp_parse = RExC_parse;
@@ -11179,13 +11152,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
      *flagp = 0;                                /* Initialize. */
  
-    if (RExC_in_lookbehind) {
-       RExC_in_lookbehind++;
-    }
-    if (RExC_in_lookahead) {
-        RExC_in_lookahead++;
-    }
-
      /* Having this true makes it feasible to have a lot fewer tests for the
       * parse pointer being in scope.  For example, we can write
       *      while(isFOO(*RExC_parse)) RExC_parse++;
@@ -11439,11 +11405,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
              lookbehind_alpha_assertions:
                  RExC_seen |= REG_LOOKBEHIND_SEEN;
-                RExC_in_lookbehind++;
                  /*FALLTHROUGH*/
  
              alpha_assertions:
  
+                RExC_in_lookaround++;
                  RExC_seen_zerolen++;
  
                  if (! start_arg) {
@@ -11646,7 +11612,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                 }
  
                  RExC_seen |= REG_LOOKBEHIND_SEEN;
-               RExC_in_lookbehind++;
+               RExC_in_lookaround++;
                 RExC_parse++;
                  if (RExC_parse >= RExC_end) {
                      vFAIL("Sequence (?... not terminated");
@@ -11655,7 +11621,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  break;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
-                RExC_in_lookahead++;
+                RExC_in_lookaround++;
                  break;
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
@@ -11667,6 +11633,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     nextchar(pRExC_state);
                     return ret;
                 }
+                RExC_in_lookaround++;
                 break;
             case '|':           /* (?|...) */
                 /* branch reset, behave like a (?:...) except that
@@ -12468,7 +12435,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
          /* restore original flags, but keep (?p) and, if we've encountered
           * something in the parse that changes /d rules into /u, keep the /u */
         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
-        if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+        if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
              set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
          }
         if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
@@ -12487,14 +12454,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
         NOT_REACHED; /* NOTREACHED */
      }
  
-    if (RExC_in_lookbehind) {
-       RExC_in_lookbehind--;
-    }
-    if (RExC_in_lookahead) {
-        RExC_in_lookahead--;
-    }
      if (after_freeze > RExC_npar)
          RExC_npar = after_freeze;
+
+    RExC_in_lookaround = was_in_lookaround;
+    
      return(ret);
  }
  
@@ -12577,6 +12541,30 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  }
  
  /*
+ - regcurly - a little FSA that accepts {\d+,?\d*}
+    Pulled from reg.c.
+ */
+bool
+Perl_regcurly(const char *s)
+{
+    PERL_ARGS_ASSERT_REGCURLY;
+
+    if (*s++ != '{')
+       return FALSE;
+    if (!isDIGIT(*s))
+       return FALSE;
+    while (isDIGIT(*s))
+       s++;
+    if (*s == ',') {
+       s++;
+       while (isDIGIT(*s))
+           s++;
+    }
+
+    return *s == '}';
+}
+
+/*
   - regpiece - something followed by possible quantifier * + ? {n,m}
   *
   * Note that the branching code sequences used for ? and the general cases
@@ -12624,96 +12612,108 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
          FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
      }
  
-    if (! ISMULT2(RExC_parse)) {
-        *flagp = flags;
-        return(ret);
-    }
-
-    /* Here we know the input is a legal quantifier, including {m,n} */
-
-    op = *RExC_parse;
-
  #ifdef RE_TRACK_PATTERN_OFFSETS
      parse_start = RExC_parse;
  #endif
  
-    if (op != '{') {
+    op = *RExC_parse;
+    switch (op) {
+
+      case '*':
          nextchar(pRExC_state);
+        min = 0;
+        break;
  
-        *flagp = HASWIDTH;
+      case '+':
+        nextchar(pRExC_state);
+        min = 1;
+        break;
  
-        if (op == '*') {
-            min = 0;
-        }
-        else if (op == '+') {
-            min = 1;
-        }
-        else if (op == '?') {
-            min = 0; max = 1;
-        }
-    }
-    else {  /* is '{' */
-        const char* endptr;
+      case '?':
+        nextchar(pRExC_state);
+        min = 0; max = 1;
+        break;
  
-        maxpos = NULL;
-        next = RExC_parse + 1;
-        while (isDIGIT(*next) || *next == ',') {
-            if (*next == ',') {
-                if (maxpos)
-                    break;
-                else
-                    maxpos = next;
+      case '{':  /* A '{' may or may not indicate a quantifier; call regcurly()
+                    to determine which */
+        if (regcurly(RExC_parse)) {
+            const char* endptr;
+
+            /* Here is a quantifier, parse for min and max values */
+            maxpos = NULL;
+            next = RExC_parse + 1;
+            while (isDIGIT(*next) || *next == ',') {
+                if (*next == ',') {
+                    if (maxpos)
+                        break;
+                    else
+                        maxpos = next;
+                }
+                next++;
              }
-            next++;
-        }
  
-        assert(*next == '}');
+            assert(*next == '}');
  
-        if (!maxpos)
-            maxpos = next;
-        RExC_parse++;
-        if (isDIGIT(*RExC_parse)) {
-            endptr = RExC_end;
-            if (!grok_atoUV(RExC_parse, &uv, &endptr))
-                vFAIL("Invalid quantifier in {,}");
-            if (uv >= REG_INFTY)
-                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-            min = (I32)uv;
-        } else {
-            min = 0;
-        }
-        if (*maxpos == ',')
-            maxpos++;
-        else
-            maxpos = RExC_parse;
-        if (isDIGIT(*maxpos)) {
-            endptr = RExC_end;
-            if (!grok_atoUV(maxpos, &uv, &endptr))
-                vFAIL("Invalid quantifier in {,}");
-            if (uv >= REG_INFTY)
-                vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
-            max = (I32)uv;
-        } else {
-            max = REG_INFTY;            /* meaning "infinity" */
-        }
-        RExC_parse = next;
-        nextchar(pRExC_state);
-        if (max < min) {    /* If can't match, warn and optimize to fail
-                               unconditionally */
-            reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
-            ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
-            NEXT_OFF(REGNODE_p(orig_emit)) =
-                                regarglen[OPFAIL] + NODE_STEP_REGNODE;
-            return ret;
-        }
-        else if (min == max && *RExC_parse == '?')
-        {
-            ckWARN2reg(RExC_parse + 1,
-                       "Useless use of greediness modifier '%c'",
-                       *RExC_parse);
-        }
+            if (!maxpos)
+                maxpos = next;
+            RExC_parse++;
+            if (isDIGIT(*RExC_parse)) {
+                endptr = RExC_end;
+                if (!grok_atoUV(RExC_parse, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                min = (I32)uv;
+            } else {
+                min = 0;
+            }
+            if (*maxpos == ',')
+                maxpos++;
+            else
+                maxpos = RExC_parse;
+            if (isDIGIT(*maxpos)) {
+                endptr = RExC_end;
+                if (!grok_atoUV(maxpos, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                max = (I32)uv;
+            } else {
+                max = REG_INFTY;            /* meaning "infinity" */
+            }
+
+            RExC_parse = next;
+            nextchar(pRExC_state);
+            if (max < min) {    /* If can't match, warn and optimize to fail
+                                   unconditionally */
+                reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
+                ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+                NEXT_OFF(REGNODE_p(orig_emit)) =
+                                    regarglen[OPFAIL] + NODE_STEP_REGNODE;
+                return ret;
+            }
+            else if (min == max && *RExC_parse == '?')
+            {
+                ckWARN2reg(RExC_parse + 1,
+                           "Useless use of greediness modifier '%c'",
+                           *RExC_parse);
+            }
+
+            break;
+        } /* End of is regcurly() */
+
+        /* Here was a '{', but what followed it didn't form a quantifier. */
+        /* FALLTHROUGH */
+
+      default:
+        *flagp = flags;
+        return(ret);
+        NOT_REACHED; /*NOTREACHED*/
      }
  
+    /* Here we have a quantifier, and have calculated 'min' and 'max'.
+     *
+     * Check and possibly adjust a zero width operand */
      if (! (flags & (HASWIDTH|POSTPONED))) {
          if (max > REG_INFTY/3) {
              if (origparse[0] == '\\' && origparse[1] == 'K') {
@@ -12733,44 +12733,54 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             origparse));
              }
          }
+
+        /* There's no point in trying to match something 0 length more than
+         * once except for extra side effects, which we don't have here since
+         * not POSTPONED */
+        if (max > 1) {
+            max = 1;
+            if (min > max) {
+                min = max;
+            }
+        }
+    }
+
+    /* If this is a code block pass it up */
+    *flagp |= (flags & POSTPONED);
+
+    if (max > 0) {
+        *flagp |= (flags & HASWIDTH);
+        if (max == REG_INFTY)
+            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
      }
  
+    /* 'SIMPLE' operands don't require full generality */
      if ((flags&SIMPLE)) {
-        if (min == 0 && max == REG_INFTY) {
+        if (max == REG_INFTY) {
+            if (min == 0) {
+                if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
+                    goto min0_maxINF_wildcard_forbidden;
+                }
  
-            /* Going from 0..inf is currently forbidden in wildcard
-             * subpatterns.  The only reason is to make it harder to
-             * write patterns that take a long long time to halt, and
-             * because the use of this construct isn't necessary in
-             * matching Unicode property values */
-            if (RExC_pm_flags & PMf_WILDCARD) {
-                RExC_parse++;
-                /* diag_listed_as: Use of %s is not allowed in Unicode
-                   property wildcard subpatterns in regex; marked by
-                   <-- HERE in m/%s/ */
-                vFAIL("Use of quantifier '*' is not allowed in"
-                      " Unicode property wildcard subpatterns");
-                /* Note, don't need to worry about {0,}, as a '}' isn't
-                 * legal at all in wildcards, so wouldn't get this far
-                 * */
+                reginsert(pRExC_state, STAR, ret, depth+1);
+                MARK_NAUGHTY(4);
+                goto done_main_op;
+            }
+            else if (min == 1) {
+                reginsert(pRExC_state, PLUS, ret, depth+1);
+                MARK_NAUGHTY(3);
+                goto done_main_op;
              }
-            reginsert(pRExC_state, STAR, ret, depth+1);
-            MARK_NAUGHTY(4);
-            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-            goto nest_check;
-        }
-        if (min == 1 && max == REG_INFTY) {
-            reginsert(pRExC_state, PLUS, ret, depth+1);
-            MARK_NAUGHTY(3);
-            RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-            goto nest_check;
          }
+
+        /* Here, SIMPLE, but not the '*' and '+' special cases */
+
          MARK_NAUGHTY_EXP(2, 2);
          reginsert(pRExC_state, CURLY, ret, depth+1);
          Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
          Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
      }
-    else {
+    else {  /* not SIMPLE */
          const regnode_offset w = reg_node(pRExC_state, WHILEM);
  
          FLAGS(REGNODE_p(w)) = 0;
@@ -12799,21 +12809,16 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
          RExC_whilem_seen++;
          MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
      }
+
+    /* Finish up the CURLY/CURLYX case */
      FLAGS(REGNODE_p(ret)) = 0;
  
-    if (min > 0)
-        *flagp = 0;
-    if (max > 0)
-        *flagp |= HASWIDTH;
      ARG1_SET(REGNODE_p(ret), (U16)min);
      ARG2_SET(REGNODE_p(ret), (U16)max);
-    if (max == REG_INFTY)
-        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-
-    goto nest_check;
  
-  nest_check:
+  done_main_op:
  
+    /* Process any greediness modifiers */
      if (*RExC_parse == '?') {
          nextchar(pRExC_state);
          reginsert(pRExC_state, MINMOD, ret, depth+1);
@@ -12835,12 +12840,32 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
          }
      }
  
+    /* Forbid extra quantifiers */
      if (ISMULT2(RExC_parse)) {
          RExC_parse++;
          vFAIL("Nested quantifiers");
      }
  
      return(ret);
+
+  min0_maxINF_wildcard_forbidden:
+
+    /* Here we are in a wildcard match, and the minimum match length is 0, and
+     * the max could be infinity.  This is currently forbidden.  The only
+     * reason is to make it harder to write patterns that take a long long time
+     * to halt, and because the use of this construct isn't necessary in
+     * matching Unicode property values */
+    RExC_parse++;
+    /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
+       subpatterns in regex; marked by <-- HERE in m/%s/
+     */
+    vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
+          " subpatterns");
+
+    /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
+     * legal at all in wildcards, so can't get this far */
+
+    NOT_REACHED; /*NOTREACHED*/
  }
  
  STATIC bool
@@ -13572,7 +13597,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  /* SBOL is shared with /^/ so we set the flags so we can tell
                   * /\A/ from /^/ in split. */
                  FLAGS(REGNODE_p(ret)) = 1;
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
             goto finish_meta_pat;
         case 'G':
@@ -13588,7 +13612,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              RExC_seen |= REG_GPOS_SEEN;
             goto finish_meta_pat;
         case 'K':
-            if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+            if (!RExC_in_lookaround) {
                  RExC_seen_zerolen++;
                  ret = reg_node(pRExC_state, KEEPS);
                  /* XXX:dmq : disabling in-place substitution seems to
@@ -13609,7 +13633,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              }
              else {
                  ret = reg_node(pRExC_state, SEOL);
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
             RExC_seen_zerolen++;                /* Do not optimize RE away */
             goto finish_meta_pat;
@@ -13620,7 +13643,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              }
              else {
                  ret = reg_node(pRExC_state, EOS);
-                *flagp |= SIMPLE;   /* Wrong, but too late to fix for 5.32 */
              }
             RExC_seen_zerolen++;                /* Do not optimize RE away */
             goto finish_meta_pat;
@@ -14534,6 +14556,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * things */
                      maybe_exactfu = FALSE;
  
+                    /* Although these two characters have folds that are
+                     * locale-problematic, they also have folds to above Latin1
+                     * that aren't a problem.  Doing these now helps at
+                     * runtime. */
+                    if (UNLIKELY(   ender == GREEK_CAPITAL_LETTER_MU
+                                 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
+                    {
+                        goto fold_anyway;
+                    }
+
                      /* Here, we are adding a problematic fold character.
                       * "Problematic" in this context means that its fold isn't
                       * known until runtime.  (The non-problematic code points
@@ -14587,15 +14619,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              *(s)++ = (U8) toFOLD(ender);
                          }
                          else {
-                            UV folded = _to_uni_fold_flags(
+                            UV folded;
+
+                          fold_anyway:
+                            folded = _to_uni_fold_flags(
                                      ender,
                                      (U8 *) s,  /* We have allocated extra space
                                                    in 's' so can't run off the
                                                    end */
                                      &added_len,
-                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
-                                                    ? FOLD_FLAGS_NOMIX_ASCII
-                                                    : 0));
+                                    FOLD_FLAGS_FULL
+                                  | ((   ASCII_FOLD_RESTRICTED
+                                      || node_type == EXACTFL)
+                                    ? FOLD_FLAGS_NOMIX_ASCII
+                                    : 0));
                              if (UNLIKELY(len + added_len > max_string_len)) {
                                  overflowed = TRUE;
                                  break;
@@ -14971,12 +15008,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   *
                   * The solution used here for peeking ahead is to look at that
                   * next character.  If it isn't ASCII punctuation, then it will
-                 * be something that continues in an EXACTish node if there
-                 * were space.  We append the fold of it to s, having reserved
-                 * enough room in s0 for the purpose.  If we can't reasonably
-                 * peek ahead, we instead assume the worst case: that it is
-                 * something that would form the completion of a multi-char
-                 * fold.
+                 * be something that would continue on in an EXACTish node if
+                 * there were space.  We append the fold of it to s, having
+                 * reserved enough room in s0 for the purpose.  If we can't
+                 * reasonably peek ahead, we instead assume the worst case:
+                 * that it is something that would form the completion of a
+                 * multi-char fold.
                   *
                   * If we can't split between s and ender, we work backwards
                   * character-by-character down to s0.  At each current point
@@ -20742,30 +20779,15 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
         }
  #endif
          if ( exact ) {
-            switch (OP(REGNODE_p(scan))) {
-                case LEXACT:
-                case EXACT:
-                case LEXACT_REQ8:
-                case EXACT_REQ8:
-                case EXACTL:
-                case EXACTF:
-                case EXACTFU_S_EDGE:
-                case EXACTFAA_NO_TRIE:
-                case EXACTFAA:
-                case EXACTFU:
-                case EXACTFU_REQ8:
-                case EXACTFLU8:
-                case EXACTFUP:
-                case EXACTFL:
-                        if( exact == PSEUDO )
-                            exact= OP(REGNODE_p(scan));
-                        else if ( exact != OP(REGNODE_p(scan)) )
-                            exact= 0;
-                case NOTHING:
-                    break;
-                default:
+            if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
+                if (exact == PSEUDO )
+                    exact= OP(REGNODE_p(scan));
+                else if (exact != OP(REGNODE_p(scan)) )
                      exact= 0;
              }
+            else if (OP(REGNODE_p(scan)) != NOTHING) {
+                exact= 0;
+            }
          }
          DEBUG_PARSE_r({
              DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
@@ -21846,7 +21868,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
  #define SAVEPVN(p, n)  ((p) ? savepvn(p, n) : NULL)
  
  /*
-=for apidoc_section REGEXP Functions
  =for apidoc re_dup_guts
  Duplicate a regexp.
  
@@ -21854,7 +21875,7 @@ This routine is expected to clone a given regexp structure. It is only
  compiled under USE_ITHREADS.
  
  After all of the core data stored in struct regexp is duplicated
-the regexp_engine.dupe method is used to copy any private data
+the C<regexp_engine.dupe> method is used to copy any private data
  stored in the *pprivate pointer. This allows extensions to handle
  any duplication they need to do.
  
@@ -22207,8 +22228,6 @@ S_put_code_point(pTHX_ SV *sv, UV c)
      }
  }
  
-#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
-
  STATIC void
  S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
  {