PATCH: [perl #133756] Failure to match properly

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 8b2100c..58cb941 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1617,15 +1617,19 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      }
  
      /* Add in the points from the bit map */
-    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
-        if (ANYOF_BITMAP_TEST(node, i)) {
-            unsigned int start = i++;
+    if (OP(node) != ANYOFH) {
+        for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+            if (ANYOF_BITMAP_TEST(node, i)) {
+                unsigned int start = i++;
  
-            for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
-                /* empty */
+                for (;    i < NUM_ANYOF_CODE_POINTS
+                       && ANYOF_BITMAP_TEST(node, i); ++i)
+                {
+                    /* empty */
+                }
+                invlist = _add_range_to_invlist(invlist, start, i-1);
+                new_node_has_latin1 = TRUE;
              }
-            invlist = _add_range_to_invlist(invlist, start, i-1);
-            new_node_has_latin1 = TRUE;
          }
      }
  
@@ -3906,7 +3910,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      that a character in the pattern corresponds to at most a single
   *      character in the target string.  (And I do mean character, and not byte
   *      here, unlike other parts of the documentation that have never been
- *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
+ *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
   *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
   *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
   *      EXACTFL nodes, violate the assumption, and they are the only instances
@@ -4335,6 +4339,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
             }
  #endif
         }
+
+        if (     STR_LEN(scan) == 1
+            &&   isALPHA_A(* STRING(scan))
+            &&  (         OP(scan) == EXACTFAA
+                 || (     OP(scan) == EXACTFU
+                     && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
+        {
+            U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+            /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+             * with the mask set to the complement of the bit that differs
+             * between upper and lower case, and the lowest code point of the
+             * pair (which the '&' forces) */
+            OP(scan) = ANYOFM;
+            ARG_SET(scan, *STRING(scan) & mask);
+            FLAGS(scan) = mask;
+        }
      }
  
  #ifdef DEBUGGING
@@ -5275,6 +5296,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      OP(next) = EXACTFU;
                  }
  
+                if (     STR_LEN(next) == 1
+                    &&   isALPHA_A(* STRING(next))
+                    && (         OP(next) == EXACTFAA
+                        || (     OP(next) == EXACTFU
+                            && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
+                {
+                    /* These differ in just one bit */
+                    U8 mask = ~ ('A' ^ 'a');
+
+                    assert(isALPHA_A(* STRING(next)));
+
+                    /* Then replace it by an ANYOFM node, with
+                    * the mask set to the complement of the
+                    * bit that differs between upper and lower
+                    * case, and the lowest code point of the
+                    * pair (which the '&' forces) */
+                    OP(next) = ANYOFM;
+                    ARG_SET(next, *STRING(next) & mask);
+                    FLAGS(next) = mask;
+                }
+
                 if (flags & SCF_DO_STCLASS) {
                     mincount = 0;
                     maxcount = REG_INFTY;
@@ -5763,6 +5805,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFD:
                  case ANYOFL:
                  case ANYOFPOSIXL:
+                case ANYOFH:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -8765,7 +8808,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  }
  
  #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
-    int num;                                                    \
      if (RExC_lastparse!=RExC_parse) {                           \
          Perl_re_printf( aTHX_  "%s",                            \
              Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse,        \
@@ -8781,16 +8823,15 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
      } else                                                      \
          Perl_re_printf( aTHX_ "%16s","");                       \
                                                                  \
-    num=REG_NODE_NUM(REGNODE_p(RExC_emit));                     \
-    if (RExC_lastnum!=num)                                      \
-       Perl_re_printf( aTHX_ "|%4d", num);                      \
+    if (RExC_lastnum!=RExC_emit)                                \
+       Perl_re_printf( aTHX_ "|%4d", RExC_emit);                \
      else                                                        \
         Perl_re_printf( aTHX_ "|%4s","");                        \
      Perl_re_printf( aTHX_ "|%*s%-4s",                           \
          (int)((depth*2)), "",                                   \
          (funcname)                                              \
      );                                                          \
-    RExC_lastnum=num;                                           \
+    RExC_lastnum=RExC_emit;                                     \
      RExC_lastparse=RExC_parse;                                  \
  })
  
@@ -12006,7 +12047,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
                      "%*s%*s Setting open paren #%" IVdf " to %d\n",
                      22, "|    |", (int)(depth * 2 + 1), "",
-                    (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
+                    (IV)parno, ret));
                  RExC_open_parens[parno]= ret;
              }
  
@@ -12095,7 +12136,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
                          "%*s%*s Setting close paren #%" IVdf " to %d\n",
                          22, "|    |", (int)(depth * 2 + 1), "",
-                        (IV)parno, REG_NODE_NUM(REGNODE_p(ender))));
+                        (IV)parno, ender));
                  RExC_close_parens[parno]= ender;
                 if (RExC_nestroot == parno)
                     RExC_nestroot = 0;
@@ -12129,7 +12170,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
                      "%*s%*s Setting close paren #0 (END) to %d\n",
                      22, "|    |", (int)(depth * 2 + 1), "",
-                    REG_NODE_NUM(REGNODE_p(ender))));
+                    ender));
  
                  RExC_close_parens[0]= ender;
              }
@@ -12141,9 +12182,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
              Perl_re_printf( aTHX_  "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
                            SvPV_nolen_const(RExC_mysv1),
-                          (IV)REG_NODE_NUM(REGNODE_p(lastbr)),
+                          (IV)lastbr,
                            SvPV_nolen_const(RExC_mysv2),
-                          (IV)REG_NODE_NUM(REGNODE_p(ender)),
+                          (IV)ender,
                            (IV)(ender - lastbr)
              );
          );
@@ -12191,7 +12232,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                    SvPV_nolen_const(RExC_mysv1),
                                    (IV)REG_NODE_NUM(ret_as_regnode),
                                    SvPV_nolen_const(RExC_mysv2),
-                                  (IV)REG_NODE_NUM(REGNODE_p(ender)),
+                                  (IV)ender,
                                    (IV)(ender - ret)
                      );
                  );
@@ -12963,169 +13004,6 @@ S_compute_EXACTish(RExC_state_t *pRExC_state)
      return op + EXACTF;
  }
  
-PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
-                         regnode_offset node, I32* flagp, STRLEN len,
-                         UV code_point, bool downgradable)
-{
-    /* This knows the details about sizing an EXACTish node, setting flags for
-     * it (by setting <*flagp>, and potentially populating it with a single
-     * character.
-     *
-     * If <len> (the length in bytes) is non-zero, this function assumes that
-     * the node has already been populated, and just does the sizing.  In this
-     * case <code_point> should be the final code point that has already been
-     * placed into the node.  This value will be ignored except that under some
-     * circumstances <*flagp> is set based on it.
-     *
-     * If <len> is zero, the function assumes that the node is to contain only
-     * the single character given by <code_point> and calculates what <len>
-     * should be.  It populates the node's STRING with <code_point> or its
-     * fold if folding.
-     *
-     * In both cases <*flagp> is appropriately set
-     *
-     * It knows that under FOLD, the Latin Sharp S and UTF characters above
-     * 255, must be folded (the former only when the rules indicate it can
-     * match 'ss')
-     *
-     * When it does the populating, it looks at the flag 'downgradable'.  If
-     * true with a node that folds, it checks if the single code point
-     * participates in a fold, and if not downgrades the node to an EXACT.
-     * This helps the optimizer */
-
-    bool len_passed_in = cBOOL(len != 0);
-    U8 character[UTF8_MAXBYTES_CASE+1];
-
-    PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
-
-    if (! len_passed_in) {
-        if (UTF) {
-            if (UVCHR_IS_INVARIANT(code_point)) {
-                if (LOC || ! FOLD) {    /* /l defers folding until runtime */
-                    *character = (U8) code_point;
-                }
-                else { /* Here is /i and not /l. */
-                    *character = toFOLD((U8) code_point);
-
-                    /* We can downgrade to an EXACT node if this character
-                     * isn't a folding one.  Note that this assumes that
-                     * nothing above Latin1 folds to some other invariant than
-                     * one of these alphabetics; otherwise we would also have
-                     * to check:
-                     *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
-                     *      || ASCII_FOLD_RESTRICTED))
-                     */
-                    if (downgradable && PL_fold[code_point] == code_point) {
-                        OP(REGNODE_p(node)) = EXACT;
-                    }
-                }
-                len = 1;
-            }
-            else if (FOLD && (   ! LOC
-                              || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
-            {   /* Folding, and ok to do so now */
-                UV folded = _to_uni_fold_flags(
-                                   code_point,
-                                   character,
-                                   &len,
-                                   FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
-                                                      ? FOLD_FLAGS_NOMIX_ASCII
-                                                      : 0));
-                if (downgradable
-                    && folded == code_point /* This quickly rules out many
-                                               cases, avoiding the
-                                               _invlist_contains_cp() overhead
-                                               for those.  */
-                    && ! _invlist_contains_cp(PL_in_some_fold, code_point))
-                {
-                    OP(REGNODE_p(node)) = (LOC)
-                               ? EXACTL
-                               : EXACT;
-                }
-            }
-            else if (code_point <= MAX_UTF8_TWO_BYTE) {
-
-                /* Not folding this cp, and can output it directly */
-                *character = UTF8_TWO_BYTE_HI(code_point);
-                *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
-                len = 2;
-            }
-            else {
-                uvchr_to_utf8( character, code_point);
-                len = UTF8SKIP(character);
-            }
-        } /* Else pattern isn't UTF8.  */
-        else if (! FOLD) {
-            *character = (U8) code_point;
-            len = 1;
-        } /* Else is folded non-UTF8 */
-#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
-   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
-                                      || UNICODE_DOT_DOT_VERSION > 0)
-        else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-#else
-        else if (1) {
-#endif
-            *character = (U8) (DEPENDS_SEMANTICS)
-                              ? toFOLD(code_point)
-                              : (LOC)
-                                ? code_point
-                                : toLOWER_L1(code_point);
-            len = 1;
-
-            /* Can turn into an EXACT node if we know the fold at compile time,
-             * and it folds to itself and doesn't particpate in other folds */
-            if (downgradable
-                && ! LOC
-                && PL_fold_latin1[code_point] == code_point
-                && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
-                    || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
-            {
-                OP(REGNODE_p(node)) = EXACT;
-            }
-        } /* else is Sharp s.  May need to fold it */
-        else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
-            *character = 's';
-            *(character + 1) = 's';
-            len = 2;
-        }
-        else {
-            *character = LATIN_SMALL_LETTER_SHARP_S;
-            len = 1;
-        }
-    }
-
-    if (downgradable) {
-        change_engine_size(pRExC_state, STR_SZ(len));
-    }
-
-    RExC_emit += STR_SZ(len);
-    STR_LEN(REGNODE_p(node)) = len;
-    if (! len_passed_in) {
-        Copy((char *) character, STRING(REGNODE_p(node)), len, char);
-    }
-
-    *flagp |= HASWIDTH;
-
-    /* A single character node is SIMPLE, except for the special-cased SHARP S
-     * under /di. */
-    if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
-#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
-   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
-                                      || UNICODE_DOT_DOT_VERSION > 0)
-        && ( code_point != LATIN_SMALL_LETTER_SHARP_S
-            || ! FOLD || ! DEPENDS_SEMANTICS)
-#endif
-    ) {
-        *flagp |= SIMPLE;
-    }
-
-    if (OP(REGNODE_p(node)) == EXACTFL) {
-        RExC_contains_locale = 1;
-    }
-}
-
  STATIC bool
  S_new_regcurly(const char *s, const char *e)
  {
@@ -13431,25 +13309,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* FALLTHROUGH */
         case 'b':
            {
+            U8 flags = 0;
             regex_charset charset = get_regex_charset(RExC_flags);
  
             RExC_seen_zerolen++;
              RExC_seen |= REG_LOOKBEHIND_SEEN;
             op = BOUND + charset;
  
-            if (op == BOUND) {
-                RExC_seen_d_op = TRUE;
-            }
-            else if (op == BOUNDL) {
-                RExC_contains_locale = 1;
-            }
-
-           ret = reg_node(pRExC_state, op);
-           *flagp |= SIMPLE;
             if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
-                FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
+                flags = TRADITIONAL_BOUND;
                  if (op > BOUNDA) {  /* /aa is same as /a */
-                    OP(REGNODE_p(ret)) = BOUNDA;
+                    op = BOUNDA;
                  }
              }
              else {
@@ -13485,25 +13355,25 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          {
                              goto bad_bound_type;
                          }
-                        FLAGS(REGNODE_p(ret)) = GCB_BOUND;
+                        flags = GCB_BOUND;
                          break;
                      case 'l':
                          if (length != 2 || *(RExC_parse + 1) != 'b') {
                              goto bad_bound_type;
                          }
-                        FLAGS(REGNODE_p(ret)) = LB_BOUND;
+                        flags = LB_BOUND;
                          break;
                      case 's':
                          if (length != 2 || *(RExC_parse + 1) != 'b') {
                              goto bad_bound_type;
                          }
-                        FLAGS(REGNODE_p(ret)) = SB_BOUND;
+                        flags = SB_BOUND;
                          break;
                      case 'w':
                          if (length != 2 || *(RExC_parse + 1) != 'b') {
                              goto bad_bound_type;
                          }
-                        FLAGS(REGNODE_p(ret)) = WB_BOUND;
+                        flags = WB_BOUND;
                          break;
                      default:
                        bad_bound_type:
@@ -13516,8 +13386,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  RExC_parse = endbrace;
                  REQUIRE_UNI_RULES(flagp, 0);
  
-                if (op >= BOUNDA) {  /* /aa is same as /a */
-                    OP(REGNODE_p(ret)) = BOUNDU;
+                if (op == BOUND) {
+                    op = BOUNDU;
+                }
+                else if (op >= BOUNDA) {  /* /aa is same as /a */
+                    op = BOUNDU;
                      length += 4;
  
                      /* Don't have to worry about UTF-8, in this message because
@@ -13532,9 +13405,22 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  }
             }
  
+            if (op == BOUND) {
+                RExC_seen_d_op = TRUE;
+            }
+            else if (op == BOUNDL) {
+                RExC_contains_locale = 1;
+            }
+
              if (invert) {
-                OP(REGNODE_p(ret)) += NBOUND - BOUND;
+                op += NBOUND - BOUND;
              }
+
+           ret = reg_node(pRExC_state, op);
+            FLAGS(REGNODE_p(ret)) = flags;
+
+           *flagp |= SIMPLE;
+
             goto finish_meta_pat;
            }
  
@@ -13890,7 +13776,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* We can convert EXACTF nodes to EXACTFU if they contain only
               * characters that match identically regardless of the target
               * string's UTF8ness.  The reason to do this is that EXACTF is not
-             * trie-able, EXACTFU is.
+             * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
+             * runtime.
               *
               * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
               * contain only above-Latin1 characters (hence must be in UTF8),
@@ -14314,7 +14201,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          goto loopdone;
                      }
  
-                    /* This code point means we can't simplify things */
+                    /* This problematic code point means we can't simplify
+                     * things */
                      maybe_exactfu = FALSE;
  
                      /* Here, we are adding a problematic fold character.
@@ -14489,14 +14377,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
               * identifies, so when it is set to less than the full node, we can
               * skip the rest of this */
              if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+                PERL_UINT_FAST8_T backup_count = 0;
  
                  const STRLEN full_len = len;
  
                 assert(len >= MAX_NODE_STRING_SIZE);
  
-                /* Here, <s> points to the final byte of the final character.
-                 * Look backwards through the string until find a non-
-                 * problematic character */
+                /* Here, <s> points to just beyond where we have output the
+                 * final character of the node.  Look backwards through the
+                 * string until find a non- problematic character */
  
                 if (! UTF) {
  
@@ -14505,7 +14394,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          goto loopdone;
                      }
  
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
+                        backup_count++;
+                    }
                      len = s - s0 + 1;
                 }
                  else {
@@ -14547,6 +14438,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * special case the very first byte in the string, so
                           * we don't read outside the string */
                          s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+                        backup_count++;
                      } /* End of loop backwards through the string */
  
                      /* If there were only problematic characters in the string,
@@ -14570,12 +14462,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  } else {
  
                      /* Here, the node does contain some characters that aren't
-                     * problematic.  If one such is the final character in the
-                     * node, we are done */
-                    if (len == full_len) {
+                     * problematic.  If we didn't have to backup any, then the
+                     * final character in the node is non-problematic, and we
+                     * can take the node as-is */
+                    if (backup_count == 0) {
                          goto loopdone;
                      }
-                    else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+                    else if (backup_count == 1) {
  
                          /* If the final character is problematic, but the
                           * penultimate is not, back-off that last character to
@@ -14745,6 +14638,11 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
      assert(PL_regkind[OP(node)] == ANYOF);
  
+    /* There is no bitmap for this node type */
+    if (OP(node) == ANYOFH) {
+        return;
+    }
+
      ANYOF_BITMAP_ZERO(node);
      if (*invlist_ptr) {
  
@@ -16639,7 +16537,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      STRLEN numlen;
      int namedclass = OOB_NAMEDCLASS;
      char *rangebegin = NULL;
-    bool need_class = 0;
      SV *listsv = NULL;
      STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                       than just initialized.  */
@@ -16677,10 +16574,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * time */
      SV* swash = NULL;          /* Code points that match \p{} \P{} */
  
-    /* Set if a component of this character class is user-defined; just passed
-     * on to the engine */
-    bool has_user_defined_property = FALSE;
-
      /* inversion list of code points this node matches only when the target
       * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
       * /d) */
@@ -16714,7 +16607,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      bool warn_super = ALWAYS_WARN_SUPER;
  
      const char * orig_parse = RExC_parse;
-    bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
  
      /* This variable is used to mark where the end in the input is of something
       * that looks like a POSIX construct but isn't.  During the parse, when
@@ -16729,7 +16621,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         one.  */
      U8 anyof_flags = 0;   /* flag bits if the node is an ANYOF-type */
      U32 posixl = 0;       /* bit field of posix classes matched under /l */
-    bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
+
+
+/* Flags as to what things aren't knowable until runtime.  (Note that these are
+ * mutually exclusive.) */
+#define HAS_USER_DEFINED_PROPERTY 0x01   /* /u any user-defined properties that
+                                            haven't been defined as of yet */
+#define HAS_D_RUNTIME_DEPENDENCY  0x02   /* /d if the target being matched is
+                                            UTF-8 or not */
+#define HAS_L_RUNTIME_DEPENDENCY   0x04 /* /l what the posix classes match and
+                                            what gets folded */
+    U32 has_runtime_dependency = 0;     /* OR of the above flags */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -17192,9 +17094,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                          (FOLD) ? "__" : "",
                                          UTF8fARG(UTF, n, name),
                                          (FOLD) ? "_i" : "");
-                        has_user_defined_property = TRUE;
-                        optimizable = FALSE;    /* Will have to leave this an
-                                                   ANYOF node */
+                        has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
  
                          /* We don't know yet what this matches, so have to flag
                           * it */
@@ -17208,12 +17108,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          if (swash_init_flags
                              & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
                          {
-                            has_user_defined_property = TRUE;
+                            has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
                          }
                      }
                      }
                      if (invlist) {
-                        if (! has_user_defined_property &&
+                        if (! (has_runtime_dependency
+                                                & HAS_USER_DEFINED_PROPERTY) &&
                              /* We warn on matching an above-Unicode code point
                               * if the match would return true, except don't
                               * warn for \p{All}, which has exactly one element
@@ -17388,43 +17289,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  SV* scratch_list = NULL;
  
                  /* What the Posix classes (like \w, [:space:]) match in locale
-                 * isn't knowable under locale until actual match time.  Room
-                 * must be reserved (one time per outer bracketed class) to
-                 * store such classes.  The space will contain a bit for each
-                 * named class that is to be matched against.  This isn't
-                 * needed for \p{} and pseudo-classes, as they are not affected
-                 * by locale, and hence are dealt with separately */
-                if (! need_class) {
-                    need_class = 1;
-                    anyof_flags |= ANYOF_MATCHES_POSIXL;
-
-                    /* We can't change this into some other type of node
-                     * (unless this is the only element, in which case there
-                     * are nodes that mean exactly this) as has runtime
-                     * dependencies */
-                    optimizable = FALSE;
-                }
-
-                /* Coverity thinks it is possible for this to be negative; both
-                 * jhi and khw think it's not, but be safer */
-                assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
-                       || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
-
-                /* See if it already matches the complement of this POSIX
-                 * class */
-                if (  (anyof_flags & ANYOF_MATCHES_POSIXL)
-                    && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
-                                                         ? -1
-                                                         : 1)))
-                {
-                    posixl_matches_all = TRUE;
-                    break;  /* No need to continue.  Since it matches both
-                               e.g., \w and \W, it matches everything, and the
-                               bracketed class can be optimized into qr/./s */
-                }
-
-                /* Add this class to those that should be checked at runtime */
+                 * isn't knowable under locale until actual match time.  A
+                 * special node is used for these which has extra space for a
+                 * bitmap, with a bit reserved for each named class that is to
+                 * be matched against.  This isn't needed for \p{} and
+                 * pseudo-classes, as they are not affected by locale, and
+                 * hence are dealt with separately */
                  POSIXL_SET(posixl, namedclass);
+                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+                anyof_flags |= ANYOF_MATCHES_POSIXL;
  
                  /* The above-Latin1 characters are not subject to locale rules.
                   * Just add them to the unconditionally-matched list */
@@ -17611,7 +17484,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
           *
           * See [perl #89750] */
          if (FOLD && allow_multi_folds && value == prevvalue) {
-            if (value == LATIN_SMALL_LETTER_SHARP_S
+            if (    value == LATIN_SMALL_LETTER_SHARP_S
                  || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
                                                          value)))
              {
@@ -18118,7 +17991,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               * the target string is in UTF-8.  But things like \W match all the
               * upper Latin1 characters if the target string is not in UTF-8.
               *
-             * Handle the case where there something like \W separately */
+             * Handle the case with something like \W separately */
              if (nposixes) {
                  SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
  
@@ -18192,9 +18065,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                  SvREFCNT_dec(nonascii_but_latin1_properties);
  
-                /* Get rid of any characters that we now know are matched
-                 * unconditionally from the conditional list, which may make
-                 * that list empty */
+                /* Get rid of any characters from the conditional list that we
+                 * now know are matched unconditionally, which may make that
+                 * list empty */
                  _invlist_subtract(upper_latin1_only_utf8_matches,
                                    cp_list,
                                    &upper_latin1_only_utf8_matches);
@@ -18278,6 +18151,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              }
          }
          if (only_utf8_locale_list) {
+            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
              anyof_flags
                   |= ANYOFL_FOLD
                   |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
@@ -18287,6 +18161,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              invlist_iterinit(cp_list);
              if (invlist_iternext(cp_list, &start, &end) && start < 256) {
                  anyof_flags |= ANYOFL_FOLD;
+                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
              }
              invlist_iterfinish(cp_list);
          }
@@ -18295,20 +18170,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               && (    upper_latin1_only_utf8_matches
                   || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
      {
-        use_anyofd = TRUE;
          RExC_seen_d_op = TRUE;
-        optimizable = FALSE;
+        has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
      }
  
-    /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
-     * at compile time.  Besides not inverting folded locale now, we can't
-     * invert if there are things such as \w, which aren't known until runtime
-     * */
+    /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
+     * compile time. */
      if (     cp_list
          &&   invert
-        && ! use_anyofd
-        && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
-       && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+        && ! has_runtime_dependency)
      {
          _invlist_invert(cp_list);
  
@@ -18318,8 +18188,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              swash = NULL;
          }
  
-       /* Clear the invert flag since have just done it here */
-       invert = FALSE;
+        invert = FALSE;
      }
  
      if (ret_invlist) {
@@ -18329,114 +18198,507 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          return RExC_emit;
      }
  
+    /* All possible optimizations below still have these characteristics.
+     * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+     * routine) */
+    *flagp |= HASWIDTH|SIMPLE;
+
+    if (anyof_flags & ANYOF_LOCALE_FLAGS) {
+        RExC_contains_locale = 1;
+    }
+
      /* Some character classes are equivalent to other nodes.  Such nodes take
-     * up less room and generally fewer operations to execute than ANYOF nodes.
-     * */
+     * up less room, and some nodes require fewer operations to execute, than
+     * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
+     * improve efficiency. */
  
      if (optimizable) {
-        int posix_class = -1;   /* Illegal value */
-        UV start, end;
+        PERL_UINT_FAST8_T i;
+        Size_t partial_cp_count = 0;
+        UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
+        UV   end[MAX_FOLD_FROMS+1] = { 0 };
+
+        if (cp_list) { /* Count the code points in enough ranges that we would
+                          see all the ones possible in any fold in this version
+                          of Unicode */
+
+            invlist_iterinit(cp_list);
+            for (i = 0; i <= MAX_FOLD_FROMS; i++) {
+                if (invlist_iternext(cp_list, &start[i], &end[i])) {
+                    partial_cp_count += end[i] - start[i] + 1;
+                }
+            }
+
+            invlist_iterfinish(cp_list);
+        }
  
-        if (UNLIKELY(posixl_matches_all)) {
-            ret = reg_node(pRExC_state, SANY);
+        /* If we know at compile time that this matches every possible code
+         * point, any run-time dependencies don't matter */
+        if (start[0] == 0 && end[0] == UV_MAX) {
+            if (invert) {
+                ret = reganode(pRExC_state, OPFAIL, 0);
+            }
+            else {
+                ret = reg_node(pRExC_state, SANY);
+                MARK_NAUGHTY(1);
+            }
              goto not_anyof;
          }
  
-        if (cp_list && ! invert) {
-            invlist_iterinit(cp_list);
-            if (! invlist_iternext(cp_list, &start, &end)) {
+        /* Similarly, for /l posix classes, if both a class and its
+         * complement match, any run-time dependencies don't matter */
+        if (posixl) {
+            for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
+                                                        namedclass += 2)
+            {
+                if (   POSIXL_TEST(posixl, namedclass)      /* class */
+                    && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
+                {
+                    if (invert) {
+                        ret = reganode(pRExC_state, OPFAIL, 0);
+                    }
+                    else {
+                        ret = reg_node(pRExC_state, SANY);
+                        MARK_NAUGHTY(1);
+                    }
+                    goto not_anyof;
+                }
+            }
+            /* For well-behaved locales, some classes are subsets of others,
+             * so complementing the subset and including the non-complemented
+             * superset should match everything, like [\D[:alnum:]], and
+             * [[:^alpha:][:alnum:]], but some implementations of locales are
+             * buggy, and khw thinks its a bad idea to have optimization change
+             * behavior, even if it avoids an OS bug in a given case */
+
+#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
+
+            /* If is a single posix /l class, can optimize to just that op.
+             * Such a node will not match anything in the Latin1 range, as that
+             * is not determinable until runtime, but will match whatever the
+             * class does outside that range.  (Note that some classes won't
+             * match anything outside the range, like [:ascii:]) */
+            if (    isSINGLE_BIT_SET(posixl)
+                && (partial_cp_count == 0 || start[0] > 255))
+            {
+                U8 classnum;
+                SV * class_above_latin1 = NULL;
+                bool already_inverted;
+                bool are_equivalent;
+
+                /* Compute which bit is set, which is the same thing as, e.g.,
+                 * ANYOF_CNTRL.  From
+                 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+                 * */
+                static const int MultiplyDeBruijnBitPosition2[32] =
+                    {
+                    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+                    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+                    };
+
+                namedclass = MultiplyDeBruijnBitPosition2[(posixl
+                                                          * 0x077CB531U) >> 27];
+                classnum = namedclass_to_classnum(namedclass);
+
+                /* The named classes are such that the inverted number is one
+                 * larger than the non-inverted one */
+                already_inverted = namedclass
+                                 - classnum_to_namedclass(classnum);
+
+                /* Create an inversion list of the official property, inverted
+                 * if the constructed node list is inverted, and restricted to
+                 * only the above latin1 code points, which are the only ones
+                 * known at compile time */
+                _invlist_intersection_maybe_complement_2nd(
+                                                    PL_AboveLatin1,
+                                                    PL_XPosix_ptrs[classnum],
+                                                    already_inverted,
+                                                    &class_above_latin1);
+                are_equivalent = _invlistEQ(class_above_latin1, cp_list,
+                                                                        FALSE);
+                SvREFCNT_dec_NN(class_above_latin1);
+
+                if (are_equivalent) {
+
+                    /* Resolve the run-time inversion flag with this possibly
+                     * inverted class */
+                    invert = invert ^ already_inverted;
+
+                    ret = reg_node(pRExC_state,
+                                   POSIXL + invert * (NPOSIXL - POSIXL));
+                    FLAGS(REGNODE_p(ret)) = classnum;
+                    goto not_anyof;
+                }
+            }
+        }
+
+        /* khw can't think of any other possible transformation involving
+         * these. */
+        if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
+            goto is_anyof;
+        }
  
-                /* Here, the list is empty.  This happens, for example, when a
-                 * Unicode property that doesn't match anything is the only
-                 * element in the character class (perluniprops.pod notes such
-                 * properties).  */
+        if (! has_runtime_dependency) {
+
+            /* If the list is empty, nothing matches.  This happens, for
+             * example, when a Unicode property that doesn't match anything is
+             * the only element in the character class (perluniprops.pod notes
+             * such properties). */
+            if (partial_cp_count == 0) {
+                assert (! invert);
                  ret = reganode(pRExC_state, OPFAIL, 0);
-                *flagp |= HASWIDTH|SIMPLE;
                  goto not_anyof;
              }
  
-            if (start == end) {    /* The range is a single code point */
-                if (! invlist_iternext(cp_list, &start, &end)
+            /* If matches everything but \n */
+            if (   start[0] == 0 && end[0] == '\n' - 1
+                && start[1] == '\n' + 1 && end[1] == UV_MAX)
+            {
+                assert (! invert);
+                ret = reg_node(pRExC_state, REG_ANY);
+                MARK_NAUGHTY(1);
+                goto not_anyof;
+            }
+        }
  
-                        /* Don't do this optimization if it would require
-                         * changing the pattern to UTF-8 */
-                    && (start < 256 || UTF))
-                {
-                    /* Here, the list contains a single code point.  Can
-                     * optimize into an EXACTish node */
+        /* Next see if can optimize classes that contain just a few code points
+         * into an EXACTish node.  The reason to do this is to let the
+         * optimizer join this node with adjacent EXACTish ones.
+         *
+         * An EXACTFish node can be generated even if not under /i, and vice
+         * versa.  But care must be taken.  An EXACTFish node has to be such
+         * that it only matches precisely the code points in the class, but we
+         * want to generate the least restrictive one that does that, to
+         * increase the odds of being able to join with an adjacent node.  For
+         * example, if the class contains [kK], we have to make it an EXACTFAA
+         * node to prevent the KELVIN SIGN from matching.  Whether we are under
+         * /i or not is irrelevant in this case.  Less obvious is the pattern
+         * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
+         * supposed to match the single character U+0149 LATIN SMALL LETTER N
+         * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
+         * that includes \X{02BC}, there is a multi-char fold that does, and so
+         * the node generated for it must be an EXACTFish one.  On the other
+         * hand qr/:/i should generate a plain EXACT node since the colon
+         * participates in no fold whatsoever, and having it EXACT tells the
+         * optimizer the target string cannot match unless it has a colon in
+         * it.
+         *
+         * We don't typically generate an EXACTish node if doing so would
+         * require changing the pattern to UTF-8, as that affects /d and
+         * otherwise is slower.  However, under /i, not changing to UTF-8 can
+         * miss some potential multi-character folds.  We calculate the
+         * EXACTish node, and then decide if something would be missed if we
+         * don't upgrade */
+        if (   ! posixl
+            && ! invert
+
+                /* Only try if there are no more code points in the class than
+                 * in the max possible fold */
+            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
+
+            && (start[0] < 256 || UTF || FOLD))
+        {
+            if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
+            {
+                /* We can always make a single code point class into an
+                 * EXACTish node. */
+
+                if (LOC) {
+
+                    /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
+                     * as that means there is a fold not known until runtime so
+                     * shows as only a single code point here. */
+                    op = (FOLD) ? EXACTFL : EXACTL;
+                }
+                else if (! FOLD) { /* Not /l and not /i */
+                    op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+                }
+                else if (start[0] < 256) { /* /i, not /l, and the code point is
+                                              small */
+
+                    /* Under /i, it gets a little tricky.  A code point that
+                     * doesn't participate in a fold should be an EXACT node.
+                     * We know this one isn't the result of a simple fold, or
+                     * there'd be more than one code point in the list, but it
+                     * could be part of a multi- character fold.  In that case
+                     * we better not create an EXACT node, as we would wrongly
+                     * be telling the optimizer that this code point must be in
+                     * the target string, and that is wrong.  This is because
+                     * if the sequence around this code point forms a
+                     * multi-char fold, what needs to be in the string could be
+                     * the code point that folds to the sequence.
+                     *
+                     * This handles the case of below-255 code points, as we
+                     * have an easy look up for those.  The next clause handles
+                     * the above-256 one */
+                    op = IS_IN_SOME_FOLD_L1(start[0])
+                         ? EXACTFU
+                         : EXACT;
+                }
+                else {  /* /i, larger code point.  Since we are under /i, and
+                           have just this code point, we know that it can't
+                           fold to something else, so PL_InMultiCharFold
+                           applies to it */
+                    op = _invlist_contains_cp(PL_InMultiCharFold,
+                                              start[0])
+                         ? EXACTFU_ONLY8
+                         : EXACT_ONLY8;
+                }
+
+                value = start[0];
+            }
+            else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
+                     && _invlist_contains_cp(PL_in_some_fold, start[0]))
+            {
+                /* Here, the only runtime dependency, if any, is from /d, and
+                 * the class matches more than one code point, and the lowest
+                 * code point participates in some fold.  It might be that the
+                 * other code points are /i equivalent to this one, and hence
+                 * they would representable by an EXACTFish node.  Above, we
+                 * eliminated classes that contain too many code points to be
+                 * EXACTFish, with the test for MAX_FOLD_FROMS
+                 *
+                 * First, special case the ASCII fold pairs, like 'B' and 'b'.
+                 * We do this because we have EXACTFAA at our disposal for the
+                 * ASCII range */
+                if (partial_cp_count == 2 && isASCII(start[0])) {
+
+                    /* The only ASCII characters that participate in folds are
+                     * alphabetics */
+                    assert(isALPHA(start[0]));
+                    if (   end[0] == start[0]   /* First range is a single
+                                                   character, so 2nd exists */
+                        && isALPHA_FOLD_EQ(start[0], start[1]))
+                    {
+
+                        /* Here, is part of an ASCII fold pair */
+
+                        if (   ASCII_FOLD_RESTRICTED
+                            || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
+                        {
+                            /* If the second clause just above was true, it
+                             * means we can't be under /i, or else the list
+                             * would have included more than this fold pair.
+                             * Therefore we have to exclude the possibility of
+                             * whatever else it is that folds to these, by
+                             * using EXACTFAA */
+                            op = EXACTFAA;
+                        }
+                        else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
+
+                            /* Here, there's no simple fold that start[0] is part
+                             * of, but there is a multi-character one.  If we
+                             * are not under /i, we want to exclude that
+                             * possibility; if under /i, we want to include it
+                             * */
+                            op = (FOLD) ? EXACTFU : EXACTFAA;
+                        }
+                        else {
  
-                    value = start;
+                            /* Here, the only possible fold start[0] particpates in
+                             * is with start[1].  /i or not isn't relevant */
+                            op = EXACTFU;
+                        }
  
-                    if (! FOLD) {
-                        op = (LOC)
-                             ? EXACTL
-                             : EXACT;
+                        value = toFOLD(start[0]);
+                    }
+                }
+                else if (  ! upper_latin1_only_utf8_matches
+                         || (   _invlist_len(upper_latin1_only_utf8_matches)
+                                                                          == 2
+                             && PL_fold_latin1[
+                               invlist_highest(upper_latin1_only_utf8_matches)]
+                             == start[0]))
+                {
+                    /* Here, the smallest character is non-ascii or there are
+                     * more than 2 code points matched by this node.  Also, we
+                     * either don't have /d UTF-8 dependent matches, or if we
+                     * do, they look like they could be a single character that
+                     * is the fold of the lowest one in the always-match list.
+                     * This test quickly excludes most of the false positives
+                     * when there are /d UTF-8 depdendent matches.  These are
+                     * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
+                     * SMALL LETTER A WITH GRAVE iff the target string is
+                     * UTF-8.  (We don't have to worry above about exceeding
+                     * the array bounds of PL_fold_latin1[] because any code
+                     * point in 'upper_latin1_only_utf8_matches' is below 256.)
+                     *
+                     * EXACTFAA would apply only to pairs (hence exactly 2 code
+                     * points) in the ASCII range, so we can't use it here to
+                     * artificially restrict the fold domain, so we check if
+                     * the class does or does not match some EXACTFish node.
+                     * Further, if we aren't under /i, and and the folded-to
+                     * character is part of a multi-character fold, we can't do
+                     * this optimization, as the sequence around it could be
+                     * that multi-character fold, and we don't here know the
+                     * context, so we have to assume it is that multi-char
+                     * fold, to prevent potential bugs.
+                     *
+                     * To do the general case, we first find the fold of the
+                     * lowest code point (which may be higher than the lowest
+                     * one), then find everything that folds to it.  (The data
+                     * structure we have only maps from the folded code points,
+                     * so we have to do the earlier step.) */
+
+                    Size_t foldlen;
+                    U8 foldbuf[UTF8_MAXBYTES_CASE];
+                    UV folded = _to_uni_fold_flags(start[0],
+                                                        foldbuf, &foldlen, 0);
+                    unsigned int first_fold;
+                    const unsigned int * remaining_folds;
+                    Size_t folds_to_this_cp_count = _inverse_folds(
+                                                            folded,
+                                                            &first_fold,
+                                                            &remaining_folds);
+                    Size_t folds_count = folds_to_this_cp_count + 1;
+                    SV * fold_list = _new_invlist(folds_count);
+                    unsigned int i;
+
+                    /* If there are UTF-8 dependent matches, create a temporary
+                     * list of what this node matches, including them. */
+                    SV * all_cp_list = NULL;
+                    SV ** use_this_list = &cp_list;
+
+                    if (upper_latin1_only_utf8_matches) {
+                        all_cp_list = _new_invlist(0);
+                        use_this_list = &all_cp_list;
+                        _invlist_union(cp_list,
+                                       upper_latin1_only_utf8_matches,
+                                       use_this_list);
                      }
-                    else if (LOC) {
  
-                        /* A locale node under folding with one code point can
-                         * be an EXACTFL, as its fold won't be calculated until
-                         * runtime */
-                        op = EXACTFL;
+                    /* Having gotten everything that participates in the fold
+                     * containing the lowest code point, we turn that into an
+                     * inversion list, making sure everything is included. */
+                    fold_list = add_cp_to_invlist(fold_list, start[0]);
+                    fold_list = add_cp_to_invlist(fold_list, folded);
+                    fold_list = add_cp_to_invlist(fold_list, first_fold);
+                    for (i = 0; i < folds_to_this_cp_count - 1; i++) {
+                        fold_list = add_cp_to_invlist(fold_list,
+                                                        remaining_folds[i]);
                      }
-                    else {
  
-                        /* Here, we are generally folding, but there is only
-                         * one code point to match.  If we have to, we use an
-                         * EXACT node, but it would be better for joining with
-                         * adjacent nodes in the optimization phase if we used
-                         * the same EXACTFish node that any such are likely to
-                         * be.  We can do this iff the code point doesn't
-                         * participate in any folds.  For example, an EXACTF of
-                         * a colon is the same as an EXACT one, since nothing
-                         * folds to or from a colon. */
-                        if (value < 256) {
-                            if (IS_IN_SOME_FOLD_L1(value)) {
-                                op = EXACT;
-                            }
-                        }
-                        else {
-                            if (_invlist_contains_cp(PL_in_some_fold, value)) {
-                                op = EXACT;
+                    /* If the fold list is identical to what's in this ANYOF
+                     * node, the node can be represented by an EXACTFish one
+                     * instead */
+                    if (_invlistEQ(*use_this_list, fold_list,
+                                   0 /* Don't complement */ )
+                    ) {
+
+                        /* But, we have to be careful, as mentioned above.
+                         * Just the right sequence of characters could match
+                         * this if it is part of a multi-character fold.  That
+                         * IS what we want if we are under /i.  But it ISN'T
+                         * what we want if not under /i, as it could match when
+                         * it shouldn't.  So, when we aren't under /i and this
+                         * character participates in a multi-char fold, we
+                         * don't optimize into an EXACTFish node.  So, for each
+                         * case below we have to check if we are folding
+                         * and if not, if it is not part of a multi-char fold.
+                         * */
+                        if (start[0] > 255) {    /* Highish code point */
+                            if (FOLD || ! _invlist_contains_cp(
+                                            PL_InMultiCharFold, folded))
+                            {
+                                op = (LOC)
+                                     ? EXACTFLU8
+                                     : (ASCII_FOLD_RESTRICTED)
+                                       ? EXACTFAA
+                                       : EXACTFU_ONLY8;
+                                value = folded;
                              }
+                        }   /* Below, the lowest code point < 256 */
+                        else if (    FOLD
+                                 &&  folded == 's'
+                                 &&  DEPENDS_SEMANTICS)
+                        {   /* An EXACTF node containing a single character
+                                's', can be an EXACTFU if it doesn't get
+                                joined with an adjacent 's' */
+                            op = EXACTFU_S_EDGE;
+                            value = folded;
                          }
+                        else if (    FOLD
+                                || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
+                        {
+                            if (upper_latin1_only_utf8_matches) {
+                                op = EXACTF;
  
-                        /* If we haven't found the node type, above, it means
-                         * we can use the prevailing one */
-                        if (op == END) {
-                            op = compute_EXACTish(pRExC_state);
+                                /* We can't use the fold, as that only matches
+                                 * under UTF-8 */
+                                value = start[0];
+                            }
+                            else if (     UNLIKELY(start[0] == MICRO_SIGN)
+                                     && ! UTF)
+                            {   /* EXACTFUP is a special node for this
+                                   character */
+                                op = (ASCII_FOLD_RESTRICTED)
+                                     ? EXACTFAA
+                                     : EXACTFUP;
+                                value = MICRO_SIGN;
+                            }
+                            else if (     ASCII_FOLD_RESTRICTED
+                                     && ! isASCII(start[0]))
+                            {   /* For ASCII under /iaa, we can use EXACTFU
+                                   below */
+                                op = EXACTFAA;
+                                value = folded;
+                            }
+                            else {
+                                op = EXACTFU;
+                                value = folded;
+                            }
                          }
                      }
-                }
-            }   /* End of first range contains just a single code point */
-            else if (start == 0) {
-                if (end == UV_MAX) {
-                    op = SANY;
-                    *flagp |= HASWIDTH|SIMPLE;
-                    MARK_NAUGHTY(1);
-                }
-                else if (end == '\n' - 1
-                        && invlist_iternext(cp_list, &start, &end)
-                        && start == '\n' + 1 && end == UV_MAX)
-                {
-                    op = REG_ANY;
-                    *flagp |= HASWIDTH|SIMPLE;
-                    MARK_NAUGHTY(1);
+
+                    SvREFCNT_dec_NN(fold_list);
+                    SvREFCNT_dec(all_cp_list);
                  }
              }
-            invlist_iterfinish(cp_list);
  
              if (op != END) {
-                ret = reg_node(pRExC_state, op);
-                if (PL_regkind[op] == EXACT) {
-                    alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
-                                            TRUE /* downgradable to EXACT */
-                                            );
+
+                /* Here, we have calculated what EXACTish node we would use.
+                 * But we don't use it if it would require converting the
+                 * pattern to UTF-8, unless not using it could cause us to miss
+                 * some folds (hence be buggy) */
+
+                if (! UTF && value > 255) {
+                    SV * in_multis = NULL;
+
+                    assert(FOLD);
+
+                    /* If there is no code point that is part of a multi-char
+                     * fold, then there aren't any matches, so we don't do this
+                     * optimization.  Otherwise, it could match depending on
+                     * the context around us, so we do upgrade */
+                    _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
+                    if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+                        REQUIRE_UTF8(flagp);
+                    }
+                    else {
+                        op = END;
+                    }
+                }
+
+                if (op != END) {
+                    U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+
+                    ret = regnode_guts(pRExC_state, op, len, "exact");
+                    FILL_NODE(ret, op);
+                    RExC_emit += 1 + STR_SZ(len);
+                    STR_LEN(REGNODE_p(ret)) = len;
+                    if (len == 1) {
+                        *STRING(REGNODE_p(ret)) = value;
+                    }
+                    else {
+                        uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
+                    }
+                    goto not_anyof;
                  }
-                goto not_anyof;
              }
+        }
  
-            {
+        if (! has_runtime_dependency) {
  
              /* See if this can be turned into an ANYOFM node.  Think about the
               * bit patterns in two different bytes.  In some positions, the
@@ -18457,7 +18719,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               * 0x30.  Any other bytes ANDed yield something else.  So [01],
               * which is a common usage, is optimizable into ANYOFM, and can
               * benefit from the speed up.  We can only do this on UTF-8
-             * invariant bytes, because they don't have the same patterns under
+             * invariant bytes, because they have the same bit patterns under
               * UTF-8 as not. */
              PERL_UINT_FAST8_T inverted = 0;
  #ifdef EBCDIC
@@ -18474,9 +18736,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              }
  
              if (invlist_highest(cp_list) <= max_permissible) {
-                UV this_start, this_end, lowest_cp;
+                UV this_start, this_end;
+                UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
                  U8 bits_differing = 0;
-                Size_t cp_count = 0;
+                Size_t full_cp_count = 0;
                  bool first_time = TRUE;
  
                  /* Go through the bytes and find the bit positions that differ
@@ -18509,7 +18772,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          bits_differing  |= i ^ lowest_cp;
                      }
  
-                    cp_count += this_end - this_start + 1;
+                    full_cp_count += this_end - this_start + 1;
                  }
                  invlist_iterfinish(cp_list);
  
@@ -18526,8 +18789,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   * a 1 in that position, and another has a 0.  But that would
                   * mean that one of them differs from the lowest code point in
                   * that position, which possibility we've already excluded.  */
-                if (  (inverted || cp_count > 1)
-                    && cp_count == 1U << PL_bitcount[bits_differing])
+                if (  (inverted || full_cp_count > 1)
+                    && full_cp_count == 1U << PL_bitcount[bits_differing])
                  {
                      U8 ANYOFM_mask;
  
@@ -18539,8 +18802,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      /* The argument is the lowest code point */
                      ret = reganode(pRExC_state, op, lowest_cp);
                      FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
-
-                    *flagp |= HASWIDTH|SIMPLE;
                  }
              }
            done_anyofm:
@@ -18552,64 +18813,143 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              if (op != END) {
                  goto not_anyof;
              }
-            }
+        }
  
-            /* Here, didn't find an optimization.  See if this matches any
-             * of the POSIX classes.  The POSIXA ones are about the same speed
-             * as ANYOF ops, but take less room; the ones that have
-             * above-Latin1 code point matches are somewhat faster than ANYOF.
-             * */
+        if (! posixl) {
+            PERL_UINT_FAST8_T type;
+            SV * intersection = NULL;
+            SV* d_invlist = NULL;
  
-            for (posix_class = 0;
-                 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
-                 posix_class++)
-            {
-                int try_inverted;
+            /* See if this matches any of the POSIX classes.  The POSIXA and
+             * POSIXD ones are about the same speed as ANYOF ops, but take less
+             * room; the ones that have above-Latin1 code point matches are
+             * somewhat faster than ANYOF.  */
+
+            for (type = POSIXA; type >= POSIXD; type--) {
+                int posix_class;
  
-                for (try_inverted = 0; try_inverted < 2; try_inverted++)
+                if (type == POSIXL) {   /* But not /l posix classes */
+                    continue;
+                }
+
+                for (posix_class = 0;
+                     posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+                     posix_class++)
                  {
+                    SV** our_code_points = &cp_list;
+                    SV** official_code_points;
+                    int try_inverted;
+
+                    if (type == POSIXA) {
+                        official_code_points = &PL_Posix_ptrs[posix_class];
+                    }
+                    else {
+                        official_code_points = &PL_XPosix_ptrs[posix_class];
+                    }
+
+                    /* Skip non-existent classes of this type.  e.g. \v only
+                     * has an entry in PL_XPosix_ptrs */
+                    if (! *official_code_points) {
+                        continue;
+                    }
+
+                    /* Try both the regular class, and its inversion */
+                    for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+                        bool this_inverted = invert ^ try_inverted;
+
+                        if (type != POSIXD) {
+
+                            /* This class that isn't /d can't match if we have
+                             * /d dependencies */
+                            if (has_runtime_dependency
+                                                    & HAS_D_RUNTIME_DEPENDENCY)
+                            {
+                                continue;
+                            }
+                        }
+                        else /* is /d */ if (! this_inverted) {
+
+                            /* /d classes don't match anything non-ASCII below
+                             * 256 unconditionally (which cp_list contains) */
+                            _invlist_intersection(cp_list, PL_UpperLatin1,
+                                                           &intersection);
+                            if (_invlist_len(intersection) != 0) {
+                                continue;
+                            }
  
-                    /* Check if matches POSIXA, normal or inverted */
-                    if (PL_Posix_ptrs[posix_class]) {
-                        if (_invlistEQ(cp_list,
-                                       PL_Posix_ptrs[posix_class],
+                            SvREFCNT_dec(d_invlist);
+                            d_invlist = invlist_clone(cp_list, NULL);
+
+                            /* But under UTF-8 it turns into using /u rules.
+                             * Add the things it matches under these conditions
+                             * so that we check below that these are identical
+                             * to what the tested class should match */
+                            if (upper_latin1_only_utf8_matches) {
+                                _invlist_union(
+                                            d_invlist,
+                                            upper_latin1_only_utf8_matches,
+                                            &d_invlist);
+                            }
+                            our_code_points = &d_invlist;
+                        }
+                        else {  /* POSIXD, inverted.  If this doesn't have this
+                                   flag set, it isn't /d. */
+                            if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+                            {
+                                continue;
+                            }
+                            our_code_points = &cp_list;
+                        }
+
+                        /* Here, have weeded out some things.  We want to see
+                         * if the list of characters this node contains
+                         * ('*our_code_points') precisely matches those of the
+                         * class we are currently checking against
+                         * ('*official_code_points'). */
+                        if (_invlistEQ(*our_code_points,
+                                       *official_code_points,
                                         try_inverted))
                          {
+                            /* Here, they precisely match.  Optimize this ANYOF
+                             * node into its equivalent POSIX one of the
+                             * correct type, possibly inverted */
                              ret = reg_node(pRExC_state, (try_inverted)
-                                                        ? NPOSIXA
-                                                        : POSIXA);
-                        FLAGS(REGNODE_p(ret)) = posix_class;
-                            *flagp |= HASWIDTH|SIMPLE;
+                                                        ? type + NPOSIXA
+                                                                - POSIXA
+                                                        : type);
+                            FLAGS(REGNODE_p(ret)) = posix_class;
+                            SvREFCNT_dec(d_invlist);
+                            SvREFCNT_dec(intersection);
                              goto not_anyof;
                          }
                      }
-
-                    /* Check if matches POSIXU, normal or inverted */
-                    if (_invlistEQ(cp_list,
-                                   PL_XPosix_ptrs[posix_class],
-                                   try_inverted))
-                    {
-                        ret = reg_node(pRExC_state, (try_inverted)
-                                                    ? NPOSIXU
-                                                    : POSIXU);
-
-                        FLAGS(REGNODE_p(ret)) = posix_class;
-                        *flagp |= HASWIDTH|SIMPLE;
-                        goto not_anyof;
-                    }
                  }
              }
+            SvREFCNT_dec(d_invlist);
+            SvREFCNT_dec(intersection);
+        }
+
+        /* If didn't find an optimization and there is no need for a
+        * bitmap, optimize to indicate that */
+        if (     start[0] >= NUM_ANYOF_CODE_POINTS
+            && ! LOC
+            && ! upper_latin1_only_utf8_matches)
+        {
+            op = ANYOFH;
          }
      }   /* End of seeing if can optimize it into a different node */
  
-    /* It's going to be an ANYOF node. */
-    op = (use_anyofd)
-         ? ANYOFD
-         : ((posixl)
-            ? ANYOFPOSIXL
-            : ((LOC)
-               ? ANYOFL
-               : ANYOF));
+  is_anyof: /* It's going to be an ANYOF node. */
+    if (op != ANYOFH) {
+        op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+             ? ANYOFD
+             : ((posixl)
+                ? ANYOFPOSIXL
+                : ((LOC)
+                   ? ANYOFL
+                   : ANYOF));
+    }
+
      ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
      FILL_NODE(ret, op);        /* We set the argument later */
      RExC_emit += 1 + regarglen[op];
@@ -18663,14 +19003,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                    (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
                     ? listsv : NULL,
                    only_utf8_locale_list,
-                  swash, has_user_defined_property);
-
-    *flagp |= HASWIDTH|SIMPLE;
-
-    if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
-        RExC_contains_locale = 1;
-    }
-
+                  swash, cBOOL(has_runtime_dependency
+                                                & HAS_USER_DEFINED_PROPERTY));
      return ret;
  
    not_anyof:
@@ -18680,7 +19014,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                             RExC_parse - orig_parse);;
-    SvREFCNT_dec_NN(cp_list);;
+    SvREFCNT_dec(cp_list);;
      return ret;
  }
  
@@ -19119,6 +19453,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
  STATIC void
  S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
  {
+    /* 'size' is the delta to add or subtract from the current memory allocated
+     * to the regex engine being constructed */
+
      PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
  
      RExC_size += size;
@@ -19374,7 +19711,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
              DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
              regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
              Perl_re_printf( aTHX_  "~ %s (%d) %s %s\n",
-                SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)),
+                SvPV_nolen_const(RExC_mysv), scan,
                      (temp == NULL ? "->" : ""),
                      (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
              );
@@ -19465,7 +19802,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
              regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
              Perl_re_printf( aTHX_  "~ %s (%d) -> %s\n",
                  SvPV_nolen_const(RExC_mysv),
-                REG_NODE_NUM(REGNODE_p(scan)),
+                scan,
                  PL_reg_name[exact]);
          });
         if (temp == NULL)
@@ -19478,7 +19815,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
          Perl_re_printf( aTHX_
                        "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
                       SvPV_nolen_const(RExC_mysv),
-                     (IV)REG_NODE_NUM(REGNODE_p(val)),
+                     (IV)val,
                       (IV)(val - scan)
          );
      });
@@ -19956,42 +20293,46 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* Ready to start outputting.  First, the initial left bracket */
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
  
-        /* Then all the things that could fit in the bitmap */
-        do_sep = put_charclass_bitmap_innards(sv,
-                                              ANYOF_BITMAP(o),
-                                              bitmap_range_not_in_bitmap,
-                                              only_utf8_locale_invlist,
-                                              o,
-
-                                              /* Can't try inverting for a
-                                               * better display if there are
-                                               * things that haven't been
-                                               * resolved */
-                                              unresolved != NULL);
-        SvREFCNT_dec(bitmap_range_not_in_bitmap);
-
-        /* If there are user-defined properties which haven't been defined yet,
-         * output them.  If the result is not to be inverted, it is clearest to
-         * output them in a separate [] from the bitmap range stuff.  If the
-         * result is to be complemented, we have to show everything in one [],
-         * as the inversion applies to the whole thing.  Use {braces} to
-         * separate them from anything in the bitmap and anything above the
-         * bitmap. */
-        if (unresolved) {
-            if (inverted) {
-                if (! do_sep) { /* If didn't output anything in the bitmap */
-                    sv_catpvs(sv, "^");
+        if (OP(o) != ANYOFH) {
+            /* Then all the things that could fit in the bitmap */
+            do_sep = put_charclass_bitmap_innards(sv,
+                                                  ANYOF_BITMAP(o),
+                                                  bitmap_range_not_in_bitmap,
+                                                  only_utf8_locale_invlist,
+                                                  o,
+
+                                                  /* Can't try inverting for a
+                                                   * better display if there
+                                                   * are things that haven't
+                                                   * been resolved */
+                                                  unresolved != NULL);
+            SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+            /* If there are user-defined properties which haven't been defined
+             * yet, output them.  If the result is not to be inverted, it is
+             * clearest to output them in a separate [] from the bitmap range
+             * stuff.  If the result is to be complemented, we have to show
+             * everything in one [], as the inversion applies to the whole
+             * thing.  Use {braces} to separate them from anything in the
+             * bitmap and anything above the bitmap. */
+            if (unresolved) {
+                if (inverted) {
+                    if (! do_sep) { /* If didn't output anything in the bitmap
+                                     */
+                        sv_catpvs(sv, "^");
+                    }
+                    sv_catpvs(sv, "{");
                  }
-                sv_catpvs(sv, "{");
-            }
-            else if (do_sep) {
-                Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
-            }
-            sv_catsv(sv, unresolved);
-            if (inverted) {
-                sv_catpvs(sv, "}");
+                else if (do_sep) {
+                    Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
+                                                      PL_colors[0]);
+                }
+                sv_catsv(sv, unresolved);
+                if (inverted) {
+                    sv_catpvs(sv, "}");
+                }
+                do_sep = ! inverted;
              }
-            do_sep = ! inverted;
          }
  
          /* And, finally, add the above-the-bitmap stuff */
@@ -21593,8 +21934,10 @@ Perl_init_uniprops(pTHX)
      PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
      PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
                                              UNI__PERL_FOLDS_TO_MULTI_CHAR]);
-    PL_InMultiCharFold = _new_invlist_C_array(UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist);
-    PL_NonFinalFold = _new_invlist_C_array(UNI__PERL_NON_FINAL_FOLDS_invlist);
+    PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
+                                            UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
+    PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
+                                            UNI__PERL_NON_FINAL_FOLDS]);
  
      PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
      PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);