Properly handle filled /il regnodes and multi-char folds

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 17a5e43..2c5cbfe 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -357,7 +357,7 @@ struct RExC_state_t {
  /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
   * a flag that indicates we need to override /d with /u as a result of
   * something in the pattern.  It should only be used in regards to calling
- * set_regex_charset() or get_regex_charse() */
+ * set_regex_charset() or get_regex_charset() */
  #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
      STMT_START {                                                            \
              if (DEPENDS_SEMANTICS) {                                        \
@@ -1588,7 +1588,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      unsigned int i;
      const U32 n = ARG(node);
      bool new_node_has_latin1 = FALSE;
-    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
                        ? 0
                        : ANYOF_FLAGS(node);
  
@@ -1643,7 +1643,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      }
  
      /* Add in the points from the bit map */
-    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
          for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
              if (ANYOF_BITMAP_TEST(node, i)) {
                  unsigned int start = i++;
@@ -1730,7 +1730,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * another SSC or a regular ANYOF class.  Can create false positives. */
  
      SV* anded_cp_list;
-    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
                            ? 0
                            : ANYOF_FLAGS(and_with);
      U8  anded_flags;
@@ -1916,7 +1916,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
      SV* ored_cp_list;
      U8 ored_flags;
-    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
                           ? 0
                           : ANYOF_FLAGS(or_with);
  
@@ -2142,6 +2142,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
      populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
  
      set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+    SvREFCNT_dec(invlist);
  
      /* Make sure is clone-safe */
      ssc->invlist = NULL;
@@ -5864,6 +5865,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFH:
                  case ANYOFHb:
                  case ANYOFHr:
+                case ANYOFHs:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -5889,6 +5891,26 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                      break;
                    }
  
+                case ANYOFR:
+                case ANYOFRb:
+                  {
+                    SV* cp_list = NULL;
+
+                    cp_list = _add_range_to_invlist(cp_list,
+                                        ANYOFRbase(scan),
+                                        ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+                    if (flags & SCF_DO_STCLASS_OR) {
+                        ssc_union(data->start_class, cp_list, invert);
+                    }
+                    else if (flags & SCF_DO_STCLASS_AND) {
+                        ssc_intersection(data->start_class, cp_list, invert);
+                    }
+
+                    SvREFCNT_dec_NN(cp_list);
+                    break;
+                  }
+
                 case NPOSIXL:
                      invert = 1;
                      /* FALLTHROUGH */
@@ -10297,6 +10319,28 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
  
  #endif
  
+PERL_STATIC_INLINE UV
+S_invlist_lowest(SV* const invlist)
+{
+    /* Returns the lowest code point that matches an inversion list.  This API
+     * has an ambiguity, as it returns 0 under either the lowest is actually
+     * 0, or if the list is empty.  If this distinction matters to you, check
+     * for emptiness before calling this function */
+
+    UV len = _invlist_len(invlist);
+    UV *array;
+
+    PERL_ARGS_ASSERT_INVLIST_LOWEST;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    array = invlist_array(invlist);
+
+    return array[0];
+}
+
  STATIC SV *
  S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
  {
@@ -13774,7 +13818,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             STRLEN len = 0;
             UV ender = 0;
             char *p;
-           char *s;
+           char *s, *old_s = NULL, *old_old_s = NULL;
             char *s0;
              U32 max_string_len = 255;
  
@@ -13796,20 +13840,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              U8 node_type = EXACT;
  
              /* Assume the node will be fully used; the excess is given back at
-             * the end.  Under /i, leave enough extra room so that we won't
-             * overflow the buffer when we fold a character which would end up
-             * overflowing the node.   We can't make any other length
-             * assumptions, as a byte input sequence could shrink down. */
+             * the end.  Under /i, we may need to temporarily add the fold of
+             * an extra character or two at the end to check for splitting
+             * multi-char folds, so allocate extra space for that.   We can't
+             * make any other length assumptions, as a byte input sequence
+             * could shrink down. */
              Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
                                                   + ((! FOLD)
                                                      ? 0
-                                                    : 1 * ((UTF)
+                                                    : 2 * ((UTF)
                                                             ? UTF8_MAXBYTES_CASE
                          /* Max non-UTF-8 expansion is 2 */ : 2)));
  
              bool next_is_quantifier;
              char * oldp = NULL;
-            char * old_oldp = NULL;
  
              /* We can convert EXACTF nodes to EXACTFU if they contain only
               * characters that match identically regardless of the target
@@ -13858,6 +13902,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              p = RExC_parse;
              len = 0;
              s = s0;
+            node_type = EXACT;
+            oldp = NULL;
+            maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
+            maybe_SIMPLE = SIMPLE;
+            requires_utf8_target = FALSE;
+            has_ss = FALSE;
+            has_micro_sign = FALSE;
  
            continue_parse:
  
@@ -13888,8 +13939,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * The exceptions override this */
                  Size_t added_len = 1;
  
-                old_oldp = oldp;
                 oldp = p;
+                old_old_s = old_s;
+                old_s = s;
  
                  /* White space has already been ignored */
                  assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
@@ -14343,72 +14395,76 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              }
                          }
                      }
-                    else {
-
-                        /* Here is non-UTF8.  First, see if the character's
-                         * fold differs between /d and /u. */
-                        if (PL_fold[ender] != PL_fold_latin1[ender]) {
-                            maybe_exactfu = FALSE;
+                    else { /* Here is non-UTF8. */
+
+                        /* The fold will be one or (rarely) two characters.
+                         * Check that there's room for at least a single one
+                         * before setting any flags, etc.  Because otherwise an
+                         * overflowing character could cause a flag to be set
+                         * even though it doesn't end up in this node.  (For
+                         * the two character fold, we check again, before
+                         * setting any flags) */
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
                          }
  
  #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
     || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
                                        || UNICODE_DOT_DOT_VERSION > 0)
  
-                        /* On non-ancient Unicode versions, this includes the
-                         * multi-char fold SHARP S to 'ss' */
-
-                        if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
-                            || (   isALPHA_FOLD_EQ(ender, 's')
-                                && len > 0
-                                && isALPHA_FOLD_EQ(*(s-1), 's')))
-                        {
-                            /* Here, we have one of the following:
-                             *  a)  a SHARP S.  This folds to 'ss' only under
-                             *      /u rules.  If we are in that situation,
-                             *      fold the SHARP S to 'ss'.
-                             *  b)  'ss'.  When under /u, there's nothing
-                             *      special needed to be done here.  The
-                             *      previous iteration handled the first 's',
-                             *      and this iteration will handle the second.
-                             *      If, on the otherhand it's not /u, we have
-                             *      to exclude the possibility of moving to /u,
-                             *      so that we won't generate an unwanted
-                             *      match, unless, at runtime, the target
-                             *      string is in UTF-8.
-                             * */
+                        /* On non-ancient Unicodes, check for the only possible
+                         * multi-char fold  */
+                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
  
+                            /* This potential multi-char fold means the node
+                             * can't be simple (because it could match more
+                             * than a single char).  And in some cases it will
+                             * match 'ss', so set that flag */
+                            maybe_SIMPLE = 0;
                              has_ss = TRUE;
-                            maybe_exactfu = FALSE;  /* Can't generate an
-                                                       EXACTFU node (unless we
-                                                       already are in one) */
-                            if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
-                                maybe_SIMPLE = 0;
-                                if (node_type == EXACTFU) {
-
-                                    if (UNLIKELY(len + 2 > max_string_len)) {
-                                        overflowed = TRUE;
-                                        break;
-                                    }
-
-                                    *(s++) = 's';
  
-                                    /* Let the code below add in the extra 's'
-                                     * */
-                                    ender = 's';
-                                    added_len = 2;
+                            /* It can't change to be an EXACTFU (unless already
+                             * is one).  We fold it iff under /u rules. */
+                            if (node_type != EXACTFU) {
+                                maybe_exactfu = FALSE;
+                            }
+                            else {
+                                if (UNLIKELY(len + 2 > max_string_len)) {
+                                    overflowed = TRUE;
+                                    break;
                                  }
+
+                                *(s++) = 's';
+                                *(s++) = 's';
+                                added_len = 2;
+
+                                goto done_with_this_char;
                              }
                          }
+                        else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
+                                 && LIKELY(len > 0)
+                                 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
+                        {
+                            /* Also, the sequence 'ss' is special when not
+                             * under /u.  If the target string is UTF-8, it
+                             * should match SHARP S; otherwise it won't.  So,
+                             * here we have to exclude the possibility of this
+                             * node moving to /u.*/
+                            has_ss = TRUE;
+                            maybe_exactfu = FALSE;
+                        }
  #endif
+                        /* Here, the fold will be a single character */
  
-                        else if (UNLIKELY(ender == MICRO_SIGN)) {
+                        if (UNLIKELY(ender == MICRO_SIGN)) {
                              has_micro_sign = TRUE;
                          }
+                        else if (PL_fold[ender] != PL_fold_latin1[ender]) {
  
-                        if (UNLIKELY(len + 1 > max_string_len)) {
-                            overflowed = TRUE;
-                            break;
+                            /* If the character's fold differs between /d and
+                             * /u, this can't change to be an EXACTFU node */
+                            maybe_exactfu = FALSE;
                          }
  
                          *(s++) = (DEPENDS_SEMANTICS)
@@ -14424,6 +14480,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      }
                 } /* End of adding current character to the node */
  
+              done_with_this_char:
+
                  len += added_len;
  
                 if (next_is_quantifier) {
@@ -14512,8 +14570,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                  goto continue_parse;
              }
-            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
-                                locale, and prepare for that? */
+            else if (FOLD) {
+                bool splittable = FALSE;
+                bool backed_up = FALSE;
+                char * e;
+                char * s_start;
  
                  /* Here is /i.  Running out of room creates a problem if we are
                   * folding, and the split happens in the middle of a
@@ -14526,188 +14587,403 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * things that fold to them) as 'ff' and 'ss' are
                   * multi-character folds.
                   *
+                 * The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  That means we would be
+                 * splitting one if the final character in the node is at the
+                 * beginning of either type, or is the second of a three
+                 * character fold.
+                 *
                   * At this point:
-                 *  old_oldp  points to the beginning in the input of the
-                 *              penultimate character in the node.
-                 *  oldp      points to the beginning in the input of the
-                 *              final character in the node.
-                 *  p         points to the beginning in the input of the
-                 *              next character in the input, the one that won't
-                 *              fit in the node.
+                 *  ender     is the code point of the character that won't fit
+                 *            in the node
+                 *  s         points to just beyond the final byte in the node.
+                 *            It's where we would place ender if there were
+                 *            room, and where in fact we do place ender's fold
+                 *            in the code below, as we've over-allocated space
+                 *            for s0 (hence s) to allow for this
+                 *  e         starts at 's' and advances as we append things.
+                 *  old_s     is the same as 's'.  (If ender had fit, 's' would
+                 *            have been advanced to beyond it).
+                 *  old_old_s points to the beginning byte of the final
+                 *            character in the node
+                 *  p         points to the beginning byte in the input of the
+                 *            character beyond 'ender'.
+                 *  oldp      points to the beginning byte in the input of
+                 *            'ender'.
                   *
-                 * We aren't in the middle of a multi-char fold unless the
-                 * final character in the node can appear in a non-final
-                 * position in such a fold.  Very few characters actually
-                 * participate in multi-character folds, and fewer still can be
-                 * in the non-final position.  But it's complicated to know
-                 * here if that final character is folded or not, so skip this
-                 * check */
-
-                           /* Make sure enough space for final char of node,
-                            * first char of following node, and the fold of the
-                            * following char (so we don't have to worry about
-                            * that fold running off the end */
-                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
-                STRLEN fold_len;
-                UV folded;
-                char * const sav_oldp = oldp;
-
-                assert(FOLD);
-
-                /* The Unicode standard says that multi character folds consist
-                 * of either two or three characters.  So we create a buffer
-                 * containing a window of three.  The first is the final
-                 * character in the node (folded), and then the two that begin
-                 * the following node.   But if the first character of the
-                 * following node can't be in a non-final fold position, there
-                 * is no need to look at its successor character.  The macros
-                 * used below to check for multi character folds require folded
-                 * inputs, so we have to fold these.  (The fold of p was likely
-                 * calculated in the loop above, but it hasn't beeen saved, and
-                 * khw thinks it would be too entangled to change to do so) */
-
-                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
-                    folded = _to_uni_fold_flags(ender,
-                                                foldbuf,
-                                                &fold_len,
-                                                FOLD_FLAGS_FULL);
+                 * In the case of /il, we haven't folded anything that could be
+                 * affected by the locale.  That means only above-Latin1
+                 * characters that fold to other above-latin1 characters get
+                 * folded at compile time.  To check where a good place to
+                 * split nodes is, everything in it will have to be folded.
+                 * The boolean 'maybe_exactfu' keeps track in /il if there are
+                 * any unfolded characters in the node. */
+                bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+                /* If we do need to fold the node, we need a place to store the
+                 * folded copy, and a way to map back to the unfolded original
+                 * */
+                char * locfold_buf;
+                Size_t * loc_correspondence;
+
+                if (! need_to_fold_loc) {   /* The normal case.  Just
+                                               initialize to the actual node */
+                    e = s;
+                    s_start = s0;
+                    s = old_old_s;  /* Point to the beginning of the final char
+                                       that fits in the node */
                  }
                  else {
-                    foldbuf[0] = folded = MICRO_SIGN;
-                    fold_len = 1;
-                }
-
-                /* Here, foldbuf contains the fold of the first character in
-                 * the next node.  We may also need the next one (if there is
-                 * one) to get our third, but if the first character folded to
-                 * more than one, those extra one(s) will serve as the third.
-                 * Also, we don't need a third unless the previous one can
-                 * appear in a non-final position in a fold */
-                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
-                    && (fold_len == 1 || (   UTF
-                                          && UVCHR_SKIP(folded) == fold_len))
-                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
-                {
-                    if (UTF) {
-                        STRLEN next_fold_len;
  
-                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
-                                         (U8*) RExC_end, foldbuf + fold_len,
-                                         &next_fold_len);
-                        fold_len += next_fold_len;
-                    }
-                    else {
-                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
-                            foldbuf[fold_len] = 's';
+                    /* Here, we have filled a /il node, and there are unfolded
+                     * characters in it.  If the runtime locale turns out to be
+                     * UTF-8, there are possible multi-character folds, just
+                     * like when not under /l.  The node hence can't terminate
+                     * in the middle of such a fold.  To determine this, we
+                     * have to create a folded copy of this node.  That means
+                     * reparsing the node, folding everything assuming a UTF-8
+                     * locale.  (If at runtime it isn't such a locale, the
+                     * actions here wouldn't have been necessary, but we have
+                     * to assume the worst case.)  If we find we need to back
+                     * off the folded string, we do so, and then map that
+                     * position back to the original unfolded node, which then
+                     * gets output, truncated at that spot */
+
+                    char * redo_p = RExC_parse;
+                    char * redo_e;
+                    char * old_redo_e;
+
+                    /* Allow enough space assuming a single byte input folds to
+                     * a single byte output, plus assume that the two unparsed
+                     * characters (that we may need) fold to the largest number
+                     * of bytes possible, plus extra for one more worst case
+                     * scenario.  In the loop below, if we start eating into
+                     * that final spare space, we enlarge this initial space */
+                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+                    Newxz(locfold_buf, size, char);
+                    Newxz(loc_correspondence, size, Size_t);
+
+                    /* Redo this node's parse, folding into 'locfold_buf' */
+                    redo_p = RExC_parse;
+                    redo_e = locfold_buf;
+                    while (redo_p <= oldp) {
+
+                        old_redo_e = redo_e;
+                        loc_correspondence[redo_e - locfold_buf]
+                                                        = redo_p - RExC_parse;
+
+                        if (UTF) {
+                            Size_t added_len;
+
+                            (void) _to_utf8_fold_flags((U8 *) redo_p,
+                                                       (U8 *) RExC_end,
+                                                       (U8 *) redo_e,
+                                                       &added_len,
+                                                       FOLD_FLAGS_FULL);
+                            redo_e += added_len;
+                            redo_p += UTF8SKIP(redo_p);
                          }
                          else {
-                            foldbuf[fold_len] = toLOWER_L1(p[1]);
+
+                            /* Note that if this code is run on some ancient
+                             * Unicode versions, SHARP S doesn't fold to 'ss',
+                             * but rather than clutter the code with #ifdef's,
+                             * as is done above, we ignore that possibility.
+                             * This is ok because this code doesn't affect what
+                             * gets matched, but merely where the node gets
+                             * split */
+                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+                            }
+                            else {
+                                *redo_e++ = 's';
+                                *redo_e++ = 's';
+                            }
+                            redo_p++;
+                        }
+
+
+                        /* If we're getting so close to the end that a
+                         * worst-case fold in the next character would cause us
+                         * to overflow, increase, assuming one byte output byte
+                         * per one byte input one, plus room for another worst
+                         * case fold */
+                        if (   redo_p <= oldp
+                            && redo_e > locfold_buf + size
+                                                    - (UTF8_MAXBYTES_CASE + 1))
+                        {
+                            Size_t new_size = size
+                                            + (oldp - redo_p)
+                                            + UTF8_MAXBYTES_CASE + 1;
+                            Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+                            Renew(locfold_buf, new_size, char);
+                            Renew(loc_correspondence, new_size, Size_t);
+                            size = new_size;
+
+                            redo_e = locfold_buf + e_offset;
                          }
-                        fold_len++;
                      }
+
+                    /* Set so that things are in terms of the folded, temporary
+                     * string */
+                    s = old_redo_e;
+                    s_start = locfold_buf;
+                    e = redo_e;
+
                  }
  
-                /* Here foldbuf contains the the fold of p, and if appropriate
-                 * that of the character following p in the input. */
+                /* Here, we have 's', 's_start' and 'e' set up to point to the
+                 * input that goes into the node, folded.
+                 *
+                 * If the final character of the node and the fold of ender
+                 * form the first two characters of a three character fold, we
+                 * need to peek ahead at the next (unparsed) character in the
+                 * input to determine if the three actually do form such a
+                 * fold.  Just looking at that character is not generally
+                 * sufficient, as it could be, for example, an escape sequence
+                 * that evaluates to something else, and it needs to be folded.
+                 *
+                 * khw originally thought to just go through the parse loop one
+                 * extra time, but that doesn't work easily as that iteration
+                 * could cause things to think that the parse is over and to
+                 * goto loopdone.  The character could be a '$' for example, or
+                 * the character beyond could be a quantifier, and other
+                 * glitches as well.
+                 *
+                 * The solution used here for peeking ahead is to look at that
+                 * next character.  If it isn't ASCII punctuation, then it will
+                 * be something that continues in an EXACTish node if there
+                 * were space.  We append the fold of it to s, having reserved
+                 * enough room in s0 for the purpose.  If we can't reasonably
+                 * peek ahead, we instead assume the worst case: that it is
+                 * something that would form the completion of a multi-char
+                 * fold.
+                 *
+                 * If we can't split between s and ender, we work backwards
+                 * character-by-character down to s0.  At each current point
+                 * see if we are at the beginning of a multi-char fold.  If so,
+                 * that means we would be splitting the fold across nodes, and
+                 * so we back up one and try again.
+                 *
+                 * If we're not at the beginning, we still could be at the
+                 * final two characters of a (rare) three character fold.  We
+                 * check if the sequence starting at the character before the
+                 * current position (and including the current and next
+                 * characters) is a three character fold.  If not, the node can
+                 * be split here.  If it is, we have to backup two characters
+                 * and try again.
+                 *
+                 * Otherwise, the node can be split at the current position.
+                 *
+                 * The same logic is used for UTF-8 patterns and not */
+                if (UTF) {
+                    Size_t added_len;
+
+                    /* Append the fold of ender */
+                    (void) _to_uni_fold_flags(
+                        ender,
+                        (U8 *) e,
+                        &added_len,
+                        FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+                                        ? FOLD_FLAGS_NOMIX_ASCII
+                                        : 0));
+                    e += added_len;
+
+                    /* 's' and the character folded to by ender may be the
+                     * first two of a three-character fold, in which case the
+                     * node should not be split here.  That may mean examining
+                     * the so-far unparsed character starting at 'p'.  But if
+                     * ender folded to more than one character, we already have
+                     * three characters to look at.  Also, we first check if
+                     * the sequence consisting of s and the next character form
+                     * the first two of some three character fold.  If not,
+                     * there's no need to peek ahead. */
+                    if (   added_len <= UTF8SKIP(e - added_len)
+                        && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
+                    {
+                        /* Here, the two do form the beginning of a potential
+                         * three character fold.  The unexamined character may
+                         * or may not complete it.  Peek at it.  It might be
+                         * something that ends the node or an escape sequence,
+                         * in which case we don't know without a lot of work
+                         * what it evaluates to, so we have to assume the worst
+                         * case: that it does complete the fold, and so we
+                         * can't split here.  All such instances  will have
+                         * that character be an ASCII punctuation character,
+                         * like a backslash.  So, for that case, backup one and
+                         * drop down to try at that position */
+                        if (isPUNCT(*p)) {
+                            s = (char *) utf8_hop_back((U8 *) s, -1,
+                                       (U8 *) s_start);
+                            backed_up = TRUE;
+                        }
+                        else {
+                            /* Here, since it's not punctuation, it must be a
+                             * real character, and we can append its fold to
+                             * 'e' (having deliberately reserved enough space
+                             * for this eventuality) and drop down to check if
+                             * the three actually do form a folded sequence */
+                            (void) _to_utf8_fold_flags(
+                                (U8 *) p, (U8 *) RExC_end,
+                                (U8 *) e,
+                                &added_len,
+                                FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+                                                ? FOLD_FLAGS_NOMIX_ASCII
+                                                : 0));
+                            e += added_len;
+                        }
+                    }
  
-                /* Search backwards until find a place that doesn't split a
-                 * multi-char fold */
-                while (1) {
-                    STRLEN s_len;
-                    char s_fold_buf[UTF8_MAXBYTES_CASE];
-                    char * s_fold = s_fold_buf;
+                    /* Here, we either have three characters available in
+                     * sequence starting at 's', or we have two characters and
+                     * know that the following one can't possibly be part of a
+                     * three character fold.  We go through the node backwards
+                     * until we find a place where we can split it without
+                     * breaking apart a multi-character fold.  At any given
+                     * point we have to worry about if such a fold begins at
+                     * the current 's', and also if a three-character fold
+                     * begins at s-1, (containing s and s+1).  Splitting in
+                     * either case would break apart a fold */
+                    do {
+                        char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
+                                                            (U8 *) s_start);
+
+                        /* If is a multi-char fold, can't split here.  Backup
+                         * one char and try again */
+                        if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
+                            s = prev_s;
+                            backed_up = TRUE;
+                            continue;
+                        }
  
-                    if (s <= s0) {
+                        /* If the two characters beginning at 's' are part of a
+                         * three character fold starting at the character
+                         * before s, we can't split either before or after s.
+                         * Backup two chars and try again */
+                        if (   LIKELY(s > s_start)
+                            && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
+                        {
+                            s = prev_s;
+                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
+                            backed_up = TRUE;
+                            continue;
+                        }
  
-                        /* There's no safe place in the node to split.  Quit so
-                         * will take the whole node */
-                        oldp = sav_oldp;
+                        /* Here there's no multi-char fold between s and the
+                         * next character following it.  We can split */
+                        splittable = TRUE;
                          break;
-                    }
  
-                    /* Backup 1 character.  The first time through this moves s
-                     * to point to the final character in the node */
-                    if (UTF) {
-                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    } while (s > s_start); /* End of loops backing up through the node */
+
+                    /* Here we either couldn't find a place to split the node,
+                     * or else we broke out of the loop setting 'splittable' to
+                     * true.  In the latter case, the place to split is between
+                     * the first and second characters in the sequence starting
+                     * at 's' */
+                    if (splittable) {
+                        s += UTF8SKIP(s);
+                    }
+                }
+                else {  /* Pattern not UTF-8 */
+                    if (   ender != LATIN_SMALL_LETTER_SHARP_S
+                        || ASCII_FOLD_RESTRICTED)
+                    {
+                        *e++ = toLOWER_L1(ender);
                      }
                      else {
-                        s--;
+                        *e++ = 's';
+                        *e++ = 's';
                      }
  
-                    /* 's' may or may not be folded; so make sure it is, and
-                     * use just the final character in its fold (should there
-                     * be more than one */
-                    if (UTF) {
-                        toFOLD_utf8_safe((U8*) s,
-                                         (U8*) s + UTF8SKIP(s),
-                                         (U8 *) s_fold_buf, &s_len);
-                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
-                        {
-                            s_fold += UTF8SKIP(s_fold);
+                    if (   e - s  <= 1
+                        && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
+                    {
+                        if (isPUNCT(*p)) {
+                            s--;
+                            backed_up = TRUE;
+                        }
+                        else {
+                            if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
+                                || ASCII_FOLD_RESTRICTED)
+                            {
+                                *e++ = toLOWER_L1(ender);
+                            }
+                            else {
+                                *e++ = 's';
+                                *e++ = 's';
+                            }
                          }
-                        s_len = UTF8SKIP(s_fold);
                      }
-                    else {
-                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
-                        {
-                            s_fold_buf[0] = 's';
+
+                    do {
+                        if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
+                            s--;
+                            backed_up = TRUE;
+                            continue;
                          }
-                        else {  /* This works for all other non-UTF-8 folds
-                                 */
-                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+
+                        if (   LIKELY(s > s_start)
+                            && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
+                        {
+                            s -= 2;
+                            backed_up = TRUE;
+                            continue;
                          }
-                        s_len = 1;
+
+                        splittable = TRUE;
+                        break;
+
+                    } while (s > s_start);
+
+                    if (splittable) {
+                        s++;
                      }
+                }
  
-                    /* Unshift this character to the beginning of the buffer,
-                     * No longer needed trailing characters are overwritten.
-                     * */
-                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
-                    Copy(s_fold, foldbuf, s_len, U8);
-
-                    /* If this isn't a multi-character fold, we have found a
-                     * splittable place.  If this is the final character in the
-                     * node, that means the node is valid as-is, and can quit.
-                     * Otherwise, we note how much we can fill the node before
-                     * coming to a non-splittable position, and go parse it
-                     * again, stopping there. This is done because we know
-                     * where in the output to stop, but we don't have a map to
-                     * where that is in the input.  One could be created, but
-                     * it seems like overkill for such a rare event as we are
-                     * dealing with here */
-                    if (UTF) {
-                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
-                                                foldbuf + UTF8_MAXBYTES_CASE))
-                        {
-                            upper_fill = s + UTF8SKIP(s) - s0;
-                            if (LIKELY(oldp)) {
-                                break;
+                /* Here, we are done backing up.  If we didn't backup at all
+                 * (the likely case), just proceed */
+                if (backed_up) {
+
+                   /* If we did find a place to split, reparse the entire node
+                    * stopping where we have calculated. */
+                    if (splittable) {
+
+                       /* If we created a temporary folded string under /l, we
+                        * have to map that back to the original */
+                        if (need_to_fold_loc) {
+                            upper_fill = loc_correspondence[s - s_start];
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
+
+                            if (upper_fill == 0) {
+                                FAIL2("panic: loc_correspondence[%d] is 0",
+                                      (int) (s - s_start));
                              }
-                            goto reparse;
                          }
-                    }
-                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
-                                                foldbuf + UTF8_MAXBYTES_CASE))
-                    {
-                        upper_fill = s + 1 - s0;
-                        if (LIKELY(oldp)) {
-                            break;
+                        else {
+                            upper_fill = s - s0;
                          }
                          goto reparse;
                      }
+                    else if (need_to_fold_loc) {
+                        Safefree(locfold_buf);
+                        Safefree(loc_correspondence);
+                    }
  
-                    oldp = old_oldp;
-                    old_oldp = NULL;
-
-                } /* End of loop backing up through the node */
                      /* Here the node consists entirely of non-final multi-char
                       * folds.  (Likely it is all 'f's or all 's's.)  There's no
                       * decent place to split it, so give up and just take the
                       * whole thing */
-
+                    len = old_s - s0;
+                }
             }   /* End of verifying node ends with an appropriate char */
  
-                p = oldp;
+            /* We need to start the next node at the character that didn't fit
+             * in this one */
+            p = oldp;
  
            loopdone:   /* Jumped to when encounters something that shouldn't be
                           in the node */
@@ -14869,7 +15145,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      assert(PL_regkind[OP(node)] == ANYOF);
  
      /* There is no bitmap for this node type */
-    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
          return;
      }
  
@@ -16429,7 +16705,11 @@ redo_curchar:
                               well have generated non-portable code points, but
                               they're valid on this machine */
                      FALSE, /* similarly, no need for strict */
-                    FALSE, /* Require return to be an ANYOF */
+
+                    /* We can optimize into something besides an ANYOF, except
+                     * under /l, which needs to be ANYOF because of runtime
+                     * checks for locale sanity, etc */
+                  ! in_locale,
                      NULL
                  );
  
@@ -16658,6 +16938,22 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
      UPDATE_WARNINGS_LOC(RExC_parse);
  }
  
+Size_t PERL_STATIC_INLINE
+S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
+{
+    const U8 * const start = s1;
+    const U8 * const send = start + max;
+
+    PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
+
+    while (s1 < send && *s1  == *s2) {
+        s1++; s2++;
+    }
+
+    return s1 - start;
+}
+
+
  STATIC AV *
  S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
  {
@@ -18331,14 +18627,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   |= ANYOFL_FOLD
                   |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
          }
-        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
-            UV start, end;
-            invlist_iterinit(cp_list);
-            if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                anyof_flags |= ANYOFL_FOLD;
-                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-            }
-            invlist_iterfinish(cp_list);
+        else if (cp_list && invlist_lowest(cp_list) < 256) {
+            /* If nothing is below 256, has no locale dependency; otherwise it
+             * does */
+            anyof_flags |= ANYOFL_FOLD;
+            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
          }
      }
      else if (   DEPENDS_SEMANTICS
@@ -18383,9 +18676,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      if (optimizable) {
          PERL_UINT_FAST8_T i;
-        Size_t partial_cp_count = 0;
+        UV partial_cp_count = 0;
          UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
          UV   end[MAX_FOLD_FROMS+1] = { 0 };
+        bool single_range = FALSE;
  
          if (cp_list) { /* Count the code points in enough ranges that we would
                            see all the ones possible in any fold in this version
@@ -18399,6 +18693,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  partial_cp_count += end[i] - start[i] + 1;
              }
  
+            if (i == 1) {
+                single_range = TRUE;
+            }
              invlist_iterfinish(cp_list);
          }
  
@@ -18559,21 +18856,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
           * participates in no fold whatsoever, and having it EXACT tells the
           * optimizer the target string cannot match unless it has a colon in
           * it.
-         *
-         * We don't typically generate an EXACTish node if doing so would
-         * require changing the pattern to UTF-8, as that affects /d and
-         * otherwise is slower.  However, under /i, not changing to UTF-8 can
-         * miss some potential multi-character folds.  We calculate the
-         * EXACTish node, and then decide if something would be missed if we
-         * don't upgrade */
+         */
          if (   ! posixl
              && ! invert
  
                  /* Only try if there are no more code points in the class than
                   * in the max possible fold */
-            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
-
-            && (start[0] < 256 || UTF || FOLD))
+            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
          {
              if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
              {
@@ -18582,10 +18871,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                  if (LOC) {
  
-                    /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
-                     * as that means there is a fold not known until runtime so
-                     * shows as only a single code point here. */
-                    op = (FOLD) ? EXACTFL : EXACTL;
+                    /* Here is /l:  Use EXACTL, except if there is a fold not
+                     * known until runtime so shows as only a single code point
+                     * here.  For code points above 255, we know which can
+                     * cause problems by having a potential fold to the Latin1
+                     * range. */
+                    if (  ! FOLD
+                        || (     start[0] > 255
+                            && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
+                    {
+                        op = EXACTL;
+                    }
+                    else {
+                        op = EXACTFL;
+                    }
                  }
                  else if (! FOLD) { /* Not /l and not /i */
                      op = (start[0] < 256) ? EXACT : EXACT_REQ8;
@@ -18835,45 +19134,43 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              }
  
              if (op != END) {
+                U8 len;
  
-                /* Here, we have calculated what EXACTish node we would use.
-                 * But we don't use it if it would require converting the
-                 * pattern to UTF-8, unless not using it could cause us to miss
-                 * some folds (hence be buggy) */
-
-                if (! UTF && value > 255) {
-                    SV * in_multis = NULL;
-
-                    assert(FOLD);
-
-                    /* If there is no code point that is part of a multi-char
-                     * fold, then there aren't any matches, so we don't do this
-                     * optimization.  Otherwise, it could match depending on
-                     * the context around us, so we do upgrade */
-                    _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
-                    if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+                /* Here, we have calculated what EXACTish node to use.  Have to
+                 * convert to UTF-8 if not already there */
+                if (value > 255) {
+                    if (! UTF) {
+                        SvREFCNT_dec(cp_list);;
                          REQUIRE_UTF8(flagp);
                      }
-                    else {
-                        op = END;
+
+                    /* This is a kludge to the special casing issues with this
+                     * ligature under /aa.  FB05 should fold to FB06, but the
+                     * call above to _to_uni_fold_flags() didn't find this, as
+                     * it didn't use the /aa restriction in order to not miss
+                     * other folds that would be affected.  This is the only
+                     * instance likely to ever be a problem in all of Unicode.
+                     * So special case it. */
+                    if (   value == LATIN_SMALL_LIGATURE_LONG_S_T
+                        && ASCII_FOLD_RESTRICTED)
+                    {
+                        value = LATIN_SMALL_LIGATURE_ST;
                      }
                  }
  
-                if (op != END) {
-                    U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+                len = (UTF) ? UVCHR_SKIP(value) : 1;
  
-                    ret = regnode_guts(pRExC_state, op, len, "exact");
-                    FILL_NODE(ret, op);
-                    RExC_emit += 1 + STR_SZ(len);
-                    setSTR_LEN(REGNODE_p(ret), len);
-                    if (len == 1) {
-                        *STRING(REGNODE_p(ret)) = (U8) value;
-                    }
-                    else {
-                        uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
-                    }
-                    goto not_anyof;
+                ret = regnode_guts(pRExC_state, op, len, "exact");
+                FILL_NODE(ret, op);
+                RExC_emit += 1 + STR_SZ(len);
+                setSTR_LEN(REGNODE_p(ret), len);
+                if (len == 1) {
+                    *STRINGs(REGNODE_p(ret)) = (U8) value;
+                }
+                else {
+                    uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
                  }
+                goto not_anyof;
              }
          }
  
@@ -19116,6 +19413,52 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              SvREFCNT_dec(intersection);
          }
  
+        /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+         * both in size and speed.  Currently, a 20 bit range base (smallest
+         * code point in the range), and a 12 bit maximum delta are packed into
+         * a 32 bit word.  This allows for using it on all of the Unicode code
+         * points except for the highest plane, which is only for private use
+         * code points.  khw doubts that a bigger delta is likely in real world
+         * applications */
+        if (     single_range
+            && ! has_runtime_dependency
+            &&   anyof_flags == 0
+            &&   start[0] < (1 << ANYOFR_BASE_BITS)
+            &&   end[0] - start[0]
+                    < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+                                   * CHARBITS - ANYOFR_BASE_BITS))))
+
+        {
+            U8 low_utf8[UTF8_MAXBYTES+1];
+            U8 high_utf8[UTF8_MAXBYTES+1];
+
+            ret = reganode(pRExC_state, ANYOFR,
+                        (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+            /* Place the lowest UTF-8 start byte in the flags field, so as to
+             * allow efficient ruling out at run time of many possible inputs.
+             * */
+            (void) uvchr_to_utf8(low_utf8, start[0]);
+            (void) uvchr_to_utf8(high_utf8, end[0]);
+
+            /* If all code points share the same first byte, this can be an
+             * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
+             * quickly rule out many inputs at run-time without having to
+             * compute the code point from UTF-8.  For EBCDIC, we use I8, as
+             * not doing that transformation would not rule out nearly so many
+             * things */
+            if (low_utf8[0] == high_utf8[0]) {
+                OP(REGNODE_p(ret)) = ANYOFRb;
+                ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
+            }
+            else {
+                ANYOF_FLAGS(REGNODE_p(ret))
+                                    = NATIVE_UTF8_TO_I8(low_utf8[0]);
+            }
+
+            goto not_anyof;
+        }
+
          /* If didn't find an optimization and there is no need for a bitmap,
           * optimize to indicate that */
          if (     start[0] >= NUM_ANYOF_CODE_POINTS
@@ -19126,14 +19469,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              U8 low_utf8[UTF8_MAXBYTES+1];
              UV highest_cp = invlist_highest(cp_list);
  
-            op = ANYOFH;
-
              /* Currently the maximum allowed code point by the system is
               * IV_MAX.  Higher ones are reserved for future internal use.  This
               * particular regnode can be used for higher ones, but we can't
               * calculate the code point of those.  IV_MAX suffices though, as
               * it will be a large first byte */
-            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+            Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+                           - low_utf8;
  
              /* We store the lowest possible first byte of the UTF-8
               * representation, using the flags field.  This allows for quick
@@ -19142,23 +19484,51 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               * transformation would not rule out nearly so many things */
              anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
  
+            op = ANYOFH;
+
              /* If the first UTF-8 start byte for the highest code point in the
               * range is suitably small, we may be able to get an upper bound as
               * well */
              if (highest_cp <= IV_MAX) {
                  U8 high_utf8[UTF8_MAXBYTES+1];
-
-                (void) uvchr_to_utf8(high_utf8, highest_cp);
+                Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+                                - high_utf8;
  
                  /* If the lowest and highest are the same, we can get an exact
-                 * first byte instead of a just minimum.  We signal this with a
-                 * different regnode */
+                 * first byte instead of a just minimum or even a sequence of
+                 * exact leading bytes.  We signal these with different
+                 * regnodes */
                  if (low_utf8[0] == high_utf8[0]) {
+                    Size_t len = find_first_differing_byte_pos(low_utf8,
+                                                               high_utf8,
+                                                       MIN(low_len, high_len));
  
-                    /* No need to convert to I8 for EBCDIC as this is an exact
-                     * match */
-                    anyof_flags = low_utf8[0];
-                    op = ANYOFHb;
+                    if (len == 1) {
+
+                        /* No need to convert to I8 for EBCDIC as this is an
+                         * exact match */
+                        anyof_flags = low_utf8[0];
+                        op = ANYOFHb;
+                    }
+                    else {
+                        op = ANYOFHs;
+                        ret = regnode_guts(pRExC_state, op,
+                                           regarglen[op] + STR_SZ(len),
+                                           "anyofhs");
+                        FILL_NODE(ret, op);
+                        RExC_emit += 1 + regarglen[op]
+                                   - 1 + STR_SZ(len); /* Replace the [1]
+                                                         element of the struct
+                                                         by the real value */
+                        REGNODE_p(ret)->flags = len;
+                        Copy(low_utf8,  /* Add the common bytes */
+                           ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+                           len, U8);
+                        NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+                        set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+                                                  NULL, only_utf8_locale_list);
+                        goto not_anyof;
+                    }
                  }
                  else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
                  {
@@ -19244,8 +19614,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
                    (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
-                   ? listsv : NULL,
+                   ? listsv
+                   : NULL,
                    only_utf8_locale_list);
+    SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
      return ret;
  
    not_anyof:
@@ -19256,6 +19629,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                             RExC_parse - orig_parse);;
      SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
      return ret;
  }
  
@@ -19296,11 +19670,12 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
         SV *rv;
  
          if (cp_list) {
-            av_store(av, INVLIST_INDEX, cp_list);
+            av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
          }
  
          if (only_utf8_locale_list) {
-            av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+            av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+                                          SvREFCNT_inc(only_utf8_locale_list));
          }
  
          if (runtime_defns) {
@@ -19713,7 +20088,7 @@ STATIC regnode_offset
  S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
  {
      /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
-     * equivalents space.  It aligns and increments RExC_size and RExC_emit
+     * equivalents space.  It aligns and increments RExC_size
       *
       * It returns the regnode's offset into the regex engine program */
  
@@ -20417,6 +20792,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                                                  NULL,
                                                  NULL,
                                                  NULL,
+                                                0,
                                                  FALSE
                                                 );
              sv_catpvs(sv, "]");
@@ -20499,10 +20875,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
      else if (k == LOGICAL)
          /* 2: embedded, otherwise 1 */
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
-    else if (k == ANYOF) {
-       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
-                          ? 0
-                          : ANYOF_FLAGS(o);
+    else if (k == ANYOF || k == ANYOFR) {
+        U8 flags;
+        char * bitmap;
+        U32 arg;
          bool do_sep = FALSE;    /* Do we need to separate various components of
                                     the output? */
          /* Set if there is still an unresolved user-defined property */
@@ -20517,7 +20893,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And things that aren't in the bitmap, but are small enough to be */
          SV* bitmap_range_not_in_bitmap = NULL;
  
-        const bool inverted = flags & ANYOF_INVERT;
+        bool inverted;
+
+        if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+            flags = 0;
+            bitmap = NULL;
+            arg = 0;
+        }
+        else {
+            flags = ANYOF_FLAGS(o);
+            bitmap = ANYOF_BITMAP(o);
+            arg = ARG(o);
+        }
  
         if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
              if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
@@ -20528,17 +20915,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              }
          }
  
+        inverted = flags & ANYOF_INVERT;
+
          /* If there is stuff outside the bitmap, get it */
-        if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
-            (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+        if (arg != ANYOF_ONLY_HAS_BITMAP) {
+            if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
+                nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+                                            ANYOFRbase(o),
+                                            ANYOFRbase(o) + ANYOFRdelta(o));
+            }
+            else {
+                (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
                                                  &unresolved,
                                                  &only_utf8_locale_invlist,
                                                  &nonbitmap_invlist);
+            }
+
              /* The non-bitmap data may contain stuff that could fit in the
               * bitmap.  This could come from a user-defined property being
               * finally resolved when this call was done; or much more likely
               * because there are matches that require UTF-8 to be valid, and so
-             * aren't in the bitmap.  This is teased apart later */
+             * aren't in the bitmap (or ANYOFR).  This is teased apart later */
              _invlist_intersection(nonbitmap_invlist,
                                    PL_InBitmap,
                                    &bitmap_range_not_in_bitmap);
@@ -20558,19 +20955,26 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* Ready to start outputting.  First, the initial left bracket */
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
  
-        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        /* ANYOFH by definition doesn't have anything that will fit inside the
+         * bitmap;  ANYOFR may or may not. */
+        if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+            && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
+                ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+        {
              /* Then all the things that could fit in the bitmap */
              do_sep = put_charclass_bitmap_innards(sv,
-                                                  ANYOF_BITMAP(o),
+                                                  bitmap,
                                                    bitmap_range_not_in_bitmap,
                                                    only_utf8_locale_invlist,
                                                    o,
+                                                  flags,
  
                                                    /* Can't try inverting for a
                                                     * better display if there
                                                     * are things that haven't
                                                     * been resolved */
-                                                  unresolved != NULL);
+                                                  unresolved != NULL
+                                            || inRANGE(OP(o), ANYOFR, ANYOFRb));
              SvREFCNT_dec(bitmap_range_not_in_bitmap);
  
              /* If there are user-defined properties which haven't been defined
@@ -20656,15 +21060,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And finally the matching, closing ']' */
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
-        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        if (OP(o) == ANYOFHs) {
+            Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+        }
+        else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
              U8 lowest = (OP(o) != ANYOFHr)
                           ? FLAGS(o)
                           : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
-            U8 highest = (OP(o) == ANYOFHb)
-                         ? lowest
-                         : OP(o) == ANYOFH
+            U8 highest = (OP(o) == ANYOFHr)
+                         ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+                         : (OP(o) == ANYOFH || OP(o) == ANYOFR)
                             ? 0xFF
-                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+                           : lowest;
              Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
              if (lowest != highest) {
                  Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
@@ -20682,7 +21089,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              _invlist_invert(cp_list);
          }
  
-        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
          SvREFCNT_dec(cp_list);
@@ -21763,6 +22170,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                       SV *nonbitmap_invlist,
                                       SV *only_utf8_locale_invlist,
                                       const regnode * const node,
+                                     const U8 flags,
                                       const bool force_as_is_display)
  {
      /* Appends to 'sv' a displayable version of the innards of the bracketed
@@ -21779,6 +22187,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
       *  'node' is the regex pattern ANYOF node.  It is needed only when the
       *      above two parameters are not null, and is passed so that this
       *      routine can tease apart the various reasons for them.
+     *  'flags' is the flags field of 'node'
       *  'force_as_is_display' is TRUE if this routine should definitely NOT try
       *      to invert things to see if that leads to a cleaner display.  If
       *      FALSE, this routine is free to use its judgment about doing this.
@@ -21817,8 +22226,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                 literally */
      SV* inverted_display;   /* The output string when we invert the inputs */
  
-    U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
-
      bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
                                                     to match? */
      /* We are biased in favor of displaying things without them being inverted,
@@ -22262,9 +22669,6 @@ Perl_init_uniprops(pTHX)
                                              UNI__PERL_FOLDS_TO_MULTI_CHAR]);
      PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
                                              UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
-    PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
-                                            UNI__PERL_NON_FINAL_FOLDS]);
-
      PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
      PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
      PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);