This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Properly handle filled /il regnodes and multi-char folds
[perl5.git] / regcomp.c
index 7d905b1..2c5cbfe 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -357,7 +357,7 @@ struct RExC_state_t {
 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
  * a flag that indicates we need to override /d with /u as a result of
  * something in the pattern.  It should only be used in regards to calling
- * set_regex_charset() or get_regex_charse() */
+ * set_regex_charset() or get_regex_charset() */
 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
     STMT_START {                                                            \
             if (DEPENDS_SEMANTICS) {                                        \
@@ -1588,7 +1588,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     unsigned int i;
     const U32 n = ARG(node);
     bool new_node_has_latin1 = FALSE;
-    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
                       ? 0
                       : ANYOF_FLAGS(node);
 
@@ -1643,7 +1643,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     }
 
     /* Add in the points from the bit map */
-    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
             if (ANYOF_BITMAP_TEST(node, i)) {
                 unsigned int start = i++;
@@ -1730,7 +1730,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * another SSC or a regular ANYOF class.  Can create false positives. */
 
     SV* anded_cp_list;
-    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
                           ? 0
                           : ANYOF_FLAGS(and_with);
     U8  anded_flags;
@@ -1916,7 +1916,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
     SV* ored_cp_list;
     U8 ored_flags;
-    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
                          ? 0
                          : ANYOF_FLAGS(or_with);
 
@@ -2142,6 +2142,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
 
     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+    SvREFCNT_dec(invlist);
 
     /* Make sure is clone-safe */
     ssc->invlist = NULL;
@@ -5864,6 +5865,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                 case ANYOFH:
                 case ANYOFHb:
                 case ANYOFHr:
+                case ANYOFHs:
                 case ANYOF:
                    if (flags & SCF_DO_STCLASS_AND)
                        ssc_and(pRExC_state, data->start_class,
@@ -5889,6 +5891,26 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                     break;
                   }
 
+                case ANYOFR:
+                case ANYOFRb:
+                  {
+                    SV* cp_list = NULL;
+
+                    cp_list = _add_range_to_invlist(cp_list,
+                                        ANYOFRbase(scan),
+                                        ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+                    if (flags & SCF_DO_STCLASS_OR) {
+                        ssc_union(data->start_class, cp_list, invert);
+                    }
+                    else if (flags & SCF_DO_STCLASS_AND) {
+                        ssc_intersection(data->start_class, cp_list, invert);
+                    }
+
+                    SvREFCNT_dec_NN(cp_list);
+                    break;
+                  }
+
                case NPOSIXL:
                     invert = 1;
                     /* FALLTHROUGH */
@@ -10297,6 +10319,28 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
 
 #endif
 
+PERL_STATIC_INLINE UV
+S_invlist_lowest(SV* const invlist)
+{
+    /* Returns the lowest code point that matches an inversion list.  This API
+     * has an ambiguity, as it returns 0 under either the lowest is actually
+     * 0, or if the list is empty.  If this distinction matters to you, check
+     * for emptiness before calling this function */
+
+    UV len = _invlist_len(invlist);
+    UV *array;
+
+    PERL_ARGS_ASSERT_INVLIST_LOWEST;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    array = invlist_array(invlist);
+
+    return array[0];
+}
+
 STATIC SV *
 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
 {
@@ -14526,13 +14570,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                 goto continue_parse;
             }
-            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
-                                locale, and prepare for that? */
+            else if (FOLD) {
                 bool splittable = FALSE;
                 bool backed_up = FALSE;
-                char * e = s;
-
-                assert(FOLD);
+                char * e;
+                char * s_start;
 
                 /* Here is /i.  Running out of room creates a problem if we are
                  * folding, and the split happens in the middle of a
@@ -14569,6 +14611,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  *  oldp      points to the beginning byte in the input of
                  *            'ender'.
                  *
+                 * In the case of /il, we haven't folded anything that could be
+                 * affected by the locale.  That means only above-Latin1
+                 * characters that fold to other above-latin1 characters get
+                 * folded at compile time.  To check where a good place to
+                 * split nodes is, everything in it will have to be folded.
+                 * The boolean 'maybe_exactfu' keeps track in /il if there are
+                 * any unfolded characters in the node. */
+                bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+                /* If we do need to fold the node, we need a place to store the
+                 * folded copy, and a way to map back to the unfolded original
+                 * */
+                char * locfold_buf;
+                Size_t * loc_correspondence;
+
+                if (! need_to_fold_loc) {   /* The normal case.  Just
+                                               initialize to the actual node */
+                    e = s;
+                    s_start = s0;
+                    s = old_old_s;  /* Point to the beginning of the final char
+                                       that fits in the node */
+                }
+                else {
+
+                    /* Here, we have filled a /il node, and there are unfolded
+                     * characters in it.  If the runtime locale turns out to be
+                     * UTF-8, there are possible multi-character folds, just
+                     * like when not under /l.  The node hence can't terminate
+                     * in the middle of such a fold.  To determine this, we
+                     * have to create a folded copy of this node.  That means
+                     * reparsing the node, folding everything assuming a UTF-8
+                     * locale.  (If at runtime it isn't such a locale, the
+                     * actions here wouldn't have been necessary, but we have
+                     * to assume the worst case.)  If we find we need to back
+                     * off the folded string, we do so, and then map that
+                     * position back to the original unfolded node, which then
+                     * gets output, truncated at that spot */
+
+                    char * redo_p = RExC_parse;
+                    char * redo_e;
+                    char * old_redo_e;
+
+                    /* Allow enough space assuming a single byte input folds to
+                     * a single byte output, plus assume that the two unparsed
+                     * characters (that we may need) fold to the largest number
+                     * of bytes possible, plus extra for one more worst case
+                     * scenario.  In the loop below, if we start eating into
+                     * that final spare space, we enlarge this initial space */
+                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+                    Newxz(locfold_buf, size, char);
+                    Newxz(loc_correspondence, size, Size_t);
+
+                    /* Redo this node's parse, folding into 'locfold_buf' */
+                    redo_p = RExC_parse;
+                    redo_e = locfold_buf;
+                    while (redo_p <= oldp) {
+
+                        old_redo_e = redo_e;
+                        loc_correspondence[redo_e - locfold_buf]
+                                                        = redo_p - RExC_parse;
+
+                        if (UTF) {
+                            Size_t added_len;
+
+                            (void) _to_utf8_fold_flags((U8 *) redo_p,
+                                                       (U8 *) RExC_end,
+                                                       (U8 *) redo_e,
+                                                       &added_len,
+                                                       FOLD_FLAGS_FULL);
+                            redo_e += added_len;
+                            redo_p += UTF8SKIP(redo_p);
+                        }
+                        else {
+
+                            /* Note that if this code is run on some ancient
+                             * Unicode versions, SHARP S doesn't fold to 'ss',
+                             * but rather than clutter the code with #ifdef's,
+                             * as is done above, we ignore that possibility.
+                             * This is ok because this code doesn't affect what
+                             * gets matched, but merely where the node gets
+                             * split */
+                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+                            }
+                            else {
+                                *redo_e++ = 's';
+                                *redo_e++ = 's';
+                            }
+                            redo_p++;
+                        }
+
+
+                        /* If we're getting so close to the end that a
+                         * worst-case fold in the next character would cause us
+                         * to overflow, increase, assuming one byte output byte
+                         * per one byte input one, plus room for another worst
+                         * case fold */
+                        if (   redo_p <= oldp
+                            && redo_e > locfold_buf + size
+                                                    - (UTF8_MAXBYTES_CASE + 1))
+                        {
+                            Size_t new_size = size
+                                            + (oldp - redo_p)
+                                            + UTF8_MAXBYTES_CASE + 1;
+                            Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+                            Renew(locfold_buf, new_size, char);
+                            Renew(loc_correspondence, new_size, Size_t);
+                            size = new_size;
+
+                            redo_e = locfold_buf + e_offset;
+                        }
+                    }
+
+                    /* Set so that things are in terms of the folded, temporary
+                     * string */
+                    s = old_redo_e;
+                    s_start = locfold_buf;
+                    e = redo_e;
+
+                }
+
+                /* Here, we have 's', 's_start' and 'e' set up to point to the
+                 * input that goes into the node, folded.
+                 *
                  * If the final character of the node and the fold of ender
                  * form the first two characters of a three character fold, we
                  * need to peek ahead at the next (unparsed) character in the
@@ -14608,11 +14776,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * and try again.
                  *
                  * Otherwise, the node can be split at the current position.
-                 */
-                s = old_old_s;  /* Point to the beginning of the final char
-                                   that fits in the node */
-
-                /* The same logic is used for UTF-8 patterns and not */
+                 *
+                 * The same logic is used for UTF-8 patterns and not */
                 if (UTF) {
                     Size_t added_len;
 
@@ -14651,7 +14816,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * drop down to try at that position */
                         if (isPUNCT(*p)) {
                             s = (char *) utf8_hop_back((U8 *) s, -1,
-                                       (U8 *) s0);
+                                       (U8 *) s_start);
                             backed_up = TRUE;
                         }
                         else {
@@ -14683,7 +14848,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      * either case would break apart a fold */
                     do {
                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
-                                                                    (U8 *) s0);
+                                                            (U8 *) s_start);
 
                         /* If is a multi-char fold, can't split here.  Backup
                          * one char and try again */
@@ -14697,11 +14862,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * three character fold starting at the character
                          * before s, we can't split either before or after s.
                          * Backup two chars and try again */
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
                         {
                             s = prev_s;
-                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
                             backed_up = TRUE;
                             continue;
                         }
@@ -14711,7 +14876,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         splittable = TRUE;
                         break;
 
-                    } while (s > s0); /* End of loops backing up through the node */
+                    } while (s > s_start); /* End of loops backing up through the node */
 
                     /* Here we either couldn't find a place to split the node,
                      * or else we broke out of the loop setting 'splittable' to
@@ -14760,7 +14925,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             continue;
                         }
 
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
                         {
                             s -= 2;
@@ -14771,7 +14936,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         splittable = TRUE;
                         break;
 
-                    } while (s > s0);
+                    } while (s > s_start);
 
                     if (splittable) {
                         s++;
@@ -14785,9 +14950,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    /* If we did find a place to split, reparse the entire node
                     * stopping where we have calculated. */
                     if (splittable) {
-                        upper_fill = s - s0;
+
+                       /* If we created a temporary folded string under /l, we
+                        * have to map that back to the original */
+                        if (need_to_fold_loc) {
+                            upper_fill = loc_correspondence[s - s_start];
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
+
+                            if (upper_fill == 0) {
+                                FAIL2("panic: loc_correspondence[%d] is 0",
+                                      (int) (s - s_start));
+                            }
+                        }
+                        else {
+                            upper_fill = s - s0;
+                        }
                         goto reparse;
                     }
+                    else if (need_to_fold_loc) {
+                        Safefree(locfold_buf);
+                        Safefree(loc_correspondence);
+                    }
 
                     /* Here the node consists entirely of non-final multi-char
                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
@@ -14961,7 +15145,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
     assert(PL_regkind[OP(node)] == ANYOF);
 
     /* There is no bitmap for this node type */
-    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
         return;
     }
 
@@ -16754,6 +16938,22 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
     UPDATE_WARNINGS_LOC(RExC_parse);
 }
 
+Size_t PERL_STATIC_INLINE
+S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
+{
+    const U8 * const start = s1;
+    const U8 * const send = start + max;
+
+    PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
+
+    while (s1 < send && *s1  == *s2) {
+        s1++; s2++;
+    }
+
+    return s1 - start;
+}
+
+
 STATIC AV *
 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
 {
@@ -18427,14 +18627,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  |= ANYOFL_FOLD
                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
         }
-        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
-            UV start, end;
-            invlist_iterinit(cp_list);
-            if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                anyof_flags |= ANYOFL_FOLD;
-                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-            }
-            invlist_iterfinish(cp_list);
+        else if (cp_list && invlist_lowest(cp_list) < 256) {
+            /* If nothing is below 256, has no locale dependency; otherwise it
+             * does */
+            anyof_flags |= ANYOFL_FOLD;
+            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
         }
     }
     else if (   DEPENDS_SEMANTICS
@@ -18479,9 +18676,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     if (optimizable) {
         PERL_UINT_FAST8_T i;
-        Size_t partial_cp_count = 0;
+        UV partial_cp_count = 0;
         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
         UV   end[MAX_FOLD_FROMS+1] = { 0 };
+        bool single_range = FALSE;
 
         if (cp_list) { /* Count the code points in enough ranges that we would
                           see all the ones possible in any fold in this version
@@ -18495,6 +18693,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 partial_cp_count += end[i] - start[i] + 1;
             }
 
+            if (i == 1) {
+                single_range = TRUE;
+            }
             invlist_iterfinish(cp_list);
         }
 
@@ -18939,7 +19140,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  * convert to UTF-8 if not already there */
                 if (value > 255) {
                     if (! UTF) {
-
                         SvREFCNT_dec(cp_list);;
                         REQUIRE_UTF8(flagp);
                     }
@@ -18960,17 +19160,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                 len = (UTF) ? UVCHR_SKIP(value) : 1;
 
-                    ret = regnode_guts(pRExC_state, op, len, "exact");
-                    FILL_NODE(ret, op);
-                    RExC_emit += 1 + STR_SZ(len);
-                    setSTR_LEN(REGNODE_p(ret), len);
-                    if (len == 1) {
-                        *STRING(REGNODE_p(ret)) = (U8) value;
-                    }
-                    else {
-                        uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
-                    }
-                    goto not_anyof;
+                ret = regnode_guts(pRExC_state, op, len, "exact");
+                FILL_NODE(ret, op);
+                RExC_emit += 1 + STR_SZ(len);
+                setSTR_LEN(REGNODE_p(ret), len);
+                if (len == 1) {
+                    *STRINGs(REGNODE_p(ret)) = (U8) value;
+                }
+                else {
+                    uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
+                }
+                goto not_anyof;
             }
         }
 
@@ -19213,6 +19413,52 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             SvREFCNT_dec(intersection);
         }
 
+        /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+         * both in size and speed.  Currently, a 20 bit range base (smallest
+         * code point in the range), and a 12 bit maximum delta are packed into
+         * a 32 bit word.  This allows for using it on all of the Unicode code
+         * points except for the highest plane, which is only for private use
+         * code points.  khw doubts that a bigger delta is likely in real world
+         * applications */
+        if (     single_range
+            && ! has_runtime_dependency
+            &&   anyof_flags == 0
+            &&   start[0] < (1 << ANYOFR_BASE_BITS)
+            &&   end[0] - start[0]
+                    < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+                                   * CHARBITS - ANYOFR_BASE_BITS))))
+
+        {
+            U8 low_utf8[UTF8_MAXBYTES+1];
+            U8 high_utf8[UTF8_MAXBYTES+1];
+
+            ret = reganode(pRExC_state, ANYOFR,
+                        (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+            /* Place the lowest UTF-8 start byte in the flags field, so as to
+             * allow efficient ruling out at run time of many possible inputs.
+             * */
+            (void) uvchr_to_utf8(low_utf8, start[0]);
+            (void) uvchr_to_utf8(high_utf8, end[0]);
+
+            /* If all code points share the same first byte, this can be an
+             * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
+             * quickly rule out many inputs at run-time without having to
+             * compute the code point from UTF-8.  For EBCDIC, we use I8, as
+             * not doing that transformation would not rule out nearly so many
+             * things */
+            if (low_utf8[0] == high_utf8[0]) {
+                OP(REGNODE_p(ret)) = ANYOFRb;
+                ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
+            }
+            else {
+                ANYOF_FLAGS(REGNODE_p(ret))
+                                    = NATIVE_UTF8_TO_I8(low_utf8[0]);
+            }
+
+            goto not_anyof;
+        }
+
         /* If didn't find an optimization and there is no need for a bitmap,
          * optimize to indicate that */
         if (     start[0] >= NUM_ANYOF_CODE_POINTS
@@ -19223,14 +19469,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             U8 low_utf8[UTF8_MAXBYTES+1];
             UV highest_cp = invlist_highest(cp_list);
 
-            op = ANYOFH;
-
             /* Currently the maximum allowed code point by the system is
              * IV_MAX.  Higher ones are reserved for future internal use.  This
              * particular regnode can be used for higher ones, but we can't
              * calculate the code point of those.  IV_MAX suffices though, as
              * it will be a large first byte */
-            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+            Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+                           - low_utf8;
 
             /* We store the lowest possible first byte of the UTF-8
              * representation, using the flags field.  This allows for quick
@@ -19239,23 +19484,51 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * transformation would not rule out nearly so many things */
             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
 
+            op = ANYOFH;
+
             /* If the first UTF-8 start byte for the highest code point in the
              * range is suitably small, we may be able to get an upper bound as
              * well */
             if (highest_cp <= IV_MAX) {
                 U8 high_utf8[UTF8_MAXBYTES+1];
-
-                (void) uvchr_to_utf8(high_utf8, highest_cp);
+                Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+                                - high_utf8;
 
                 /* If the lowest and highest are the same, we can get an exact
-                 * first byte instead of a just minimum.  We signal this with a
-                 * different regnode */
+                 * first byte instead of a just minimum or even a sequence of
+                 * exact leading bytes.  We signal these with different
+                 * regnodes */
                 if (low_utf8[0] == high_utf8[0]) {
+                    Size_t len = find_first_differing_byte_pos(low_utf8,
+                                                               high_utf8,
+                                                       MIN(low_len, high_len));
+
+                    if (len == 1) {
 
-                    /* No need to convert to I8 for EBCDIC as this is an exact
-                     * match */
-                    anyof_flags = low_utf8[0];
-                    op = ANYOFHb;
+                        /* No need to convert to I8 for EBCDIC as this is an
+                         * exact match */
+                        anyof_flags = low_utf8[0];
+                        op = ANYOFHb;
+                    }
+                    else {
+                        op = ANYOFHs;
+                        ret = regnode_guts(pRExC_state, op,
+                                           regarglen[op] + STR_SZ(len),
+                                           "anyofhs");
+                        FILL_NODE(ret, op);
+                        RExC_emit += 1 + regarglen[op]
+                                   - 1 + STR_SZ(len); /* Replace the [1]
+                                                         element of the struct
+                                                         by the real value */
+                        REGNODE_p(ret)->flags = len;
+                        Copy(low_utf8,  /* Add the common bytes */
+                           ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+                           len, U8);
+                        NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+                        set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+                                                  NULL, only_utf8_locale_list);
+                        goto not_anyof;
+                    }
                 }
                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
                 {
@@ -19341,8 +19614,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
-                   ? listsv : NULL,
+                   ? listsv
+                   : NULL,
                   only_utf8_locale_list);
+    SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
     return ret;
 
   not_anyof:
@@ -19353,6 +19629,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                            RExC_parse - orig_parse);;
     SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
     return ret;
 }
 
@@ -19393,11 +19670,12 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
        SV *rv;
 
         if (cp_list) {
-            av_store(av, INVLIST_INDEX, cp_list);
+            av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
         }
 
         if (only_utf8_locale_list) {
-            av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+            av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+                                          SvREFCNT_inc(only_utf8_locale_list));
         }
 
         if (runtime_defns) {
@@ -19810,7 +20088,7 @@ STATIC regnode_offset
 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
 {
     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
-     * equivalents space.  It aligns and increments RExC_size and RExC_emit
+     * equivalents space.  It aligns and increments RExC_size
      *
      * It returns the regnode's offset into the regex engine program */
 
@@ -20597,10 +20875,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     else if (k == LOGICAL)
         /* 2: embedded, otherwise 1 */
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
-    else if (k == ANYOF) {
-       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
-                          ? 0
-                          : ANYOF_FLAGS(o);
+    else if (k == ANYOF || k == ANYOFR) {
+        U8 flags;
+        char * bitmap;
+        U32 arg;
         bool do_sep = FALSE;    /* Do we need to separate various components of
                                    the output? */
         /* Set if there is still an unresolved user-defined property */
@@ -20615,7 +20893,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And things that aren't in the bitmap, but are small enough to be */
         SV* bitmap_range_not_in_bitmap = NULL;
 
-        const bool inverted = flags & ANYOF_INVERT;
+        bool inverted;
+
+        if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+            flags = 0;
+            bitmap = NULL;
+            arg = 0;
+        }
+        else {
+            flags = ANYOF_FLAGS(o);
+            bitmap = ANYOF_BITMAP(o);
+            arg = ARG(o);
+        }
 
        if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
@@ -20626,17 +20915,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             }
         }
 
+        inverted = flags & ANYOF_INVERT;
+
         /* If there is stuff outside the bitmap, get it */
-        if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
-            (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+        if (arg != ANYOF_ONLY_HAS_BITMAP) {
+            if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
+                nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+                                            ANYOFRbase(o),
+                                            ANYOFRbase(o) + ANYOFRdelta(o));
+            }
+            else {
+                (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
                                                 &unresolved,
                                                 &only_utf8_locale_invlist,
                                                 &nonbitmap_invlist);
+            }
+
             /* The non-bitmap data may contain stuff that could fit in the
              * bitmap.  This could come from a user-defined property being
              * finally resolved when this call was done; or much more likely
              * because there are matches that require UTF-8 to be valid, and so
-             * aren't in the bitmap.  This is teased apart later */
+             * aren't in the bitmap (or ANYOFR).  This is teased apart later */
             _invlist_intersection(nonbitmap_invlist,
                                   PL_InBitmap,
                                   &bitmap_range_not_in_bitmap);
@@ -20656,10 +20955,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* Ready to start outputting.  First, the initial left bracket */
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
 
-        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        /* ANYOFH by definition doesn't have anything that will fit inside the
+         * bitmap;  ANYOFR may or may not. */
+        if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+            && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
+                ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+        {
             /* Then all the things that could fit in the bitmap */
             do_sep = put_charclass_bitmap_innards(sv,
-                                                  ANYOF_BITMAP(o),
+                                                  bitmap,
                                                   bitmap_range_not_in_bitmap,
                                                   only_utf8_locale_invlist,
                                                   o,
@@ -20669,7 +20973,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                                                    * better display if there
                                                    * are things that haven't
                                                    * been resolved */
-                                                  unresolved != NULL);
+                                                  unresolved != NULL
+                                            || inRANGE(OP(o), ANYOFR, ANYOFRb));
             SvREFCNT_dec(bitmap_range_not_in_bitmap);
 
             /* If there are user-defined properties which haven't been defined
@@ -20755,15 +21060,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And finally the matching, closing ']' */
        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
-        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        if (OP(o) == ANYOFHs) {
+            Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+        }
+        else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
             U8 lowest = (OP(o) != ANYOFHr)
                          ? FLAGS(o)
                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
-            U8 highest = (OP(o) == ANYOFHb)
-                         ? lowest
-                         : OP(o) == ANYOFH
+            U8 highest = (OP(o) == ANYOFHr)
+                         ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+                         : (OP(o) == ANYOFH || OP(o) == ANYOFR)
                            ? 0xFF
-                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+                           : lowest;
             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
             if (lowest != highest) {
                 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);