This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(toke|regcomp).c: Use common fcn to handle \0 problems
[perl5.git] / regcomp.c
index c72a444..118e40f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -114,7 +114,7 @@ typedef struct scan_frame {
 
 /* Certain characters are output as a sequence with the first being a
  * backslash. */
-#define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
+#define isBACKSLASHED_PUNCT(c)  memCHRs("-[]\\^", c)
 
 
 struct RExC_state_t {
@@ -343,9 +343,14 @@ struct RExC_state_t {
 
 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 #define PBITVAL(paren) (1 << ((paren) & 7))
-#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
-#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
-#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+#define PAREN_OFFSET(depth) \
+    (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
+#define PAREN_TEST(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
+#define PAREN_SET(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
+#define PAREN_UNSET(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 
 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
                                      if (!UTF) {                           \
@@ -357,7 +362,7 @@ struct RExC_state_t {
 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
  * a flag that indicates we need to override /d with /u as a result of
  * something in the pattern.  It should only be used in regards to calling
- * set_regex_charset() or get_regex_charse() */
+ * set_regex_charset() or get_regex_charset() */
 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
     STMT_START {                                                            \
             if (DEPENDS_SEMANTICS) {                                        \
@@ -426,6 +431,17 @@ struct RExC_state_t {
 #define _invlist_intersection_complement_2nd(a, b, output) \
                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 
+/* We add a marker if we are deferring expansion of a property that is both
+ * 1) potentiallly user-defined; and
+ * 2) could also be an official Unicode property.
+ *
+ * Without this marker, any deferred expansion can only be for a user-defined
+ * one.  This marker shouldn't conflict with any that could be in a legal name,
+ * and is appended to its name to indicate this.  There is a string and
+ * character form */
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERs  "~"
+#define DEFERRED_COULD_BE_OFFICIAL_MARKERc  '~'
+
 /* About scan_data_t.
 
   During optimisation we recurse through the regexp program performing
@@ -872,11 +888,27 @@ static const scan_data_t zero_scan_data = {
     } STMT_END
 
 /* m is not necessarily a "literal string", in this macro */
-#define reg_warn_non_literal_string(loc, m)                             \
-    _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
-                      Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
+#define warn_non_literal_string(loc, packed_warn, m)                    \
+    _WARN_HELPER(loc, packed_warn,                                      \
+                      Perl_warner(aTHX_ packed_warn,                    \
                                        "%s" REPORT_LOCATION,            \
                                   m, REPORT_LOCATION_ARGS(loc)))
+#define reg_warn_non_literal_string(loc, m)                             \
+                warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
+
+#define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
+    STMT_START {                                                            \
+                char * format;                                              \
+                Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
+                Newx(format, format_size, char);                            \
+                my_strlcpy(format, m, format_size);                         \
+                my_strlcat(format, REPORT_LOCATION, format_size);           \
+                SAVEFREEPV(format);                                         \
+                _WARN_HELPER(loc, packwarn,                                 \
+                      Perl_ck_warner(aTHX_ packwarn,                        \
+                                        format,                             \
+                                        a1, REPORT_LOCATION_ARGS(loc)));    \
+    } STMT_END
 
 #define        ckWARNreg(loc,m)                                                \
     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
@@ -1588,7 +1620,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     unsigned int i;
     const U32 n = ARG(node);
     bool new_node_has_latin1 = FALSE;
-    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
                       ? 0
                       : ANYOF_FLAGS(node);
 
@@ -1643,7 +1675,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     }
 
     /* Add in the points from the bit map */
-    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
             if (ANYOF_BITMAP_TEST(node, i)) {
                 unsigned int start = i++;
@@ -1730,7 +1762,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      * another SSC or a regular ANYOF class.  Can create false positives. */
 
     SV* anded_cp_list;
-    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
                           ? 0
                           : ANYOF_FLAGS(and_with);
     U8  anded_flags;
@@ -1916,7 +1948,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 
     SV* ored_cp_list;
     U8 ored_flags;
-    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
                          ? 0
                          : ANYOF_FLAGS(or_with);
 
@@ -2142,6 +2174,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
 
     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+    SvREFCNT_dec(invlist);
 
     /* Make sure is clone-safe */
     ssc->invlist = NULL;
@@ -2785,7 +2818,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
         if (OP(noper) == NOTHING) {
             /* skip past a NOTHING at the start of an alternation
              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
+             *
+             * If the next node is not something we are supposed to process
+             * we will just ignore it due to the condition guarding the
+             * next block.
              */
+
             regnode *noper_next= regnext(noper);
             if (noper_next < tail)
                 noper= noper_next;
@@ -3007,6 +3045,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                 regnode *noper_next= regnext(noper);
                 if (noper_next < tail)
                     noper= noper_next;
+                /* we will undo this assignment if noper does not
+                 * point at a trieable type in the else clause of
+                 * the following statement. */
             }
 
             if (    noper < tail
@@ -3068,7 +3109,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
                    }
                }
-           }
+            } else {
+                /* If we end up here it is because we skipped past a NOTHING, but did not end up
+                 * on a trieable type. So we need to reset noper back to point at the first regop
+                 * in the branch before we call TRIE_HANDLE_WORD()
+                */
+                noper= NEXTOPER(cur);
+            }
             TRIE_HANDLE_WORD(state);
 
         } /* end second pass */
@@ -3232,6 +3279,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                 regnode *noper_next= regnext(noper);
                 if (noper_next < tail)
                     noper= noper_next;
+                /* we will undo this assignment if noper does not
+                 * point at a trieable type in the else clause of
+                 * the following statement. */
             }
 
             if (    noper < tail
@@ -3272,6 +3322,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                     /* charid is now 0 if we dont know the char read, or
                      * nonzero if we do */
                 }
+            } else {
+                /* If we end up here it is because we skipped past a NOTHING, but did not end up
+                 * on a trieable type. So we need to reset noper back to point at the first regop
+                 * in the branch before we call TRIE_HANDLE_WORD().
+                */
+                noper= NEXTOPER(cur);
             }
             accept_state = TRIE_NODENUM( state );
             TRIE_HANDLE_WORD(accept_state);
@@ -3994,11 +4050,6 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
  *      using /iaa matching will be doing so almost entirely with ASCII
  *      strings, so this should rarely be encountered in practice */
 
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
-    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
-                                      && OP(scan) != LEXACT_REQ8)  \
-        join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
-
 STATIC U32
 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                    UV *min_subtract, bool *unfolded_multi_char,
@@ -4387,23 +4438,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
            }
 #endif
        }
-
-        if (     STR_LEN(scan) == 1
-            &&   isALPHA_A(* STRING(scan))
-            &&  (         OP(scan) == EXACTFAA
-                 || (     OP(scan) == EXACTFU
-                     && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
-        {
-            U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
-
-            /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
-             * with the mask set to the complement of the bit that differs
-             * between upper and lower case, and the lowest code point of the
-             * pair (which the '&' forces) */
-            OP(scan) = ANYOFM;
-            ARG_SET(scan, *STRING(scan) & mask);
-            FLAGS(scan) = mask;
-        }
     }
 
 #ifdef DEBUGGING
@@ -4463,6 +4497,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
     dVAR;
+    SSize_t final_minlen;
     /* There must be at least this number of characters to match */
     SSize_t min = 0;
     I32 pars = 0, code;
@@ -4506,15 +4541,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             U32 j;
             for ( j = 0 ; j < recursed_depth ; j++ ) {
                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
-                    if (
-                        PAREN_TEST(RExC_study_chunk_recursed +
-                                   ( j * RExC_study_chunk_recursed_bytes), i )
-                        && (
-                            !j ||
-                            !PAREN_TEST(RExC_study_chunk_recursed +
-                                   (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
-                        )
-                    ) {
+                    if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
                         Perl_re_printf( aTHX_ " %d",(int)i);
                         break;
                     }
@@ -4542,7 +4569,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
          * parsing code, as each (?:..) is handled by a different invocation of
          * reg() -- Yves
          */
-        JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
+        if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
+                                          && OP(scan) != LEXACT_REQ8)
+            join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
+                    0, NULL, depth + 1);
 
         /* Follow the next-chain of the current node and optimize
            away all the NOTHINGs from it.  */
@@ -4782,7 +4812,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if ( SvIV(re_trie_maxbuff)>=0  ) {
                         regnode *cur;
                         regnode *first = (regnode *)NULL;
-                        regnode *last = (regnode *)NULL;
+                        regnode *prev = (regnode *)NULL;
                         regnode *tail = scan;
                         U8 trietype = 0;
                         U32 count=0;
@@ -4913,7 +4943,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
                                 }
                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
-                                   REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                                   REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                   PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
                                );
                             });
@@ -4959,7 +4989,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 } else {
                                     if ( trietype == NOTHING )
                                         trietype = noper_trietype;
-                                    last = cur;
+                                    prev = cur;
                                 }
                                if (first)
                                    count++;
@@ -4969,7 +4999,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  * noper may either be a triable node which can
                                  * not be tried together with the current trie,
                                  * or a non triable node */
-                                if ( last ) {
+                                if ( prev ) {
                                     /* If last is set and trietype is not
                                      * NOTHING then we have found at least two
                                      * triable branch sequences in a row of a
@@ -4982,7 +5012,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                         make_trie( pRExC_state,
                                                 startbranch, first, cur, tail,
                                                 count, trietype, depth+1 );
-                                    last = NULL; /* note: we clear/update
+                                    prev = NULL; /* note: we clear/update
                                                     first, trietype etc below,
                                                     so we dont do it here */
                                 }
@@ -5011,12 +5041,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
-                               REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+                               REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
                                PL_reg_name[trietype]
                             );
 
                         });
-                        if ( last && trietype ) {
+                        if ( prev && trietype ) {
                             if ( trietype != NOTHING ) {
                                 /* the last branch of the sequence was part of
                                  * a trie, so we have to construct it here
@@ -5061,9 +5091,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                         OP(opt)= OPTIMIZED;
                                 }
                             }
-                        } /* end if ( last) */
+                        } /* end if ( prev) */
                     } /* TRIE_MAXBUF is non zero */
-
                 } /* do trie */
 
            }
@@ -5117,8 +5146,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                 if (
                     !recursed_depth
-                    ||
-                    !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
+                    || !PAREN_TEST(recursed_depth - 1, paren)
                 ) {
                     /* it is quite possible that there are more efficient ways
                      * to do this. We maintain a bitmap per level of recursion
@@ -5133,13 +5161,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (!recursed_depth) {
                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
                     } else {
-                        Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
-                             RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
+                        Copy(PAREN_OFFSET(recursed_depth - 1),
+                             PAREN_OFFSET(recursed_depth),
                              RExC_study_chunk_recursed_bytes, U8);
                     }
                     /* we havent recursed into this paren yet, so recurse into it */
                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
-                    PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
+                    PAREN_SET(recursed_depth, paren);
                     my_recursed_depth= recursed_depth + 1;
                 } else {
                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
@@ -5204,17 +5232,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  || OP(scan) == LEXACT_REQ8
                  || OP(scan) == EXACTL)
         {
-           SSize_t l = STR_LEN(scan);
+           SSize_t bytelen = STR_LEN(scan), charlen;
            UV uc;
-            assert(l);
+            assert(bytelen);
            if (UTF) {
                const U8 * const s = (U8*)STRING(scan);
-               uc = utf8_to_uvchr_buf(s, s + l, NULL);
-               l = utf8_length(s, s + l);
+               uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+               charlen = utf8_length(s, s + bytelen);
            } else {
                uc = *((U8*)STRING(scan));
+                charlen = bytelen;
            }
-           min += l;
+           min += charlen;
            if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
                /* The code below prefers earlier match for fixed
                   offset, later match for variable offset.  */
@@ -5223,7 +5252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    data->last_start_max = is_inf
                        ? SSize_t_MAX : data->pos_min + data->pos_delta;
                }
-               sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+               sv_catpvn(data->last_found, STRING(scan), bytelen);
                if (UTF)
                    SvUTF8_on(data->last_found);
                {
@@ -5231,11 +5260,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
                    if (mg && mg->mg_len >= 0)
-                       mg->mg_len += utf8_length((U8*)STRING(scan),
-                                              (U8*)STRING(scan)+STR_LEN(scan));
+                       mg->mg_len += charlen;
                }
-               data->last_end = data->pos_min + l;
-               data->pos_min += l; /* As in the first entry. */
+               data->last_end = data->pos_min + charlen;
+               data->pos_min += charlen; /* As in the first entry. */
                data->flags &= ~SF_BEFORE_EOL;
            }
 
@@ -5257,25 +5285,42 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        }
         else if (PL_regkind[OP(scan)] == EXACT) {
             /* But OP != EXACT!, so is EXACTFish */
-           SSize_t l = STR_LEN(scan);
+           SSize_t bytelen = STR_LEN(scan), charlen;
             const U8 * s = (U8*)STRING(scan);
 
+            /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+             * with the mask set to the complement of the bit that differs
+             * between upper and lower case, and the lowest code point of the
+             * pair (which the '&' forces) */
+            if (     bytelen == 1
+                &&   isALPHA_A(*s)
+                &&  (         OP(scan) == EXACTFAA
+                     || (     OP(scan) == EXACTFU
+                         && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
+            {
+                U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+                OP(scan) = ANYOFM;
+                ARG_SET(scan, *s & mask);
+                FLAGS(scan) = mask;
+                /* we're not EXACTFish any more, so restudy */
+                continue;
+            }
+
            /* Search for fixed substrings supports EXACT only. */
            if (flags & SCF_DO_SUBSTR) {
                assert(data);
                 scan_commit(pRExC_state, data, minlenp, is_inf);
            }
-           if (UTF) {
-               l = utf8_length(s, s + l);
-           }
+            charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
            if (unfolded_multi_char) {
                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
            }
-           min += l - min_subtract;
+           min += charlen - min_subtract;
             assert (min >= 0);
             delta += min_subtract;
            if (flags & SCF_DO_SUBSTR) {
-               data->pos_min += l - min_subtract;
+               data->pos_min += charlen - min_subtract;
                if (data->pos_min < 0) {
                     data->pos_min = 0;
                 }
@@ -5745,10 +5790,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                continue;
 
            default:
-#ifdef DEBUGGING
                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
                                                                     OP(scan));
-#endif
             case REF:
             case CLUMP:
                if (flags & SCF_DO_SUBSTR) {
@@ -5864,6 +5907,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                 case ANYOFH:
                 case ANYOFHb:
                 case ANYOFHr:
+                case ANYOFHs:
                 case ANYOF:
                    if (flags & SCF_DO_STCLASS_AND)
                        ssc_and(pRExC_state, data->start_class,
@@ -5889,6 +5933,26 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                     break;
                   }
 
+                case ANYOFR:
+                case ANYOFRb:
+                  {
+                    SV* cp_list = NULL;
+
+                    cp_list = _add_range_to_invlist(cp_list,
+                                        ANYOFRbase(scan),
+                                        ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+                    if (flags & SCF_DO_STCLASS_OR) {
+                        ssc_union(data->start_class, cp_list, invert);
+                    }
+                    else if (flags & SCF_DO_STCLASS_AND) {
+                        ssc_intersection(data->start_class, cp_list, invert);
+                    }
+
+                    SvREFCNT_dec_NN(cp_list);
+                    break;
+                  }
+
                case NPOSIXL:
                     invert = 1;
                     /* FALLTHROUGH */
@@ -6429,18 +6493,15 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
 
     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
 
-    {
-        SSize_t final_minlen= min < stopmin ? min : stopmin;
-
-        if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
-            if (final_minlen > SSize_t_MAX - delta)
-                RExC_maxlen = SSize_t_MAX;
-            else if (RExC_maxlen < final_minlen + delta)
-                RExC_maxlen = final_minlen + delta;
-        }
-        return final_minlen;
+    final_minlen = min < stopmin
+            ? min : stopmin;
+    if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
+        if (final_minlen > SSize_t_MAX - delta)
+            RExC_maxlen = SSize_t_MAX;
+        else if (RExC_maxlen < final_minlen + delta)
+            RExC_maxlen = final_minlen + delta;
     }
-    NOT_REACHED; /* NOTREACHED */
+    return final_minlen;
 }
 
 STATIC U32
@@ -7398,28 +7459,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
-    /* Initialize these here instead of as-needed, as is quick and avoids
-     * having to test them each time otherwise */
-    if (! PL_InBitmap) {
-#ifdef DEBUGGING
-        char * dump_len_string;
-#endif
-
-        /* This is calculated here, because the Perl program that generates the
-         * static global ones doesn't currently have access to
-         * NUM_ANYOF_CODE_POINTS */
-       PL_InBitmap = _new_invlist(2);
-       PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
-                                                    NUM_ANYOF_CODE_POINTS - 1);
-#ifdef DEBUGGING
-        dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
-        if (   ! dump_len_string
-            || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
-        {
-            PL_dump_re_max_len = 60;    /* A reasonable default */
-        }
-#endif
-    }
 
     pRExC_state->warn_text = NULL;
     pRExC_state->unlexed_names = NULL;
@@ -8677,8 +8716,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
        i = rx->sublen + rx->suboffset - rx->offs[0].end;
     }
     else
-    if ( 0 <= n && n <= (I32)rx->nparens &&
-        (s1 = rx->offs[n].start) != -1 &&
+    if (inRANGE(n, 0, (I32)rx->nparens) &&
+        (s1 = rx->offs[n].start) != -1  &&
         (t1 = rx->offs[n].end) != -1)
     {
         /* $&, ${^MATCH},  $1 ... */
@@ -8819,7 +8858,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
 
         i = t1 - s1;
         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
-                       i = el;
+            i = el;
     }
     return i;
 }
@@ -10297,6 +10336,28 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
 
 #endif
 
+PERL_STATIC_INLINE UV
+S_invlist_lowest(SV* const invlist)
+{
+    /* Returns the lowest code point that matches an inversion list.  This API
+     * has an ambiguity, as it returns 0 under either the lowest is actually
+     * 0, or if the list is empty.  If this distinction matters to you, check
+     * for emptiness before calling this function */
+
+    UV len = _invlist_len(invlist);
+    UV *array;
+
+    PERL_ARGS_ASSERT_INVLIST_LOWEST;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    array = invlist_array(invlist);
+
+    return array[0];
+}
+
 STATIC SV *
 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
 {
@@ -10590,8 +10651,8 @@ S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
         }
         else {  /* Single char fold */
             unsigned int k;
-            unsigned int first_fold;
-            const unsigned int * remaining_folds;
+            U32 first_fold;
+            const U32 * remaining_folds;
             Size_t folds_count;
 
             /* It matches itself */
@@ -10693,7 +10754,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
     }
 
     while (RExC_parse < RExC_end) {
-        /* && strchr("iogcmsx", *RExC_parse) */
+        /* && memCHRs("iogcmsx", *RExC_parse) */
         /* (?g), (?gc) and (?o) are useless here
            and must be globally applied -- japhy */
         switch (*RExC_parse) {
@@ -11347,6 +11408,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                 RExC_parse-seqstart, seqstart);
                NOT_REACHED; /*NOTREACHED*/
             case '<':           /* (?<...) */
+                /* If you want to support (?<*...), first reconcile with GH #17363 */
                if (*RExC_parse == '!')
                    paren = ',';
                else if (*RExC_parse != '=')
@@ -11920,6 +11982,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
                 }
                 /* FALLTHROUGH */
+            case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
+           /* FALLTHROUGH */
            default: /* e.g., (?i) */
                RExC_parse = (char *) seqstart + 1;
               parse_flags:
@@ -12830,9 +12894,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         value = (U8 *) SvPV(value_sv, value_len);
 
         /* See if the result is one code point vs 0 or multiple */
-        if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
-                                               ? UTF8SKIP(value)
-                                               : 1))
+        if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
+                                  ? UTF8SKIP(value)
+                                  : 1)))
         {
             /* Here, exactly one code point.  If that isn't what is wanted,
              * fail */
@@ -13904,6 +13968,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        || ! is_PATWS_safe((p), RExC_end, UTF));
 
                switch ((U8)*p) {
+                  const char* message;
+                  U32 packed_warn;
+                  U8 grok_c_char;
+
                case '^':
                case '$':
                case '.':
@@ -14019,67 +14087,70 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        p++;
                        break;
                    case 'o':
-                       {
-                           UV result;
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_o(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                      TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       TRUE, /* Output warnings
-                                                                for non-
-                                                                portables */
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
-                           break;
-                       }
+                        if (! grok_bslash_o(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p; /* going to die anyway; point to
+                                               exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+                        break;
                    case 'x':
-                       {
-                            UV result = UV_MAX; /* initialize to erroneous
-                                                   value */
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_x(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                       TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       TRUE, /* Silence warnings
-                                                                for non-
-                                                                portables */
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
+                        if (! grok_bslash_x(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p;    /* going to die anyway; point
+                                                   to exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
 
 #ifdef EBCDIC
-                            if (ender < 0x100) {
-                                if (RExC_recode_x_to_native) {
-                                    ender = LATIN1_TO_NATIVE(ender);
-                                }
-                           }
+                        if (ender < 0x100) {
+                            if (RExC_recode_x_to_native) {
+                                ender = LATIN1_TO_NATIVE(ender);
+                            }
+                        }
 #endif
-                           break;
-                       }
+                        break;
                    case 'c':
-                       p++;
-                       ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
-                        UPDATE_WARNINGS_LOC(p);
                         p++;
+                        if (! grok_bslash_c(*p, &grok_c_char,
+                                            &message, &packed_warn))
+                        {
+                            /* going to die anyway; point to exact spot of
+                             * failure */
+                            RExC_parse = p + ((UTF)
+                                              ? UTF8_SAFE_SKIP(p, RExC_end)
+                                              : 1);
+                            vFAIL(message);
+                        }
+
+                        ender = grok_c_char;
+                        p++;
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+
                        break;
                     case '8': case '9': /* must be a backreference */
                         --p;
@@ -14114,17 +14185,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* FALLTHROUGH */
                     case '0':
                        {
-                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+                                      | PERL_SCAN_NOTIFY_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            p += numlen;
-                            if (   isDIGIT(*p)  /* like \08, \178 */
-                                && ckWARN(WARN_REGEXP)
-                                && numlen < 3)
+                            if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
+                                && isDIGIT(*p)  /* like \08, \178 */
+                                && ckWARN(WARN_REGEXP))
                             {
                                reg_warn_non_literal_string(
-                                         p + 1,
-                                         form_short_octal_warning(p, numlen));
+                                     p + 1,
+                                     form_alien_digit_msg(8, numlen, p,
+                                                        RExC_end, UTF, FALSE));
                             }
                        }
                        break;
@@ -14201,6 +14274,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                 if (ender > 255) {
                     REQUIRE_UTF8(flagp);
+                    if (   UNICODE_IS_PERL_EXTENDED(ender)
+                        && TO_OUTPUT_WARNINGS(p))
+                    {
+                        ckWARN2_non_literal_string(p,
+                                                   packWARN(WARN_PORTABLE),
+                                                   PL_extended_cp_format,
+                                                   ender);
+                    }
                 }
 
                 /* We need to check if the next non-ignored thing is a
@@ -14526,13 +14607,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                 goto continue_parse;
             }
-            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
-                                locale, and prepare for that? */
+            else if (FOLD) {
                 bool splittable = FALSE;
                 bool backed_up = FALSE;
-                char * e = s;
-
-                assert(FOLD);
+                char * e;
+                char * s_start;
 
                 /* Here is /i.  Running out of room creates a problem if we are
                  * folding, and the split happens in the middle of a
@@ -14569,6 +14648,132 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  *  oldp      points to the beginning byte in the input of
                  *            'ender'.
                  *
+                 * In the case of /il, we haven't folded anything that could be
+                 * affected by the locale.  That means only above-Latin1
+                 * characters that fold to other above-latin1 characters get
+                 * folded at compile time.  To check where a good place to
+                 * split nodes is, everything in it will have to be folded.
+                 * The boolean 'maybe_exactfu' keeps track in /il if there are
+                 * any unfolded characters in the node. */
+                bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+                /* If we do need to fold the node, we need a place to store the
+                 * folded copy, and a way to map back to the unfolded original
+                 * */
+                char * locfold_buf = NULL;
+                Size_t * loc_correspondence = NULL;
+
+                if (! need_to_fold_loc) {   /* The normal case.  Just
+                                               initialize to the actual node */
+                    e = s;
+                    s_start = s0;
+                    s = old_old_s;  /* Point to the beginning of the final char
+                                       that fits in the node */
+                }
+                else {
+
+                    /* Here, we have filled a /il node, and there are unfolded
+                     * characters in it.  If the runtime locale turns out to be
+                     * UTF-8, there are possible multi-character folds, just
+                     * like when not under /l.  The node hence can't terminate
+                     * in the middle of such a fold.  To determine this, we
+                     * have to create a folded copy of this node.  That means
+                     * reparsing the node, folding everything assuming a UTF-8
+                     * locale.  (If at runtime it isn't such a locale, the
+                     * actions here wouldn't have been necessary, but we have
+                     * to assume the worst case.)  If we find we need to back
+                     * off the folded string, we do so, and then map that
+                     * position back to the original unfolded node, which then
+                     * gets output, truncated at that spot */
+
+                    char * redo_p = RExC_parse;
+                    char * redo_e;
+                    char * old_redo_e;
+
+                    /* Allow enough space assuming a single byte input folds to
+                     * a single byte output, plus assume that the two unparsed
+                     * characters (that we may need) fold to the largest number
+                     * of bytes possible, plus extra for one more worst case
+                     * scenario.  In the loop below, if we start eating into
+                     * that final spare space, we enlarge this initial space */
+                    Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+                    Newxz(locfold_buf, size, char);
+                    Newxz(loc_correspondence, size, Size_t);
+
+                    /* Redo this node's parse, folding into 'locfold_buf' */
+                    redo_p = RExC_parse;
+                    old_redo_e = redo_e = locfold_buf;
+                    while (redo_p <= oldp) {
+
+                        old_redo_e = redo_e;
+                        loc_correspondence[redo_e - locfold_buf]
+                                                        = redo_p - RExC_parse;
+
+                        if (UTF) {
+                            Size_t added_len;
+
+                            (void) _to_utf8_fold_flags((U8 *) redo_p,
+                                                       (U8 *) RExC_end,
+                                                       (U8 *) redo_e,
+                                                       &added_len,
+                                                       FOLD_FLAGS_FULL);
+                            redo_e += added_len;
+                            redo_p += UTF8SKIP(redo_p);
+                        }
+                        else {
+
+                            /* Note that if this code is run on some ancient
+                             * Unicode versions, SHARP S doesn't fold to 'ss',
+                             * but rather than clutter the code with #ifdef's,
+                             * as is done above, we ignore that possibility.
+                             * This is ok because this code doesn't affect what
+                             * gets matched, but merely where the node gets
+                             * split */
+                            if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+                                *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+                            }
+                            else {
+                                *redo_e++ = 's';
+                                *redo_e++ = 's';
+                            }
+                            redo_p++;
+                        }
+
+
+                        /* If we're getting so close to the end that a
+                         * worst-case fold in the next character would cause us
+                         * to overflow, increase, assuming one byte output byte
+                         * per one byte input one, plus room for another worst
+                         * case fold */
+                        if (   redo_p <= oldp
+                            && redo_e > locfold_buf + size
+                                                    - (UTF8_MAXBYTES_CASE + 1))
+                        {
+                            Size_t new_size = size
+                                            + (oldp - redo_p)
+                                            + UTF8_MAXBYTES_CASE + 1;
+                            Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+                            Renew(locfold_buf, new_size, char);
+                            Renew(loc_correspondence, new_size, Size_t);
+                            size = new_size;
+
+                            redo_e = locfold_buf + e_offset;
+                        }
+                    }
+
+                    /* Set so that things are in terms of the folded, temporary
+                     * string */
+                    s = old_redo_e;
+                    s_start = locfold_buf;
+                    e = redo_e;
+
+                }
+
+                /* Here, we have 's', 's_start' and 'e' set up to point to the
+                 * input that goes into the node, folded.
+                 *
                  * If the final character of the node and the fold of ender
                  * form the first two characters of a three character fold, we
                  * need to peek ahead at the next (unparsed) character in the
@@ -14608,11 +14813,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * and try again.
                  *
                  * Otherwise, the node can be split at the current position.
-                 */
-                s = old_old_s;  /* Point to the beginning of the final char
-                                   that fits in the node */
-
-                /* The same logic is used for UTF-8 patterns and not */
+                 *
+                 * The same logic is used for UTF-8 patterns and not */
                 if (UTF) {
                     Size_t added_len;
 
@@ -14651,7 +14853,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * drop down to try at that position */
                         if (isPUNCT(*p)) {
                             s = (char *) utf8_hop_back((U8 *) s, -1,
-                                       (U8 *) s0);
+                                       (U8 *) s_start);
                             backed_up = TRUE;
                         }
                         else {
@@ -14683,7 +14885,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      * either case would break apart a fold */
                     do {
                         char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
-                                                                    (U8 *) s0);
+                                                            (U8 *) s_start);
 
                         /* If is a multi-char fold, can't split here.  Backup
                          * one char and try again */
@@ -14697,11 +14899,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * three character fold starting at the character
                          * before s, we can't split either before or after s.
                          * Backup two chars and try again */
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                             && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
                         {
                             s = prev_s;
-                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                            s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
                             backed_up = TRUE;
                             continue;
                         }
@@ -14711,7 +14913,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         splittable = TRUE;
                         break;
 
-                    } while (s > s0); /* End of loops backing up through the node */
+                    } while (s > s_start); /* End of loops backing up through the node */
 
                     /* Here we either couldn't find a place to split the node,
                      * or else we broke out of the loop setting 'splittable' to
@@ -14760,7 +14962,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             continue;
                         }
 
-                        if (   LIKELY(s > s0)
+                        if (   LIKELY(s > s_start)
                             && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
                         {
                             s -= 2;
@@ -14771,7 +14973,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         splittable = TRUE;
                         break;
 
-                    } while (s > s0);
+                    } while (s > s_start);
 
                     if (splittable) {
                         s++;
@@ -14785,9 +14987,28 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    /* If we did find a place to split, reparse the entire node
                     * stopping where we have calculated. */
                     if (splittable) {
-                        upper_fill = s - s0;
+
+                       /* If we created a temporary folded string under /l, we
+                        * have to map that back to the original */
+                        if (need_to_fold_loc) {
+                            upper_fill = loc_correspondence[s - s_start];
+                            Safefree(locfold_buf);
+                            Safefree(loc_correspondence);
+
+                            if (upper_fill == 0) {
+                                FAIL2("panic: loc_correspondence[%d] is 0",
+                                      (int) (s - s_start));
+                            }
+                        }
+                        else {
+                            upper_fill = s - s0;
+                        }
                         goto reparse;
                     }
+                    else if (need_to_fold_loc) {
+                        Safefree(locfold_buf);
+                        Safefree(loc_correspondence);
+                    }
 
                     /* Here the node consists entirely of non-final multi-char
                      * folds.  (Likely it is all 'f's or all 's's.)  There's no
@@ -14961,7 +15182,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
     assert(PL_regkind[OP(node)] == ANYOF);
 
     /* There is no bitmap for this node type */
-    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
         return;
     }
 
@@ -16679,8 +16900,8 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
                        current Unicode version */
           {
             Size_t folds_count;
-            unsigned int first_fold;
-            const unsigned int * remaining_folds;
+            U32 first_fold;
+            const U32 * remaining_folds;
             UV folded_cp;
 
             if (isASCII(cp)) {
@@ -16736,6 +16957,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
 
     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+        CLEAR_POSIX_WARNINGS();
         return;
     }
 
@@ -16754,6 +16976,22 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
     UPDATE_WARNINGS_LOC(RExC_parse);
 }
 
+PERL_STATIC_INLINE Size_t
+S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
+{
+    const U8 * const start = s1;
+    const U8 * const send = start + max;
+
+    PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
+
+    while (s1 < send && *s1  == *s2) {
+        s1++; s2++;
+    }
+
+    return s1 - start;
+}
+
+
 STATIC AV *
 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
 {
@@ -17150,6 +17388,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * is already in 'value'.  Otherwise, need to translate the escape
              * into what it signifies. */
             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
+                const char * message;
+                U32 packed_warn;
+                U8 grok_c_char;
 
            case 'w':   namedclass = ANYOF_WORDCHAR;    break;
            case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
@@ -17400,53 +17641,74 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            case 'a':   value = '\a';                   break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_o(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                               TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               silence_non_portable,
-                                               UTF);
-                   if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
-                non_portable_endpoint++;
+                if (! grok_bslash_o(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
+                if (value < 256) {
+                    non_portable_endpoint++;
+                }
                break;
            case 'x':
                RExC_parse--;   /* function expects to be pointed at the 'x' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_x(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                              TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               silence_non_portable,
-                                               UTF);
-                    if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
-                non_portable_endpoint++;
+                if (!  grok_bslash_x(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
+                if (value < 256) {
+                    non_portable_endpoint++;
+                }
                break;
            case 'c':
-               value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
-                UPDATE_WARNINGS_LOC(RExC_parse);
-               RExC_parse++;
+                if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
+                                                                &packed_warn))
+                {
+                    /* going to die anyway; point to exact spot of
+                        * failure */
+                    RExC_parse += (UTF)
+                                  ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                  : 1;
+                    vFAIL(message);
+                }
+
+                value = grok_c_char;
+                RExC_parse++;
+                if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                 non_portable_endpoint++;
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7':
                {
                    /* Take 1-3 octal digits */
-                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+                              | PERL_SCAN_NOTIFY_ILLDIGIT;
                     numlen = (strict) ? 4 : 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
@@ -17457,17 +17719,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                           : 1;
                             vFAIL("Need exactly 3 octal digits");
                         }
-                        else if (   numlen < 3 /* like \08, \178 */
+                        else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
                                  && RExC_parse < RExC_end
                                  && isDIGIT(*RExC_parse)
                                  && ckWARN(WARN_REGEXP))
                         {
                             reg_warn_non_literal_string(
                                  RExC_parse + 1,
-                                 form_short_octal_warning(RExC_parse, numlen));
+                                 form_alien_digit_msg(8, numlen, RExC_parse,
+                                                        RExC_end, UTF, FALSE));
                         }
                     }
-                    non_portable_endpoint++;
+                    if (value < 256) {
+                        non_portable_endpoint++;
+                    }
                    break;
                }
            default:
@@ -17729,7 +17994,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
        /* non-Latin1 code point implies unicode semantics. */
        if (value > 255) {
+            if (value > MAX_LEGAL_CP && (   value != UV_MAX
+                                         || prevvalue > MAX_LEGAL_CP))
+            {
+                vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+            }
             REQUIRE_UNI_RULES(flagp, 0);
+            if (  ! silence_non_portable
+                &&  UNICODE_IS_PERL_EXTENDED(value)
+                &&  TO_OUTPUT_WARNINGS(RExC_parse))
+            {
+                ckWARN2_non_literal_string(RExC_parse,
+                                           packWARN(WARN_PORTABLE),
+                                           PL_extended_cp_format,
+                                           value);
+            }
        }
 
         /* Ready to process either the single value, or the completed range.
@@ -18114,8 +18393,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     STRLEN foldlen;
                     unsigned int k;
                     Size_t folds_count;
-                    unsigned int first_fold;
-                    const unsigned int * remaining_folds;
+                    U32 first_fold;
+                    const U32 * remaining_folds;
 
                     if (j < 256) {
 
@@ -18427,14 +18706,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  |= ANYOFL_FOLD
                  |  ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
         }
-        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
-            UV start, end;
-            invlist_iterinit(cp_list);
-            if (invlist_iternext(cp_list, &start, &end) && start < 256) {
-                anyof_flags |= ANYOFL_FOLD;
-                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-            }
-            invlist_iterfinish(cp_list);
+        else if (cp_list && invlist_lowest(cp_list) < 256) {
+            /* If nothing is below 256, has no locale dependency; otherwise it
+             * does */
+            anyof_flags |= ANYOFL_FOLD;
+            has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
         }
     }
     else if (   DEPENDS_SEMANTICS
@@ -18479,9 +18755,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     if (optimizable) {
         PERL_UINT_FAST8_T i;
-        Size_t partial_cp_count = 0;
+        UV partial_cp_count = 0;
         UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
         UV   end[MAX_FOLD_FROMS+1] = { 0 };
+        bool single_range = FALSE;
 
         if (cp_list) { /* Count the code points in enough ranges that we would
                           see all the ones possible in any fold in this version
@@ -18495,6 +18772,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 partial_cp_count += end[i] - start[i] + 1;
             }
 
+            if (i == 1) {
+                single_range = TRUE;
+            }
             invlist_iterfinish(cp_list);
         }
 
@@ -18661,7 +18941,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                 /* Only try if there are no more code points in the class than
                  * in the max possible fold */
-            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
+            &&   inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
         {
             if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
             {
@@ -18819,8 +19099,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     U8 foldbuf[UTF8_MAXBYTES_CASE];
                     UV folded = _to_uni_fold_flags(start[0],
                                                         foldbuf, &foldlen, 0);
-                    unsigned int first_fold;
-                    const unsigned int * remaining_folds;
+                    U32 first_fold;
+                    const U32 * remaining_folds;
                     Size_t folds_to_this_cp_count = _inverse_folds(
                                                             folded,
                                                             &first_fold,
@@ -18939,7 +19219,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  * convert to UTF-8 if not already there */
                 if (value > 255) {
                     if (! UTF) {
-
                         SvREFCNT_dec(cp_list);;
                         REQUIRE_UTF8(flagp);
                     }
@@ -18960,17 +19239,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                 len = (UTF) ? UVCHR_SKIP(value) : 1;
 
-                    ret = regnode_guts(pRExC_state, op, len, "exact");
-                    FILL_NODE(ret, op);
-                    RExC_emit += 1 + STR_SZ(len);
-                    setSTR_LEN(REGNODE_p(ret), len);
-                    if (len == 1) {
-                        *STRING(REGNODE_p(ret)) = (U8) value;
-                    }
-                    else {
-                        uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
-                    }
-                    goto not_anyof;
+                ret = regnode_guts(pRExC_state, op, len, "exact");
+                FILL_NODE(ret, op);
+                RExC_emit += 1 + STR_SZ(len);
+                setSTR_LEN(REGNODE_p(ret), len);
+                if (len == 1) {
+                    *STRINGs(REGNODE_p(ret)) = (U8) value;
+                }
+                else {
+                    uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
+                }
+                goto not_anyof;
             }
         }
 
@@ -19213,6 +19492,52 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             SvREFCNT_dec(intersection);
         }
 
+        /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+         * both in size and speed.  Currently, a 20 bit range base (smallest
+         * code point in the range), and a 12 bit maximum delta are packed into
+         * a 32 bit word.  This allows for using it on all of the Unicode code
+         * points except for the highest plane, which is only for private use
+         * code points.  khw doubts that a bigger delta is likely in real world
+         * applications */
+        if (     single_range
+            && ! has_runtime_dependency
+            &&   anyof_flags == 0
+            &&   start[0] < (1 << ANYOFR_BASE_BITS)
+            &&   end[0] - start[0]
+                    < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+                                   * CHARBITS - ANYOFR_BASE_BITS))))
+
+        {
+            U8 low_utf8[UTF8_MAXBYTES+1];
+            U8 high_utf8[UTF8_MAXBYTES+1];
+
+            ret = reganode(pRExC_state, ANYOFR,
+                        (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+            /* Place the lowest UTF-8 start byte in the flags field, so as to
+             * allow efficient ruling out at run time of many possible inputs.
+             * */
+            (void) uvchr_to_utf8(low_utf8, start[0]);
+            (void) uvchr_to_utf8(high_utf8, end[0]);
+
+            /* If all code points share the same first byte, this can be an
+             * ANYOFRb.  Otherwise store the lowest UTF-8 start byte which can
+             * quickly rule out many inputs at run-time without having to
+             * compute the code point from UTF-8.  For EBCDIC, we use I8, as
+             * not doing that transformation would not rule out nearly so many
+             * things */
+            if (low_utf8[0] == high_utf8[0]) {
+                OP(REGNODE_p(ret)) = ANYOFRb;
+                ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
+            }
+            else {
+                ANYOF_FLAGS(REGNODE_p(ret))
+                                    = NATIVE_UTF8_TO_I8(low_utf8[0]);
+            }
+
+            goto not_anyof;
+        }
+
         /* If didn't find an optimization and there is no need for a bitmap,
          * optimize to indicate that */
         if (     start[0] >= NUM_ANYOF_CODE_POINTS
@@ -19223,14 +19548,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             U8 low_utf8[UTF8_MAXBYTES+1];
             UV highest_cp = invlist_highest(cp_list);
 
-            op = ANYOFH;
-
             /* Currently the maximum allowed code point by the system is
              * IV_MAX.  Higher ones are reserved for future internal use.  This
              * particular regnode can be used for higher ones, but we can't
              * calculate the code point of those.  IV_MAX suffices though, as
              * it will be a large first byte */
-            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+            Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+                           - low_utf8;
 
             /* We store the lowest possible first byte of the UTF-8
              * representation, using the flags field.  This allows for quick
@@ -19239,23 +19563,48 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * transformation would not rule out nearly so many things */
             anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
 
+            op = ANYOFH;
+
             /* If the first UTF-8 start byte for the highest code point in the
              * range is suitably small, we may be able to get an upper bound as
              * well */
             if (highest_cp <= IV_MAX) {
                 U8 high_utf8[UTF8_MAXBYTES+1];
-
-                (void) uvchr_to_utf8(high_utf8, highest_cp);
+                Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+                                - high_utf8;
 
                 /* If the lowest and highest are the same, we can get an exact
-                 * first byte instead of a just minimum.  We signal this with a
-                 * different regnode */
+                 * first byte instead of a just minimum or even a sequence of
+                 * exact leading bytes.  We signal these with different
+                 * regnodes */
                 if (low_utf8[0] == high_utf8[0]) {
+                    Size_t len = find_first_differing_byte_pos(low_utf8,
+                                                               high_utf8,
+                                                       MIN(low_len, high_len));
+
+                    if (len == 1) {
 
-                    /* No need to convert to I8 for EBCDIC as this is an exact
-                     * match */
-                    anyof_flags = low_utf8[0];
-                    op = ANYOFHb;
+                        /* No need to convert to I8 for EBCDIC as this is an
+                         * exact match */
+                        anyof_flags = low_utf8[0];
+                        op = ANYOFHb;
+                    }
+                    else {
+                        op = ANYOFHs;
+                        ret = regnode_guts(pRExC_state, op,
+                                           regarglen[op] + STR_SZ(len),
+                                           "anyofhs");
+                        FILL_NODE(ret, op);
+                        ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
+                                                                        = len;
+                        Copy(low_utf8,  /* Add the common bytes */
+                           ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+                           len, U8);
+                        RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
+                        set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+                                                  NULL, only_utf8_locale_list);
+                        goto not_anyof;
+                    }
                 }
                 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
                 {
@@ -19341,8 +19690,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
-                   ? listsv : NULL,
+                   ? listsv
+                   : NULL,
                   only_utf8_locale_list);
+    SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
     return ret;
 
   not_anyof:
@@ -19353,6 +19705,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                            RExC_parse - orig_parse);;
     SvREFCNT_dec(cp_list);;
+    SvREFCNT_dec(only_utf8_locale_list);
     return ret;
 }
 
@@ -19393,15 +19746,17 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
        SV *rv;
 
         if (cp_list) {
-            av_store(av, INVLIST_INDEX, cp_list);
+            av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
         }
 
         if (only_utf8_locale_list) {
-            av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+            av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+                                     SvREFCNT_inc_NN(only_utf8_locale_list));
         }
 
         if (runtime_defns) {
-            av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
+            av_store(av, DEFERRED_USER_DEFINED_INDEX,
+                         SvREFCNT_inc_NN(runtime_defns));
         }
 
        rv = newRV_noinc(MUTABLE_SV(av));
@@ -19511,10 +19866,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                     STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
                     STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
 
-                    av_store(av, INVLIST_INDEX, invlist);
+                    ary[INVLIST_INDEX] = invlist;
                     av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
-                                 ? ONLY_LOCALE_MATCHES_INDEX:
-                                 INVLIST_INDEX);
+                                 ? ONLY_LOCALE_MATCHES_INDEX
+                                 INVLIST_INDEX);
                     si = NULL;
                 }
            }
@@ -19547,15 +19902,11 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                 UV prev_cp = 0;
                 U8 count = 0;
 
-                /* Ignore everything before the first new-line */
-                while (*si_string != '\n' && remaining > 0) {
-                    si_string++;
-                    remaining--;
-                }
-                assert(remaining > 0);
-
+                /* Ignore everything before and including the first new-line */
+                si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
+                assert (si_string != NULL);
                 si_string++;
-                remaining--;
+                remaining = SvPVX(si) + SvCUR(si) - si_string;
 
                 while (remaining > 0) {
 
@@ -19597,26 +19948,34 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                         continue;
                     }
 
-                    /* Here, didn't find a legal hex number.  Just add it from
-                     * here to the next \n */
+                    /* Here, didn't find a legal hex number.  Just add the text
+                     * from here up to the next \n, omitting any trailing
+                     * markers. */
 
                     remaining -= len;
-                    while (*(si_string + len) != '\n' && remaining > 0) {
-                        remaining--;
-                        len++;
-                    }
-                    if (*(si_string + len) == '\n') {
-                        len++;
-                        remaining--;
-                    }
+                    len = strcspn(si_string,
+                                        DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
+                    remaining -= len;
                     if (matches_string) {
-                        sv_catpvn(matches_string, si_string, len - 1);
+                        sv_catpvn(matches_string, si_string, len);
                     }
                     else {
-                        matches_string = newSVpvn(si_string, len - 1);
+                        matches_string = newSVpvn(si_string, len);
                     }
-                    si_string += len;
                     sv_catpvs(matches_string, " ");
+
+                    si_string += len;
+                    if (   remaining
+                        && UCHARAT(si_string)
+                                            == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
+                    {
+                        si_string++;
+                        remaining--;
+                    }
+                    if (remaining && UCHARAT(si_string) == '\n') {
+                        si_string++;
+                        remaining--;
+                    }
                 } /* end of loop through the text */
 
                 assert(matches_string);
@@ -19810,7 +20169,7 @@ STATIC regnode_offset
 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
 {
     /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
-     * equivalents space.  It aligns and increments RExC_size and RExC_emit
+     * equivalents space.  It aligns and increments RExC_size
      *
      * It returns the regnode's offset into the regex engine program */
 
@@ -20051,6 +20410,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
         scan = REGNODE_OFFSET(temp);
     }
 
+    assert(val >= scan);
     if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
         assert((UV) (val - scan) <= U32_MAX);
         ARG_SET(REGNODE_p(scan), val - scan);
@@ -20514,6 +20874,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                                                 NULL,
                                                 NULL,
                                                 NULL,
+                                                0,
                                                 FALSE
                                                );
             sv_catpvs(sv, "]");
@@ -20596,10 +20957,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     else if (k == LOGICAL)
         /* 2: embedded, otherwise 1 */
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
-    else if (k == ANYOF) {
-       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
-                          ? 0
-                          : ANYOF_FLAGS(o);
+    else if (k == ANYOF || k == ANYOFR) {
+        U8 flags;
+        char * bitmap;
+        U32 arg;
         bool do_sep = FALSE;    /* Do we need to separate various components of
                                    the output? */
         /* Set if there is still an unresolved user-defined property */
@@ -20614,7 +20975,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And things that aren't in the bitmap, but are small enough to be */
         SV* bitmap_range_not_in_bitmap = NULL;
 
-        const bool inverted = flags & ANYOF_INVERT;
+        bool inverted;
+
+        if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+            flags = 0;
+            bitmap = NULL;
+            arg = 0;
+        }
+        else {
+            flags = ANYOF_FLAGS(o);
+            bitmap = ANYOF_BITMAP(o);
+            arg = ARG(o);
+        }
 
        if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
             if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
@@ -20625,17 +20997,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             }
         }
 
+        inverted = flags & ANYOF_INVERT;
+
         /* If there is stuff outside the bitmap, get it */
-        if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
-            (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+        if (arg != ANYOF_ONLY_HAS_BITMAP) {
+            if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
+                nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+                                            ANYOFRbase(o),
+                                            ANYOFRbase(o) + ANYOFRdelta(o));
+            }
+            else {
+                (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
                                                 &unresolved,
                                                 &only_utf8_locale_invlist,
                                                 &nonbitmap_invlist);
+            }
+
             /* The non-bitmap data may contain stuff that could fit in the
              * bitmap.  This could come from a user-defined property being
              * finally resolved when this call was done; or much more likely
              * because there are matches that require UTF-8 to be valid, and so
-             * aren't in the bitmap.  This is teased apart later */
+             * aren't in the bitmap (or ANYOFR).  This is teased apart later */
             _invlist_intersection(nonbitmap_invlist,
                                   PL_InBitmap,
                                   &bitmap_range_not_in_bitmap);
@@ -20655,19 +21037,26 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* Ready to start outputting.  First, the initial left bracket */
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
 
-        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        /* ANYOFH by definition doesn't have anything that will fit inside the
+         * bitmap;  ANYOFR may or may not. */
+        if (  ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+            && (   ! inRANGE(OP(o), ANYOFR, ANYOFRb)
+                ||   ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+        {
             /* Then all the things that could fit in the bitmap */
             do_sep = put_charclass_bitmap_innards(sv,
-                                                  ANYOF_BITMAP(o),
+                                                  bitmap,
                                                   bitmap_range_not_in_bitmap,
                                                   only_utf8_locale_invlist,
                                                   o,
+                                                  flags,
 
                                                   /* Can't try inverting for a
                                                    * better display if there
                                                    * are things that haven't
                                                    * been resolved */
-                                                  unresolved != NULL);
+                                                  unresolved != NULL
+                                            || inRANGE(OP(o), ANYOFR, ANYOFRb));
             SvREFCNT_dec(bitmap_range_not_in_bitmap);
 
             /* If there are user-defined properties which haven't been defined
@@ -20753,15 +21142,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         /* And finally the matching, closing ']' */
        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
-        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+        if (OP(o) == ANYOFHs) {
+            Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+        }
+        else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
             U8 lowest = (OP(o) != ANYOFHr)
                          ? FLAGS(o)
                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
-            U8 highest = (OP(o) == ANYOFHb)
-                         ? lowest
-                         : OP(o) == ANYOFH
+            U8 highest = (OP(o) == ANYOFHr)
+                         ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+                         : (OP(o) == ANYOFH || OP(o) == ANYOFR)
                            ? 0xFF
-                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+                           : lowest;
             Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
             if (lowest != highest) {
                 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
@@ -20779,7 +21171,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             _invlist_invert(cp_list);
         }
 
-        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+        put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
        Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
 
         SvREFCNT_dec(cp_list);
@@ -21860,6 +22252,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                      SV *nonbitmap_invlist,
                                      SV *only_utf8_locale_invlist,
                                      const regnode * const node,
+                                     const U8 flags,
                                      const bool force_as_is_display)
 {
     /* Appends to 'sv' a displayable version of the innards of the bracketed
@@ -21876,6 +22269,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
      *  'node' is the regex pattern ANYOF node.  It is needed only when the
      *      above two parameters are not null, and is passed so that this
      *      routine can tease apart the various reasons for them.
+     *  'flags' is the flags field of 'node'
      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
      *      to invert things to see if that leads to a cleaner display.  If
      *      FALSE, this routine is free to use its judgment about doing this.
@@ -21914,8 +22308,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv,
                                literally */
     SV* inverted_display;   /* The output string when we invert the inputs */
 
-    U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
-
     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
                                                    to match? */
     /* We are biased in favor of displaying things without them being inverted,
@@ -22261,7 +22653,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
        else if ( op == PLUS || op == STAR) {
            DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
        }
-       else if (PL_regkind[(U8)op] == EXACT) {
+       else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
             /* Literal string, where present. */
            node += NODE_SZ_STR(node) - 1;
            node = NEXTOPER(node);
@@ -22291,6 +22683,17 @@ Perl_init_uniprops(pTHX)
 {
     dVAR;
 
+#ifdef DEBUGGING
+    char * dump_len_string;
+
+    dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+    if (   ! dump_len_string
+        || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+    {
+        PL_dump_re_max_len = 60;    /* A reasonable default */
+    }
+#endif
+
     PL_user_def_props = newHV();
 
 #ifdef USE_ITHREADS
@@ -22300,7 +22703,7 @@ Perl_init_uniprops(pTHX)
 
 #endif
 
-    /* Set up the inversion list global variables */
+    /* Set up the inversion list interpreter-level variables */
 
     PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
     PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
@@ -22342,6 +22745,7 @@ Perl_init_uniprops(pTHX)
     PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
     PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
 
+    PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
     PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
     PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
     PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
@@ -22807,7 +23211,7 @@ Perl_parse_uniprop_string(pTHX_
      * Other parameters will be set on return as described below */
 
     const char * const name,    /* The first non-blank in the \p{}, \P{} */
-    const Size_t name_len,      /* Its length in bytes, not including any
+    Size_t name_len,            /* Its length in bytes, not including any
                                    trailing space */
     const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
     const bool to_fold,         /* ? Is this under /i */
@@ -22819,7 +23223,7 @@ Perl_parse_uniprop_string(pTHX_
                                    user-defined property */
     SV * msg,                   /* Any error or warning msg(s) are appended to
                                    this */
-   const STRLEN level)          /* Recursion level of this call */
+    const STRLEN level)         /* Recursion level of this call */
 {
     dVAR;
     char* lookup_name;          /* normalized name for lookup in our tables */
@@ -22843,6 +23247,7 @@ Perl_parse_uniprop_string(pTHX_
                                    the normalized name in certain situations */
     Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
                                    part of a package name */
+    Size_t lun_non_pkg_begin = 0;   /* Similarly for 'lookup_name' */
     bool could_be_user_defined = TRUE;  /* ? Could this be a user-defined
                                              property rather than a Unicode
                                              one. */
@@ -22855,6 +23260,15 @@ Perl_parse_uniprop_string(pTHX_
                                    qualified name */
     bool invert_return = FALSE; /* ? Do we need to complement the result before
                                      returning it */
+    bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
+                                       explicit utf8:: package that we strip
+                                       off  */
+    /* The expansion of properties that could be either user-defined or
+     * official unicode ones is deferred until runtime, including a marker for
+     * those that might be in the latter category.  This boolean indicates if
+     * we've seen that marker.  If not, what we're parsing can't be such an
+     * official Unicode property whose expansion was deferred */
+    bool could_be_deferred_official = FALSE;
 
     PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
 
@@ -22913,6 +23327,19 @@ Perl_parse_uniprop_string(pTHX_
             break;
         }
 
+        /* If this looks like it is a marker we inserted at compile time,
+         * set a flag and otherwise ignore it.  If it isn't in the final
+         * position, keep it as it would have been user input. */
+        if (     UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
+            && ! deferrable
+            &&   could_be_user_defined
+            &&   i == name_len - 1)
+        {
+            name_len--;
+            could_be_deferred_official = TRUE;
+            continue;
+        }
+
         /* Otherwise, this character is part of the name. */
         lookup_name[j++] = cur;
 
@@ -22929,6 +23356,7 @@ Perl_parse_uniprop_string(pTHX_
             i++;
             non_pkg_begin = i + 1;
             lookup_name[j++] = ':';
+            lun_non_pkg_begin = j;
         }
         else { /* Only word chars (and '::') can be in a user-defined name */
             could_be_user_defined = FALSE;
@@ -22946,6 +23374,7 @@ Perl_parse_uniprop_string(pTHX_
         lookup_name +=  STRLENs("utf8::");
         j -=  STRLENs("utf8::");
         equals_pos -=  STRLENs("utf8::");
+        stripped_utf8_pkg = TRUE;
     }
 
     /* Here, we are either done with the whole property name, if it was simple;
@@ -22965,10 +23394,13 @@ Perl_parse_uniprop_string(pTHX_
         /* Most punctuation after the equals indicates a subpattern, like
          * \p{foo=/bar/} */
         if (   isPUNCT_A(name[i])
-            && name[i] != '-'
-            && name[i] != '+'
-            && name[i] != '_'
-            && name[i] != '{')
+            &&  name[i] != '-'
+            &&  name[i] != '+'
+            &&  name[i] != '_'
+            &&  name[i] != '{'
+                /* A backslash means the real delimitter is the next character,
+                 * but it must be punctuation */
+            && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
         {
             /* Find the property.  The table includes the equals sign, so we
              * use 'j' as-is */
@@ -22984,8 +23416,8 @@ Perl_parse_uniprop_string(pTHX_
                 const char * pos_in_brackets;
                 bool escaped = 0;
 
-                /* A backslash means the real delimitter is the next character.
-                 * */
+                /* Backslash => delimitter is the character following.  We
+                 * already checked that it is punctuation */
                 if (open == '\\') {
                     open = name[i++];
                     escaped = 1;
@@ -22996,12 +23428,16 @@ Perl_parse_uniprop_string(pTHX_
                  * set of closing is so that if the opening is something like
                  * ']', the closing will be that as well.  Something similar is
                  * done in toke.c */
-                pos_in_brackets = strchr("([<)]>)]>", open);
+                pos_in_brackets = memCHRs("([<)]>)]>", open);
                 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
 
                 if (    i >= name_len
                     ||  name[name_len-1] != close
-                    || (escaped && name[name_len-2] != '\\'))
+                    || (escaped && name[name_len-2] != '\\')
+                        /* Also make sure that there are enough characters.
+                         * e.g., '\\\' would show up incorrectly as legal even
+                         * though it is too short */
+                    || (SSize_t) (name_len - i - 1 - escaped) < 0)
                 {
                     sv_catpvs(msg, "Unicode property wildcard not terminated");
                     goto append_name_to_msg;
@@ -23342,7 +23778,32 @@ Perl_parse_uniprop_string(pTHX_
         /* Here, the name could be for a user defined property, which are
          * implemented as subs. */
         user_sub = get_cvn_flags(name, name_len, 0);
-        if (user_sub) {
+        if (! user_sub) {
+
+            /* Here, the property name could be a user-defined one, but there
+             * is no subroutine to handle it (as of now).   Defer handling it
+             * until runtime.  Otherwise, a block defined by Unicode in a later
+             * release would get the synonym InFoo added for it, and existing
+             * code that used that name would suddenly break if it referred to
+             * the property before the sub was declared.  See [perl #134146] */
+            if (deferrable) {
+                goto definition_deferred;
+            }
+
+            /* Here, we are at runtime, and didn't find the user property.  It
+             * could be an official property, but only if no package was
+             * specified, or just the utf8:: package. */
+            if (could_be_deferred_official) {
+                lookup_name += lun_non_pkg_begin;
+                j -= lun_non_pkg_begin;
+            }
+            else if (! stripped_utf8_pkg) {
+                goto unknown_user_defined;
+            }
+
+            /* Drop down to look up in the official properties */
+        }
+        else {
             const char insecure[] = "Insecure user-defined property";
 
             /* Here, there is a sub by the correct name.  Normally we call it
@@ -23689,9 +24150,7 @@ Perl_parse_uniprop_string(pTHX_
                  * property hasn't been encountered yet, but at runtime, it's
                  * an error to try to use an undefined one */
                 if (! deferrable) {
-                    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
-                    sv_catpvs(msg, "Unknown user-defined property name");
-                    goto append_name_to_msg;
+                    goto unknown_user_defined;;
                 }
 
                 goto definition_deferred;
@@ -23952,6 +24411,10 @@ Perl_parse_uniprop_string(pTHX_
     }
     return prop_definition;
 
+  unknown_user_defined:
+    if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+    sv_catpvs(msg, "Unknown user-defined property name");
+    goto append_name_to_msg;
 
   failed:
     if (non_pkg_begin != 0) {
@@ -23978,18 +24441,34 @@ Perl_parse_uniprop_string(pTHX_
 
   definition_deferred:
 
-    /* Here it could yet to be defined, so defer evaluation of this
-     * until its needed at runtime.  We need the fully qualified property name
-     * to avoid ambiguity, and a trailing newline */
-    if (! fq_name) {
-        fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
-                                      non_pkg_begin != 0 /* If has "::" */
-                               );
-    }
-    sv_catpvs(fq_name, "\n");
+    {
+        bool is_qualified = non_pkg_begin != 0;  /* If has "::" */
 
-    *user_defined_ptr = TRUE;
-    return fq_name;
+        /* Here it could yet to be defined, so defer evaluation of this until
+         * its needed at runtime.  We need the fully qualified property name to
+         * avoid ambiguity */
+        if (! fq_name) {
+            fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+                                                                is_qualified);
+        }
+
+        /* If it didn't come with a package, or the package is utf8::, this
+         * actually could be an official Unicode property whose inclusion we
+         * are deferring until runtime to make sure that it isn't overridden by
+         * a user-defined property of the same name (which we haven't
+         * encountered yet).  Add a marker to indicate this possibility, for
+         * use at such time when we first need the definition during pattern
+         * matching execution */
+        if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
+            sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
+        }
+
+        /* We also need a trailing newline */
+        sv_catpvs(fq_name, "\n");
+
+        *user_defined_ptr = TRUE;
+        return fq_name;
+    }
 }
 
 #endif