This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Improve handling of nested qr/(?[...])/
[perl5.git] / regcomp.c
index f0054e5..1d758a9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -82,7 +82,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #  include "regcomp.h"
 #endif
 
 #  include "regcomp.h"
 #endif
 
-#include "dquote_inline.h"
 #include "invlist_inline.h"
 #include "unicode_constants.h"
 
 #include "invlist_inline.h"
 #include "unicode_constants.h"
 
@@ -143,6 +142,8 @@ struct RExC_state_t {
     U32                seen;
     SSize_t    size;                   /* Number of regnode equivalents in
                                            pattern */
     U32                seen;
     SSize_t    size;                   /* Number of regnode equivalents in
                                            pattern */
+    Size_t      sets_depth;              /* Counts recursion depth of already-
+                                           compiled regex set patterns */
 
     /* position beyond 'precomp' of the warning message furthest away from
      * 'precomp'.  During the parse, no warnings are raised for any problems
 
     /* position beyond 'precomp' of the warning message furthest away from
      * 'precomp'.  During the parse, no warnings are raised for any problems
@@ -267,6 +268,7 @@ struct RExC_state_t {
 #define RExC_paren_names       (pRExC_state->paren_names)
 #define RExC_recurse   (pRExC_state->recurse)
 #define RExC_recurse_count     (pRExC_state->recurse_count)
 #define RExC_paren_names       (pRExC_state->paren_names)
 #define RExC_recurse   (pRExC_state->recurse)
 #define RExC_recurse_count     (pRExC_state->recurse_count)
+#define RExC_sets_depth         (pRExC_state->sets_depth)
 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 #define RExC_study_chunk_recursed_bytes  \
                                    (pRExC_state->study_chunk_recursed_bytes)
 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 #define RExC_study_chunk_recursed_bytes  \
                                    (pRExC_state->study_chunk_recursed_bytes)
@@ -343,9 +345,14 @@ struct RExC_state_t {
 
 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 #define PBITVAL(paren) (1 << ((paren) & 7))
 
 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 #define PBITVAL(paren) (1 << ((paren) & 7))
-#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
-#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
-#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
+#define PAREN_OFFSET(depth) \
+    (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
+#define PAREN_TEST(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
+#define PAREN_SET(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
+#define PAREN_UNSET(depth, paren) \
+    (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
 
 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
                                      if (!UTF) {                           \
 
 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
                                      if (!UTF) {                           \
@@ -883,11 +890,27 @@ static const scan_data_t zero_scan_data = {
     } STMT_END
 
 /* m is not necessarily a "literal string", in this macro */
     } STMT_END
 
 /* m is not necessarily a "literal string", in this macro */
-#define reg_warn_non_literal_string(loc, m)                             \
-    _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
-                      Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
+#define warn_non_literal_string(loc, packed_warn, m)                    \
+    _WARN_HELPER(loc, packed_warn,                                      \
+                      Perl_warner(aTHX_ packed_warn,                    \
                                        "%s" REPORT_LOCATION,            \
                                   m, REPORT_LOCATION_ARGS(loc)))
                                        "%s" REPORT_LOCATION,            \
                                   m, REPORT_LOCATION_ARGS(loc)))
+#define reg_warn_non_literal_string(loc, m)                             \
+                warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
+
+#define ckWARN2_non_literal_string(loc, packwarn, m, a1)                    \
+    STMT_START {                                                            \
+                char * format;                                              \
+                Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
+                Newx(format, format_size, char);                            \
+                my_strlcpy(format, m, format_size);                         \
+                my_strlcat(format, REPORT_LOCATION, format_size);           \
+                SAVEFREEPV(format);                                         \
+                _WARN_HELPER(loc, packwarn,                                 \
+                      Perl_ck_warner(aTHX_ packwarn,                        \
+                                        format,                             \
+                                        a1, REPORT_LOCATION_ARGS(loc)));    \
+    } STMT_END
 
 #define        ckWARNreg(loc,m)                                                \
     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
 
 #define        ckWARNreg(loc,m)                                                \
     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
@@ -1379,29 +1402,6 @@ S_edit_distance(const UV* src,
 /* END of edit_distance() stuff
  * ========================================================= */
 
 /* END of edit_distance() stuff
  * ========================================================= */
 
-/* is c a control character for which we have a mnemonic? */
-#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
-
-STATIC const char *
-S_cntrl_to_mnemonic(const U8 c)
-{
-    /* Returns the mnemonic string that represents character 'c', if one
-     * exists; NULL otherwise.  The only ones that exist for the purposes of
-     * this routine are a few control characters */
-
-    switch (c) {
-        case '\a':       return "\\a";
-        case '\b':       return "\\b";
-        case ESC_NATIVE: return "\\e";
-        case '\f':       return "\\f";
-        case '\n':       return "\\n";
-        case '\r':       return "\\r";
-        case '\t':       return "\\t";
-    }
-
-    return NULL;
-}
-
 /* Mark that we cannot extend a found fixed substring at this point.
    Update the longest found anchored substring or the longest found
    floating substrings if needed. */
 /* Mark that we cannot extend a found fixed substring at this point.
    Update the longest found anchored substring or the longest found
    floating substrings if needed. */
@@ -3589,7 +3589,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                         str=STRING(convert);
                         setSTR_LEN(convert, 0);
                     }
                         str=STRING(convert);
                         setSTR_LEN(convert, 0);
                     }
-                    setSTR_LEN(convert, STR_LEN(convert) + len);
+                    assert( ( STR_LEN(convert) + len ) < 256 );
+                    setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
                     while (len--)
                         *str++ = *ch++;
                } else {
                     while (len--)
                         *str++ = *ch++;
                } else {
@@ -3603,7 +3604,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
            trie->prefixlen = (state-1);
             if (str) {
                 regnode *n = convert+NODE_SZ_STR(convert);
            trie->prefixlen = (state-1);
             if (str) {
                 regnode *n = convert+NODE_SZ_STR(convert);
-                NEXT_OFF(convert) = NODE_SZ_STR(convert);
+                assert( NODE_SZ_STR(convert) <= U16_MAX );
+                NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
                 trie->startstate = state;
                 trie->minlen -= (state - 1);
                 trie->maxlen -= (state - 1);
                 trie->startstate = state;
                 trie->minlen -= (state - 1);
                 trie->maxlen -= (state - 1);
@@ -4029,11 +4031,6 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
  *      using /iaa matching will be doing so almost entirely with ASCII
  *      strings, so this should rarely be encountered in practice */
 
  *      using /iaa matching will be doing so almost entirely with ASCII
  *      strings, so this should rarely be encountered in practice */
 
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
-    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
-                                      && OP(scan) != LEXACT_REQ8)  \
-        join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
-
 STATIC U32
 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                    UV *min_subtract, bool *unfolded_multi_char,
 STATIC U32
 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                    UV *min_subtract, bool *unfolded_multi_char,
@@ -4197,7 +4194,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
             merged++;
 
             NEXT_OFF(scan) += NEXT_OFF(n);
             merged++;
 
             NEXT_OFF(scan) += NEXT_OFF(n);
-            setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
+            assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
+            setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
             next = n + NODE_SZ_STR(n);
             /* Now we can overwrite *n : */
             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
             next = n + NODE_SZ_STR(n);
             /* Now we can overwrite *n : */
             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
@@ -4422,23 +4420,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
            }
 #endif
        }
            }
 #endif
        }
-
-        if (     STR_LEN(scan) == 1
-            &&   isALPHA_A(* STRING(scan))
-            &&  (         OP(scan) == EXACTFAA
-                 || (     OP(scan) == EXACTFU
-                     && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
-        {
-            U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
-
-            /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
-             * with the mask set to the complement of the bit that differs
-             * between upper and lower case, and the lowest code point of the
-             * pair (which the '&' forces) */
-            OP(scan) = ANYOFM;
-            ARG_SET(scan, *STRING(scan) & mask);
-            FLAGS(scan) = mask;
-        }
     }
 
 #ifdef DEBUGGING
     }
 
 #ifdef DEBUGGING
@@ -4498,6 +4479,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
     dVAR;
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
     dVAR;
+    SSize_t final_minlen;
     /* There must be at least this number of characters to match */
     SSize_t min = 0;
     I32 pars = 0, code;
     /* There must be at least this number of characters to match */
     SSize_t min = 0;
     I32 pars = 0, code;
@@ -4541,15 +4523,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             U32 j;
             for ( j = 0 ; j < recursed_depth ; j++ ) {
                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
             U32 j;
             for ( j = 0 ; j < recursed_depth ; j++ ) {
                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
-                    if (
-                        PAREN_TEST(RExC_study_chunk_recursed +
-                                   ( j * RExC_study_chunk_recursed_bytes), i )
-                        && (
-                            !j ||
-                            !PAREN_TEST(RExC_study_chunk_recursed +
-                                   (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
-                        )
-                    ) {
+                    if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
                         Perl_re_printf( aTHX_ " %d",(int)i);
                         break;
                     }
                         Perl_re_printf( aTHX_ " %d",(int)i);
                         break;
                     }
@@ -4577,7 +4551,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
          * parsing code, as each (?:..) is handled by a different invocation of
          * reg() -- Yves
          */
          * parsing code, as each (?:..) is handled by a different invocation of
          * reg() -- Yves
          */
-        JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
+        if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
+                                          && OP(scan) != LEXACT_REQ8)
+            join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
+                    0, NULL, depth + 1);
 
         /* Follow the next-chain of the current node and optimize
            away all the NOTHINGs from it.  */
 
         /* Follow the next-chain of the current node and optimize
            away all the NOTHINGs from it.  */
@@ -5151,8 +5128,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                 if (
                     !recursed_depth
 
                 if (
                     !recursed_depth
-                    ||
-                    !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
+                    || !PAREN_TEST(recursed_depth - 1, paren)
                 ) {
                     /* it is quite possible that there are more efficient ways
                      * to do this. We maintain a bitmap per level of recursion
                 ) {
                     /* it is quite possible that there are more efficient ways
                      * to do this. We maintain a bitmap per level of recursion
@@ -5167,13 +5143,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if (!recursed_depth) {
                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
                     } else {
                     if (!recursed_depth) {
                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
                     } else {
-                        Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
-                             RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
+                        Copy(PAREN_OFFSET(recursed_depth - 1),
+                             PAREN_OFFSET(recursed_depth),
                              RExC_study_chunk_recursed_bytes, U8);
                     }
                     /* we havent recursed into this paren yet, so recurse into it */
                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
                              RExC_study_chunk_recursed_bytes, U8);
                     }
                     /* we havent recursed into this paren yet, so recurse into it */
                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
-                    PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
+                    PAREN_SET(recursed_depth, paren);
                     my_recursed_depth= recursed_depth + 1;
                 } else {
                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
                     my_recursed_depth= recursed_depth + 1;
                 } else {
                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
@@ -5238,17 +5214,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  || OP(scan) == LEXACT_REQ8
                  || OP(scan) == EXACTL)
         {
                  || OP(scan) == LEXACT_REQ8
                  || OP(scan) == EXACTL)
         {
-           SSize_t l = STR_LEN(scan);
+           SSize_t bytelen = STR_LEN(scan), charlen;
            UV uc;
            UV uc;
-            assert(l);
+            assert(bytelen);
            if (UTF) {
                const U8 * const s = (U8*)STRING(scan);
            if (UTF) {
                const U8 * const s = (U8*)STRING(scan);
-               uc = utf8_to_uvchr_buf(s, s + l, NULL);
-               l = utf8_length(s, s + l);
+               uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+               charlen = utf8_length(s, s + bytelen);
            } else {
                uc = *((U8*)STRING(scan));
            } else {
                uc = *((U8*)STRING(scan));
+                charlen = bytelen;
            }
            }
-           min += l;
+           min += charlen;
            if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
                /* The code below prefers earlier match for fixed
                   offset, later match for variable offset.  */
            if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
                /* The code below prefers earlier match for fixed
                   offset, later match for variable offset.  */
@@ -5257,7 +5234,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    data->last_start_max = is_inf
                        ? SSize_t_MAX : data->pos_min + data->pos_delta;
                }
                    data->last_start_max = is_inf
                        ? SSize_t_MAX : data->pos_min + data->pos_delta;
                }
-               sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+               sv_catpvn(data->last_found, STRING(scan), bytelen);
                if (UTF)
                    SvUTF8_on(data->last_found);
                {
                if (UTF)
                    SvUTF8_on(data->last_found);
                {
@@ -5265,11 +5242,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
                    if (mg && mg->mg_len >= 0)
                    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
                        mg_find(sv, PERL_MAGIC_utf8) : NULL;
                    if (mg && mg->mg_len >= 0)
-                       mg->mg_len += utf8_length((U8*)STRING(scan),
-                                              (U8*)STRING(scan)+STR_LEN(scan));
+                       mg->mg_len += charlen;
                }
                }
-               data->last_end = data->pos_min + l;
-               data->pos_min += l; /* As in the first entry. */
+               data->last_end = data->pos_min + charlen;
+               data->pos_min += charlen; /* As in the first entry. */
                data->flags &= ~SF_BEFORE_EOL;
            }
 
                data->flags &= ~SF_BEFORE_EOL;
            }
 
@@ -5291,25 +5267,42 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        }
         else if (PL_regkind[OP(scan)] == EXACT) {
             /* But OP != EXACT!, so is EXACTFish */
        }
         else if (PL_regkind[OP(scan)] == EXACT) {
             /* But OP != EXACT!, so is EXACTFish */
-           SSize_t l = STR_LEN(scan);
+           SSize_t bytelen = STR_LEN(scan), charlen;
             const U8 * s = (U8*)STRING(scan);
 
             const U8 * s = (U8*)STRING(scan);
 
+            /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+             * with the mask set to the complement of the bit that differs
+             * between upper and lower case, and the lowest code point of the
+             * pair (which the '&' forces) */
+            if (     bytelen == 1
+                &&   isALPHA_A(*s)
+                &&  (         OP(scan) == EXACTFAA
+                     || (     OP(scan) == EXACTFU
+                         && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
+            {
+                U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+                OP(scan) = ANYOFM;
+                ARG_SET(scan, *s & mask);
+                FLAGS(scan) = mask;
+                /* we're not EXACTFish any more, so restudy */
+                continue;
+            }
+
            /* Search for fixed substrings supports EXACT only. */
            if (flags & SCF_DO_SUBSTR) {
                assert(data);
                 scan_commit(pRExC_state, data, minlenp, is_inf);
            }
            /* Search for fixed substrings supports EXACT only. */
            if (flags & SCF_DO_SUBSTR) {
                assert(data);
                 scan_commit(pRExC_state, data, minlenp, is_inf);
            }
-           if (UTF) {
-               l = utf8_length(s, s + l);
-           }
+            charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
            if (unfolded_multi_char) {
                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
            }
            if (unfolded_multi_char) {
                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
            }
-           min += l - min_subtract;
+           min += charlen - min_subtract;
             assert (min >= 0);
             delta += min_subtract;
            if (flags & SCF_DO_SUBSTR) {
             assert (min >= 0);
             delta += min_subtract;
            if (flags & SCF_DO_SUBSTR) {
-               data->pos_min += l - min_subtract;
+               data->pos_min += charlen - min_subtract;
                if (data->pos_min < 0) {
                     data->pos_min = 0;
                 }
                if (data->pos_min < 0) {
                     data->pos_min = 0;
                 }
@@ -5779,10 +5772,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                continue;
 
            default:
                continue;
 
            default:
-#ifdef DEBUGGING
                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
                                                                     OP(scan));
                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
                                                                     OP(scan));
-#endif
             case REF:
             case CLUMP:
                if (flags & SCF_DO_SUBSTR) {
             case REF:
             case CLUMP:
                if (flags & SCF_DO_SUBSTR) {
@@ -6433,6 +6424,11 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
            if (trie->jump) /* no more substrings -- for now /grr*/
                flags &= ~SCF_DO_SUBSTR;
        }
            if (trie->jump) /* no more substrings -- for now /grr*/
                flags &= ~SCF_DO_SUBSTR;
        }
+        else if (OP(scan) == REGEX_SET) {
+            Perl_croak(aTHX_ "panic: %s regnode should be resolved"
+                             " before optimization", reg_name[REGEX_SET]);
+        }
+
 #endif /* old or new */
 #endif /* TRIE_STUDY_OPT */
 
 #endif /* old or new */
 #endif /* TRIE_STUDY_OPT */
 
@@ -6484,18 +6480,15 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
 
     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
 
 
     DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
 
-    {
-        SSize_t final_minlen= min < stopmin ? min : stopmin;
-
-        if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
-            if (final_minlen > SSize_t_MAX - delta)
-                RExC_maxlen = SSize_t_MAX;
-            else if (RExC_maxlen < final_minlen + delta)
-                RExC_maxlen = final_minlen + delta;
-        }
-        return final_minlen;
+    final_minlen = min < stopmin
+            ? min : stopmin;
+    if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
+        if (final_minlen > SSize_t_MAX - delta)
+            RExC_maxlen = SSize_t_MAX;
+        else if (RExC_maxlen < final_minlen + delta)
+            RExC_maxlen = final_minlen + delta;
     }
     }
-    NOT_REACHED; /* NOTREACHED */
+    return final_minlen;
 }
 
 STATIC U32
 }
 
 STATIC U32
@@ -7685,6 +7678,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_study_chunk_recursed = NULL;
     RExC_study_chunk_recursed_bytes= 0;
     RExC_recurse_count = 0;
     RExC_study_chunk_recursed = NULL;
     RExC_study_chunk_recursed_bytes= 0;
     RExC_recurse_count = 0;
+    RExC_sets_depth = 0;
     pRExC_state->code_index = 0;
 
     /* Initialize the string in the compiled pattern.  This is so that there is
     pRExC_state->code_index = 0;
 
     /* Initialize the string in the compiled pattern.  This is so that there is
@@ -12185,7 +12179,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             }
            break;
        }
             }
            break;
        }
-        DEBUG_PARSE_r(
+        DEBUG_PARSE_r({
             DEBUG_PARSE_MSG("lsbr");
             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
             DEBUG_PARSE_MSG("lsbr");
             regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
             regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
@@ -12196,7 +12190,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                           (IV)ender,
                           (IV)(ender - lastbr)
             );
                           (IV)ender,
                           (IV)(ender - lastbr)
             );
-        );
+        });
         if (! REGTAIL(pRExC_state, lastbr, ender)) {
             REQUIRE_BRANCHJ(flagp, 0);
         }
         if (! REGTAIL(pRExC_state, lastbr, ender)) {
             REQUIRE_BRANCHJ(flagp, 0);
         }
@@ -12237,7 +12231,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
                                ? regnext(ret_as_regnode)
                                : ret_as_regnode;
                 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
                                ? regnext(ret_as_regnode)
                                : ret_as_regnode;
-                DEBUG_PARSE_r(
+                DEBUG_PARSE_r({
                     DEBUG_PARSE_MSG("NADA");
                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
                                      NULL, pRExC_state);
                     DEBUG_PARSE_MSG("NADA");
                     regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
                                      NULL, pRExC_state);
@@ -12250,7 +12244,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                   (IV)ender,
                                   (IV)(ender - ret)
                     );
                                   (IV)ender,
                                   (IV)(ender - ret)
                     );
-                );
+                });
                 OP(br)= NOTHING;
                 if (OP(REGNODE_p(ender)) == TAIL) {
                     NEXT_OFF(br)= 0;
                 OP(br)= NOTHING;
                 if (OP(REGNODE_p(ender)) == TAIL) {
                     NEXT_OFF(br)= 0;
@@ -12621,12 +12615,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     }
   nest_check:
     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
     }
   nest_check:
     if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
-       ckWARN2reg(RExC_parse,
-                  "%" UTF8f " matches null string many times",
-                  UTF8fARG(UTF, (RExC_parse >= origparse
-                                 ? RExC_parse - origparse
-                                 : 0),
-                  origparse));
+        if (origparse[0] == '\\' && origparse[1] == 'K') {
+            vFAIL2utf8f(
+                       "%" UTF8f " is forbidden - matches null string many times",
+                       UTF8fARG(UTF, (RExC_parse >= origparse
+                                     ? RExC_parse - origparse
+                                     : 0),
+                       origparse));
+            /* NOT-REACHED */
+        } else {
+            ckWARN2reg(RExC_parse,
+                       "%" UTF8f " matches null string many times",
+                       UTF8fARG(UTF, (RExC_parse >= origparse
+                                     ? RExC_parse - origparse
+                                     : 0),
+                       origparse));
+        }
     }
 
     if (*RExC_parse == '?') {
     }
 
     if (*RExC_parse == '?') {
@@ -12959,48 +12963,30 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          * thing. */
 
         do {    /* Loop until the ending brace */
          * thing. */
 
         do {    /* Loop until the ending brace */
-            UV cp = 0;
-            char * start_digit;     /* The first of the current code point */
-            if (! isXDIGIT(*RExC_parse)) {
+            I32 flags = PERL_SCAN_SILENT_OVERFLOW
+                      | PERL_SCAN_SILENT_ILLDIGIT
+                      | PERL_SCAN_NOTIFY_ILLDIGIT
+                      | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
+                      | PERL_SCAN_DISALLOW_PREFIX;
+            STRLEN len = endbrace - RExC_parse;
+            NV overflow_value;
+            char * start_digit = RExC_parse;
+            UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
+
+            if (len == 0) {
                 RExC_parse++;
                 RExC_parse++;
+              bad_NU:
                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
             }
 
                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
             }
 
-            start_digit = RExC_parse;
-            count++;
-
-            /* Loop through the hex digits of the current code point */
-            do {
-                /* Adding this digit will shift the result 4 bits.  If that
-                 * result would be above the legal max, it's overflow */
-                if (cp > MAX_LEGAL_CP >> 4) {
+            RExC_parse += len;
 
 
-                    /* Find the end of the code point */
-                    do {
-                        RExC_parse ++;
-                    } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
-
-                    /* Be sure to synchronize this message with the similar one
-                     * in utf8.c */
-                    vFAIL4("Use of code point 0x%.*s is not allowed; the"
-                        " permissible max is 0x%" UVxf,
-                        (int) (RExC_parse - start_digit), start_digit,
-                        MAX_LEGAL_CP);
-                }
-
-                /* Accumulate this (valid) digit into the running total */
-                cp  = (cp << 4) + READ_XDIGIT(RExC_parse);
-
-                /* READ_XDIGIT advanced the input pointer.  Ignore a single
-                 * underscore separator */
-                if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
-                    RExC_parse++;
-                }
-            } while (isXDIGIT(*RExC_parse));
+            if (cp > MAX_LEGAL_CP) {
+                vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
+            }
 
 
-            /* Here, have accumulated the next code point */
-            if (RExC_parse >= endbrace) {   /* If done ... */
-                if (count != 1) {
+            if (RExC_parse >= endbrace) { /* Got to the closing '}' */
+                if (count) {
                     goto do_concat;
                 }
 
                     goto do_concat;
                 }
 
@@ -13017,18 +13003,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
                 return TRUE;
             }
 
                 return TRUE;
             }
 
-            /* Here, the only legal thing would be a multiple character
-             * sequence (of the form "\N{U+c1.c2. ... }".   So the next
-             * character must be a dot (and the one after that can't be the
-             * endbrace, or we'd have something like \N{U+100.} ) */
+            /* Here, the parse stopped bfore the ending brace.  This is legal
+             * only if that character is a dot separating code points, like a
+             * multiple character sequence (of the form "\N{U+c1.c2. ... }".
+             * So the next character must be a dot (and the one after that
+             * can't be the endbrace, or we'd have something like \N{U+100.} )
+             * */
             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
             if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
-                                ? UTF8SKIP(RExC_parse)
-                                : 1;
-                if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
-                    RExC_parse = endbrace;
-                }
-                vFAIL("Invalid hexadecimal number in \\N{U+...}");
+                              ? UTF8SKIP(RExC_parse)
+                              : 1;
+                RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
+                                                          malformed utf8 */
+                goto bad_NU;
             }
 
             /* Here, looks like its really a multiple character sequence.  Fail
             }
 
             /* Here, looks like its really a multiple character sequence.  Fail
@@ -13046,7 +13033,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
              * but go through the motions of code point counting and error
              * checking, if the caller doesn't want a node returned. */
 
              * but go through the motions of code point counting and error
              * checking, if the caller doesn't want a node returned. */
 
-            if (node_p && count == 1) {
+            if (node_p && ! substitute_parse) {
                 substitute_parse = newSVpvs("?:");
             }
 
                 substitute_parse = newSVpvs("?:");
             }
 
@@ -13962,6 +13949,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        || ! is_PATWS_safe((p), RExC_end, UTF));
 
                switch ((U8)*p) {
                        || ! is_PATWS_safe((p), RExC_end, UTF));
 
                switch ((U8)*p) {
+                  const char* message;
+                  U32 packed_warn;
+                  U8 grok_c_char;
+
                case '^':
                case '$':
                case '.':
                case '^':
                case '$':
                case '.':
@@ -14077,67 +14068,70 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        p++;
                        break;
                    case 'o':
                        p++;
                        break;
                    case 'o':
-                       {
-                           UV result;
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_o(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                      TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       TRUE, /* Output warnings
-                                                                for non-
-                                                                portables */
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
-                           break;
-                       }
+                        if (! grok_bslash_o(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p; /* going to die anyway; point to
+                                               exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+                        break;
                    case 'x':
                    case 'x':
-                       {
-                            UV result = UV_MAX; /* initialize to erroneous
-                                                   value */
-                           const char* error_msg;
-
-                           bool valid = grok_bslash_x(&p,
-                                                       RExC_end,
-                                                      &result,
-                                                      &error_msg,
-                                                       TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       TRUE, /* Silence warnings
-                                                                for non-
-                                                                portables */
-                                                       UTF);
-                           if (! valid) {
-                               RExC_parse = p; /* going to die anyway; point
-                                                  to exact spot of failure */
-                               vFAIL(error_msg);
-                           }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
+                        if (! grok_bslash_x(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p;    /* going to die anyway; point
+                                                   to exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
 
 #ifdef EBCDIC
 
 #ifdef EBCDIC
-                            if (ender < 0x100) {
-                                if (RExC_recode_x_to_native) {
-                                    ender = LATIN1_TO_NATIVE(ender);
-                                }
-                           }
+                        if (ender < 0x100) {
+                            if (RExC_recode_x_to_native) {
+                                ender = LATIN1_TO_NATIVE(ender);
+                            }
+                        }
 #endif
 #endif
-                           break;
-                       }
+                        break;
                    case 'c':
                    case 'c':
-                       p++;
-                       ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
-                        UPDATE_WARNINGS_LOC(p);
                         p++;
                         p++;
+                        if (! grok_bslash_c(*p, &grok_c_char,
+                                            &message, &packed_warn))
+                        {
+                            /* going to die anyway; point to exact spot of
+                             * failure */
+                            RExC_parse = p + ((UTF)
+                                              ? UTF8_SAFE_SKIP(p, RExC_end)
+                                              : 1);
+                            vFAIL(message);
+                        }
+
+                        ender = grok_c_char;
+                        p++;
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+
                        break;
                     case '8': case '9': /* must be a backreference */
                         --p;
                        break;
                     case '8': case '9': /* must be a backreference */
                         --p;
@@ -14172,17 +14166,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* FALLTHROUGH */
                     case '0':
                        {
                         /* FALLTHROUGH */
                     case '0':
                        {
-                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+                           I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+                                      | PERL_SCAN_NOTIFY_ILLDIGIT;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            p += numlen;
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            p += numlen;
-                            if (   isDIGIT(*p)  /* like \08, \178 */
-                                && ckWARN(WARN_REGEXP)
-                                && numlen < 3)
+                            if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
+                                && isDIGIT(*p)  /* like \08, \178 */
+                                && ckWARN(WARN_REGEXP))
                             {
                                reg_warn_non_literal_string(
                             {
                                reg_warn_non_literal_string(
-                                         p + 1,
-                                         form_short_octal_warning(p, numlen));
+                                     p + 1,
+                                     form_alien_digit_msg(8, numlen, p,
+                                                        RExC_end, UTF, FALSE));
                             }
                        }
                        break;
                             }
                        }
                        break;
@@ -14259,6 +14255,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                 if (ender > 255) {
                     REQUIRE_UTF8(flagp);
 
                 if (ender > 255) {
                     REQUIRE_UTF8(flagp);
+                    if (   UNICODE_IS_PERL_EXTENDED(ender)
+                        && TO_OUTPUT_WARNINGS(p))
+                    {
+                        ckWARN2_non_literal_string(p,
+                                                   packWARN(WARN_PORTABLE),
+                                                   PL_extended_cp_format,
+                                                   ender);
+                    }
                 }
 
                 /* We need to check if the next non-ignored thing is a
                 }
 
                 /* We need to check if the next non-ignored thing is a
@@ -14587,8 +14591,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             else if (FOLD) {
                 bool splittable = FALSE;
                 bool backed_up = FALSE;
             else if (FOLD) {
                 bool splittable = FALSE;
                 bool backed_up = FALSE;
-                char * e;
-                char * s_start;
+                char * e;       /* should this be U8? */
+                char * s_start; /* should this be U8? */
 
                 /* Here is /i.  Running out of room creates a problem if we are
                  * folding, and the split happens in the middle of a
 
                 /* Here is /i.  Running out of room creates a problem if we are
                  * folding, and the split happens in the middle of a
@@ -14905,7 +14909,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
                         || ASCII_FOLD_RESTRICTED)
                     {
                     if (   ender != LATIN_SMALL_LETTER_SHARP_S
                         || ASCII_FOLD_RESTRICTED)
                     {
-                        *e++ = toLOWER_L1(ender);
+                        assert( toLOWER_L1(ender) < 256 );
+                        *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
                     }
                     else {
                         *e++ = 's';
                     }
                     else {
                         *e++ = 's';
@@ -14923,7 +14928,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
                                 || ASCII_FOLD_RESTRICTED)
                             {
                             if (   UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
                                 || ASCII_FOLD_RESTRICTED)
                             {
-                                *e++ = toLOWER_L1(ender);
+                                assert( toLOWER_L1(ender) < 256 );
+                                *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
                             }
                             else {
                                 *e++ = 's';
                             }
                             else {
                                 *e++ = 's';
@@ -16232,7 +16238,10 @@ redo_curchar:
                     && UCHARAT(RExC_parse + 1) == '?'
                     && UCHARAT(RExC_parse + 2) == '^')
                 {
                     && UCHARAT(RExC_parse + 1) == '?'
                     && UCHARAT(RExC_parse + 2) == '^')
                 {
-                    /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
+                    const regnode_offset orig_emit = RExC_emit;
+                    SV * resultant_invlist;
+
+                    /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
                      * This happens when we have some thing like
                      *
                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
                      * This happens when we have some thing like
                      *
                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
@@ -16241,62 +16250,33 @@ redo_curchar:
                      *
                      * Here we would be handling the interpolated
                      * '$thai_or_lao'.  We handle this by a recursive call to
                      *
                      * Here we would be handling the interpolated
                      * '$thai_or_lao'.  We handle this by a recursive call to
-                     * ourselves which returns the inversion list the
-                     * interpolated expression evaluates to.  We use the flags
-                     * from the interpolated pattern. */
-                    U32 save_flags = RExC_flags;
-                    const char * save_parse;
-
-                    RExC_parse += 2;        /* Skip past the '(?' */
-                    save_parse = RExC_parse;
-
-                    /* Parse the flags for the '(?'.  We already know the first
-                     * flag to parse is a '^' */
-                    parse_lparen_question_flags(pRExC_state);
-
-                    if (   RExC_parse >= RExC_end - 4
-                        || UCHARAT(RExC_parse) != ':'
-                        || UCHARAT(++RExC_parse) != '('
-                        || UCHARAT(++RExC_parse) != '?'
-                        || UCHARAT(++RExC_parse) != '[')
-                    {
+                     * reg which returns the inversion list the
+                     * interpolated expression evaluates to.  Actually, the
+                     * return is a special regnode containing a pointer to that
+                     * inversion list.  If the return isn't that regnode alone,
+                     * we know that this wasn't such an interpolation, which is
+                     * an error: we need to get a single inversion list back
+                     * from the recursion */
 
 
-                        /* In combination with the above, this moves the
-                         * pointer to the point just after the first erroneous
-                         * character. */
-                        if (RExC_parse >= RExC_end - 4) {
-                            RExC_parse = RExC_end;
-                        }
-                        else if (RExC_parse != save_parse) {
-                            RExC_parse += (UTF)
-                                          ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
-                                          : 1;
-                        }
-                        vFAIL("Expecting '(?flags:(?[...'");
-                    }
-
-                    /* Recurse, with the meat of the embedded expression */
                     RExC_parse++;
                     RExC_parse++;
-                    if (! handle_regex_sets(pRExC_state, &current, flagp,
-                                                    depth+1, oregcomp_parse))
-                    {
-                        RETURN_FAIL_ON_RESTART(*flagp, flagp);
-                    }
+                    RExC_sets_depth++;
 
 
-                    /* Here, 'current' contains the embedded expression's
-                     * inversion list, and RExC_parse points to the trailing
-                     * ']'; the next character should be the ')' */
-                    RExC_parse++;
-                    if (UCHARAT(RExC_parse) != ')')
-                        vFAIL("Expecting close paren for nested extended charclass");
+                   node = reg(pRExC_state, 2, flagp, depth+1);
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
 
 
-                    /* Then the ')' matching the original '(' handled by this
-                     * case: statement */
-                    RExC_parse++;
-                    if (UCHARAT(RExC_parse) != ')')
-                        vFAIL("Expecting close paren for wrapper for nested extended charclass");
+                    if (   OP(REGNODE_p(node)) != REGEX_SET
+                           /* If more than a single node returned, the nested
+                            * parens evaluated to more than just a (?[...]),
+                            * which isn't legal */
+                        || node != 1) {
+                        vFAIL("Expecting interpolated extended charclass");
+                    }
+                    resultant_invlist = (SV *) ARGp(REGNODE_p(node));
+                    current = invlist_clone(resultant_invlist, NULL);
+                    SvREFCNT_dec(resultant_invlist);
 
 
-                    RExC_flags = save_flags;
+                    RExC_sets_depth--;
+                    RExC_emit = orig_emit;
                     goto handle_operand;
                 }
 
                     goto handle_operand;
                 }
 
@@ -16684,6 +16664,13 @@ redo_curchar:
         return END;
     }
 
         return END;
     }
 
+    if (RExC_sets_depth) {  /* If within a recursive call, return in a special
+                               regnode */
+        RExC_parse++;
+        node = regpnode(pRExC_state, REGEX_SET, (void *) final);
+    }
+    else {
+
     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
      * expecting a string of ranges and individual code points */
     invlist_iterinit(final);
     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
      * expecting a string of ranges and individual code points */
     invlist_iterinit(final);
@@ -16767,6 +16754,7 @@ redo_curchar:
         ANYOF_FLAGS(REGNODE_p(node))
                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
     }
         ANYOF_FLAGS(REGNODE_p(node))
                 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
     }
+    }
 
     nextchar(pRExC_state);
     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
 
     nextchar(pRExC_state);
     Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
@@ -16934,6 +16922,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
 
     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
 
     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+        CLEAR_POSIX_WARNINGS();
         return;
     }
 
         return;
     }
 
@@ -17364,6 +17353,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * is already in 'value'.  Otherwise, need to translate the escape
              * into what it signifies. */
             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
              * is already in 'value'.  Otherwise, need to translate the escape
              * into what it signifies. */
             if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
+                const char * message;
+                U32 packed_warn;
+                U8 grok_c_char;
 
            case 'w':   namedclass = ANYOF_WORDCHAR;    break;
            case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
 
            case 'w':   namedclass = ANYOF_WORDCHAR;    break;
            case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
@@ -17614,53 +17606,74 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            case 'a':   value = '\a';                   break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
            case 'a':   value = '\a';                   break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_o(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                               TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               silence_non_portable,
-                                               UTF);
-                   if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
-                non_portable_endpoint++;
+                if (! grok_bslash_o(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            cBOOL(range), /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
+                if (value < 256) {
+                    non_portable_endpoint++;
+                }
                break;
            case 'x':
                RExC_parse--;   /* function expects to be pointed at the 'x' */
                break;
            case 'x':
                RExC_parse--;   /* function expects to be pointed at the 'x' */
-               {
-                   const char* error_msg;
-                   bool valid = grok_bslash_x(&RExC_parse,
-                                               RExC_end,
-                                              &value,
-                                              &error_msg,
-                                              TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               silence_non_portable,
-                                               UTF);
-                    if (! valid) {
-                       vFAIL(error_msg);
-                   }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-               }
-                non_portable_endpoint++;
+                if (!  grok_bslash_x(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            cBOOL(range), /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
+                if (value < 256) {
+                    non_portable_endpoint++;
+                }
                break;
            case 'c':
                break;
            case 'c':
-               value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
-                UPDATE_WARNINGS_LOC(RExC_parse);
-               RExC_parse++;
+                if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
+                                                                &packed_warn))
+                {
+                    /* going to die anyway; point to exact spot of
+                        * failure */
+                    RExC_parse += (UTF)
+                                  ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                  : 1;
+                    vFAIL(message);
+                }
+
+                value = grok_c_char;
+                RExC_parse++;
+                if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                 non_portable_endpoint++;
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7':
                {
                    /* Take 1-3 octal digits */
                 non_portable_endpoint++;
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7':
                {
                    /* Take 1-3 octal digits */
-                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
+                   I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+                              | PERL_SCAN_NOTIFY_ILLDIGIT;
                     numlen = (strict) ? 4 : 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
                     numlen = (strict) ? 4 : 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
@@ -17671,17 +17684,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                           : 1;
                             vFAIL("Need exactly 3 octal digits");
                         }
                                           : 1;
                             vFAIL("Need exactly 3 octal digits");
                         }
-                        else if (   numlen < 3 /* like \08, \178 */
+                        else if (  (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
                                  && RExC_parse < RExC_end
                                  && isDIGIT(*RExC_parse)
                                  && ckWARN(WARN_REGEXP))
                         {
                             reg_warn_non_literal_string(
                                  RExC_parse + 1,
                                  && RExC_parse < RExC_end
                                  && isDIGIT(*RExC_parse)
                                  && ckWARN(WARN_REGEXP))
                         {
                             reg_warn_non_literal_string(
                                  RExC_parse + 1,
-                                 form_short_octal_warning(RExC_parse, numlen));
+                                 form_alien_digit_msg(8, numlen, RExC_parse,
+                                                        RExC_end, UTF, FALSE));
                         }
                     }
                         }
                     }
-                    non_portable_endpoint++;
+                    if (value < 256) {
+                        non_portable_endpoint++;
+                    }
                    break;
                }
            default:
                    break;
                }
            default:
@@ -17943,7 +17959,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
        /* non-Latin1 code point implies unicode semantics. */
        if (value > 255) {
 
        /* non-Latin1 code point implies unicode semantics. */
        if (value > 255) {
+            if (value > MAX_LEGAL_CP && (   value != UV_MAX
+                                         || prevvalue > MAX_LEGAL_CP))
+            {
+                vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+            }
             REQUIRE_UNI_RULES(flagp, 0);
             REQUIRE_UNI_RULES(flagp, 0);
+            if (  ! silence_non_portable
+                &&  UNICODE_IS_PERL_EXTENDED(value)
+                &&  TO_OUTPUT_WARNINGS(RExC_parse))
+            {
+                ckWARN2_non_literal_string(RExC_parse,
+                                           packWARN(WARN_PORTABLE),
+                                           PL_extended_cp_format,
+                                           value);
+            }
        }
 
         /* Ready to process either the single value, or the completed range.
        }
 
         /* Ready to process either the single value, or the completed range.
@@ -18668,17 +18698,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
        invert = FALSE;
     }
 
        invert = FALSE;
     }
 
+    /* All possible optimizations below still have these characteristics.
+     * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+     * routine) */
+    *flagp |= HASWIDTH|SIMPLE;
+
     if (ret_invlist) {
         *ret_invlist = cp_list;
 
         return RExC_emit;
     }
 
     if (ret_invlist) {
         *ret_invlist = cp_list;
 
         return RExC_emit;
     }
 
-    /* All possible optimizations below still have these characteristics.
-     * (Multi-char folds aren't SIMPLE, but they don't get this far in this
-     * routine) */
-    *flagp |= HASWIDTH|SIMPLE;
-
     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
         RExC_contains_locale = 1;
     }
     if (anyof_flags & ANYOF_LOCALE_FLAGS) {
         RExC_contains_locale = 1;
     }
@@ -19848,10 +19878,11 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                     /* The data consists of just strings defining user-defined
                      * property names, but in prior incarnations, and perhaps
                      * somehow from pluggable regex engines, it could still
                     /* The data consists of just strings defining user-defined
                      * property names, but in prior incarnations, and perhaps
                      * somehow from pluggable regex engines, it could still
-                     * hold hex code point definitions.  Each component of a
-                     * range would be separated by a tab, and each range by a
-                     * new-line.  If these are found, instead add them to the
-                     * inversion list */
+                     * hold hex code point definitions, all of which should be
+                     * legal (or it wouldn't have gotten this far).  Each
+                     * component of a range would be separated by a tab, and
+                     * each range by a new-line.  If these are found, instead
+                     * add them to the inversion list */
                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
                                      |PERL_SCAN_SILENT_NON_PORTABLE;
                     STRLEN len = remaining;
                     I32 grok_flags =  PERL_SCAN_SILENT_ILLDIGIT
                                      |PERL_SCAN_SILENT_NON_PORTABLE;
                     STRLEN len = remaining;
@@ -20176,6 +20207,22 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
     return(ret);
 }
 
     return(ret);
 }
 
+/*
+- regpnode - emit a temporary node with a void* argument
+*/
+STATIC regnode_offset /* Location. */
+S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
+{
+    const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode");
+    regnode_offset ptr = ret;
+
+    PERL_ARGS_ASSERT_REGPNODE;
+
+    FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
+    RExC_emit = ptr;
+    return(ret);
+}
+
 STATIC regnode_offset
 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
 {
 STATIC regnode_offset
 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
 {
@@ -23163,9 +23210,10 @@ Perl_parse_uniprop_string(pTHX_
     dVAR;
     char* lookup_name;          /* normalized name for lookup in our tables */
     unsigned lookup_len;        /* Its length */
     dVAR;
     char* lookup_name;          /* normalized name for lookup in our tables */
     unsigned lookup_len;        /* Its length */
-    bool stricter = FALSE;      /* Some properties have stricter name
-                                   normalization rules, which we decide upon
-                                   based on parsing */
+    enum { Not_Strict = 0,      /* Some properties have stricter name */
+           Strict,              /* normalization rules, which we decide */
+           As_Is                /* upon based on parsing */
+         } stricter = Not_Strict;
 
     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
      * (though it requires extra effort to download them from Unicode and
 
     /* nv= or numeric_value=, or possibly one of the cjk numeric properties
      * (though it requires extra effort to download them from Unicode and
@@ -23316,7 +23364,7 @@ Perl_parse_uniprop_string(pTHX_
      * or are positioned just after the '=' if it is compound. */
 
     if (equals_pos >= 0) {
      * or are positioned just after the '=' if it is compound. */
 
     if (equals_pos >= 0) {
-        assert(! stricter); /* We shouldn't have set this yet */
+        assert(stricter == Not_Strict); /* We shouldn't have set this yet */
 
         /* Space immediately after the '=' is ignored */
         i++;
 
         /* Space immediately after the '=' is ignored */
         i++;
@@ -23495,6 +23543,93 @@ Perl_parse_uniprop_string(pTHX_
              * some constructs in their subpattern, like \A. */
         } /* End of is a wildcard subppattern */
 
              * some constructs in their subpattern, like \A. */
         } /* End of is a wildcard subppattern */
 
+        /* \p{name=...} is handled specially.  Instead of using the normal
+         * mechanism involving charclass_invlists.h, it uses _charnames.pm
+         * which has the necessary (huge) data accessible to it, and which
+         * doesn't get loaded unless necessary.  The legal syntax for names is
+         * somewhat different than other properties due both to the vagaries of
+         * a few outlier official names, and the fact that only a few ASCII
+         * characters are permitted in them */
+        if (   memEQs(lookup_name, j - 1, "name")
+            || memEQs(lookup_name, j - 1, "na"))
+        {
+            dSP;
+            HV * table;
+            SV * character;
+            const char * error_msg;
+            CV* lookup_loose;
+            SV * character_name;
+            STRLEN character_len;
+            UV cp;
+
+            stricter = As_Is;
+
+            /* Since the RHS (after skipping initial space) is passed unchanged
+             * to charnames, and there are different criteria for what are
+             * legal characters in the name, just parse it here.  A character
+             * name must begin with an ASCII alphabetic */
+            if (! isALPHA(name[i])) {
+                goto failed;
+            }
+            lookup_name[j++] = name[i];
+
+            for (++i; i < name_len; i++) {
+                /* Official names can only be in the ASCII range, and only
+                 * certain characters */
+                if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
+                    goto failed;
+                }
+                lookup_name[j++] = name[i];
+            }
+
+            /* Finished parsing, save the name into an SV */
+            character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
+
+            /* Make sure _charnames is loaded.  (The parameters give context
+             * for any errors generated */
+            table = load_charnames(character_name, name, name_len, &error_msg);
+            if (table == NULL) {
+                sv_catpv(msg, error_msg);
+                goto append_name_to_msg;
+            }
+
+            lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
+            if (! lookup_loose) {
+                Perl_croak(aTHX_
+                       "panic: Can't find '_charnames::_loose_regcomp_lookup");
+            }
+
+            PUSHSTACKi(PERLSI_OVERLOAD);
+            ENTER ;
+            SAVETMPS;
+
+            PUSHMARK(SP) ;
+            XPUSHs(character_name);
+            PUTBACK;
+            call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
+
+            SPAGAIN ;
+
+            character = POPs;
+            SvREFCNT_inc_simple_void_NN(character);
+
+            PUTBACK ;
+            FREETMPS ;
+            LEAVE ;
+            POPSTACK;
+
+            if (! SvOK(character)) {
+                goto failed;
+            }
+
+            cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
+            if (character_len < SvCUR(character)) {
+                goto failed;
+            }
+
+            prop_definition = add_cp_to_invlist(NULL, cp);
+            return prop_definition;
+        }
 
         /* Certain properties whose values are numeric need special handling.
          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
 
         /* Certain properties whose values are numeric need special handling.
          * They may optionally be prefixed by 'is'.  Ignore that prefix for the
@@ -23547,12 +23682,12 @@ Perl_parse_uniprop_string(pTHX_
              * But the numeric type properties can have the alphas [Ee] to
              * signify an exponent, and it is still a number with stricter
              * rules.  So look for an alpha that signifies not-strict */
              * But the numeric type properties can have the alphas [Ee] to
              * signify an exponent, and it is still a number with stricter
              * rules.  So look for an alpha that signifies not-strict */
-            stricter = TRUE;
+            stricter = Strict;
             for (k = i; k < name_len; k++) {
                 if (   isALPHA_A(name[k])
                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
                 {
             for (k = i; k < name_len; k++) {
                 if (   isALPHA_A(name[k])
                     && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
                 {
-                    stricter = FALSE;
+                    stricter = Not_Strict;
                     break;
                 }
             }
                     break;
                 }
             }
@@ -23590,7 +23725,7 @@ Perl_parse_uniprop_string(pTHX_
             && memNEs(lookup_name + 4, j - 4, "space")
             && memNEs(lookup_name + 4, j - 4, "word"))
         {
             && memNEs(lookup_name + 4, j - 4, "space")
             && memNEs(lookup_name + 4, j - 4, "word"))
         {
-            stricter = TRUE;
+            stricter = Strict;
 
             /* We set the inputs back to 0 and the code below will reparse,
              * using strict */
 
             /* We set the inputs back to 0 and the code below will reparse,
              * using strict */
@@ -23938,6 +24073,7 @@ Perl_parse_uniprop_string(pTHX_
              * but not yet used. */
             save_item(PL_subname);
 
              * but not yet used. */
             save_item(PL_subname);
 
+            /* G_SCALAR guarantees a single return value */
             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
 
             SPAGAIN;
             (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
 
             SPAGAIN;
@@ -23965,7 +24101,7 @@ Perl_parse_uniprop_string(pTHX_
                 (void) POPs;
                 prop_definition = NULL;
             }
                 (void) POPs;
                 prop_definition = NULL;
             }
-            else {  /* G_SCALAR guarantees a single return value */
+            else {
                 SV * contents = POPs;
 
                 /* The contents is supposed to be the expansion of the property
                 SV * contents = POPs;
 
                 /* The contents is supposed to be the expansion of the property