This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: White-space only
[perl5.git] / regcomp.c
index 82be641..059745d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -86,9 +86,8 @@ EXTERN_C const struct regexp_engine my_reg_engine;
 #  include "regcomp.h"
 #endif
 
-#include "dquote_static.c"
-#include "charclass_invlists.h"
-#include "inline_invlist.c"
+#include "dquote_inline.h"
+#include "invlist_inline.h"
 #include "unicode_constants.h"
 
 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
@@ -181,6 +180,9 @@ struct RExC_state_t {
     I32                contains_locale;
     I32                contains_i;
     I32                override_recoding;
+#ifdef EBCDIC
+    I32                recode_x_to_native;
+#endif
     I32                in_multi_char_class;
     struct reg_code_block *code_blocks;        /* positions of literal (?{})
                                            within pattern */
@@ -212,6 +214,7 @@ struct RExC_state_t {
 #define RExC_mysv2     (pRExC_state->mysv2)
 
 #endif
+    bool        seen_unfolded_sharp_s;
 };
 
 #define RExC_flags     (pRExC_state->flags)
@@ -224,6 +227,17 @@ struct RExC_state_t {
 #define RExC_end       (pRExC_state->end)
 #define RExC_parse     (pRExC_state->parse)
 #define RExC_whilem_seen       (pRExC_state->whilem_seen)
+
+/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
+ * EXACTF node, hence was parsed under /di rules.  If later in the parse,
+ * something forces the pattern into using /ui rules, the sharp s should be
+ * folded into the sequence 'ss', which takes up more space than previously
+ * calculated.  This means that the sizing pass needs to be restarted.  (The
+ * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
+ * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
+ * so there is no need to resize [perl #125990]. */
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
 #ifdef RE_TRACK_PATTERN_OFFSETS
 #define RExC_offsets   (pRExC_state->rxi->u.offsets) /* I am not like the
                                                          others */
@@ -256,6 +270,9 @@ struct RExC_state_t {
 #define RExC_contains_locale   (pRExC_state->contains_locale)
 #define RExC_contains_i (pRExC_state->contains_i)
 #define RExC_override_recoding (pRExC_state->override_recoding)
+#ifdef EBCDIC
+#   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#endif
 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 #define RExC_frame_head (pRExC_state->frame_head)
 #define RExC_frame_last (pRExC_state->frame_last)
@@ -294,7 +311,9 @@ struct RExC_state_t {
 #define        SPSTART         0x04    /* Starts with * or + */
 #define POSTPONED      0x08    /* (?1),(?&name), (??{...}) or similar */
 #define TRYAGAIN       0x10    /* Weeded out a declaration. */
-#define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
+#define RESTART_PASS1   0x20    /* Need to restart sizing pass */
+#define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
+                                   calcuate sizes as UTF-8 */
 
 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 
@@ -313,12 +332,30 @@ struct RExC_state_t {
 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 
-#define REQUIRE_UTF8   STMT_START {                                       \
+#define REQUIRE_UTF8(flagp) STMT_START {                                   \
                                      if (!UTF) {                           \
-                                         *flagp = RESTART_UTF8;            \
+                                         assert(PASS1);                    \
+                                         *flagp = RESTART_PASS1|NEED_UTF8; \
                                          return NULL;                      \
                                      }                                     \
-                        } STMT_END
+                             } STMT_END
+
+/* Change from /d into /u rules, and restart the parse if we've already seen
+ * something whose size would increase as a result, by setting *flagp and
+ * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
+ * we've change to /u during the parse.  */
+#define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
+    STMT_START {                                                            \
+            if (DEPENDS_SEMANTICS) {                                        \
+                assert(PASS1);                                              \
+                set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
+                RExC_uni_semantics = 1;                                     \
+                if (RExC_seen_unfolded_sharp_s) {                           \
+                    *flagp |= RESTART_PASS1;                                \
+                    return restart_retval;                                  \
+                }                                                           \
+            }                                                               \
+    } STMT_END
 
 /* This converts the named class defined in regcomp.h to its equivalent class
  * number defined in handy.h. */
@@ -624,7 +661,7 @@ static const scan_data_t zero_scan_data =
 } STMT_END
 
 /* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
+#define vFAIL2utf8f(m, a1) STMT_START {            \
     const IV offset = RExC_parse - RExC_precomp;   \
     if (!SIZE_ONLY)                                \
         SAVEFREESV(RExC_rx_sv);                    \
@@ -632,6 +669,14 @@ static const scan_data_t zero_scan_data =
             REPORT_LOCATION_ARGS(offset));         \
 } STMT_END
 
+#define vFAIL3utf8f(m, a1, a2) STMT_START {             \
+    const IV offset = RExC_parse - RExC_precomp;        \
+    if (!SIZE_ONLY)                                     \
+        SAVEFREESV(RExC_rx_sv);                         \
+    S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
+            REPORT_LOCATION_ARGS(offset));              \
+} STMT_END
+
 /* These have asserts in them because of [perl #122671] Many warnings in
  * regcomp.c can occur twice.  If they get output in pass1 and later in that
  * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
@@ -803,9 +848,6 @@ static const scan_data_t zero_scan_data =
             if (RExC_seen & REG_GPOS_SEEN)                                  \
                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
                                                                             \
-            if (RExC_seen & REG_CANY_SEEN)                                  \
-                PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
-                                                                            \
             if (RExC_seen & REG_RECURSE_SEEN)                               \
                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
                                                                             \
@@ -1175,7 +1217,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 
     /* If this can match all upper Latin1 code points, have to add them
      * as well */
-    if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+    if (OP(node) == ANYOFD
+        && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+    {
         _invlist_union(invlist, PL_UpperLatin1, &invlist);
     }
 
@@ -1253,12 +1297,19 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
          * that should be; while the consequences for having /l bugs is
          * incorrect matches */
         if (ssc_is_anything((regnode_ssc *)and_with)) {
-            anded_flags |= ANYOF_WARN_SUPER;
+            anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
         }
     }
     else {
         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
-        anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+        if (OP(and_with) == ANYOFD) {
+            anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+        }
+        else {
+            anded_flags = ANYOF_FLAGS(and_with)
+            &( ANYOF_COMMON_FLAGS
+              |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
+        }
     }
 
     ANYOF_FLAGS(ssc) &= anded_flags;
@@ -1409,6 +1460,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
     else {
         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+        if (OP(or_with) != ANYOFD) {
+            ored_flags
+            |= ANYOF_FLAGS(or_with)
+             & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+        }
     }
 
     ANYOF_FLAGS(ssc) |= ored_flags;
@@ -1607,7 +1663,9 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
     /* The code in this file assumes that all but these flags aren't relevant
      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
      * by the time we reach here */
-    assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
+    assert(! (ANYOF_FLAGS(ssc)
+        & ~( ANYOF_COMMON_FLAGS
+            |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
 
     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
 
@@ -1621,6 +1679,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
     }
 
+    if (RExC_contains_locale) {
+        OP(ssc) = ANYOFL;
+    }
+
     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
 }
 
@@ -1999,7 +2061,7 @@ is the recommended Unicode-aware way of saying
 #define TRIE_STORE_REVCHAR(val)                                            \
     STMT_START {                                                           \
        if (UTF) {                                                         \
-            SV *zlopp = newSV(7); /* XXX: optimize me */                   \
+            SV *zlopp = newSV(UTF8_MAXBYTES);                             \
            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
@@ -3650,6 +3712,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
              * this function, we need to flag any occurrences of the sharp s.
              * This character forbids trie formation (because of added
              * complexity) */
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
            while (s < s_end) {
                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
                     OP(scan) = EXACTFA_NO_TRIE;
@@ -3657,7 +3722,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                     break;
                 }
                 s++;
-                continue;
             }
         }
        else {
@@ -3703,6 +3767,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                 *min_subtract += len - 1;
                 s += len;
            }
+#endif
        }
     }
 
@@ -5064,7 +5129,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
                                                                      OP(scan));
 #endif
-               case CANY:
                case SANY:
                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
                        ssc_match_all_cp(data->start_class);
@@ -5093,6 +5157,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                    }
                    break;
 
+                case ANYOFD:
                 case ANYOFL:
                 case ANYOF:
                    if (flags & SCF_DO_STCLASS_AND)
@@ -5638,7 +5703,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
         }
         return final_minlen;
     }
-    NOT_REACHED;
+    NOT_REACHED; /* NOTREACHED */
 }
 
 STATIC U32
@@ -6175,6 +6240,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
 
        ENTER;
        SAVETMPS;
+       save_re_context();
        PUSHSTACKi(PERLSI_REQUIRE);
         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
          * parsing qr''; normally only q'' does this. It also alters
@@ -6541,7 +6607,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
 
     /* ignore the utf8ness if the pattern is 0 length */
     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+
     RExC_uni_semantics = 0;
+    RExC_seen_unfolded_sharp_s = 0;
     RExC_contains_locale = 0;
     RExC_contains_i = 0;
     RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
@@ -6562,8 +6630,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         });
 
   redo_first_pass:
-    /* we jump here if we upgrade the pattern to utf8 and have to
-     * recompile */
+    /* we jump here if we have to recompile, e.g., from upgrading the pattern
+     * to utf8 */
 
     if ((pm_flags & PMf_USE_RE_EVAL)
                /* this second condition covers the non-regex literal case,
@@ -6597,7 +6665,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     if (rx_flags & PMf_FOLD) {
         RExC_contains_i = 1;
     }
-    if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+    if (   initial_charset == REGEX_DEPENDS_CHARSET
+        && (RExC_utf8 ||RExC_uni_semantics))
+    {
 
        /* Set to use unicode semantics if the pattern is in utf8 and has the
         * 'depends' charset specified, as it means unicode when utf8  */
@@ -6609,7 +6679,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_pm_flags = pm_flags;
 
     if (runtime_code) {
-       if (TAINTING_get && TAINT_get)
+        assert(TAINTING_get || !TAINT_get);
+       if (TAINT_get)
            Perl_croak(aTHX_ "Eval-group in insecure regular expression");
 
        if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
@@ -6630,6 +6701,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_extralen = 0;
     RExC_override_recoding = 0;
+#ifdef EBCDIC
+    RExC_recode_x_to_native = 0;
+#endif
     RExC_in_multi_char_class = 0;
 
     /* First pass: determine size, legality. */
@@ -6680,9 +6754,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         at least some part of the pattern, and therefore must convert the whole
         thing.
         -- dmq */
-        if (flags & RESTART_UTF8) {
-            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+        if (flags & RESTART_PASS1) {
+            if (flags & NEED_UTF8) {
+                S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
                                     pRExC_state->num_code_blocks);
+            }
+            else {
+                DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+                "Need to redo pass 1\n"));
+            }
+
             goto redo_first_pass;
         }
         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
@@ -6749,7 +6830,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
        for (n = 0; n < pRExC_state->num_code_blocks; n++)
            if (pRExC_state->code_blocks[n].src_regex)
                SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
-       SAVEFREEPV(pRExC_state->code_blocks);
+       if(pRExC_state->code_blocks)
+           SAVEFREEPV(pRExC_state->code_blocks); /* often null */
     }
 
     {
@@ -6764,25 +6846,25 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                     || ! has_charset);
         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
                                                    == REG_RUN_ON_COMMENT_SEEN);
-       U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
+       U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
                            >> RXf_PMf_STD_PMMOD_SHIFT);
        const char *fptr = STD_PAT_MODS;        /*"msixn"*/
        char *p;
-        /* Allocate for the worst case, which is all the std flags are turned
-         * on.  If more precision is desired, we could do a population count of
-         * the flags set.  This could be done with a small lookup table, or by
-         * shifting, masking and adding, or even, when available, assembly
-         * language for a machine-language population count.
-         * We never output a minus, as all those are defaults, so are
+
+        /* We output all the necessary flags; we never output a minus, as all
+         * those are defaults, so are
          * covered by the caret */
        const STRLEN wraplen = plen + has_p + has_runon
             + has_default       /* If needs a caret */
+            + PL_bitcount[reganch] /* 1 char for each set standard flag */
 
                /* If needs a character set specifier */
            + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
-            + (sizeof(STD_PAT_MODS) - 1)
             + (sizeof("(?:)") - 1);
 
+        /* make sure PL_bitcount bounds not exceeded */
+        assert(sizeof(STD_PAT_MODS) <= 8);
+
         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
        r->xpv_len_u.xpvlenu_pv = p;
        if (RExC_utf8)
@@ -7279,8 +7361,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                                                 lookbehind */
     if (pRExC_state->num_code_blocks)
        r->extflags |= RXf_EVAL_SEEN;
-    if (RExC_seen & REG_CANY_SEEN)
-        r->intflags |= PREGf_CANY_SEEN;
     if (RExC_seen & REG_VERBARG_SEEN)
     {
        r->intflags |= PREGf_VERBARG_SEEN;
@@ -7311,7 +7391,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          * flags appropriately - Yves */
         regnode *first = ri->program + 1;
         U8 fop = OP(first);
-        regnode *next = NEXTOPER(first);
+        regnode *next = regnext(first);
         U8 nop = OP(next);
 
         if (PL_regkind[fop] == NOTHING && nop == END)
@@ -7325,13 +7405,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
             r->extflags |= RXf_START_ONLY;
         else if (fop == PLUS
                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
-                 && OP(regnext(first)) == END)
+                 && nop == END)
             r->extflags |= RXf_WHITE;
         else if ( r->extflags & RXf_SPLIT
                   && (fop == EXACT || fop == EXACTL)
                   && STR_LEN(first) == 1
                   && *(STRING(first)) == ' '
-                  && OP(regnext(first)) == END )
+                  && nop == END )
             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
 
     }
@@ -7692,13 +7772,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
         sv_setpvn(sv, s, i);
         TAINT_set(oldtainted);
 #endif
-        if ( (rx->intflags & PREGf_CANY_SEEN)
-            ? (RXp_MATCH_UTF8(rx)
-                        && (!i || is_utf8_string((U8*)s, i)))
-            : (RXp_MATCH_UTF8(rx)) )
-        {
+        if (RXp_MATCH_UTF8(rx))
             SvUTF8_on(sv);
-        }
         else
             SvUTF8_off(sv);
         if (TAINTING_get) {
@@ -7892,7 +7967,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
                       (unsigned long) flags);
         }
-        NOT_REACHED; /* NOT REACHED */
+        NOT_REACHED; /* NOTREACHED */
     }
     return NULL;
 }
@@ -7983,7 +8058,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  * Some of the methods should always be private to the implementation, and some
  * should eventually be made public */
 
-/* The header definitions are in F<inline_invlist.c> */
+/* The header definitions are in F<invlist_inline.h> */
 
 PERL_STATIC_INLINE UV*
 S__invlist_array_init(SV* const invlist, const bool will_have_0)
@@ -9000,7 +9075,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
     }
 
     /* Here, can't just append things, create and return a new inversion list
-     * which is the union of this range and the existing inversion list */
+     * which is the union of this range and the existing inversion list.  (If
+     * the new range is well-behaved wrt to the old one, we could just insert
+     * it, doing a Move() down on the tail of the old one (potentially growing
+     * it first).  But to determine that means we would have the extra
+     * (possibly throw-away) work of first finding where the new one goes and
+     * whether it disrupts (splits) an existing range, so it doesn't appear to
+     * me (khw) that it's worth it) */
     range_invlist = _new_invlist(2);
     _append_range_to_invlist(range_invlist, start, end);
 
@@ -9717,9 +9798,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
         ++RExC_parse;
     }
 
-    if (PASS2) {
-        STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
-    }
+    vFAIL("Sequence (?... not terminated");
 }
 
 /*
@@ -9739,10 +9818,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
 #endif
 
 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
-   flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
-   needs to be restarted.
-   Otherwise would only return NULL if regbranch() returns NULL, which
-   cannot happen.  */
+   flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
+   needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
+   upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
+   NULL, which cannot happen.  */
 STATIC regnode *
 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
@@ -9788,9 +9867,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            STRLEN verb_len = 0;
            char *start_arg = NULL;
            unsigned char op = 0;
-           int argok = 1;
-            int internal_argval = 0; /* internal_argval is only useful if
-                                        !argok */
+            int arg_required = 0;
+            int internal_argval = -1; /* if >-1 we are not allowed an argument*/
 
             if (has_intervening_patws) {
                 RExC_parse++;
@@ -9832,14 +9910,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             case 'F':  /* (*FAIL) */
                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
                    op = OPFAIL;
-                   argok = 0;
                }
                break;
             case ':':  /* (*:NAME) */
            case 'M':  /* (*MARK:NAME) */
                if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
                     op = MARKPOINT;
-                    argok = -1;
+                    arg_required = 1;
                 }
                 break;
             case 'P':  /* (*PRUNE) */
@@ -9864,36 +9941,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     "Unknown verb pattern '%"UTF8f"'",
                     UTF8fARG(UTF, verb_len, start_verb));
            }
-           if ( argok ) {
-                if ( start_arg && internal_argval ) {
-                   vFAIL3("Verb pattern '%.*s' may not have an argument",
-                       verb_len, start_verb);
-               } else if ( argok < 0 && !start_arg ) {
-                    vFAIL3("Verb pattern '%.*s' has a mandatory argument",
-                       verb_len, start_verb);
-               } else {
-                   ret = reganode(pRExC_state, op, internal_argval);
-                   if ( ! internal_argval && ! SIZE_ONLY ) {
-                        if (start_arg) {
-                            SV *sv = newSVpvn( start_arg,
-                                               RExC_parse - start_arg);
-                            ARG(ret) = add_data( pRExC_state,
-                                                 STR_WITH_LEN("S"));
-                            RExC_rxi->data->data[ARG(ret)]=(void*)sv;
-                            ret->flags = 0;
-                        } else {
-                            ret->flags = 1;
-                        }
-                    }
-               }
-               if (!internal_argval)
-                    RExC_seen |= REG_VERBARG_SEEN;
-           } else if ( start_arg ) {
-               vFAIL3("Verb pattern '%.*s' may not have an argument",
-                       verb_len, start_verb);
-           } else {
-               ret = reg_node(pRExC_state, op);
-           }
+            if ( arg_required && !start_arg ) {
+                vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+                    verb_len, start_verb);
+            }
+            if (internal_argval == -1) {
+                ret = reganode(pRExC_state, op, 0);
+            } else {
+                ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
+            }
+            RExC_seen |= REG_VERBARG_SEEN;
+            if ( ! SIZE_ONLY ) {
+                if (start_arg) {
+                    SV *sv = newSVpvn( start_arg,
+                                       RExC_parse - start_arg);
+                    ARG(ret) = add_data( pRExC_state,
+                                         STR_WITH_LEN("S"));
+                    RExC_rxi->data->data[ARG(ret)]=(void*)sv;
+                    ret->flags = 1;
+                } else {
+                    ret->flags = 0;
+                }
+                if ( internal_argval != -1 )
+                    ARG2L_SET(ret, internal_argval);
+            }
            nextchar(pRExC_state);
            return ret;
         }
@@ -10053,10 +10124,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            case '!':           /* (?!...) */
                RExC_seen_zerolen++;
                /* check if we're really just a "FAIL" assertion */
-               --RExC_parse;
-               nextchar(pRExC_state);
+                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                        FALSE /* Don't force to /x */ );
                if (*RExC_parse == ')') {
-                   ret=reg_node(pRExC_state, OPFAIL);
+                    ret=reganode(pRExC_state, OPFAIL, 0);
                    nextchar(pRExC_state);
                    return ret;
                }
@@ -10096,14 +10167,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 if (RExC_parse == RExC_end || *RExC_parse != ')')
                     vFAIL("Sequence (?&... not terminated");
                 goto gen_recurse_regop;
-                /* NOT REACHED */
+                /* NOTREACHED */
             case '+':
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse++;
                     vFAIL("Illegal pattern");
                 }
                 goto parse_recursion;
-                /* NOT REACHED*/
+                /* NOTREACHED*/
             case '-': /* (?-1) */
                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
                     RExC_parse--; /* rewind to let it be handled later */
@@ -10122,10 +10193,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         RExC_parse++;
                         is_neg = TRUE;
                     }
-                    unum = grok_atou(RExC_parse, &endptr);
-                    num = (unum > I32_MAX) ? I32_MAX : (I32)unum;
-                    if (endptr)
-                       RExC_parse = (char*)endptr;
+                    if (grok_atoUV(RExC_parse, &unum, &endptr)
+                        && unum <= I32_MAX
+                    ) {
+                        num = (I32)unum;
+                        RExC_parse = (char*)endptr;
+                    } else
+                        num = I32_MAX;
                     if (is_neg) {
                         /* Some limit for num? */
                         num = -num;
@@ -10176,7 +10250,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 nextchar(pRExC_state);
                 return ret;
 
-            /* NOT REACHED */
+            /* NOTREACHED */
 
            case '?':           /* (??...) */
                is_logical = 1;
@@ -10256,9 +10330,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                int is_define= 0;
                 const int DEFINE_len = sizeof("DEFINE") - 1;
                if (RExC_parse[0] == '?') {        /* (?(?...)) */
-                   if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
-                       || RExC_parse[1] == '<'
-                       || RExC_parse[1] == '{') { /* Lookahead or eval. */
+                    if (
+                        RExC_parse[1] == '=' ||
+                        RExC_parse[1] == '!' ||
+                        RExC_parse[1] == '<' ||
+                        RExC_parse[1] == '{'
+                    ) { /* Lookahead or eval. */
                        I32 flag;
                         regnode *tail;
 
@@ -10267,8 +10344,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                            ret->flags = 1;
 
                         tail = reg(pRExC_state, 1, &flag, depth+1);
-                        if (flag & RESTART_UTF8) {
-                            *flagp = RESTART_UTF8;
+                        if (flag & (RESTART_PASS1|NEED_UTF8)) {
+                            *flagp = flag & (RESTART_PASS1|NEED_UTF8);
                             return NULL;
                         }
                         REGTAIL(pRExC_state, ret, tail);
@@ -10309,9 +10386,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    RExC_parse++;
                    parno = 0;
                    if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
-                       parno = grok_atou(RExC_parse, &endptr);
-                       if (endptr)
+                        UV uv;
+                        if (grok_atoUV(RExC_parse, &uv, &endptr)
+                            && uv <= I32_MAX
+                        ) {
+                            parno = (I32)uv;
                             RExC_parse = (char*)endptr;
+                        }
+                        /* else "Switch condition not recognized" below */
                    } else if (RExC_parse[0] == '&') {
                        SV *sv_dat;
                        RExC_parse++;
@@ -10327,27 +10409,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                     /* (?(1)...) */
                    char c;
-                   char *tmp;
-                   parno = grok_atou(RExC_parse, &endptr);
-                    if (endptr)
-                       RExC_parse = (char*)endptr;
+                    UV uv;
+                    if (grok_atoUV(RExC_parse, &uv, &endptr)
+                        && uv <= I32_MAX
+                    ) {
+                        parno = (I32)uv;
+                        RExC_parse = (char*)endptr;
+                    }
+                    else {
+                        vFAIL("panic: grok_atoUV returned FALSE");
+                    }
                     ret = reganode(pRExC_state, GROUPP, parno);
 
                  insert_if_check_paren:
-                   if (*(tmp = nextchar(pRExC_state)) != ')') {
-                        /* nextchar also skips comments, so undo its work
-                         * and skip over the the next character.
-                         */
-                        RExC_parse = tmp;
+                   if (UCHARAT(RExC_parse) != ')') {
                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
                        vFAIL("Switch condition not recognized");
                    }
+                   nextchar(pRExC_state);
                  insert_if:
                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
                     br = regbranch(pRExC_state, &flags, 1,depth+1);
                    if (br == NULL) {
-                        if (flags & RESTART_UTF8) {
-                            *flagp = RESTART_UTF8;
+                        if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                            *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                             return NULL;
                         }
                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
@@ -10355,7 +10440,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     } else
                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
                                                           LONGJMP, 0));
-                   c = *nextchar(pRExC_state);
+                   c = UCHARAT(RExC_parse);
+                    nextchar(pRExC_state);
                    if (flags&HASWIDTH)
                        *flagp |= HASWIDTH;
                    if (c == '|') {
@@ -10366,8 +10452,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         lastbr = reganode(pRExC_state, IFTHEN, 0);
 
                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
-                            if (flags & RESTART_UTF8) {
-                                *flagp = RESTART_UTF8;
+                            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                                 return NULL;
                             }
                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
@@ -10376,7 +10462,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         REGTAIL(pRExC_state, ret, lastbr);
                        if (flags&HASWIDTH)
                            *flagp |= HASWIDTH;
-                       c = *nextchar(pRExC_state);
+                        c = UCHARAT(RExC_parse);
+                        nextchar(pRExC_state);
                    }
                    else
                        lastbr = NULL;
@@ -10464,8 +10551,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     /*     branch_len = (paren != 0); */
 
     if (br == NULL) {
-        if (flags & RESTART_UTF8) {
-            *flagp = RESTART_UTF8;
+        if (flags & (RESTART_PASS1|NEED_UTF8)) {
+            *flagp = flags & (RESTART_PASS1|NEED_UTF8);
             return NULL;
         }
         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10511,8 +10598,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
         br = regbranch(pRExC_state, &flags, 0, depth+1);
 
        if (br == NULL) {
-            if (flags & RESTART_UTF8) {
-                *flagp = RESTART_UTF8;
+            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                 return NULL;
             }
             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10643,12 +10730,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
     /* Check for proper termination. */
     if (paren) {
-        /* restore original flags, but keep (?p) */
+        /* restore original flags, but keep (?p) and, if we've changed from /d
+         * rules to /u, keep the /u */
        RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
-       if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+        if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+            set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+        }
+       if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
            RExC_parse = oregcomp_parse;
            vFAIL("Unmatched (");
        }
+       nextchar(pRExC_state);
     }
     else if (!paren && RExC_parse < RExC_end) {
        if (*RExC_parse == ')') {
@@ -10673,8 +10765,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  *
  * Implements the concatenation operator.
  *
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
  */
 STATIC regnode *
 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
@@ -10705,16 +10797,16 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
 
     *flagp = WORST;                    /* Tentatively. */
 
-    RExC_parse--;
-    nextchar(pRExC_state);
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't force to /x */ );
     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
        flags &= ~TRYAGAIN;
         latest = regpiece(pRExC_state, &flags,depth+1);
        if (latest == NULL) {
            if (flags & TRYAGAIN)
                continue;
-            if (flags & RESTART_UTF8) {
-                *flagp = RESTART_UTF8;
+            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                 return NULL;
             }
             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10756,8 +10848,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  *
  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
  * TRYAGAIN.
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
  */
 STATIC regnode *
 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
@@ -10773,6 +10865,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     char *parse_start;
 #endif
     const char *maxpos = NULL;
+    UV uv;
 
     /* Save the original in case we change the emitted regop to a FAIL. */
     regnode * const orig_emit = RExC_emit;
@@ -10785,8 +10878,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     ret = regatom(pRExC_state, &flags,depth+1);
     if (ret == NULL) {
-       if (flags & (TRYAGAIN|RESTART_UTF8))
-           *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
+       if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
+           *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
         else
             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
        return(NULL);
@@ -10814,16 +10907,28 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            if (!maxpos)
                maxpos = next;
            RExC_parse++;
-           min = grok_atou(RExC_parse, &endptr);
+            if (isDIGIT(*RExC_parse)) {
+                if (!grok_atoUV(RExC_parse, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                min = (I32)uv;
+            } else {
+                min = 0;
+            }
            if (*maxpos == ',')
                maxpos++;
            else
                maxpos = RExC_parse;
-           max = grok_atou(maxpos, &endptr);
-           if (!max && *maxpos != '0')
+            if (isDIGIT(*maxpos)) {
+                if (!grok_atoUV(maxpos, &uv, &endptr))
+                    vFAIL("Invalid quantifier in {,}");
+                if (uv >= REG_INFTY)
+                    vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+                max = (I32)uv;
+            } else {
                max = REG_INFTY;                /* meaning "infinity" */
-           else if (max >= REG_INFTY)
-               vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+            }
            RExC_parse = next;
            nextchar(pRExC_state);
             if (max < min) {    /* If can't match, warn and optimize to fail
@@ -10834,31 +10939,40 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      * enough space for all the things we are about to throw
                      * away, but we can shrink it by the ammount we are about
                      * to re-use here */
-                    RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+                    RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
                 }
                 else {
                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
                     RExC_emit = orig_emit;
                 }
-                ret = reg_node(pRExC_state, OPFAIL);
+                ret = reganode(pRExC_state, OPFAIL, 0);
                 return ret;
             }
-            else if (min == max
-                     && RExC_parse < RExC_end
-                     && (*RExC_parse == '?' || *RExC_parse == '+'))
+            else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
             {
                 if (PASS2) {
                     ckWARN2reg(RExC_parse + 1,
                                "Useless use of greediness modifier '%c'",
                                *RExC_parse);
                 }
-                /* Absorb the modifier, so later code doesn't see nor use
-                    * it */
-                nextchar(pRExC_state);
             }
 
          do_curly:
            if ((flags&SIMPLE)) {
+                if (min == 0 && max == REG_INFTY) {
+                    reginsert(pRExC_state, STAR, ret, depth+1);
+                    ret->flags = 0;
+                    MARK_NAUGHTY(4);
+                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+                    goto nest_check;
+                }
+                if (min == 1 && max == REG_INFTY) {
+                    reginsert(pRExC_state, PLUS, ret, depth+1);
+                    ret->flags = 0;
+                    MARK_NAUGHTY(3);
+                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+                    goto nest_check;
+                }
                 MARK_NAUGHTY_EXP(2, 2);
                reginsert(pRExC_state, CURLY, ret, depth+1);
                 Set_Node_Offset(ret, parse_start+1); /* MJD */
@@ -10932,22 +11046,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
 
-    if (op == '*' && (flags&SIMPLE)) {
-       reginsert(pRExC_state, STAR, ret, depth+1);
-       ret->flags = 0;
-       MARK_NAUGHTY(4);
-        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-    }
-    else if (op == '*') {
+    if (op == '*') {
        min = 0;
        goto do_curly;
     }
-    else if (op == '+' && (flags&SIMPLE)) {
-       reginsert(pRExC_state, PLUS, ret, depth+1);
-       ret->flags = 0;
-       MARK_NAUGHTY(3);
-        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-    }
     else if (op == '+') {
        min = 1;
        goto do_curly;
@@ -10993,95 +11095,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     return(ret);
 }
 
-STATIC STRLEN
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
-                      UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
+                regnode ** node_p,
+                UV * code_point_p,
+                int * cp_count,
+                I32 * flagp,
+                const U32 depth
     )
 {
-
- /* This is expected to be called by a parser routine that has recognized '\N'
-   and needs to handle the rest. RExC_parse is expected to point at the first
-   char following the N at the time of the call.  On successful return,
-   RExC_parse has been updated to point to just after the sequence identified
-   by this routine, <*flagp> has been updated, and the non-NULL input pointers
-   have been set appropriately.
-
-   The typical case for this is \N{some character name}.  This is usually
-   called while parsing the input, filling in or ready to fill in an EXACTish
-   node, and the code point for the character should be returned, so that it
-   can be added to the node, and parsing continued with the next input
-   character.  But it may be that instead of a single character the \N{}
-   expands to more than one, a named sequence.  In this case any following
-   quantifier applies to the whole sequence, and it is easier, given the code
-   structure that calls this, to handle it from a different area of the code.
-   For this reason, the input parameters can be set so that it returns valid
-   only on one or the other of these cases.
-
-   Another possibility is for the input to be an empty \N{}, which for
-   backwards compatibility we accept, but generate a NOTHING node which should
-   later get optimized out.  This is handled from the area of code which can
-   handle a named sequence, so if called with the parameters for the other, it
-   fails.
-
-   Still another possibility is for the \N to mean [^\n], and not a single
-   character or explicit sequence at all.  This is determined by context.
-   Again, this is handled from the area of code which can handle a named
-   sequence, so if called with the parameters for the other, it also fails.
-
-   And the final possibility is for the \N to be called from within a bracketed
-   character class.  In this case the [^\n] meaning makes no sense, and so is
-   an error.  Other anomalous situations are left to the calling code to handle.
-
-   For non-single-quoted regexes, the tokenizer has attempted to decide which
-   of the above applies, and in the case of a named sequence, has converted it
-   into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
-   where c1... are the characters in the sequence.  For single-quoted regexes,
-   the tokenizer passes the \N sequence through unchanged; this code will not
-   attempt to determine this nor expand those, instead raising a syntax error.
-   The net effect is that if the beginning of the passed-in pattern isn't '{U+'
-   or there is no '}', it signals that this \N occurrence means to match a
-   non-newline. (This mostly was done because of [perl #56444].)
-
-   The API is somewhat convoluted due to historical and the above reasons.
-
-   The function raises an error (via vFAIL), and doesn't return for various
-   syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
-   it returns a count of how many characters were accounted for by it.  (This
-   can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
-   points in the sequence.  It sets <node_p>, <valuep>, and/or
-   <substitute_parse> on success.
-
-   If <valuep> is non-null, it means the caller can accept an input sequence
-   consisting of just a single code point; <*valuep> is set to the value of the
-   only or first code point in the input.
-
-   If <substitute_parse> is non-null, it means the caller can accept an input
-   sequence consisting of one or more code points; <*substitute_parse> is a
-   newly created mortal SV* in this case, containing \x{} escapes representing
-   those code points.
-
-   Both <valuep> and <substitute_parse> can be non-NULL.
-
-   If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
-   that the caller can accept any legal sequence other than a single code
-   point.  To wit, <*node_p> is set as follows:
-    1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
-    2) \N{}:              points to a new NOTHING node; return is 0
-    3) otherwise:         points to a new EXACT node containing the resolved
-                          string; return is the number of code points in the
-                          string.  This will never be 1.
-   Note that failure is returned for single code point sequences if <valuep> is
-   null and <node_p> is not.
- */
-
-    char * endbrace;    /* '}' following the name */
-    char* p;
+ /* This routine teases apart the various meanings of \N and returns
+  * accordingly.  The input parameters constrain which meaning(s) is/are valid
+  * in the current context.
+  *
+  * Exactly one of <node_p> and <code_point_p> must be non-NULL.
+  *
+  * If <code_point_p> is not NULL, the context is expecting the result to be a
+  * single code point.  If this \N instance turns out to a single code point,
+  * the function returns TRUE and sets *code_point_p to that code point.
+  *
+  * If <node_p> is not NULL, the context is expecting the result to be one of
+  * the things representable by a regnode.  If this \N instance turns out to be
+  * one such, the function generates the regnode, returns TRUE and sets *node_p
+  * to point to that regnode.
+  *
+  * If this instance of \N isn't legal in any context, this function will
+  * generate a fatal error and not return.
+  *
+  * On input, RExC_parse should point to the first char following the \N at the
+  * time of the call.  On successful return, RExC_parse will have been updated
+  * to point to just after the sequence identified by this routine.  Also
+  * *flagp has been updated as needed.
+  *
+  * When there is some problem with the current context and this \N instance,
+  * the function returns FALSE, without advancing RExC_parse, nor setting
+  * *node_p, nor *code_point_p, nor *flagp.
+  *
+  * If <cp_count> is not NULL, the caller wants to know the length (in code
+  * points) that this \N sequence matches.  This is set even if the function
+  * returns FALSE, as detailed below.
+  *
+  * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+  *
+  * Probably the most common case is for the \N to specify a single code point.
+  * *cp_count will be set to 1, and *code_point_p will be set to that code
+  * point.
+  *
+  * Another possibility is for the input to be an empty \N{}, which for
+  * backwards compatibility we accept.  *cp_count will be set to 0. *node_p
+  * will be set to a generated NOTHING node.
+  *
+  * Still another possibility is for the \N to mean [^\n]. *cp_count will be
+  * set to 0. *node_p will be set to a generated REG_ANY node.
+  *
+  * The fourth possibility is that \N resolves to a sequence of more than one
+  * code points.  *cp_count will be set to the number of code points in the
+  * sequence. *node_p * will be set to a generated node returned by this
+  * function calling S_reg().
+  *
+  * The final possibility is that it is premature to be calling this function;
+  * that pass1 needs to be restarted.  This can happen when this changes from
+  * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
+  * latter occurs only when the fourth possibility would otherwise be in
+  * effect, and is because one of those code points requires the pattern to be
+  * recompiled as UTF-8.  The function returns FALSE, and sets the
+  * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
+  * happens, the caller needs to desist from continuing parsing, and return
+  * this information to its caller.  This is not set for when there is only one
+  * code point, as this can be called as part of an ANYOF node, and they can
+  * store above-Latin1 code points without the pattern having to be in UTF-8.
+  *
+  * For non-single-quoted regexes, the tokenizer has resolved character and
+  * sequence names inside \N{...} into their Unicode values, normalizing the
+  * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
+  * hex-represented code points in the sequence.  This is done there because
+  * the names can vary based on what charnames pragma is in scope at the time,
+  * so we need a way to take a snapshot of what they resolve to at the time of
+  * the original parse. [perl #56444].
+  *
+  * That parsing is skipped for single-quoted regexes, so we may here get
+  * '\N{NAME}'.  This is a fatal error.  These names have to be resolved by the
+  * parser.  But if the single-quoted regex is something like '\N{U+41}', that
+  * is legal and handled here.  The code point is Unicode, and has to be
+  * translated into the native character set for non-ASCII platforms.
+  */
+
+    char * endbrace;    /* points to '}' following the name */
     char *endchar;     /* Points to '.' or '}' ending cur char in the input
                            stream */
-    bool has_multiple_chars; /* true if the input stream contains a sequence of
-                                more than one character */
-    bool in_char_class = substitute_parse != NULL;
-    STRLEN count = 0;   /* Number of characters in this sequence */
+    char* p = RExC_parse; /* Temporary */
 
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -11089,36 +11191,37 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
 
     GET_RE_DEBUG_FLAGS;
 
-    assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
-    assert(! (node_p && substitute_parse)); /* At most 1 should be set */
+    assert(cBOOL(node_p) ^ cBOOL(code_point_p));  /* Exactly one should be set */
+    assert(! (node_p && cp_count));               /* At most 1 should be set */
+
+    if (cp_count) {     /* Initialize return for the most common case */
+        *cp_count = 1;
+    }
 
     /* The [^\n] meaning of \N ignores spaces and comments under the /x
-     * modifier.  The other meaning does not, so use a temporary until we find
+     * modifier.  The other meanings do not, so use a temporary until we find
      * out which we are being called with */
-    p = (RExC_flags & RXf_PMf_EXTENDED)
-       ? regpatws(pRExC_state, RExC_parse,
-                                TRUE) /* means recognize comments */
-       : RExC_parse;
+    skip_to_be_ignored_text(pRExC_state, &p,
+                            FALSE /* Don't force to /x */ );
 
     /* Disambiguate between \N meaning a named character versus \N meaning
-     * [^\n].  The former is assumed when it can't be the latter. */
+     * [^\n].  The latter is assumed when the {...} following the \N is a legal
+     * quantifier, or there is no '{' at all */
     if (*p != '{' || regcurly(p)) {
        RExC_parse = p;
+        if (cp_count) {
+            *cp_count = -1;
+        }
+
        if (! node_p) {
-           /* no bare \N allowed in a charclass */
-            if (in_char_class) {
-                vFAIL("\\N in a character class must be a named character: \\N{...}");
-            }
-            return (STRLEN) -1;
+            return FALSE;
         }
-        RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
-                           current char */
-       nextchar(pRExC_state);
+
        *node_p = reg_node(pRExC_state, REG_ANY);
        *flagp |= HASWIDTH|SIMPLE;
        MARK_NAUGHTY(1);
         Set_Node_Length(*node_p, 1); /* MJD */
-       return 1;
+       return TRUE;
     }
 
     /* Here, we have decided it should be a named character or sequence */
@@ -11143,17 +11246,20 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
        vFAIL("\\N{NAME} must be resolved by the lexer");
     }
 
-    RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+    REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
+                                        semantics */
 
     if (endbrace == RExC_parse) {   /* empty: \N{} */
-       if (node_p) {
-           *node_p = reg_node(pRExC_state,NOTHING);
-       }
-        else if (! in_char_class) {
-            return (STRLEN) -1;
+        if (cp_count) {
+            *cp_count = 0;
         }
         nextchar(pRExC_state);
-        return 0;
+       if (! node_p) {
+            return FALSE;
+        }
+
+        *node_p = reg_node(pRExC_state,NOTHING);
+        return TRUE;
     }
 
     RExC_parse += 2;   /* Skip past the 'U+' */
@@ -11162,116 +11268,128 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
 
     /* Code points are separated by dots.  If none, there is only one code
      * point, and is terminated by the brace */
-    has_multiple_chars = (endchar < endbrace);
 
-    /* We get the first code point if we want it, and either there is only one,
-     * or we can accept both cases of one and there is more than one */
-    if (valuep && (substitute_parse || ! has_multiple_chars)) {
-       STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
-       I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
+    if (endchar >= endbrace) {
+       STRLEN length_of_hex;
+       I32 grok_hex_flags;
+
+        /* Here, exactly one code point.  If that isn't what is wanted, fail */
+        if (! code_point_p) {
+            RExC_parse = p;
+            return FALSE;
+        }
+
+        /* Convert code point from hex */
+       length_of_hex = (STRLEN)(endchar - RExC_parse);
+       grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
                            | PERL_SCAN_DISALLOW_PREFIX
 
                              /* No errors in the first pass (See [perl
                               * #122671].)  We let the code below find the
                               * errors when there are multiple chars. */
-                           | ((SIZE_ONLY || has_multiple_chars)
+                           | ((SIZE_ONLY)
                               ? PERL_SCAN_SILENT_ILLDIGIT
                               : 0);
 
-       *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
+        /* This routine is the one place where both single- and double-quotish
+         * \N{U+xxxx} are evaluated.  The value is a Unicode code point which
+         * must be converted to native. */
+       *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
+                                         &length_of_hex,
+                                         &grok_hex_flags,
+                                         NULL));
 
        /* The tokenizer should have guaranteed validity, but it's possible to
          * bypass it by using single quoting, so check.  Don't do the check
          * here when there are multiple chars; we do it below anyway. */
-        if (! has_multiple_chars) {
-            if (length_of_hex == 0
-                || length_of_hex != (STRLEN)(endchar - RExC_parse) )
-            {
-                RExC_parse += length_of_hex;   /* Includes all the valid */
-                RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
-                                ? UTF8SKIP(RExC_parse)
-                                : 1;
-                /* Guard against malformed utf8 */
-                if (RExC_parse >= endchar) {
-                    RExC_parse = endchar;
-                }
-                vFAIL("Invalid hexadecimal number in \\N{U+...}");
+        if (length_of_hex == 0
+            || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+        {
+            RExC_parse += length_of_hex;       /* Includes all the valid */
+            RExC_parse += (RExC_orig_utf8)     /* point to after 1st invalid */
+                            ? UTF8SKIP(RExC_parse)
+                            : 1;
+            /* Guard against malformed utf8 */
+            if (RExC_parse >= endchar) {
+                RExC_parse = endchar;
             }
-
-            RExC_parse = endbrace + 1;
-            return 1;
+            vFAIL("Invalid hexadecimal number in \\N{U+...}");
         }
-    }
 
-    /* Here, we should have already handled the case where a single character
-     * is expected and found.  So it is a failure if we aren't expecting
-     * multiple chars and got them; or didn't get them but wanted them.  We
-     * fail without advancing the parse, so that the caller can try again with
-     * different acceptance criteria */
-    if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
-        RExC_parse = p;
-        return (STRLEN) -1;
+        RExC_parse = endbrace + 1;
+        return TRUE;
     }
-
-    {
-       /* What is done here is to convert this to a sub-pattern of the form
-        * \x{char1}\x{char2}...
-         * and then either return it in <*substitute_parse> if non-null; or
-         * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
-         * way, it retains its atomicness, while not having to worry about
-         * special handling that some code points may have.  toke.c has
-         * converted the original Unicode values to native, so that we can just
-         * pass on the hex values unchanged.  We do have to set a flag to keep
-         * recoding from happening in the recursion */
-
-       SV * dummy = NULL;
+    else {  /* Is a multiple character sequence */
+       SV * substitute_parse;
        STRLEN len;
        char *orig_end = RExC_end;
         I32 flags;
 
-        if (substitute_parse) {
-            *substitute_parse = newSVpvs("");
+        /* Count the code points, if desired, in the sequence */
+        if (cp_count) {
+            *cp_count = 0;
+            while (RExC_parse < endbrace) {
+                /* Point to the beginning of the next character in the sequence. */
+                RExC_parse = endchar + 1;
+                endchar = RExC_parse + strcspn(RExC_parse, ".}");
+                (*cp_count)++;
+            }
         }
-        else {
-            substitute_parse = &dummy;
-            *substitute_parse = newSVpvs("?:");
+
+        /* Fail if caller doesn't want to handle a multi-code-point sequence.
+         * But don't backup up the pointer if the caller want to know how many
+         * code points there are (they can then handle things) */
+        if (! node_p) {
+            if (! cp_count) {
+                RExC_parse = p;
+            }
+            return FALSE;
         }
-        *substitute_parse = sv_2mortal(*substitute_parse);
+
+       /* What is done here is to convert this to a sub-pattern of the form
+         * \x{char1}\x{char2}...  and then call reg recursively to parse it
+         * (enclosing in "(?: ... )" ).  That way, it retains its atomicness,
+         * while not having to worry about special handling that some code
+         * points may have. */
+
+       substitute_parse = newSVpvs("?:");
 
        while (RExC_parse < endbrace) {
 
            /* Convert to notation the rest of the code understands */
-           sv_catpv(*substitute_parse, "\\x{");
-           sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
-           sv_catpv(*substitute_parse, "}");
+           sv_catpv(substitute_parse, "\\x{");
+           sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+           sv_catpv(substitute_parse, "}");
 
            /* Point to the beginning of the next character in the sequence. */
            RExC_parse = endchar + 1;
            endchar = RExC_parse + strcspn(RExC_parse, ".}");
 
-            count++;
        }
-        if (! in_char_class) {
-            sv_catpv(*substitute_parse, ")");
-        }
+        sv_catpv(substitute_parse, ")");
 
-       RExC_parse = SvPV(*substitute_parse, len);
+       RExC_parse = SvPV(substitute_parse, len);
 
        /* Don't allow empty number */
-       if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
+       if (len < (STRLEN) 8) {
             RExC_parse = endbrace;
            vFAIL("Invalid hexadecimal number in \\N{U+...}");
        }
        RExC_end = RExC_parse + len;
 
-       /* The values are Unicode, and therefore not subject to recoding */
+        /* The values are Unicode, and therefore not subject to recoding, but
+         * have to be converted to native on a non-Unicode (meaning non-ASCII)
+         * platform. */
        RExC_override_recoding = 1;
+#ifdef EBCDIC
+        RExC_recode_x_to_native = 1;
+#endif
 
         if (node_p) {
             if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
-                if (flags & RESTART_UTF8) {
-                    *flagp = RESTART_UTF8;
-                    return (STRLEN) -1;
+                if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                    *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+                    return FALSE;
                 }
                 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
                     (UV) flags);
@@ -11279,14 +11397,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
             *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
         }
 
+        /* Restore the saved values */
        RExC_parse = endbrace;
        RExC_end = orig_end;
        RExC_override_recoding = 0;
+#ifdef EBCDIC
+        RExC_recode_x_to_native = 0;
+#endif
 
+        SvREFCNT_dec_NN(substitute_parse);
         nextchar(pRExC_state);
-    }
 
-    return count;
+        return TRUE;
+    }
 }
 
 
@@ -11301,10 +11424,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
  */
 STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
 {
     STRLEN numlen = 1;
-    SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+    SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
     const STRLEN newlen = SvCUR(sv);
     UV uv = UNICODE_REPLACEMENT;
@@ -11452,8 +11575,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
             *character = (U8) code_point;
             len = 1;
         } /* Else is folded non-UTF8 */
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-
+#else
+        else if (1) {
+#endif
             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
              * comments at join_exact()); */
             *character = (U8) code_point;
@@ -11496,10 +11624,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
 
     /* A single character node is SIMPLE, except for the special-cased SHARP S
      * under /di. */
-    if ((len == 1 || (UTF && len == UNISKIP(code_point)))
-        && (code_point != LATIN_SMALL_LETTER_SHARP_S
-            || ! FOLD || ! DEPENDS_SEMANTICS))
-    {
+    if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+        && ( code_point != LATIN_SMALL_LETTER_SHARP_S
+            || ! FOLD || ! DEPENDS_SEMANTICS)
+#endif
+    ) {
         *flagp |= SIMPLE;
     }
 
@@ -11517,10 +11649,10 @@ static I32
 S_backref_value(char *p)
 {
     const char* endptr;
-    UV val = grok_atou(p, &endptr);
-    if (endptr == p || endptr == NULL || val > I32_MAX)
-        return I32_MAX;
-    return (I32)val;
+    UV val;
+    if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
+        return (I32)val;
+    return I32_MAX;
 }
 
 
@@ -11585,8 +11717,8 @@ S_backref_value(char *p)
 
    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
    TRYAGAIN.
-   Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
-   restarted.
+   Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+   restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
    Otherwise does not return NULL.
 */
 
@@ -11595,7 +11727,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 {
     regnode *ret = NULL;
     I32 flags = 0;
-    char *parse_start = RExC_parse;
+    char *parse_start;
     U8 op;
     int invert = 0;
     U8 arg;
@@ -11609,6 +11741,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     PERL_ARGS_ASSERT_REGATOM;
 
   tryagain:
+    parse_start = RExC_parse;
     switch ((U8)*RExC_parse) {
     case '^':
        RExC_seen_zerolen++;
@@ -11647,17 +11780,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        TRUE, /* allow multi-char folds */
                        FALSE, /* don't silence non-portable warnings. */
                        (bool) RExC_strict,
+                       TRUE, /* Allow an optimized regnode result */
                        NULL);
-       if (*RExC_parse != ']') {
-           RExC_parse = oregcomp_parse;
-           vFAIL("Unmatched [");
-       }
         if (ret == NULL) {
-            if (*flagp & RESTART_UTF8)
+            if (*flagp & (RESTART_PASS1|NEED_UTF8))
                 return NULL;
             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
                   (UV) *flagp);
         }
+       if (*RExC_parse != ']') {
+           RExC_parse = oregcomp_parse;
+           vFAIL("Unmatched [");
+       }
        nextchar(pRExC_state);
         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
        break;
@@ -11674,8 +11808,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    }
                    goto tryagain;
                }
-                if (flags & RESTART_UTF8) {
-                    *flagp = RESTART_UTF8;
+                if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                    *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                     return NULL;
                 }
                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
@@ -11749,13 +11883,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            RExC_seen_zerolen++;                /* Do not optimize RE away */
            goto finish_meta_pat;
        case 'C':
-           ret = reg_node(pRExC_state, CANY);
-            RExC_seen |= REG_CANY_SEEN;
-           *flagp |= HASWIDTH|SIMPLE;
-            if (PASS2) {
-                ckWARNdep(RExC_parse+1, "\\C is deprecated");
-            }
-           goto finish_meta_pat;
+           vFAIL("\\C no longer supported");
        case 'X':
            ret = reg_node(pRExC_state, CLUMP);
            *flagp |= HASWIDTH;
@@ -11772,27 +11900,102 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             invert = 1;
             /* FALLTHROUGH */
        case 'b':
+          {
+           regex_charset charset = get_regex_charset(RExC_flags);
+
            RExC_seen_zerolen++;
             RExC_seen |= REG_LOOKBEHIND_SEEN;
-           op = BOUND + get_regex_charset(RExC_flags);
-            if (op > BOUNDA) {  /* /aa is same as /a */
-                op = BOUNDA;
-            }
-            else if (op == BOUNDL) {
-                RExC_contains_locale = 1;
-            }
+           op = BOUND + charset;
 
-            if (invert) {
-                op += NBOUND - BOUND;
+            if (op == BOUNDL) {
+                RExC_contains_locale = 1;
             }
 
            ret = reg_node(pRExC_state, op);
            *flagp |= SIMPLE;
-           if ((U8) *(RExC_parse + 1) == '{') {
-                /* diag_listed_as: Use "%s" instead of "%s" */
-               vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+           if (*(RExC_parse + 1) != '{') {
+                FLAGS(ret) = TRADITIONAL_BOUND;
+                if (PASS2 && op > BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDA;
+                }
+            }
+            else {
+                STRLEN length;
+                char name = *RExC_parse;
+                char * endbrace;
+                RExC_parse += 2;
+                endbrace = strchr(RExC_parse, '}');
+
+                if (! endbrace) {
+                    vFAIL2("Missing right brace on \\%c{}", name);
+                }
+                /* XXX Need to decide whether to take spaces or not.  Should be
+                 * consistent with \p{}, but that currently is SPACE, which
+                 * means vertical too, which seems wrong
+                 * while (isBLANK(*RExC_parse)) {
+                    RExC_parse++;
+                }*/
+                if (endbrace == RExC_parse) {
+                    RExC_parse++;  /* After the '}' */
+                    vFAIL2("Empty \\%c{}", name);
+                }
+                length = endbrace - RExC_parse;
+                /*while (isBLANK(*(RExC_parse + length - 1))) {
+                    length--;
+                }*/
+                switch (*RExC_parse) {
+                    case 'g':
+                        if (length != 1
+                            && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+                        {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = GCB_BOUND;
+                        break;
+                    case 's':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = SB_BOUND;
+                        break;
+                    case 'w':
+                        if (length != 2 || *(RExC_parse + 1) != 'b') {
+                            goto bad_bound_type;
+                        }
+                        FLAGS(ret) = WB_BOUND;
+                        break;
+                    default:
+                      bad_bound_type:
+                        RExC_parse = endbrace;
+                       vFAIL2utf8f(
+                            "'%"UTF8f"' is an unknown bound type",
+                           UTF8fARG(UTF, length, endbrace - length));
+                        NOT_REACHED; /*NOTREACHED*/
+                }
+                RExC_parse = endbrace;
+                REQUIRE_UNI_RULES(flagp, NULL);
+
+                if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
+                    OP(ret) = BOUNDU;
+                    length += 4;
+
+                    /* Don't have to worry about UTF-8, in this message because
+                     * to get here the contents of the \b must be ASCII */
+                    ckWARN4reg(RExC_parse + 1,  /* Include the '}' in msg */
+                              "Using /u for '%.*s' instead of /%s",
+                              (unsigned) length,
+                              endbrace - length + 1,
+                              (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+                              ? ASCII_RESTRICT_PAT_MODS
+                              : ASCII_MORE_RESTRICT_PAT_MODS);
+                }
            }
+
+            if (PASS2 && invert) {
+                OP(ret) += NBOUND - BOUND;
+            }
            goto finish_meta_pat;
+          }
 
        case 'D':
             invert = 1;
@@ -11866,55 +12069,63 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            break;
        case 'p':
        case 'P':
-           {
-#ifdef DEBUGGING
-               char* parse_start = RExC_parse - 2;
-#endif
+            RExC_parse--;
+
+            ret = regclass(pRExC_state, flagp,depth+1,
+                           TRUE, /* means just parse this element */
+                           FALSE, /* don't allow multi-char folds */
+                           FALSE, /* don't silence non-portable warnings.  It
+                                     would be a bug if these returned
+                                     non-portables */
+                           (bool) RExC_strict,
+                           TRUE, /* Allow an optimized regnode result */
+                           NULL);
+            if (*flagp & RESTART_PASS1)
+                return NULL;
+            /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+             * multi-char folds are allowed.  */
+            if (!ret)
+                FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+                      (UV) *flagp);
 
-               RExC_parse--;
-
-                ret = regclass(pRExC_state, flagp,depth+1,
-                               TRUE, /* means just parse this element */
-                               FALSE, /* don't allow multi-char folds */
-                               FALSE, /* don't silence non-portable warnings.
-                                         It would be a bug if these returned
-                                         non-portables */
-                               (bool) RExC_strict,
-                               NULL);
-                /* regclass() can only return RESTART_UTF8 if multi-char folds
-                   are allowed.  */
-                if (!ret)
-                    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
-                          (UV) *flagp);
-
-               RExC_parse--;
-
-               Set_Node_Offset(ret, parse_start + 2);
-                Set_Node_Cur_Length(ret, parse_start);
-               nextchar(pRExC_state);
-           }
+            RExC_parse--;
+
+            Set_Node_Offset(ret, parse_start);
+            Set_Node_Cur_Length(ret, parse_start - 2);
+            nextchar(pRExC_state);
            break;
         case 'N':
-            /* Handle \N and \N{NAME} with multiple code points here and not
-             * below because it can be multicharacter. join_exact() will join
-             * them up later on.  Also this makes sure that things like
-             * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
-             * The options to the grok function call causes it to fail if the
-             * sequence is just a single code point.  We then go treat it as
-             * just another character in the current EXACT node, and hence it
-             * gets uniform treatment with all the other characters.  The
-             * special treatment for quantifiers is not needed for such single
-             * character sequences */
+            /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
+             * \N{...} evaluates to a sequence of more than one code points).
+             * The function call below returns a regnode, which is our result.
+             * The parameters cause it to fail if the \N{} evaluates to a
+             * single code point; we handle those like any other literal.  The
+             * reason that the multicharacter case is handled here and not as
+             * part of the EXACtish code is because of quantifiers.  In
+             * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
+             * this way makes that Just Happen. dmq.
+             * join_exact() will join this up with adjacent EXACTish nodes
+             * later on, if appropriate. */
             ++RExC_parse;
-            if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
-                                             depth, FALSE))
-            {
-                if (*flagp & RESTART_UTF8)
-                    return NULL;
-                RExC_parse--;
-                goto defchar;
+            if (grok_bslash_N(pRExC_state,
+                              &ret,     /* Want a regnode returned */
+                              NULL,     /* Fail if evaluates to a single code
+                                           point */
+                              NULL,     /* Don't need a count of how many code
+                                           points */
+                              flagp,
+                              depth)
+            ) {
+                break;
             }
-            break;
+
+            if (*flagp & RESTART_PASS1)
+                return NULL;
+
+            /* Here, evaluates to a single code point.  Go get that */
+            RExC_parse = parse_start;
+            goto defchar;
+
        case 'k':    /* Handle \k<NAME> and \k'NAME' */
       parse_named_seq:
         {
@@ -12028,6 +12239,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          * octal character escape, e.g. \35 or \777.
                          * The above logic should make it obvious why using
                          * octal escapes in patterns is problematic. - Yves */
+                        RExC_parse = parse_start;
                         goto defchar;
                     }
                 }
@@ -12037,41 +12249,36 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * as an octal escape. It may or may not be a valid backref
                  * escape. For instance \88888888 is unlikely to be a valid
                  * backref. */
-               {
-#ifdef RE_TRACK_PATTERN_OFFSETS
-                   char * const parse_start = RExC_parse - 1; /* MJD */
-#endif
-                   while (isDIGIT(*RExC_parse))
-                       RExC_parse++;
-                    if (hasbrace) {
-                        if (*RExC_parse != '}')
-                            vFAIL("Unterminated \\g{...} pattern");
-                        RExC_parse++;
-                    }
-                   if (!SIZE_ONLY) {
-                       if (num > (I32)RExC_rx->nparens)
-                           vFAIL("Reference to nonexistent group");
-                   }
-                   RExC_sawback = 1;
-                   ret = reganode(pRExC_state,
-                                  ((! FOLD)
-                                    ? REF
-                                    : (ASCII_FOLD_RESTRICTED)
-                                      ? REFFA
-                                       : (AT_LEAST_UNI_SEMANTICS)
-                                         ? REFFU
-                                         : (LOC)
-                                           ? REFFL
-                                           : REFF),
-                                   num);
-                   *flagp |= HASWIDTH;
+                while (isDIGIT(*RExC_parse))
+                    RExC_parse++;
+                if (hasbrace) {
+                    if (*RExC_parse != '}')
+                        vFAIL("Unterminated \\g{...} pattern");
+                    RExC_parse++;
+                }
+                if (!SIZE_ONLY) {
+                    if (num > (I32)RExC_rx->nparens)
+                        vFAIL("Reference to nonexistent group");
+                }
+                RExC_sawback = 1;
+                ret = reganode(pRExC_state,
+                               ((! FOLD)
+                                 ? REF
+                                 : (ASCII_FOLD_RESTRICTED)
+                                   ? REFFA
+                                   : (AT_LEAST_UNI_SEMANTICS)
+                                     ? REFFU
+                                     : (LOC)
+                                       ? REFFL
+                                       : REFF),
+                                num);
+                *flagp |= HASWIDTH;
 
-                    /* override incorrect value set in reganode MJD */
-                    Set_Node_Offset(ret, parse_start+1);
-                    Set_Node_Cur_Length(ret, parse_start);
-                   RExC_parse--;
-                   nextchar(pRExC_state);
-               }
+                /* override incorrect value set in reganode MJD */
+                Set_Node_Offset(ret, parse_start);
+                Set_Node_Cur_Length(ret, parse_start-1);
+                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                        FALSE /* Don't force to /x */ );
            }
            break;
        case '\0':
@@ -12081,26 +12288,34 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
        default:
            /* Do not generate "unrecognized" warnings here, we fall
               back into the quick-grab loop below */
-           parse_start--;
+            RExC_parse = parse_start;
            goto defchar;
-       }
+       } /* end of switch on a \foo sequence */
        break;
 
     case '#':
-       if (RExC_flags & RXf_PMf_EXTENDED) {
+
+        /* '#' comments should have been spaced over before this function was
+         * called */
+        assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
+       /*
+        if (RExC_flags & RXf_PMf_EXTENDED) {
            RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
            if (RExC_parse < RExC_end)
                goto tryagain;
        }
+        */
+
        /* FALLTHROUGH */
 
     default:
+         defchar: {
 
-            parse_start = RExC_parse - 1;
-
-           RExC_parse++;
+            /* Here, we have determined that the next thing is probably a
+             * literal character.  RExC_parse points to the first byte of its
+             * definition.  (It still may be an escape sequence that evaluates
+             * to a single character) */
 
-         defchar: {
            STRLEN len = 0;
            UV ender = 0;
            char *p;
@@ -12118,7 +12333,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * string's UTF8ness.  The reason to do this is that EXACTF is not
              * trie-able, EXACTFU is.
              *
-             * Similarly, we can convert EXACTFL nodes to EXACTFU if they
+             * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
              * contain only above-Latin1 characters (hence must be in UTF8),
              * which don't participate in folds with Latin1-range characters,
              * as the latter's folds aren't known until runtime.  (We don't
@@ -12141,8 +12356,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
          reparse:
 
-            /* We do the EXACTFish to EXACT node only if folding.  (And we
-             * don't need to figure this out until pass 2) */
+            /* We look for the EXACTFish to EXACT node optimizaton only if
+             * folding.  (And we don't need to figure this out until pass 2) */
             maybe_exact = FOLD && PASS2;
 
            /* XXX The node can hold up to 255 bytes, yet this only goes to
@@ -12163,15 +12378,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              * could back off to end with only a code point that isn't such a
              * non-final, but it is possible for there not to be any in the
              * entire node. */
-           for (p = RExC_parse - 1;
+
+            assert(   ! UTF     /* Is at the beginning of a character */
+                   || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
+                   || UTF8_IS_START(UCHARAT(RExC_parse)));
+
+           for (p = RExC_parse;
                 len < upper_parse && p < RExC_end;
                 len++)
            {
                oldp = p;
 
-               if (RExC_flags & RXf_PMf_EXTENDED)
-                    p = regpatws(pRExC_state, p,
-                                          TRUE); /* means recognize comments */
+                /* White space has already been ignored */
+                assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
+                       || ! is_PATWS_safe((p), RExC_end, UTF));
+
                switch ((U8)*p) {
                case '^':
                case '$':
@@ -12223,24 +12444,32 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        p++;
                        break;
                    case 'N': /* Handle a single-code point named character. */
-                        /* The options cause it to fail if a multiple code
-                         * point sequence.  Handle those in the switch() above
-                         * */
                         RExC_parse = p + 1;
-                        if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
-                                                         &ender,
-                                                         flagp,
-                                                         depth,
-                                                         FALSE
-                        )) {
-                            if (*flagp & RESTART_UTF8)
-                                FAIL("panic: grok_bslash_N set RESTART_UTF8");
+                        if (! grok_bslash_N(pRExC_state,
+                                            NULL,   /* Fail if evaluates to
+                                                       anything other than a
+                                                       single code point */
+                                            &ender, /* The returned single code
+                                                       point */
+                                            NULL,   /* Don't need a count of
+                                                       how many code points */
+                                            flagp,
+                                            depth)
+                        ) {
+                            if (*flagp & NEED_UTF8)
+                                FAIL("panic: grok_bslash_N set NEED_UTF8");
+                            if (*flagp & RESTART_PASS1)
+                                return NULL;
+
+                            /* Here, it wasn't a single code point.  Go close
+                             * up this EXACTish node.  The switch() prior to
+                             * this switch handles the other cases */
                             RExC_parse = p = oldp;
                             goto loopdone;
                         }
                         p = RExC_parse;
                         if (ender > 0xff) {
-                            REQUIRE_UTF8;
+                            REQUIRE_UTF8(flagp);
                         }
                         break;
                    case 'r':
@@ -12287,7 +12516,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                goto recode_encoding;
                            }
                            if (ender > 0xff) {
-                               REQUIRE_UTF8;
+                               REQUIRE_UTF8(flagp);
                            }
                            break;
                        }
@@ -12313,11 +12542,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                            }
                             ender = result;
 
-                           if (IN_ENCODING && ender < 0x100) {
-                               goto recode_encoding;
+                            if (ender < 0x100) {
+#ifdef EBCDIC
+                                if (RExC_recode_x_to_native) {
+                                    ender = LATIN1_TO_NATIVE(ender);
+                                }
+                                else
+#endif
+                                if (IN_ENCODING) {
+                                    goto recode_encoding;
+                                }
                            }
-                           if (ender > 0xff) {
-                               REQUIRE_UTF8;
+                            else {
+                               REQUIRE_UTF8(flagp);
                            }
                            break;
                        }
@@ -12362,7 +12599,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                            STRLEN numlen = 3;
                            ender = grok_oct(p, &numlen, &flags, NULL);
                            if (ender > 0xff) {
-                               REQUIRE_UTF8;
+                               REQUIRE_UTF8(flagp);
                            }
                            p += numlen;
                             if (PASS2   /* like \08, \178 */
@@ -12381,10 +12618,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      recode_encoding:
                        if (! RExC_override_recoding) {
                            SV* enc = _get_encoding();
-                           ender = reg_recode((const char)(U8)ender, &enc);
+                           ender = reg_recode((U8)ender, &enc);
                            if (!enc && PASS2)
                                ckWARNreg(p, "Invalid escape in the specified encoding");
-                           REQUIRE_UTF8;
+                           REQUIRE_UTF8(flagp);
                        }
                        break;
                    case '\0':
@@ -12393,7 +12630,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                        /* FALLTHROUGH */
                    default:
                        if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
-                           /* Include any { following the alpha to emphasize
+                           /* Include any left brace following the alpha to emphasize
                             * that it could be part of an escape at some point
                             * in the future */
                            int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
@@ -12405,7 +12642,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                case '{':
                    /* Currently we don't warn when the lbrace is at the start
                     * of a construct.  This catches it in the middle of a
-                    * literal string, or when its the first thing after
+                    * literal string, or when it's the first thing after
                     * something like "\b" */
                    if (! SIZE_ONLY
                        && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
@@ -12415,7 +12652,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    /*FALLTHROUGH*/
                default:    /* A literal character */
                  normal_default:
-                   if (UTF8_IS_START(*p) && UTF) {
+                   if (! UTF8_IS_INVARIANT(*p) && UTF) {
                        STRLEN numlen;
                        ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
                                               &numlen, UTF8_ALLOW_DEFAULT);
@@ -12427,12 +12664,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                } /* End of switch on the literal */
 
                /* Here, have looked at the literal character and <ender>
-                * contains its ordinal, <p> points to the character after it
-                */
-
-               if ( RExC_flags & RXf_PMf_EXTENDED)
-                    p = regpatws(pRExC_state, p,
-                                          TRUE); /* means recognize comments */
+                 * contains its ordinal, <p> points to the character after it.
+                 * We need to check if the next non-ignored thing is a
+                 * quantifier.  Move <p> to after anything that should be
+                 * ignored, which, as a side effect, positions <p> for the next
+                 * loop iteration */
+                skip_to_be_ignored_text(pRExC_state, &p,
+                                        FALSE /* Don't force to /x */ );
 
                 /* If the next thing is a quantifier, it applies to this
                  * character only, which means that this character has to be in
@@ -12441,12 +12679,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  * the node, close the node with just them, and set up to do
                  * this character again next time through, when it will be the
                  * only thing in its new node */
-                if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+                if ((next_is_quantifier = (   LIKELY(p < RExC_end)
+                                           && UNLIKELY(ISMULT2(p))))
+                    && LIKELY(len))
                {
                     p = oldp;
                     goto loopdone;
                 }
 
+                /* Ready to add 'ender' to the node */
+
                 if (! FOLD) {  /* The simple case, just append the literal */
 
                     /* In the sizing pass, we need only the size of the
@@ -12454,7 +12696,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      * its representation until PASS2. */
                     if (SIZE_ONLY) {
                         if (UTF) {
-                            const STRLEN unilen = UNISKIP(ender);
+                            const STRLEN unilen = UVCHR_SKIP(ender);
                             s += unilen;
 
                             /* We have to subtract 1 just below (and again in
@@ -12504,12 +12746,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     goto not_fold_common;
                 }
                 else /* A regular FOLD code point */
-                    if (! ( UTF
-                        /* See comments for join_exact() as to why we fold this
-                         * non-UTF at compile time */
-                        || (node_type == EXACTFU
-                            && ender == LATIN_SMALL_LETTER_SHARP_S)))
-                {
+                    if (! (   UTF
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+                            /* See comments for join_exact() as to why we fold
+                             * this non-UTF at compile time */
+                            || (   node_type == EXACTFU
+                                && ender == LATIN_SMALL_LETTER_SHARP_S)
+#endif
+                )) {
                     /* Here, are folding and are not UTF-8 encoded; therefore
                      * the character must be in the range 0-255, and is not /l
                      * (Not /l because we already handled these under /l in
@@ -12520,13 +12766,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* See if the character's fold differs between /d and
                          * /u.  This includes the multi-char fold SHARP S to
                          * 'ss' */
-                        if (maybe_exactfu
+                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+                            RExC_seen_unfolded_sharp_s = 1;
+                            maybe_exactfu = FALSE;
+                        }
+                        else if (maybe_exactfu
                             && (PL_fold[ender] != PL_fold_latin1[ender]
-                                || ender == LATIN_SMALL_LETTER_SHARP_S
-                                || (len > 0
-                                   && isALPHA_FOLD_EQ(ender, 's')
-                                   && isALPHA_FOLD_EQ(*(s-1), 's'))))
-                        {
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+                                || (   len > 0
+                                    && isALPHA_FOLD_EQ(ender, 's')
+                                    && isALPHA_FOLD_EQ(*(s-1), 's'))
+#endif
+                        )) {
                             maybe_exactfu = FALSE;
                         }
                     }
@@ -12535,7 +12788,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      * we have an array that finds its fold quickly */
                     *(s++) = (char) ender;
                 }
-                else {  /* FOLD and UTF */
+                else {  /* FOLD, and UTF (or sharp s) */
                     /* Unlike the non-fold case, we do actually have to
                      * calculate the results here in pass 1.  This is for two
                      * reasons, the folded length may be longer than the
@@ -12665,7 +12918,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             }
                         }
                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
                                                                   *s, *(s+1))))
                             {
                                 break;
@@ -12798,7 +13051,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
            RExC_parse = p - 1;
             Set_Node_Cur_Length(ret, parse_start);
-           nextchar(pRExC_state);
+           RExC_parse = p;
+            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                    FALSE /* Don't force to /x */ );
            {
                /* len is STRLEN which is unsigned, need to copy to signed */
                IV iv = len;
@@ -12813,29 +13068,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     return(ret);
 }
 
-STATIC char *
-S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
-{
-    /* Returns the next non-pattern-white space, non-comment character (the
-     * latter only if 'recognize_comment is true) in the string p, which is
-     * ended by RExC_end.  See also reg_skipcomment */
-    const char *e = RExC_end;
-
-    PERL_ARGS_ASSERT_REGPATWS;
-
-    while (p < e) {
-        STRLEN len;
-       if ((len = is_PATWS_safe(p, e, UTF))) {
-           p += len;
-        }
-       else if (recognize_comment && *p == '#') {
-            p = reg_skipcomment(pRExC_state, p);
-       }
-       else
-           break;
-    }
-    return p;
-}
 
 STATIC void
 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
@@ -12977,7 +13209,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
                            break;
                        case 'e':
                            if (memEQ(posixcc, "spac", 4)) /* space */
-                               namedclass = ANYOF_PSXSPC;
+                               namedclass = ANYOF_SPACE;
                            break;
                        case 'h':
                            if (memEQ(posixcc, "grap", 4)) /* graph */
@@ -13111,6 +13343,34 @@ S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
             && first_char == *(p - 1));
 }
 
+STATIC unsigned  int
+S_regex_set_precedence(const U8 my_operator) {
+
+    /* Returns the precedence in the (?[...]) construct of the input operator,
+     * specified by its character representation.  The precedence follows
+     * general Perl rules, but it extends this so that ')' and ']' have (low)
+     * precedence even though they aren't really operators */
+
+    switch (my_operator) {
+        case '!':
+            return 5;
+        case '&':
+            return 4;
+        case '^':
+        case '|':
+        case '+':
+        case '-':
+            return 3;
+        case ')':
+            return 2;
+        case ']':
+            return 1;
+    }
+
+    NOT_REACHED; /* NOTREACHED */
+    return 0;   /* Silence compiler warning */
+}
+
 STATIC regnode *
 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                     I32 *flagp, U32 depth,
@@ -13118,24 +13378,38 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
 {
     /* Handle the (?[...]) construct to do set operations */
 
-    U8 curchar;
-    UV start, end;     /* End points of code point ranges */
-    SV* result_string;
-    char *save_end, *save_parse;
-    SV* final;
-    STRLEN len;
-    regnode* node;
-    AV* stack;
-    const bool save_fold = FOLD;
+    U8 curchar;                     /* Current character being parsed */
+    UV start, end;                 /* End points of code point ranges */
+    SV* final = NULL;               /* The end result inversion list */
+    SV* result_string;              /* 'final' stringified */
+    AV* stack;                      /* stack of operators and operands not yet
+                                       resolved */
+    AV* fence_stack = NULL;         /* A stack containing the positions in
+                                       'stack' of where the undealt-with left
+                                       parens would be if they were actually
+                                       put there */
+    IV fence = 0;                   /* Position of where most recent undealt-
+                                       with left paren in stack is; -1 if none.
+                                     */
+    STRLEN len;                     /* Temporary */
+    regnode* node;                  /* Temporary, and final regnode returned by
+                                       this function */
+    const bool save_fold = FOLD;    /* Temporary */
+    char *save_end, *save_parse;    /* Temporaries */
+    const bool in_locale = LOC;     /* we turn off /l during processing */
 
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
 
-    if (LOC) {
-        vFAIL("(?[...]) not valid in locale");
+    if (in_locale) {
+        set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
     }
-    RExC_uni_semantics = 1;
+
+    REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
+                                         This is required so that the compile
+                                         time values are valid in all runtime
+                                         cases */
 
     /* This will return only an ANYOF regnode, or (unlikely) something smaller
      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
@@ -13144,22 +13418,15 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
      * upon an unescaped ']' that isn't one ending a regclass.  To do both
      * these things, we need to realize that something preceded by a backslash
      * is escaped, so we have to keep track of backslashes */
-    if (PASS2) {
-        Perl_ck_warner_d(aTHX_
-            packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
-            "The regex_sets feature is experimental" REPORT_LOCATION,
-                UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
-                UTF8fARG(UTF,
-                         RExC_end - RExC_start - (RExC_parse - RExC_precomp),
-                         RExC_precomp + (RExC_parse - RExC_precomp)));
-    }
-    else {
+    if (SIZE_ONLY) {
         UV depth = 0; /* how many nested (?[...]) constructs */
 
         while (RExC_parse < RExC_end) {
             SV* current = NULL;
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                          TRUE); /* means recognize comments */
+
+            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                    TRUE /* Force /x */ );
+
             switch (*RExC_parse) {
                 case '?':
                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
@@ -13174,6 +13441,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                      * default: case next time and keep on incrementing until
                      * we find one of the invariants we do handle. */
                     RExC_parse++;
+                    if (*RExC_parse == 'c') {
+                            /* Skip the \cX notation for control characters */
+                            RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                    }
                     break;
                 case '[':
                 {
@@ -13188,8 +13459,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                         RExC_parse++;
                     }
 
-                    /* regclass() can only return RESTART_UTF8 if multi-char
-                       folds are allowed.  */
+                    /* regclass() can only return RESTART_PASS1 and NEED_UTF8
+                     * if multi-char folds are allowed.  */
                     if (!regclass(pRExC_state, flagp,depth+1,
                                   is_posix_class, /* parse the whole char
                                                      class only if not a
@@ -13197,10 +13468,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                   FALSE, /* don't allow multi-char folds */
                                   TRUE, /* silence non-portable warnings. */
                                   TRUE, /* strict */
+                                  FALSE, /* Require return to be an ANYOF */
                                   &current
                                  ))
-                        FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                              (UV) *flagp);
+                        FAIL2("panic: regclass returned NULL to handle_sets, "
+                              "flags=%#"UVxf"", (UV) *flagp);
 
                     /* function call leaves parse pointing to the ']', except
                      * if we faked it */
@@ -13223,80 +13495,140 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                         nextchar(pRExC_state);
                         Set_Node_Length(node,
                                 RExC_parse - oregcomp_parse + 1); /* MJD */
+                        if (in_locale) {
+                            set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+                        }
+
                         return node;
                     }
                     goto no_close;
             }
-            RExC_parse++;
+
+            RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
         }
 
       no_close:
         FAIL("Syntax error in (?[...])");
     }
 
-    /* Pass 2 only after this.  Everything in this construct is a
-     * metacharacter.  Operands begin with either a '\' (for an escape
-     * sequence), or a '[' for a bracketed character class.  Any other
-     * character should be an operator, or parenthesis for grouping.  Both
-     * types of operands are handled by calling regclass() to parse them.  It
-     * is called with a parameter to indicate to return the computed inversion
-     * list.  The parsing here is implemented via a stack.  Each entry on the
-     * stack is a single character representing one of the operators, or the
-     * '('; or else a pointer to an operand inversion list. */
-
-#define IS_OPERAND(a)  (! SvIOK(a))
-
-    /* The stack starts empty.  It is a syntax error if the first thing parsed
-     * is a binary operator; everything else is pushed on the stack.  When an
-     * operand is parsed, the top of the stack is examined.  If it is a binary
-     * operator, the item before it should be an operand, and both are replaced
-     * by the result of doing that operation on the new operand and the one on
-     * the stack.   Thus a sequence of binary operands is reduced to a single
-     * one before the next one is parsed.
+    /* Pass 2 only after this. */
+    Perl_ck_warner_d(aTHX_
+        packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+        "The regex_sets feature is experimental" REPORT_LOCATION,
+            UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+            UTF8fARG(UTF,
+                     RExC_end - RExC_start - (RExC_parse - RExC_precomp),
+                     RExC_precomp + (RExC_parse - RExC_precomp)));
+
+    /* Everything in this construct is a metacharacter.  Operands begin with
+     * either a '\' (for an escape sequence), or a '[' for a bracketed
+     * character class.  Any other character should be an operator, or
+     * parenthesis for grouping.  Both types of operands are handled by calling
+     * regclass() to parse them.  It is called with a parameter to indicate to
+     * return the computed inversion list.  The parsing here is implemented via
+     * a stack.  Each entry on the stack is a single character representing one
+     * of the operators; or else a pointer to an operand inversion list. */
+
+#define IS_OPERATOR(a) SvIOK(a)
+#define IS_OPERAND(a)  (! IS_OPERATOR(a))
+
+    /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
+     * to luke-a-shave-itch (or -itz), but people who didn't want to bother
+     * with pronouncing it called it Reverse Polish instead, but now that YOU
+     * know how to pronounce it you can use the correct term, thus giving due
+     * credit to the person who invented it, and impressing your geek friends.
+     * Wikipedia says that the pronounciation of "Ł" has been changing so that
+     * it is now more like an English initial W (as in wonk) than an L.)
+     *
+     * This means that, for example, 'a | b & c' is stored on the stack as
+     *
+     * c  [4]
+     * b  [3]
+     * &  [2]
+     * a  [1]
+     * |  [0]
      *
-     * A unary operator may immediately follow a binary in the input, for
-     * example
+     * where the numbers in brackets give the stack [array] element number.
+     * In this implementation, parentheses are not stored on the stack.
+     * Instead a '(' creates a "fence" so that the part of the stack below the
+     * fence is invisible except to the corresponding ')' (this allows us to
+     * replace testing for parens, by using instead subtraction of the fence
+     * position).  As new operands are processed they are pushed onto the stack
+     * (except as noted in the next paragraph).  New operators of higher
+     * precedence than the current final one are inserted on the stack before
+     * the lhs operand (so that when the rhs is pushed next, everything will be
+     * in the correct positions shown above.  When an operator of equal or
+     * lower precedence is encountered in parsing, all the stacked operations
+     * of equal or higher precedence are evaluated, leaving the result as the
+     * top entry on the stack.  This makes higher precedence operations
+     * evaluate before lower precedence ones, and causes operations of equal
+     * precedence to left associate.
+     *
+     * The only unary operator '!' is immediately pushed onto the stack when
+     * encountered.  When an operand is encountered, if the top of the stack is
+     * a '!", the complement is immediately performed, and the '!' popped.  The
+     * resulting value is treated as a new operand, and the logic in the
+     * previous paragraph is executed.  Thus in the expression
      *      [a] + ! [b]
-     * When an operand is parsed and the top of the stack is a unary operator,
-     * the operation is performed, and then the stack is rechecked to see if
-     * this new operand is part of a binary operation; if so, it is handled as
-     * above.
+     * the stack looks like
+     *
+     * !
+     * a
+     * +
+     *
+     * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
+     * becomes
+     *
+     * !b
+     * a
+     * +
+     *
+     * A ')' is treated as an operator with lower precedence than all the
+     * aforementioned ones, which causes all operations on the stack above the
+     * corresponding '(' to be evaluated down to a single resultant operand.
+     * Then the fence for the '(' is removed, and the operand goes through the
+     * algorithm above, without the fence.
      *
-     * A '(' is simply pushed on the stack; it is valid only if the stack is
-     * empty, or the top element of the stack is an operator or another '('
-     * (for which the parenthesized expression will become an operand).  By the
-     * time the corresponding ')' is parsed everything in between should have
-     * been parsed and evaluated to a single operand (or else is a syntax
-     * error), and is handled as a regular operand */
+     * A separate stack is kept of the fence positions, so that the position of
+     * the latest so-far unbalanced '(' is at the top of it.
+     *
+     * The ']' ending the construct is treated as the lowest operator of all,
+     * so that everything gets evaluated down to a single operand, which is the
+     * result */
 
     sv_2mortal((SV *)(stack = newAV()));
+    sv_2mortal((SV *)(fence_stack = newAV()));
 
     while (RExC_parse < RExC_end) {
-        I32 top_index = av_tindex(stack);
-        SV** top_ptr;
-        SV* current = NULL;
+        I32 top_index;              /* Index of top-most element in 'stack' */
+        SV** top_ptr;               /* Pointer to top 'stack' element */
+        SV* current = NULL;         /* To contain the current inversion list
+                                       operand */
+        SV* only_to_avoid_leaks;
 
-        /* Skip white space */
-        RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                         TRUE /* means recognize comments */ );
+        skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                TRUE /* Force /x */ );
         if (RExC_parse >= RExC_end) {
             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
         }
-        if ((curchar = UCHARAT(RExC_parse)) == ']') {
-            break;
-        }
+
+        curchar = UCHARAT(RExC_parse);
+
+redo_curchar:
+
+        top_index = av_tindex(stack);
 
         switch (curchar) {
+            SV** stacked_ptr;       /* Ptr to something already on 'stack' */
+            char stacked_operator;  /* The topmost operator on the 'stack'. */
+            SV* lhs;                /* Operand to the left of the operator */
+            SV* rhs;                /* Operand to the right of the operator */
+            SV* fence_ptr;          /* Pointer to top element of the fence
+                                       stack */
+
+            case '(':
 
-            case '?':
-                if (av_tindex(stack) >= 0   /* This makes sure that we can
-                                               safely subtract 1 from
-                                               RExC_parse in the next clause.
-                                               If we have something on the
-                                               stack, we have parsed something
-                                             */
-                    && UCHARAT(RExC_parse - 1) == '('
-                    && RExC_parse < RExC_end)
+                if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
                 {
                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
                      * This happens when we have some thing like
@@ -13311,14 +13643,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                      * interpolated expression evaluates to.  We use the flags
                      * from the interpolated pattern. */
                     U32 save_flags = RExC_flags;
-                    const char * const save_parse = ++RExC_parse;
+                    const char * save_parse;
 
+                    RExC_parse += 2;        /* Skip past the '(?' */
+                    save_parse = RExC_parse;
+
+                    /* Parse any flags for the '(?' */
                     parse_lparen_question_flags(pRExC_state);
 
                     if (RExC_parse == save_parse  /* Makes sure there was at
-                                                     least one flag (or this
-                                                     embedding wasn't compiled)
-                                                   */
+                                                     least one flag (or else
+                                                     this embedding wasn't
+                                                     compiled) */
                         || RExC_parse >= RExC_end - 4
                         || UCHARAT(RExC_parse) != ':'
                         || UCHARAT(++RExC_parse) != '('
@@ -13338,38 +13674,67 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                         }
                         vFAIL("Expecting '(?flags:(?[...'");
                     }
+
+                    /* Recurse, with the meat of the embedded expression */
                     RExC_parse++;
                     (void) handle_regex_sets(pRExC_state, &current, flagp,
                                                     depth+1, oregcomp_parse);
 
                     /* Here, 'current' contains the embedded expression's
                      * inversion list, and RExC_parse points to the trailing
-                     * ']'; the next character should be the ')' which will be
-                     * paired with the '(' that has been put on the stack, so
-                     * the whole embedded expression reduces to '(operand)' */
+                     * ']'; the next character should be the ')' */
+                    RExC_parse++;
+                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+
+                    /* Then the ')' matching the original '(' handled by this
+                     * case: statement */
                     RExC_parse++;
+                    assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
 
+                    RExC_parse++;
                     RExC_flags = save_flags;
                     goto handle_operand;
                 }
-                /* FALLTHROUGH */
 
-            default:
-                RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
-                vFAIL("Unexpected character");
+                /* A regular '('.  Look behind for illegal syntax */
+                if (top_index - fence >= 0) {
+                    /* If the top entry on the stack is an operator, it had
+                     * better be a '!', otherwise the entry below the top
+                     * operand should be an operator */
+                    if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
+                        || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
+                        || (   IS_OPERAND(*top_ptr)
+                            && (   top_index - fence < 1
+                                || ! (stacked_ptr = av_fetch(stack,
+                                                             top_index - 1,
+                                                             FALSE))
+                                || ! IS_OPERATOR(*stacked_ptr))))
+                    {
+                        RExC_parse++;
+                        vFAIL("Unexpected '(' with no preceding operator");
+                    }
+                }
+
+                /* Stack the position of this undealt-with left paren */
+                fence = top_index + 1;
+                av_push(fence_stack, newSViv(fence));
+                break;
 
             case '\\':
-                /* regclass() can only return RESTART_UTF8 if multi-char
-                   folds are allowed.  */
+                /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+                 * multi-char folds are allowed.  */
                 if (!regclass(pRExC_state, flagp,depth+1,
                               TRUE, /* means parse just the next thing */
                               FALSE, /* don't allow multi-char folds */
                               FALSE, /* don't silence non-portable warnings.  */
                               TRUE,  /* strict */
-                              &current
-                             ))
-                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                          (UV) *flagp);
+                              FALSE, /* Require return to be an ANYOF */
+                              &current))
+                {
+                    FAIL2("panic: regclass returned NULL to handle_sets, "
+                          "flags=%#"UVxf"", (UV) *flagp);
+                }
+
                 /* regclass() will return with parsing just the \ sequence,
                  * leaving the parse pointer at the next thing to parse */
                 RExC_parse--;
@@ -13383,18 +13748,22 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                     RExC_parse++;
                 }
 
-                /* regclass() can only return RESTART_UTF8 if multi-char
-                   folds are allowed.  */
+                /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+                 * multi-char folds are allowed.  */
                 if(!regclass(pRExC_state, flagp,depth+1,
                              is_posix_class, /* parse the whole char class
                                                 only if not a posix class */
                              FALSE, /* don't allow multi-char folds */
                              FALSE, /* don't silence non-portable warnings.  */
                              TRUE,   /* strict */
+                             FALSE, /* Require return to be an ANYOF */
                              &current
                             ))
-                    FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
-                          (UV) *flagp);
+                {
+                    FAIL2("panic: regclass returned NULL to handle_sets, "
+                          "flags=%#"UVxf"", (UV) *flagp);
+                }
+
                 /* function call leaves parse pointing to the ']', except if we
                  * faked it */
                 if (is_posix_class) {
@@ -13404,154 +13773,274 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                 goto handle_operand;
             }
 
+            case ']':
+                if (top_index >= 1) {
+                    goto join_operators;
+                }
+
+                /* Only a single operand on the stack: are done */
+                goto done;
+
+            case ')':
+                if (av_tindex(fence_stack) < 0) {
+                    RExC_parse++;
+                    vFAIL("Unexpected ')'");
+                }
+
+                 /* If at least two thing on the stack, treat this as an
+                  * operator */
+                if (top_index - fence >= 1) {
+                    goto join_operators;
+                }
+
+                /* Here only a single thing on the fenced stack, and there is a
+                 * fence.  Get rid of it */
+                fence_ptr = av_pop(fence_stack);
+                assert(fence_ptr);
+                fence = SvIV(fence_ptr) - 1;
+                SvREFCNT_dec_NN(fence_ptr);
+                fence_ptr = NULL;
+
+                if (fence < 0) {
+                    fence = 0;
+                }
+
+                /* Having gotten rid of the fence, we pop the operand at the
+                 * stack top and process it as a newly encountered operand */
+                current = av_pop(stack);
+                if (IS_OPERAND(current)) {
+                    goto handle_operand;
+                }
+
+                RExC_parse++;
+                goto bad_syntax;
+
             case '&':
             case '|':
             case '+':
             case '-':
             case '^':
-                if (top_index < 0
+
+                /* These binary operators should have a left operand already
+                 * parsed */
+                if (   top_index - fence < 0
+                    || top_index - fence == 1
                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
                     || ! IS_OPERAND(*top_ptr))
                 {
-                    RExC_parse++;
-                    vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+                    goto unexpected_binary;
                 }
-                av_push(stack, newSVuv(curchar));
-                break;
 
-            case '!':
-                av_push(stack, newSVuv(curchar));
-                break;
+                /* If only the one operand is on the part of the stack visible
+                 * to us, we just place this operator in the proper position */
+                if (top_index - fence < 2) {
 
-            case '(':
-                if (top_index >= 0) {
-                    top_ptr = av_fetch(stack, top_index, FALSE);
-                    assert(top_ptr);
-                    if (IS_OPERAND(*top_ptr)) {
-                        RExC_parse++;
-                        vFAIL("Unexpected '(' with no preceding operator");
-                    }
+                    /* Place the operator before the operand */
+
+                    SV* lhs = av_pop(stack);
+                    av_push(stack, newSVuv(curchar));
+                    av_push(stack, lhs);
+                    break;
                 }
-                av_push(stack, newSVuv(curchar));
-                break;
 
-            case ')':
-            {
-                SV* lparen;
-                if (top_index < 1
-                    || ! (current = av_pop(stack))
-                    || ! IS_OPERAND(current)
-                    || ! (lparen = av_pop(stack))
-                    || IS_OPERAND(lparen)
-                    || SvUV(lparen) != '(')
+                /* But if there is something else on the stack, we need to
+                 * process it before this new operator if and only if the
+                 * stacked operation has equal or higher precedence than the
+                 * new one */
+
+             join_operators:
+
+                /* The operator on the stack is supposed to be below both its
+                 * operands */
+                if (   ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
+                    || IS_OPERAND(*stacked_ptr))
                 {
-                    SvREFCNT_dec(current);
+                    /* But if not, it's legal and indicates we are completely
+                     * done if and only if we're currently processing a ']',
+                     * which should be the final thing in the expression */
+                    if (curchar == ']') {
+                        goto done;
+                    }
+
+                  unexpected_binary:
                     RExC_parse++;
-                    vFAIL("Unexpected ')'");
+                    vFAIL2("Unexpected binary operator '%c' with no "
+                           "preceding operand", curchar);
                 }
-                top_index -= 2;
-                SvREFCNT_dec_NN(lparen);
+                stacked_operator = (char) SvUV(*stacked_ptr);
 
-                /* FALLTHROUGH */
-            }
+                if (regex_set_precedence(curchar)
+                    > regex_set_precedence(stacked_operator))
+                {
+                    /* Here, the new operator has higher precedence than the
+                     * stacked one.  This means we need to add the new one to
+                     * the stack to await its rhs operand (and maybe more
+                     * stuff).  We put it before the lhs operand, leaving
+                     * untouched the stacked operator and everything below it
+                     * */
+                    lhs = av_pop(stack);
+                    assert(IS_OPERAND(lhs));
 
-              handle_operand:
+                    av_push(stack, newSVuv(curchar));
+                    av_push(stack, lhs);
+                    break;
+                }
 
-                /* Here, we have an operand to process, in 'current' */
+                /* Here, the new operator has equal or lower precedence than
+                 * what's already there.  This means the operation already
+                 * there should be performed now, before the new one. */
 
-                if (top_index < 0) {    /* Just push if stack is empty */
-                    av_push(stack, current);
+                rhs = av_pop(stack);
+                if (! IS_OPERAND(rhs)) {
+
+                    /* This can happen when a ! is not followed by an operand,
+                     * like in /(?[\t &!])/ */
+                    goto bad_syntax;
                 }
-                else {
-                    SV* top = av_pop(stack);
-                    SV *prev = NULL;
-                    char current_operator;
-
-                    if (IS_OPERAND(top)) {
-                        SvREFCNT_dec_NN(top);
-                        SvREFCNT_dec_NN(current);
-                        vFAIL("Operand with no preceding operator");
+
+                lhs = av_pop(stack);
+
+                if (! IS_OPERAND(lhs)) {
+
+                    /* This can happen when there is an empty (), like in
+                     * /(?[[0]+()+])/ */
+                    goto bad_syntax;
+                }
+
+                switch (stacked_operator) {
+                    case '&':
+                        _invlist_intersection(lhs, rhs, &rhs);
+                        break;
+
+                    case '|':
+                    case '+':
+                        _invlist_union(lhs, rhs, &rhs);
+                        break;
+
+                    case '-':
+                        _invlist_subtract(lhs, rhs, &rhs);
+                        break;
+
+                    case '^':   /* The union minus the intersection */
+                    {
+                        SV* i = NULL;
+                        SV* u = NULL;
+                        SV* element;
+
+                        _invlist_union(lhs, rhs, &u);
+                        _invlist_intersection(lhs, rhs, &i);
+                        /* _invlist_subtract will overwrite rhs
+                            without freeing what it already contains */
+                        element = rhs;
+                        _invlist_subtract(u, i, &rhs);
+                        SvREFCNT_dec_NN(i);
+                        SvREFCNT_dec_NN(u);
+                        SvREFCNT_dec_NN(element);
+                        break;
                     }
-                    current_operator = (char) SvUV(top);
-                    switch (current_operator) {
-                        case '(':   /* Push the '(' back on followed by the new
-                                       operand */
-                            av_push(stack, top);
-                            av_push(stack, current);
-                            SvREFCNT_inc(top);  /* Counters the '_dec' done
-                                                   just after the 'break', so
-                                                   it doesn't get wrongly freed
-                                                 */
-                            break;
+                }
+                SvREFCNT_dec(lhs);
+
+                /* Here, the higher precedence operation has been done, and the
+                 * result is in 'rhs'.  We overwrite the stacked operator with
+                 * the result.  Then we redo this code to either push the new
+                 * operator onto the stack or perform any higher precedence
+                 * stacked operation */
+                only_to_avoid_leaks = av_pop(stack);
+                SvREFCNT_dec(only_to_avoid_leaks);
+                av_push(stack, rhs);
+                goto redo_curchar;
+
+            case '!':   /* Highest priority, right associative */
+
+                /* If what's already at the top of the stack is another '!",
+                 * they just cancel each other out */
+                if (   (top_ptr = av_fetch(stack, top_index, FALSE))
+                    && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+                {
+                    only_to_avoid_leaks = av_pop(stack);
+                    SvREFCNT_dec(only_to_avoid_leaks);
+                }
+                else { /* Otherwise, since it's right associative, just push
+                          onto the stack */
+                    av_push(stack, newSVuv(curchar));
+                }
+                break;
 
-                        case '!':
-                            _invlist_invert(current);
-
-                            /* Unlike binary operators, the top of the stack,
-                             * now that this unary one has been popped off, may
-                             * legally be an operator, and we now have operand
-                             * for it. */
-                            top_index--;
-                            SvREFCNT_dec_NN(top);
-                            goto handle_operand;
-
-                        case '&':
-                            prev = av_pop(stack);
-                            _invlist_intersection(prev,
-                                                   current,
-                                                   &current);
-                            av_push(stack, current);
-                            break;
+            default:
+                RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                vFAIL("Unexpected character");
 
-                        case '|':
-                        case '+':
-                            prev = av_pop(stack);
-                            _invlist_union(prev, current, &current);
-                            av_push(stack, current);
-                            break;
+          handle_operand:
+
+            /* Here 'current' is the operand.  If something is already on the
+             * stack, we have to check if it is a !. */
+            top_index = av_tindex(stack);   /* Code above may have altered the
+                                             * stack in the time since we
+                                             * earlier set 'top_index'. */
+            if (top_index - fence >= 0) {
+                /* If the top entry on the stack is an operator, it had better
+                 * be a '!', otherwise the entry below the top operand should
+                 * be an operator */
+                top_ptr = av_fetch(stack, top_index, FALSE);
+                assert(top_ptr);
+                if (IS_OPERATOR(*top_ptr)) {
+
+                    /* The only permissible operator at the top of the stack is
+                     * '!', which is applied immediately to this operand. */
+                    curchar = (char) SvUV(*top_ptr);
+                    if (curchar != '!') {
+                        SvREFCNT_dec(current);
+                        vFAIL2("Unexpected binary operator '%c' with no "
+                                "preceding operand", curchar);
+                    }
 
-                        case '-':
-                            prev = av_pop(stack);;
-                            _invlist_subtract(prev, current, &current);
-                            av_push(stack, current);
-                            break;
+                    _invlist_invert(current);
 
-                        case '^':   /* The union minus the intersection */
-                        {
-                            SV* i = NULL;
-                            SV* u = NULL;
-                            SV* element;
-
-                            prev = av_pop(stack);
-                            _invlist_union(prev, current, &u);
-                            _invlist_intersection(prev, current, &i);
-                            /* _invlist_subtract will overwrite current
-                                without freeing what it already contains */
-                            element = current;
-                            _invlist_subtract(u, i, &current);
-                            av_push(stack, current);
-                            SvREFCNT_dec_NN(i);
-                            SvREFCNT_dec_NN(u);
-                            SvREFCNT_dec_NN(element);
-                            break;
-                        }
+                    only_to_avoid_leaks = av_pop(stack);
+                    SvREFCNT_dec(only_to_avoid_leaks);
+                    top_index = av_tindex(stack);
 
-                        default:
-                            Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+                    /* And we redo with the inverted operand.  This allows
+                     * handling multiple ! in a row */
+                    goto handle_operand;
+                }
+                          /* Single operand is ok only for the non-binary ')'
+                           * operator */
+                else if ((top_index - fence == 0 && curchar != ')')
+                         || (top_index - fence > 0
+                             && (! (stacked_ptr = av_fetch(stack,
+                                                           top_index - 1,
+                                                           FALSE))
+                                 || IS_OPERAND(*stacked_ptr))))
+                {
+                    SvREFCNT_dec(current);
+                    vFAIL("Operand with no preceding operator");
                 }
-                SvREFCNT_dec_NN(top);
-                SvREFCNT_dec(prev);
             }
-        }
+
+            /* Here there was nothing on the stack or the top element was
+             * another operand.  Just add this new one */
+            av_push(stack, current);
+
+        } /* End of switch on next parse token */
 
         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+    } /* End of loop parsing through the construct */
+
+  done:
+    if (av_tindex(fence_stack) >= 0) {
+        vFAIL("Unmatched (");
     }
 
     if (av_tindex(stack) < 0   /* Was empty */
         || ((final = av_pop(stack)) == NULL)
         || ! IS_OPERAND(final)
+        || SvTYPE(final) != SVt_INVLIST
         || av_tindex(stack) >= 0)  /* More left on stack */
     {
+      bad_syntax:
+        SvREFCNT_dec(final);
         vFAIL("Incomplete expression within '(?[ ])'");
     }
 
@@ -13576,6 +14065,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         }
     }
 
+    /* About to generate an ANYOF (or similar) node from the inversion list we
+     * have calculated */
     save_parse = RExC_parse;
     RExC_parse = SvPV(result_string, len);
     save_end = RExC_end;
@@ -13585,8 +14076,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
      * already has all folding taken into consideration, and we don't want
      * regclass() to add to that */
     RExC_flags &= ~RXf_PMf_FOLD;
-    /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
-     */
+    /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
+     * folds are allowed.  */
     node = regclass(pRExC_state, flagp,depth+1,
                     FALSE, /* means parse the whole char class */
                     FALSE, /* don't allow multi-char folds */
@@ -13594,14 +14085,42 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                              well have generated non-portable code points, but
                              they're valid on this machine */
                     FALSE, /* similarly, no need for strict */
+                    FALSE, /* Require return to be an ANYOF */
                     NULL
                 );
     if (!node)
         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
                     PTR2UV(flagp));
+
+    /* Fix up the node type if we are in locale.  (We have pretended we are
+     * under /u for the purposes of regclass(), as this construct will only
+     * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
+     * as to cause any warnings about bad locales to be output in regexec.c),
+     * and add the flag that indicates to check if not in a UTF-8 locale.  The
+     * reason we above forbid optimization into something other than an ANYOF
+     * node is simply to minimize the number of code changes in regexec.c.
+     * Otherwise we would have to create new EXACTish node types and deal with
+     * them.  This decision could be revisited should this construct become
+     * popular.
+     *
+     * (One might think we could look at the resulting ANYOF node and suppress
+     * the flag if everything is above 255, as those would be UTF-8 only,
+     * but this isn't true, as the components that led to that result could
+     * have been locale-affected, and just happen to cancel each other out
+     * under UTF-8 locales.) */
+    if (in_locale) {
+        set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+
+        assert(OP(node) == ANYOF);
+
+        OP(node) = ANYOFL;
+        ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+    }
+
     if (save_fold) {
         RExC_flags |= RXf_PMf_FOLD;
     }
+
     RExC_parse = save_parse + 1;
     RExC_end = save_end;
     SvREFCNT_dec_NN(final);
@@ -13611,6 +14130,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
     return node;
 }
+#undef IS_OPERATOR
 #undef IS_OPERAND
 
 STATIC void
@@ -13657,9 +14177,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
           *invlist = add_cp_to_invlist(*invlist,
                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
             break;
+
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+
         case LATIN_SMALL_LETTER_SHARP_S:
           *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
             break;
+
+#endif
+
+#if    UNICODE_MAJOR_VERSION < 3                                        \
+   || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+
+        /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
+         * U+0131.  */
+        case 'i':
+        case 'I':
+          *invlist =
+             add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+#   if UNICODE_DOT_DOT_VERSION == 1
+          *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+#   endif
+            break;
+#endif
+
         default:
             /* Use deprecated warning to increase the chances of this being
              * output */
@@ -13723,6 +14264,23 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
                                         (SvCUR(listsv) != initial_listsv_len)
 
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class.  This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro.  Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
+    STMT_START {                                                        \
+        if (do_skip) {                                                  \
+            while (   p < RExC_end                                      \
+                   && isBLANK_A(UCHARAT(p)))                            \
+            {                                                           \
+                p++;                                                    \
+            }                                                           \
+        }                                                               \
+    } STMT_END
+
 STATIC regnode *
 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  const bool stop_at_1,  /* Just parse the next thing, don't
@@ -13732,6 +14290,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                        about too large
                                                        characters */
                  const bool strict,
+                 bool optimizable,                  /* ? Allow a non-ANYOF return
+                                                       node */
                  SV** ret_invlist  /* Return an inversion list, not a node */
           )
 {
@@ -13756,8 +14316,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
      * determinable at compile time
      *
-     * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
-     * to be restarted.  This can only happen if ret_invlist is non-NULL.
+     * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
+     * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
+     * to UTF-8.  This can only happen if ret_invlist is non-NULL.
      */
 
     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
@@ -13852,11 +14413,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     DEBUG_PARSE("clas");
 
+#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
+    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
+                                   && UNICODE_DOT_DOT_VERSION == 0)
+    allow_multi_folds = FALSE;
+#endif
+
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state,
                    (LOC)
                     ? ANYOFL
-                    : ANYOF,
+                    : (DEPENDS_SEMANTICS)
+                      ? ANYOFD
+                      : ANYOF,
                    0);
 
     if (SIZE_ONLY) {
@@ -13872,20 +14441,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
     }
 
-    if (skip_white) {
-        RExC_parse = regpatws(pRExC_state, RExC_parse,
-                              FALSE /* means don't recognize comments */ );
-    }
+    SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
 
     if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
        RExC_parse++;
         invert = TRUE;
         allow_multi_folds = FALSE;
         MARK_NAUGHTY(1);
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
     }
 
     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
@@ -13922,10 +14485,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             break;
         }
 
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
 
         if  (UCHARAT(RExC_parse) == ']') {
             break;
@@ -13978,7 +14538,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              * skipped, it means that that white space is wanted literally, and
              * is already in 'value'.  Otherwise, need to translate the escape
              * into what it signifies. */
-            if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+            if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
 
            case 'w':   namedclass = ANYOF_WORDCHAR;    break;
            case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
@@ -13992,14 +14552,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            case 'H':   namedclass = ANYOF_NHORIZWS;    break;
             case 'N':  /* Handle \N{NAME} in class */
                 {
-                    SV *as_text;
-                    STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
-                                                    flagp, depth, &as_text);
-                    if (*flagp & RESTART_UTF8)
-                        FAIL("panic: grok_bslash_N set RESTART_UTF8");
-                    if (cp_count != 1) {    /* The typical case drops through */
-                        assert(cp_count != (STRLEN) -1);
-                        if (cp_count == 0) {
+                    const char * const backslash_N_beg = RExC_parse - 2;
+                    int cp_count;
+
+                    if (! grok_bslash_N(pRExC_state,
+                                        NULL,      /* No regnode */
+                                        &value,    /* Yes single value */
+                                        &cp_count, /* Multiple code pt count */
+                                        flagp,
+                                        depth)
+                    ) {
+
+                        if (*flagp & NEED_UTF8)
+                            FAIL("panic: grok_bslash_N set NEED_UTF8");
+                        if (*flagp & RESTART_PASS1)
+                            return NULL;
+
+                        if (cp_count < 0) {
+                            vFAIL("\\N in a character class must be a named character: \\N{...}");
+                        }
+                        else if (cp_count == 0) {
                             if (strict) {
                                 RExC_parse++;   /* Position after the "}" */
                                 vFAIL("Zero length \\N{}");
@@ -14019,16 +14591,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                     else if (PASS2) {
                                         ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
                                     }
+                                    break; /* <value> contains the first code
+                                              point. Drop out of the switch to
+                                              process it */
                                 }
                                 else {
+                                    SV * multi_char_N = newSVpvn(backslash_N_beg,
+                                                 RExC_parse - backslash_N_beg);
                                     multi_char_matches
                                         = add_multi_match(multi_char_matches,
-                                                          as_text,
+                                                          multi_char_N,
                                                           cp_count);
                                 }
-                                break; /* <value> contains the first code
-                                          point. Drop out of the switch to
-                                          process it */
                             }
                         } /* End of cp_count != 1 */
 
@@ -14061,38 +14635,53 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                    vFAIL2("Empty \\%c{}", (U8)value);
                if (*RExC_parse == '{') {
                    const U8 c = (U8)value;
-                   e = strchr(RExC_parse++, '}');
-                    if (!e)
+                   e = strchr(RExC_parse, '}');
+                    if (!e) {
+                        RExC_parse++;
                         vFAIL2("Missing right brace on \\%c{}", c);
-                   while (isSPACE(*RExC_parse))
-                       RExC_parse++;
+                    }
+
+                    RExC_parse++;
+                    while (isSPACE(*RExC_parse)) {
+                         RExC_parse++;
+                   }
+
+                   if (UCHARAT(RExC_parse) == '^') {
+
+                        /* toggle.  (The rhs xor gets the single bit that
+                         * differs between P and p; the other xor inverts just
+                         * that bit) */
+                        value ^= 'P' ^ 'p';
+
+                        RExC_parse++;
+                        while (isSPACE(*RExC_parse)) {
+                            RExC_parse++;
+                        }
+                    }
+
                     if (e == RExC_parse)
                         vFAIL2("Empty \\%c{}", c);
+
                    n = e - RExC_parse;
                    while (isSPACE(*(RExC_parse + n - 1)))
                        n--;
-               }
-               else {
+               }   /* The \p isn't immediately followed by a '{' */
+               else if (! isALPHA(*RExC_parse)) {
+                    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                    vFAIL2("Character following \\%c must be '{' or a "
+                           "single-character Unicode property name",
+                           (U8) value);
+                }
+                else {
                    e = RExC_parse;
                    n = 1;
                }
                if (!SIZE_ONLY) {
                     SV* invlist;
                     char* name;
+                    char* base_name;    /* name after any packages are stripped */
+                    const char * const colon_colon = "::";
 
-                   if (UCHARAT(RExC_parse) == '^') {
-                        RExC_parse++;
-                        n--;
-                         /* toggle.  (The rhs xor gets the single bit that
-                          * differs between P and p; the other xor inverts just
-                          * that bit) */
-                         value ^= 'P' ^ 'p';
-
-                        while (isSPACE(*RExC_parse)) {
-                             RExC_parse++;
-                             n--;
-                        }
-                   }
                     /* Try to get the definition of the property into
                      * <invlist>.  If /i is in effect, the effective property
                      * will have its name be <__NAME_i>.  The design is
@@ -14108,7 +14697,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
                     /* Look up the property name, and get its swash and
                      * inversion list, if the property is found  */
-                    if (swash) {
+                    if (swash) {    /* Return any left-overs */
                         SvREFCNT_dec_NN(swash);
                     }
                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
@@ -14121,26 +14710,57 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         HV* curpkg = (IN_PERL_COMPILETIME)
                                       ? PL_curstash
                                       : CopSTASH(PL_curcop);
-                        if (swash) {
+                        UV final_n = n;
+                        bool has_pkg;
+
+                        if (swash) {    /* Got a swash but no inversion list.
+                                           Something is likely wrong that will
+                                           be sorted-out later */
                             SvREFCNT_dec_NN(swash);
                             swash = NULL;
                         }
 
-                        /* Here didn't find it.  It could be a user-defined
-                         * property that will be available at run-time.  If we
-                         * accept only compile-time properties, is an error;
-                         * otherwise add it to the list for run-time look up */
-                        if (ret_invlist) {
+                        /* Here didn't find it.  It could be a an error (like a
+                         * typo) in specifying a Unicode property, or it could
+                         * be a user-defined property that will be available at
+                         * run-time.  The names of these must begin with 'In'
+                         * or 'Is' (after any packages are stripped off).  So
+                         * if not one of those, or if we accept only
+                         * compile-time properties, is an error; otherwise add
+                         * it to the list for run-time look up. */
+                        if ((base_name = rninstr(name, name + n,
+                                                 colon_colon, colon_colon + 2)))
+                        { /* Has ::.  We know this must be a user-defined
+                             property */
+                            base_name += 2;
+                            final_n -= base_name - name;
+                            has_pkg = TRUE;
+                        }
+                        else {
+                            base_name = name;
+                            has_pkg = FALSE;
+                        }
+
+                        if (   final_n < 3
+                            || base_name[0] != 'I'
+                            || (base_name[1] != 's' && base_name[1] != 'n')
+                            || ret_invlist)
+                        {
+                            const char * const msg
+                                = (has_pkg)
+                                  ? "Illegal user-defined property name"
+                                  : "Can't find Unicode property definition";
                             RExC_parse = e + 1;
-                            vFAIL2utf8f(
-                                "Property '%"UTF8f"' is unknown",
-                                UTF8fARG(UTF, n, name));
+
+                            /* diag_listed_as: Can't find Unicode property definition "%s" */
+                            vFAIL3utf8f("%s \"%"UTF8f"\"",
+                                msg, UTF8fARG(UTF, n, name));
                         }
 
                         /* If the property name doesn't already have a package
                          * name, add the current one to it so that it can be
                          * referred to outside it. [perl #121777] */
-                        if (curpkg && ! instr(name, "::")) {
+                        if (! has_pkg && curpkg) {
                             char* pkgname = HvNAME(curpkg);
                             if (strNE(pkgname, "main")) {
                                 char* full_name = Perl_form(aTHX_
@@ -14156,11 +14776,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         (value == 'p' ? '+' : '!'),
                                         UTF8fARG(UTF, n, name));
                         has_user_defined_property = TRUE;
+                        optimizable = FALSE;    /* Will have to leave this an
+                                                   ANYOF node */
 
                         /* We don't know yet, so have to assume that the
-                         * property could match something in the Latin1 range,
-                         * hence something that isn't utf8.  Note that this
-                         * would cause things in <depends_list> to match
+                         * property could match something in the upper Latin1
+                         * range, hence something that isn't utf8.  Note that
+                         * this would cause things in <depends_list> to match
                          * inappropriately, except that any \p{}, including
                          * this one forces Unicode semantics, which means there
                          * is no <depends_list> */
@@ -14213,7 +14835,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                 named */
 
                /* \p means they want Unicode semantics */
-               RExC_uni_semantics = 1;
+               REQUIRE_UNI_RULES(flagp, NULL);
                }
                break;
            case 'n':   value = '\n';                   break;
@@ -14301,7 +14923,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              recode_encoding:
                if (! RExC_override_recoding) {
                    SV* enc = _get_encoding();
-                   value = reg_recode((const char)(U8)value, &enc);
+                   value = reg_recode((U8)value, &enc);
                    if (!enc) {
                         if (strict) {
                             vFAIL("Invalid escape in the specified encoding");
@@ -14390,6 +15012,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     }
                     ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
                     ANYOF_POSIXL_ZERO(ret);
+
+                    /* We can't change this into some other type of node
+                     * (unless this is the only element, in which case there
+                     * are nodes that mean exactly this) as has runtime
+                     * dependencies */
+                    optimizable = FALSE;
                 }
 
                 /* Coverity thinks it is possible for this to be negative; both
@@ -14505,10 +15133,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
            }
        } /* end of namedclass \blah */
 
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
 
         /* If 'range' is set, 'value' is the ending of a range--check its
          * validity.  (If value isn't a single code point in the case of a
@@ -14540,7 +15165,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 vFAIL2utf8f(
                     "Invalid [] range \"%"UTF8f"\"",
                     UTF8fARG(UTF, w, rangebegin));
-                NOT_REACHED; /* NOT REACHED */
+                NOT_REACHED; /* NOTREACHED */
            }
        }
        else {
@@ -14549,12 +15174,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 && *RExC_parse == '-')
             {
                 char* next_char_ptr = RExC_parse + 1;
-                if (skip_white) {   /* Get the next real char after the '-' */
-                    next_char_ptr = regpatws(pRExC_state,
-                                             RExC_parse + 1,
-                                             FALSE); /* means don't recognize
-                                                        comments */
-                }
+
+                /* Get the next real char after the '-' */
+                SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
 
                 /* If the '-' is at the end of the class (just before the ']',
                  * it is a literal minus; otherwise it is a range */
@@ -14599,7 +15221,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
        /* non-Latin1 code point implies unicode semantics.  Must be set in
         * pass1 so is there for the whole of pass 2 */
        if (value > 255) {
-           RExC_uni_semantics = 1;
+            REQUIRE_UNI_RULES(flagp, NULL);
        }
 
         /* Ready to process either the single value, or the completed range.
@@ -14714,9 +15336,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          * same element, neither should be a digit. */
                         if (index_start == index_final) {
                             assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
-                            || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
-                            - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
-                            == 10);
+                            || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+                               - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                               == 10)
+                               /* But actually Unicode did have one group of 11
+                                * 'digits' in 5.2, so in case we are operating
+                                * on that version, let that pass */
+                            || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+                               - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                                == 11
+                               && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+                                == 0x19D0)
+                            );
                         }
                         else if ((index_start >= 0
                                   && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
@@ -14872,7 +15503,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
        ret = reg(pRExC_state, 1, &reg_flags, depth+1);
 
-       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
+       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
 
        RExC_parse = save_parse;
        RExC_end = save_end;
@@ -14892,8 +15523,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * 2) if the character class contains only a single element (including a
      *    single range), we see if there is an equivalent node for it.
      * Other checks are possible */
-    if (! ret_invlist   /* Can't optimize if returning the constructed
-                           inversion list */
+    if (   optimizable
+        && ! ret_invlist   /* Can't optimize if returning the constructed
+                              inversion list */
         && (UNLIKELY(posixl_matches_all) || element_count == 1))
     {
         U8 op = END;
@@ -15006,10 +15638,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     op = POSIXA;
                 }
             }
-            else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+            else if (! FOLD || ASCII_FOLD_RESTRICTED) {
                 /* We can optimize A-Z or a-z, but not if they could match
-                 * something like the KELVIN SIGN under /i (/a means they
-                 * can't) */
+                 * something like the KELVIN SIGN under /i. */
                 if (prevvalue == 'A') {
                     if (value == 'Z'
 #ifdef EBCDIC
@@ -15105,7 +15736,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             /* Our calculated list will be for Unicode rules.  For locale
              * matching, we have to keep a separate list that is consulted at
              * runtime only when the locale indicates Unicode rules.  For
-             * non-locale, we just use to the general list */
+             * non-locale, we just use the general list */
             if (LOC) {
                 use_list = &only_utf8_locale_list;
             }
@@ -15255,7 +15886,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             if (DEPENDS_SEMANTICS) {
                 /* Under /d, everything in the upper half of the Latin1 range
                  * matches these complements */
-                ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
+                ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
             }
             else if (AT_LEAST_ASCII_RESTRICTED) {
                 /* Under /a and /aa, everything above ASCII matches these
@@ -15342,7 +15973,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         }
 
         if (warn_super) {
-            ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+            ANYOF_FLAGS(ret)
+             |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+
+            /* Because an ANYOF node is the only one that warns, this node
+             * can't be optimized into something else */
+            optimizable = FALSE;
         }
     }
 
@@ -15364,8 +16000,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         if (only_utf8_locale_list) {
             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
         }
-        else if (cp_list) { /* Look to see if there a 0-255 code point is in
-                               the list */
+        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
             UV start, end;
             invlist_iterinit(cp_list);
             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
@@ -15398,6 +16033,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     }
 
     if (ret_invlist) {
+        assert(cp_list);
+
         *ret_invlist = cp_list;
         SvREFCNT_dec(swash);
 
@@ -15422,21 +16059,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * adjacent such nodes.  And if the class is equivalent to things like /./,
      * expensive run-time swashes can be avoided.  Now that we have more
      * complete information, we can find things necessarily missed by the
-     * earlier code.  I (khw) am not sure how much to look for here.  It would
-     * be easy, but perhaps too slow, to check any candidates against all the
-     * node types they could possibly match using _invlistEQ(). */
-
-    if (cp_list
-        && ! invert
-        && ! depends_list
-        && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
-        && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-
-           /* We don't optimize if we are supposed to make sure all non-Unicode
-            * code points raise a warning, as only ANYOF nodes have this check.
-            * */
-        && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
-    {
+     * earlier code.  I (khw) did some benchmarks and found essentially no
+     * speed difference between using a POSIXA node versus an ANYOF node, so
+     * there is no reason to optimize, for example [A-Za-z0-9_] into
+     * [[:word:]]/a (although if we did it in the sizing pass it would save
+     * space).  _invlistEQ() could be used if one ever wanted to do something
+     * like this at this point in the code */
+
+    if (optimizable && cp_list && ! invert && ! depends_list) {
         UV start, end;
         U8 op = END;  /* The optimzation node-type */
         const char * cur_parse= RExC_parse;
@@ -15445,9 +16075,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         if (! invlist_iternext(cp_list, &start, &end)) {
 
             /* Here, the list is empty.  This happens, for example, when a
-             * Unicode property is the only thing in the character class, and
-             * it doesn't match anything.  (perluniprops.pod notes such
-             * properties) */
+             * Unicode property that doesn't match anything is the only element
+             * in the character class (perluniprops.pod notes such properties).
+             * */
             op = OPFAIL;
             *flagp |= HASWIDTH|SIMPLE;
         }
@@ -15503,7 +16133,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     }
                 }
             }
-        }
+        }   /* End of first range contains just a single code point */
         else if (start == 0) {
             if (end == UV_MAX) {
                 op = SANY;
@@ -15525,7 +16155,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             RExC_parse = (char *)orig_parse;
             RExC_emit = (regnode *)orig_emit;
 
-            ret = reg_node(pRExC_state, op);
+            if (regarglen[op]) {
+                ret = reganode(pRExC_state, op, 0);
+            } else {
+                ret = reg_node(pRExC_state, op);
+            }
 
             RExC_parse = (char *)cur_parse;
 
@@ -15834,50 +16468,86 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
     return p;
 }
 
-/* nextchar()
-
-   Advances the parse position, and optionally absorbs
-   "whitespace" from the inputstream.
-
-   Without /x "whitespace" means (?#...) style comments only,
-   with /x this means (?#...) and # comments and whitespace proper.
+STATIC void
+S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
+                                char ** p,
+                                const bool force_to_xmod
+                         )
+{
+    /* If the text at the current parse position '*p' is a '(?#...)' comment,
+     * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
+     * is /x whitespace, advance '*p' so that on exit it points to the first
+     * byte past all such white space and comments */
 
-   Returns the RExC_parse point from BEFORE the scan occurs.
+    const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
 
-   This is the /x friendly way of saying RExC_parse++.
-*/
+    PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
 
-STATIC char*
-S_nextchar(pTHX_ RExC_state_t *pRExC_state)
-{
-    char* const retval = RExC_parse++;
-
-    PERL_ARGS_ASSERT_NEXTCHAR;
+    assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
 
     for (;;) {
-       if (RExC_end - RExC_parse >= 3
-           && *RExC_parse == '('
-           && RExC_parse[1] == '?'
-           && RExC_parse[2] == '#')
+       if (RExC_end - (*p) >= 3
+           && *(*p)     == '('
+           && *(*p + 1) == '?'
+           && *(*p + 2) == '#')
        {
-           while (*RExC_parse != ')') {
-               if (RExC_parse == RExC_end)
+           while (*(*p) != ')') {
+               if ((*p) == RExC_end)
                    FAIL("Sequence (?#... not terminated");
-               RExC_parse++;
+               (*p)++;
            }
-           RExC_parse++;
+           (*p)++;
            continue;
        }
-       if (RExC_flags & RXf_PMf_EXTENDED) {
-            char * p = regpatws(pRExC_state, RExC_parse,
-                                          TRUE); /* means recognize comments */
-            if (p != RExC_parse) {
-                RExC_parse = p;
+
+       if (use_xmod) {
+            const char * save_p = *p;
+            while ((*p) < RExC_end) {
+                STRLEN len;
+                if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
+                    (*p) += len;
+                }
+                else if (*(*p) == '#') {
+                    (*p) = reg_skipcomment(pRExC_state, (*p));
+                }
+                else {
+                    break;
+                }
+            }
+            if (*p != save_p) {
                 continue;
             }
        }
-        return retval;
+
+        break;
     }
+
+    return;
+}
+
+/* nextchar()
+
+   Advances the parse position by one byte, unless that byte is the beginning
+   of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
+   those two cases, the parse position is advanced beyond all such comments and
+   white space.
+
+   This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
+*/
+
+STATIC void
+S_nextchar(pTHX_ RExC_state_t *pRExC_state)
+{
+    PERL_ARGS_ASSERT_NEXTCHAR;
+
+    assert(   ! UTF
+           || UTF8_IS_INVARIANT(*RExC_parse)
+           || UTF8_IS_START(*RExC_parse));
+
+    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't assume /x */ );
 }
 
 STATIC regnode *
@@ -16366,7 +17036,7 @@ Perl_regdump(pTHX_ const regexp *r)
            PerlIO_printf(Perl_debug_log, "(SBOL)");
         if (r->intflags & PREGf_ANCH_GPOS)
            PerlIO_printf(Perl_debug_log, "(GPOS)");
-       PerlIO_putc(Perl_debug_log, ' ');
+       (void)PerlIO_putc(Perl_debug_log, ' ');
     }
     if (r->intflags & PREGf_GPOS_SEEN)
        PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
@@ -16405,8 +17075,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
-    || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
-    || _CC_VERTSPACE != 16
+    || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
   #error Need to adjust order of anyofs[]
 #endif
         "\\w",
@@ -16435,8 +17104,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         "[:^blank:]",
         "[:xdigit:]",
         "[:^xdigit:]",
-        "[:space:]",
-        "[:^space:]",
         "[:cntrl:]",
         "[:^cntrl:]",
         "[:ascii:]",
@@ -16508,9 +17175,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         }
 
     } else if (k == CURLY) {
+        U32 lo = ARG1(o), hi = ARG2(o);
        if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
            Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
-       Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
+        Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
+        if (hi == REG_INFTY)
+            sv_catpvs(sv, "INFTY");
+        else
+            Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
+        sv_catpvs(sv, "}");
     }
     else if (k == WHILEM && o->flags)                  /* Ordinal/of */
        Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
@@ -16518,7 +17191,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              || k == GROUPP || OP(o)==ACCEPT)
     {
         AV *name_list= NULL;
-       Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
+        U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
+        Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno);        /* Parenth number */
        if ( RXp_PAREN_NAMES(prog) ) {
             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
         } else if ( pRExC_state ) {
@@ -16526,12 +17200,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         }
         if (name_list) {
             if ( k != REF || (OP(o) < NREF)) {
-                SV **name= av_fetch(name_list, ARG(o), 0 );
+                SV **name= av_fetch(name_list, parno, 0 );
                if (name)
                    Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
             }
             else {
-                SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
+                SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
                 I32 *nums=(I32*)SvPVX(sv_dat);
                 SV **name= av_fetch(name_list, nums[0], 0 );
                 I32 n;
@@ -16574,11 +17248,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
         }
     }
-    else if (k == VERB) {
-        if (!o->flags)
-            Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
-                          SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
-    } else if (k == LOGICAL)
+    else if (k == LOGICAL)
         /* 2: embedded, otherwise 1 */
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
     else if (k == ANYOF) {
@@ -16587,8 +17257,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         SV* bitmap_invlist;  /* Will hold what the bit map contains */
 
 
-       if (OP(o) == ANYOFL)
-           sv_catpvs(sv, "{loc}");
+       if (OP(o) == ANYOFL) {
+            if (flags & ANYOF_LOC_REQ_UTF8) {
+                sv_catpvs(sv, "{utf8-loc}");
+            }
+            else {
+                sv_catpvs(sv, "{loc}");
+            }
+        }
        if (flags & ANYOF_LOC_FOLD)
            sv_catpvs(sv, "{i}");
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
@@ -16624,7 +17300,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                     sv_catpvs(sv, "^");
             }
 
-            if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+            if (OP(o) == ANYOFD
+                && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+            {
                 sv_catpvs(sv, "{non-utf8-latin1-all}");
             }
 
@@ -16735,10 +17413,26 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
         }
     }
+    else if (k == BOUND || k == NBOUND) {
+        /* Must be synced with order of 'bound_type' in regcomp.h */
+        const char * const bounds[] = {
+            "",      /* Traditional */
+            "{gcb}",
+            "{sb}",
+            "{wb}"
+        };
+        sv_catpv(sv, bounds[FLAGS(o)]);
+    }
     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
        Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
     else if (OP(o) == SBOL)
         Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+    /* add on the verb argument if there is one */
+    if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
+        Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
+                       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+    }
 #else
     PERL_UNUSED_CONTEXT;
     PERL_UNUSED_ARG(sv);
@@ -17313,6 +18007,47 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
 }
 
+/* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
+
+#ifndef PERL_IN_XSUB_RE
+void
+Perl_save_re_context(pTHX)
+{
+    I32 nparens = -1;
+    I32 i;
+
+    /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
+
+    if (PL_curpm) {
+       const REGEXP * const rx = PM_GETRE(PL_curpm);
+       if (rx)
+            nparens = RX_NPARENS(rx);
+    }
+
+    /* RT #124109. This is a complete hack; in the SWASHNEW case we know
+     * that PL_curpm will be null, but that utf8.pm and the modules it
+     * loads will only use $1..$3.
+     * The t/porting/re_context.t test file checks this assumption.
+     */
+    if (nparens == -1)
+        nparens = 3;
+
+    for (i = 1; i <= nparens; i++) {
+        char digits[TYPE_CHARS(long)];
+        const STRLEN len = my_snprintf(digits, sizeof(digits),
+                                       "%lu", (long)i);
+        GV *const *const gvp
+            = (GV**)hv_fetch(PL_defstash, digits, len, 0);
+
+        if (gvp) {
+            GV * const gv = *gvp;
+            if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
+                save_scalar(gv);
+        }
+    }
+}
+#endif
+
 #ifdef DEBUGGING
 
 STATIC void
@@ -17489,9 +18224,13 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
         this_end = (end < NUM_ANYOF_CODE_POINTS)
                     ? end
                     : NUM_ANYOF_CODE_POINTS - 1;
+#if NUM_ANYOF_CODE_POINTS > 256
         format = (this_end < 256)
                  ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
                  : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+#else
+        format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+#endif
         GCC_DIAG_IGNORE(-Wformat-nonliteral);
         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
         GCC_DIAG_RESTORE;
@@ -17777,11 +18516,5 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
 #endif /* DEBUGGING */
 
 /*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
  * ex: set ts=8 sts=4 sw=4 et:
  */