Do not define invlistEQ in the re extension.

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index fcaf153..8474e82 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -214,6 +214,7 @@ struct RExC_state_t {
  #define RExC_mysv2     (pRExC_state->mysv2)
  
  #endif
+    bool        seen_unfolded_sharp_s;
  };
  
  #define RExC_flags     (pRExC_state->flags)
@@ -226,6 +227,17 @@ struct RExC_state_t {
  #define RExC_end       (pRExC_state->end)
  #define RExC_parse     (pRExC_state->parse)
  #define RExC_whilem_seen       (pRExC_state->whilem_seen)
+
+/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
+ * EXACTF node, hence was parsed under /di rules.  If later in the parse,
+ * something forces the pattern into using /ui rules, the sharp s should be
+ * folded into the sequence 'ss', which takes up more space than previously
+ * calculated.  This means that the sizing pass needs to be restarted.  (The
+ * node also becomes an EXACTFU_SS.)  For all other characters, an EXACTF node
+ * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
+ * so there is no need to resize [perl #125990]. */
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
  #ifdef RE_TRACK_PATTERN_OFFSETS
  #define RExC_offsets   (pRExC_state->rxi->u.offsets) /* I am not like the
                                                           others */
@@ -299,7 +311,9 @@ struct RExC_state_t {
  #define        SPSTART         0x04    /* Starts with * or + */
  #define POSTPONED      0x08    /* (?1),(?&name), (??{...}) or similar */
  #define TRYAGAIN       0x10    /* Weeded out a declaration. */
-#define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
+#define RESTART_PASS1   0x20    /* Need to restart sizing pass */
+#define NEED_UTF8       0x40    /* In conjunction with RESTART_PASS1, need to
+                                   calcuate sizes as UTF-8 */
  
  #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
  
@@ -318,12 +332,30 @@ struct RExC_state_t {
  #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
  #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
  
-#define REQUIRE_UTF8   STMT_START {                                       \
+#define REQUIRE_UTF8(flagp) STMT_START {                                   \
                                       if (!UTF) {                           \
-                                         *flagp = RESTART_UTF8;            \
+                                         assert(PASS1);                    \
+                                         *flagp = RESTART_PASS1|NEED_UTF8; \
                                           return NULL;                      \
                                       }                                     \
-                        } STMT_END
+                             } STMT_END
+
+/* Change from /d into /u rules, and restart the parse if we've already seen
+ * something whose size would increase as a result, by setting *flagp and
+ * returning 'restart_retval'.  RExC_uni_semantics is a flag that indicates
+ * we've change to /u during the parse.  */
+#define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
+    STMT_START {                                                            \
+            if (DEPENDS_SEMANTICS) {                                        \
+                assert(PASS1);                                              \
+                set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
+                RExC_uni_semantics = 1;                                     \
+                if (RExC_seen_unfolded_sharp_s) {                           \
+                    *flagp |= RESTART_PASS1;                                \
+                    return restart_retval;                                  \
+                }                                                           \
+            }                                                               \
+    } STMT_END
  
  /* This converts the named class defined in regcomp.h to its equivalent class
   * number defined in handy.h. */
@@ -629,7 +661,7 @@ static const scan_data_t zero_scan_data =
  } STMT_END
  
  /* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
+#define vFAIL2utf8f(m, a1) STMT_START {            \
      const IV offset = RExC_parse - RExC_precomp;   \
      if (!SIZE_ONLY)                                \
          SAVEFREESV(RExC_rx_sv);                    \
@@ -637,6 +669,14 @@ static const scan_data_t zero_scan_data =
              REPORT_LOCATION_ARGS(offset));         \
  } STMT_END
  
+#define vFAIL3utf8f(m, a1, a2) STMT_START {             \
+    const IV offset = RExC_parse - RExC_precomp;        \
+    if (!SIZE_ONLY)                                     \
+        SAVEFREESV(RExC_rx_sv);                         \
+    S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
+            REPORT_LOCATION_ARGS(offset));              \
+} STMT_END
+
  /* These have asserts in them because of [perl #122671] Many warnings in
   * regcomp.c can occur twice.  If they get output in pass1 and later in that
   * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
@@ -1177,7 +1217,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
  
      /* If this can match all upper Latin1 code points, have to add them
       * as well */
-    if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+    if (OP(node) == ANYOFD
+        && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+    {
          _invlist_union(invlist, PL_UpperLatin1, &invlist);
      }
  
@@ -1255,12 +1297,20 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
           * that should be; while the consequences for having /l bugs is
           * incorrect matches */
          if (ssc_is_anything((regnode_ssc *)and_with)) {
-            anded_flags |= ANYOF_WARN_SUPER;
+            anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
          }
      }
      else {
          anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
-        anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+        if (OP(and_with) == ANYOFD) {
+            anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+        }
+        else {
+            anded_flags = ANYOF_FLAGS(and_with)
+            &( ANYOF_COMMON_FLAGS
+              |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+              |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+        }
      }
  
      ANYOF_FLAGS(ssc) &= anded_flags;
@@ -1411,6 +1461,12 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      else {
          ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
          ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+        if (OP(or_with) != ANYOFD) {
+            ored_flags
+            |= ANYOF_FLAGS(or_with)
+             & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+                |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
+        }
      }
  
      ANYOF_FLAGS(ssc) |= ored_flags;
@@ -1609,7 +1665,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
      /* The code in this file assumes that all but these flags aren't relevant
       * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
       * by the time we reach here */
-    assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
+    assert(! (ANYOF_FLAGS(ssc)
+        & ~( ANYOF_COMMON_FLAGS
+            |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+            |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
  
      populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
  
@@ -1623,6 +1682,10 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
          ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
      }
  
+    if (RExC_contains_locale) {
+        OP(ssc) = ANYOFL;
+    }
+
      assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
  }
  
@@ -2001,7 +2064,7 @@ is the recommended Unicode-aware way of saying
  #define TRIE_STORE_REVCHAR(val)                                            \
      STMT_START {                                                           \
         if (UTF) {                                                         \
-            SV *zlopp = newSV(7); /* XXX: optimize me */                   \
+            SV *zlopp = newSV(UTF8_MAXBYTES);                             \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
              unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
@@ -3304,6 +3367,14 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
   * these get optimized out
   *
+ * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
+ * as possible, even if that means splitting an existing node so that its first
+ * part is moved to the preceeding node.  This would maximise the efficiency of
+ * memEQ during matching.  Elsewhere in this file, khw proposes splitting
+ * EXACTFish nodes into portions that don't change under folding vs those that
+ * do.  Those portions that don't change may be the only things in the pattern that
+ * could be used to find fixed and floating strings.
+ *
   * If a node is to match under /i (folded), the number of characters it matches
   * can be different than its character length if it contains a multi-character
   * fold.  *min_subtract is set to the total delta number of characters of the
@@ -3652,6 +3723,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
               * this function, we need to flag any occurrences of the sharp s.
               * This character forbids trie formation (because of added
               * complexity) */
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
             while (s < s_end) {
                  if (*s == LATIN_SMALL_LETTER_SHARP_S) {
                      OP(scan) = EXACTFA_NO_TRIE;
@@ -3659,7 +3733,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                      break;
                  }
                  s++;
-                continue;
              }
          }
         else {
@@ -3705,6 +3778,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                  *min_subtract += len - 1;
                  s += len;
             }
+#endif
         }
      }
  
@@ -5094,6 +5168,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                     }
                     break;
  
+                case ANYOFD:
                  case ANYOFL:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
@@ -6543,7 +6618,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      /* ignore the utf8ness if the pattern is 0 length */
      RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+
      RExC_uni_semantics = 0;
+    RExC_seen_unfolded_sharp_s = 0;
      RExC_contains_locale = 0;
      RExC_contains_i = 0;
      RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
@@ -6564,8 +6641,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          });
  
    redo_first_pass:
-    /* we jump here if we upgrade the pattern to utf8 and have to
-     * recompile */
+    /* we jump here if we have to recompile, e.g., from upgrading the pattern
+     * to utf8 */
  
      if ((pm_flags & PMf_USE_RE_EVAL)
                 /* this second condition covers the non-regex literal case,
@@ -6599,7 +6676,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      if (rx_flags & PMf_FOLD) {
          RExC_contains_i = 1;
      }
-    if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+    if (   initial_charset == REGEX_DEPENDS_CHARSET
+        && (RExC_utf8 ||RExC_uni_semantics))
+    {
  
         /* Set to use unicode semantics if the pattern is in utf8 and has the
          * 'depends' charset specified, as it means unicode when utf8  */
@@ -6611,7 +6690,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      RExC_pm_flags = pm_flags;
  
      if (runtime_code) {
-       if (TAINTING_get && TAINT_get)
+        assert(TAINTING_get || !TAINT_get);
+       if (TAINT_get)
             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
  
         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
@@ -6685,9 +6765,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          at least some part of the pattern, and therefore must convert the whole
          thing.
          -- dmq */
-        if (flags & RESTART_UTF8) {
-            S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+        if (flags & RESTART_PASS1) {
+            if (flags & NEED_UTF8) {
+                S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
                                      pRExC_state->num_code_blocks);
+            }
+            else {
+                DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+                "Need to redo pass 1\n"));
+            }
+
              goto redo_first_pass;
          }
          Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
@@ -6754,7 +6841,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         for (n = 0; n < pRExC_state->num_code_blocks; n++)
             if (pRExC_state->code_blocks[n].src_regex)
                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
-       SAVEFREEPV(pRExC_state->code_blocks);
+       if(pRExC_state->code_blocks)
+           SAVEFREEPV(pRExC_state->code_blocks); /* often null */
      }
  
      {
@@ -6769,25 +6857,25 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                      || ! has_charset);
          bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
                                                     == REG_RUN_ON_COMMENT_SEEN);
-       U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
+       U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
                             >> RXf_PMf_STD_PMMOD_SHIFT);
         const char *fptr = STD_PAT_MODS;        /*"msixn"*/
         char *p;
-        /* Allocate for the worst case, which is all the std flags are turned
-         * on.  If more precision is desired, we could do a population count of
-         * the flags set.  This could be done with a small lookup table, or by
-         * shifting, masking and adding, or even, when available, assembly
-         * language for a machine-language population count.
-         * We never output a minus, as all those are defaults, so are
+
+        /* We output all the necessary flags; we never output a minus, as all
+         * those are defaults, so are
           * covered by the caret */
         const STRLEN wraplen = plen + has_p + has_runon
              + has_default       /* If needs a caret */
+            + PL_bitcount[reganch] /* 1 char for each set standard flag */
  
                 /* If needs a character set specifier */
             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
-            + (sizeof(STD_PAT_MODS) - 1)
              + (sizeof("(?:)") - 1);
  
+        /* make sure PL_bitcount bounds not exceeded */
+        assert(sizeof(STD_PAT_MODS) <= 8);
+
          Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
         r->xpv_len_u.xpvlenu_pv = p;
         if (RExC_utf8)
@@ -9301,9 +9389,9 @@ Perl__load_PL_utf8_foldclosures (pTHX)
  }
  #endif
  
-#ifdef PERL_ARGS_ASSERT__INVLISTEQ
+#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
  bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
+Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
  {
      /* Return a boolean as to if the two passed in inversion lists are
       * identical.  The final argument, if TRUE, says to take the complement of
@@ -9721,9 +9809,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
          ++RExC_parse;
      }
  
-    if (PASS2) {
-        STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
-    }
+    vFAIL("Sequence (?... not terminated");
  }
  
  /*
@@ -9743,10 +9829,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
  #endif
  
  /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
-   flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
-   needs to be restarted.
-   Otherwise would only return NULL if regbranch() returns NULL, which
-   cannot happen.  */
+   flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
+   needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
+   upgraded to UTF-8.  Otherwise would only return NULL if regbranch() returns
+   NULL, which cannot happen.  */
  STATIC regnode *
  S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
@@ -9792,9 +9878,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             STRLEN verb_len = 0;
             char *start_arg = NULL;
             unsigned char op = 0;
-           int argok = 1;
-            int internal_argval = 0; /* internal_argval is only useful if
-                                        !argok */
+            int arg_required = 0;
+            int internal_argval = -1; /* if >-1 we are not allowed an argument*/
  
              if (has_intervening_patws) {
                  RExC_parse++;
@@ -9836,14 +9921,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
              case 'F':  /* (*FAIL) */
                  if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
                     op = OPFAIL;
-                   argok = 0;
                 }
                 break;
              case ':':  /* (*:NAME) */
             case 'M':  /* (*MARK:NAME) */
                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
                      op = MARKPOINT;
-                    argok = -1;
+                    arg_required = 1;
                  }
                  break;
              case 'P':  /* (*PRUNE) */
@@ -9868,36 +9952,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      "Unknown verb pattern '%"UTF8f"'",
                      UTF8fARG(UTF, verb_len, start_verb));
             }
-           if ( argok ) {
-                if ( start_arg && internal_argval ) {
-                   vFAIL3("Verb pattern '%.*s' may not have an argument",
-                       verb_len, start_verb);
-               } else if ( argok < 0 && !start_arg ) {
-                    vFAIL3("Verb pattern '%.*s' has a mandatory argument",
-                       verb_len, start_verb);
-               } else {
-                   ret = reganode(pRExC_state, op, internal_argval);
-                   if ( ! internal_argval && ! SIZE_ONLY ) {
-                        if (start_arg) {
-                            SV *sv = newSVpvn( start_arg,
-                                               RExC_parse - start_arg);
-                            ARG(ret) = add_data( pRExC_state,
-                                                 STR_WITH_LEN("S"));
-                            RExC_rxi->data->data[ARG(ret)]=(void*)sv;
-                            ret->flags = 0;
-                        } else {
-                            ret->flags = 1;
-                        }
-                    }
-               }
-               if (!internal_argval)
-                    RExC_seen |= REG_VERBARG_SEEN;
-           } else if ( start_arg ) {
-               vFAIL3("Verb pattern '%.*s' may not have an argument",
-                       verb_len, start_verb);
-           } else {
-               ret = reg_node(pRExC_state, op);
-           }
+            if ( arg_required && !start_arg ) {
+                vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+                    verb_len, start_verb);
+            }
+            if (internal_argval == -1) {
+                ret = reganode(pRExC_state, op, 0);
+            } else {
+                ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
+            }
+            RExC_seen |= REG_VERBARG_SEEN;
+            if ( ! SIZE_ONLY ) {
+                if (start_arg) {
+                    SV *sv = newSVpvn( start_arg,
+                                       RExC_parse - start_arg);
+                    ARG(ret) = add_data( pRExC_state,
+                                         STR_WITH_LEN("S"));
+                    RExC_rxi->data->data[ARG(ret)]=(void*)sv;
+                    ret->flags = 1;
+                } else {
+                    ret->flags = 0;
+                }
+                if ( internal_argval != -1 )
+                    ARG2L_SET(ret, internal_argval);
+            }
             nextchar(pRExC_state);
             return ret;
          }
@@ -9912,7 +9990,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  
             RExC_parse++;
             paren = *RExC_parse++;
-           ret = NULL;                 /* For look-ahead/behind. */
+           ret = NULL;                 /* For lookahead/behind. */
             switch (paren) {
  
             case 'P':   /* (?P...) variants for those used to PCRE/Python */
@@ -10057,10 +10135,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
                 /* check if we're really just a "FAIL" assertion */
-               --RExC_parse;
-               nextchar(pRExC_state);
+                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                        FALSE /* Don't force to /x */ );
                 if (*RExC_parse == ')') {
-                   ret=reg_node(pRExC_state, OPFAIL);
+                    ret=reganode(pRExC_state, OPFAIL, 0);
                     nextchar(pRExC_state);
                     return ret;
                 }
@@ -10263,9 +10341,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 int is_define= 0;
                  const int DEFINE_len = sizeof("DEFINE") - 1;
                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
-                   if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
-                       || RExC_parse[1] == '<'
-                       || RExC_parse[1] == '{') { /* Lookahead or eval. */
+                    if (
+                        RExC_parse[1] == '=' ||
+                        RExC_parse[1] == '!' ||
+                        RExC_parse[1] == '<' ||
+                        RExC_parse[1] == '{'
+                    ) { /* Lookahead or eval. */
                         I32 flag;
                          regnode *tail;
  
@@ -10274,8 +10355,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                             ret->flags = 1;
  
                          tail = reg(pRExC_state, 1, &flag, depth+1);
-                        if (flag & RESTART_UTF8) {
-                            *flagp = RESTART_UTF8;
+                        if (flag & (RESTART_PASS1|NEED_UTF8)) {
+                            *flagp = flag & (RESTART_PASS1|NEED_UTF8);
                              return NULL;
                          }
                          REGTAIL(pRExC_state, ret, tail);
@@ -10339,7 +10420,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
                      /* (?(1)...) */
                     char c;
-                   char *tmp;
                      UV uv;
                      if (grok_atoUV(RExC_parse, &uv, &endptr)
                          && uv <= I32_MAX
@@ -10347,24 +10427,23 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          parno = (I32)uv;
                          RExC_parse = (char*)endptr;
                      }
-                    /* XXX else what? */
+                    else {
+                        vFAIL("panic: grok_atoUV returned FALSE");
+                    }
                      ret = reganode(pRExC_state, GROUPP, parno);
  
                   insert_if_check_paren:
-                   if (*(tmp = nextchar(pRExC_state)) != ')') {
-                        /* nextchar also skips comments, so undo its work
-                         * and skip over the the next character.
-                         */
-                        RExC_parse = tmp;
+                   if (UCHARAT(RExC_parse) != ')') {
                          RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
                         vFAIL("Switch condition not recognized");
                     }
+                   nextchar(pRExC_state);
                   insert_if:
                      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
                      br = regbranch(pRExC_state, &flags, 1,depth+1);
                     if (br == NULL) {
-                        if (flags & RESTART_UTF8) {
-                            *flagp = RESTART_UTF8;
+                        if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                            *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                              return NULL;
                          }
                          FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
@@ -10372,7 +10451,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      } else
                          REGTAIL(pRExC_state, br, reganode(pRExC_state,
                                                            LONGJMP, 0));
-                   c = *nextchar(pRExC_state);
+                   c = UCHARAT(RExC_parse);
+                    nextchar(pRExC_state);
                     if (flags&HASWIDTH)
                         *flagp |= HASWIDTH;
                     if (c == '|') {
@@ -10383,8 +10463,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          lastbr = reganode(pRExC_state, IFTHEN, 0);
  
                          if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
-                            if (flags & RESTART_UTF8) {
-                                *flagp = RESTART_UTF8;
+                            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                                  return NULL;
                              }
                              FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
@@ -10393,7 +10473,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          REGTAIL(pRExC_state, ret, lastbr);
                         if (flags&HASWIDTH)
                             *flagp |= HASWIDTH;
-                       c = *nextchar(pRExC_state);
+                        c = UCHARAT(RExC_parse);
+                        nextchar(pRExC_state);
                     }
                     else
                         lastbr = NULL;
@@ -10481,8 +10562,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      /*     branch_len = (paren != 0); */
  
      if (br == NULL) {
-        if (flags & RESTART_UTF8) {
-            *flagp = RESTART_UTF8;
+        if (flags & (RESTART_PASS1|NEED_UTF8)) {
+            *flagp = flags & (RESTART_PASS1|NEED_UTF8);
              return NULL;
          }
          FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10528,8 +10609,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
          br = regbranch(pRExC_state, &flags, 0, depth+1);
  
         if (br == NULL) {
-            if (flags & RESTART_UTF8) {
-                *flagp = RESTART_UTF8;
+            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                  return NULL;
              }
              FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10660,12 +10741,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  
      /* Check for proper termination. */
      if (paren) {
-        /* restore original flags, but keep (?p) */
+        /* restore original flags, but keep (?p) and, if we've changed from /d
+         * rules to /u, keep the /u */
         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
-       if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+        if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+            set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+        }
+       if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
             RExC_parse = oregcomp_parse;
             vFAIL("Unmatched (");
         }
+       nextchar(pRExC_state);
      }
      else if (!paren && RExC_parse < RExC_end) {
         if (*RExC_parse == ')') {
@@ -10690,8 +10776,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
   *
   * Implements the concatenation operator.
   *
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
   */
  STATIC regnode *
  S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
@@ -10722,16 +10808,16 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
  
      *flagp = WORST;                    /* Tentatively. */
  
-    RExC_parse--;
-    nextchar(pRExC_state);
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't force to /x */ );
      while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
         flags &= ~TRYAGAIN;
          latest = regpiece(pRExC_state, &flags,depth+1);
         if (latest == NULL) {
             if (flags & TRYAGAIN)
                 continue;
-            if (flags & RESTART_UTF8) {
-                *flagp = RESTART_UTF8;
+            if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                  return NULL;
              }
              FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
@@ -10773,8 +10859,8 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
   *
   * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
   * TRYAGAIN.
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
   */
  STATIC regnode *
  S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
@@ -10803,8 +10889,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
      ret = regatom(pRExC_state, &flags,depth+1);
      if (ret == NULL) {
-       if (flags & (TRYAGAIN|RESTART_UTF8))
-           *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
+       if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
+           *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
          else
              FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
         return(NULL);
@@ -10864,31 +10950,40 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * enough space for all the things we are about to throw
                       * away, but we can shrink it by the ammount we are about
                       * to re-use here */
-                    RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+                    RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
                  }
                  else {
                      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
                      RExC_emit = orig_emit;
                  }
-                ret = reg_node(pRExC_state, OPFAIL);
+                ret = reganode(pRExC_state, OPFAIL, 0);
                  return ret;
              }
-            else if (min == max
-                     && RExC_parse < RExC_end
-                     && (*RExC_parse == '?' || *RExC_parse == '+'))
+            else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
              {
                  if (PASS2) {
                      ckWARN2reg(RExC_parse + 1,
                                 "Useless use of greediness modifier '%c'",
                                 *RExC_parse);
                  }
-                /* Absorb the modifier, so later code doesn't see nor use
-                    * it */
-                nextchar(pRExC_state);
              }
  
           do_curly:
             if ((flags&SIMPLE)) {
+                if (min == 0 && max == REG_INFTY) {
+                    reginsert(pRExC_state, STAR, ret, depth+1);
+                    ret->flags = 0;
+                    MARK_NAUGHTY(4);
+                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+                    goto nest_check;
+                }
+                if (min == 1 && max == REG_INFTY) {
+                    reginsert(pRExC_state, PLUS, ret, depth+1);
+                    ret->flags = 0;
+                    MARK_NAUGHTY(3);
+                    RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+                    goto nest_check;
+                }
                  MARK_NAUGHTY_EXP(2, 2);
                 reginsert(pRExC_state, CURLY, ret, depth+1);
                  Set_Node_Offset(ret, parse_start+1); /* MJD */
@@ -10962,22 +11057,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
      *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
  
-    if (op == '*' && (flags&SIMPLE)) {
-       reginsert(pRExC_state, STAR, ret, depth+1);
-       ret->flags = 0;
-       MARK_NAUGHTY(4);
-        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-    }
-    else if (op == '*') {
+    if (op == '*') {
         min = 0;
         goto do_curly;
      }
-    else if (op == '+' && (flags&SIMPLE)) {
-       reginsert(pRExC_state, PLUS, ret, depth+1);
-       ret->flags = 0;
-       MARK_NAUGHTY(3);
-        RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-    }
      else if (op == '+') {
         min = 1;
         goto do_curly;
@@ -11081,14 +11164,17 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    * sequence. *node_p * will be set to a generated node returned by this
    * function calling S_reg().
    *
-  * The final possibility, which happens only when the fourth one would
-  * otherwise be in effect, is that one of those code points requires the
-  * pattern to be recompiled as UTF-8.  The function returns FALSE, and sets
-  * the RESTART_UTF8 flag in *flagp.  When this happens, the caller needs to
-  * desist from continuing parsing, and return this information to its caller.
-  * This is not set for when there is only one code point, as this can be
-  * called as part of an ANYOF node, and they can store above-Latin1 code
-  * points without the pattern having to be in UTF-8.
+  * The final possibility is that it is premature to be calling this function;
+  * that pass1 needs to be restarted.  This can happen when this changes from
+  * /d to /u rules, or when the pattern needs to be upgraded to UTF-8.  The
+  * latter occurs only when the fourth possibility would otherwise be in
+  * effect, and is because one of those code points requires the pattern to be
+  * recompiled as UTF-8.  The function returns FALSE, and sets the
+  * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate.  When this
+  * happens, the caller needs to desist from continuing parsing, and return
+  * this information to its caller.  This is not set for when there is only one
+  * code point, as this can be called as part of an ANYOF node, and they can
+  * store above-Latin1 code points without the pattern having to be in UTF-8.
    *
    * For non-single-quoted regexes, the tokenizer has resolved character and
    * sequence names inside \N{...} into their Unicode values, normalizing the
@@ -11103,14 +11189,12 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
    * parser.  But if the single-quoted regex is something like '\N{U+41}', that
    * is legal and handled here.  The code point is Unicode, and has to be
    * translated into the native character set for non-ASCII platforms.
-  * the tokenizer passes the \N sequence through unchanged; this code will not
-  * attempt to determine this nor expand those, instead raising a syntax error.
    */
  
      char * endbrace;    /* points to '}' following the name */
      char *endchar;     /* Points to '.' or '}' ending cur char in the input
                             stream */
-    char* p;            /* Temporary */
+    char* p = RExC_parse; /* Temporary */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -11128,14 +11212,12 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
      /* The [^\n] meaning of \N ignores spaces and comments under the /x
       * modifier.  The other meanings do not, so use a temporary until we find
       * out which we are being called with */
-    p = (RExC_flags & RXf_PMf_EXTENDED)
-       ? regpatws(pRExC_state, RExC_parse,
-                                TRUE) /* means recognize comments */
-       : RExC_parse;
+    skip_to_be_ignored_text(pRExC_state, &p,
+                            FALSE /* Don't force to /x */ );
  
      /* Disambiguate between \N meaning a named character versus \N meaning
       * [^\n].  The latter is assumed when the {...} following the \N is a legal
-     * quantifier, or there is no a '{' at all */
+     * quantifier, or there is no '{' at all */
      if (*p != '{' || regcurly(p)) {
         RExC_parse = p;
          if (cp_count) {
@@ -11145,9 +11227,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         if (! node_p) {
              return FALSE;
          }
-        RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
-                           current char */
-       nextchar(pRExC_state);
+
         *node_p = reg_node(pRExC_state, REG_ANY);
         *flagp |= HASWIDTH|SIMPLE;
         MARK_NAUGHTY(1);
@@ -11177,7 +11257,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
         vFAIL("\\N{NAME} must be resolved by the lexer");
      }
  
-    RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+    REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
+                                        semantics */
  
      if (endbrace == RExC_parse) {   /* empty: \N{} */
          if (cp_count) {
@@ -11317,8 +11398,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
  
          if (node_p) {
              if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
-                if (flags & RESTART_UTF8) {
-                    *flagp = RESTART_UTF8;
+                if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                    *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                      return FALSE;
                  }
                  FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
@@ -11354,10 +11435,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
   * it returns U+FFFD (Replacement character) and sets *encp to NULL.
   */
  STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
  {
      STRLEN numlen = 1;
-    SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+    SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
      const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
      const STRLEN newlen = SvCUR(sv);
      UV uv = UNICODE_REPLACEMENT;
@@ -11505,8 +11586,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
              *character = (U8) code_point;
              len = 1;
          } /* Else is folded non-UTF8 */
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
          else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-
+#else
+        else if (1) {
+#endif
              /* We don't fold any non-UTF8 except possibly the Sharp s  (see
               * comments at join_exact()); */
              *character = (U8) code_point;
@@ -11549,10 +11635,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
  
      /* A single character node is SIMPLE, except for the special-cased SHARP S
       * under /di. */
-    if ((len == 1 || (UTF && len == UNISKIP(code_point)))
-        && (code_point != LATIN_SMALL_LETTER_SHARP_S
-            || ! FOLD || ! DEPENDS_SEMANTICS))
-    {
+    if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+        && ( code_point != LATIN_SMALL_LETTER_SHARP_S
+            || ! FOLD || ! DEPENDS_SEMANTICS)
+#endif
+    ) {
          *flagp |= SIMPLE;
      }
  
@@ -11638,8 +11728,8 @@ S_backref_value(char *p)
  
     Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
     TRYAGAIN.
-   Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
-   restarted.
+   Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+   restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
     Otherwise does not return NULL.
  */
  
@@ -11648,7 +11738,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  {
      regnode *ret = NULL;
      I32 flags = 0;
-    char *parse_start = RExC_parse;
+    char *parse_start;
      U8 op;
      int invert = 0;
      U8 arg;
@@ -11662,6 +11752,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      PERL_ARGS_ASSERT_REGATOM;
  
    tryagain:
+    parse_start = RExC_parse;
      switch ((U8)*RExC_parse) {
      case '^':
         RExC_seen_zerolen++;
@@ -11700,17 +11791,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         TRUE, /* allow multi-char folds */
                         FALSE, /* don't silence non-portable warnings. */
                         (bool) RExC_strict,
+                       TRUE, /* Allow an optimized regnode result */
                         NULL);
-       if (*RExC_parse != ']') {
-           RExC_parse = oregcomp_parse;
-           vFAIL("Unmatched [");
-       }
          if (ret == NULL) {
-            if (*flagp & RESTART_UTF8)
+            if (*flagp & (RESTART_PASS1|NEED_UTF8))
                  return NULL;
              FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
                    (UV) *flagp);
          }
+       if (*RExC_parse != ']') {
+           RExC_parse = oregcomp_parse;
+           vFAIL("Unmatched [");
+       }
         nextchar(pRExC_state);
          Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
         break;
@@ -11727,8 +11819,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     }
                     goto tryagain;
                 }
-                if (flags & RESTART_UTF8) {
-                    *flagp = RESTART_UTF8;
+                if (flags & (RESTART_PASS1|NEED_UTF8)) {
+                    *flagp = flags & (RESTART_PASS1|NEED_UTF8);
                      return NULL;
                  }
                  FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
@@ -11892,7 +11984,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          NOT_REACHED; /*NOTREACHED*/
                  }
                  RExC_parse = endbrace;
-                RExC_uni_semantics = 1;
+                REQUIRE_UNI_RULES(flagp, NULL);
  
                  if (PASS2 && op >= BOUNDA) {  /* /aa is same as /a */
                      OP(ret) = BOUNDU;
@@ -11988,33 +12080,30 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             break;
         case 'p':
         case 'P':
-           {
-#ifdef DEBUGGING
-               char* parse_start = RExC_parse - 2;
-#endif
+            RExC_parse--;
  
-               RExC_parse--;
-
-                ret = regclass(pRExC_state, flagp,depth+1,
-                               TRUE, /* means just parse this element */
-                               FALSE, /* don't allow multi-char folds */
-                               FALSE, /* don't silence non-portable warnings.
-                                         It would be a bug if these returned
-                                         non-portables */
-                               (bool) RExC_strict,
-                               NULL);
-                /* regclass() can only return RESTART_UTF8 if multi-char folds
-                   are allowed.  */
-                if (!ret)
-                    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
-                          (UV) *flagp);
-
-               RExC_parse--;
-
-               Set_Node_Offset(ret, parse_start + 2);
-                Set_Node_Cur_Length(ret, parse_start);
-               nextchar(pRExC_state);
-           }
+            ret = regclass(pRExC_state, flagp,depth+1,
+                           TRUE, /* means just parse this element */
+                           FALSE, /* don't allow multi-char folds */
+                           FALSE, /* don't silence non-portable warnings.  It
+                                     would be a bug if these returned
+                                     non-portables */
+                           (bool) RExC_strict,
+                           TRUE, /* Allow an optimized regnode result */
+                           NULL);
+            if (*flagp & RESTART_PASS1)
+                return NULL;
+            /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+             * multi-char folds are allowed.  */
+            if (!ret)
+                FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+                      (UV) *flagp);
+
+            RExC_parse--;
+
+            Set_Node_Offset(ret, parse_start);
+            Set_Node_Cur_Length(ret, parse_start - 2);
+            nextchar(pRExC_state);
             break;
          case 'N':
              /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
@@ -12041,9 +12130,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  break;
              }
  
-            if (*flagp & RESTART_UTF8)
+            if (*flagp & RESTART_PASS1)
                  return NULL;
-            RExC_parse--;
+
+            /* Here, evaluates to a single code point.  Go get that */
+            RExC_parse = parse_start;
              goto defchar;
  
         case 'k':    /* Handle \k<NAME> and \k'NAME' */
@@ -12159,6 +12250,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                           * octal character escape, e.g. \35 or \777.
                           * The above logic should make it obvious why using
                           * octal escapes in patterns is problematic. - Yves */
+                        RExC_parse = parse_start;
                          goto defchar;
                      }
                  }
@@ -12168,41 +12260,36 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * as an octal escape. It may or may not be a valid backref
                   * escape. For instance \88888888 is unlikely to be a valid
                   * backref. */
-               {
-#ifdef RE_TRACK_PATTERN_OFFSETS
-                   char * const parse_start = RExC_parse - 1; /* MJD */
-#endif
-                   while (isDIGIT(*RExC_parse))
-                       RExC_parse++;
-                    if (hasbrace) {
-                        if (*RExC_parse != '}')
-                            vFAIL("Unterminated \\g{...} pattern");
-                        RExC_parse++;
-                    }
-                   if (!SIZE_ONLY) {
-                       if (num > (I32)RExC_rx->nparens)
-                           vFAIL("Reference to nonexistent group");
-                   }
-                   RExC_sawback = 1;
-                   ret = reganode(pRExC_state,
-                                  ((! FOLD)
-                                    ? REF
-                                    : (ASCII_FOLD_RESTRICTED)
-                                      ? REFFA
-                                       : (AT_LEAST_UNI_SEMANTICS)
-                                         ? REFFU
-                                         : (LOC)
-                                           ? REFFL
-                                           : REFF),
-                                   num);
-                   *flagp |= HASWIDTH;
+                while (isDIGIT(*RExC_parse))
+                    RExC_parse++;
+                if (hasbrace) {
+                    if (*RExC_parse != '}')
+                        vFAIL("Unterminated \\g{...} pattern");
+                    RExC_parse++;
+                }
+                if (!SIZE_ONLY) {
+                    if (num > (I32)RExC_rx->nparens)
+                        vFAIL("Reference to nonexistent group");
+                }
+                RExC_sawback = 1;
+                ret = reganode(pRExC_state,
+                               ((! FOLD)
+                                 ? REF
+                                 : (ASCII_FOLD_RESTRICTED)
+                                   ? REFFA
+                                   : (AT_LEAST_UNI_SEMANTICS)
+                                     ? REFFU
+                                     : (LOC)
+                                       ? REFFL
+                                       : REFF),
+                                num);
+                *flagp |= HASWIDTH;
  
-                    /* override incorrect value set in reganode MJD */
-                    Set_Node_Offset(ret, parse_start+1);
-                    Set_Node_Cur_Length(ret, parse_start);
-                   RExC_parse--;
-                   nextchar(pRExC_state);
-               }
+                /* override incorrect value set in reganode MJD */
+                Set_Node_Offset(ret, parse_start);
+                Set_Node_Cur_Length(ret, parse_start-1);
+                skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                        FALSE /* Don't force to /x */ );
             }
             break;
         case '\0':
@@ -12212,26 +12299,34 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         default:
             /* Do not generate "unrecognized" warnings here, we fall
                back into the quick-grab loop below */
-           parse_start--;
+            RExC_parse = parse_start;
             goto defchar;
-       }
+       } /* end of switch on a \foo sequence */
         break;
  
      case '#':
-       if (RExC_flags & RXf_PMf_EXTENDED) {
+
+        /* '#' comments should have been spaced over before this function was
+         * called */
+        assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
+       /*
+        if (RExC_flags & RXf_PMf_EXTENDED) {
             RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
             if (RExC_parse < RExC_end)
                 goto tryagain;
         }
+        */
+
         /* FALLTHROUGH */
  
      default:
+         defchar: {
  
-            parse_start = RExC_parse - 1;
-
-           RExC_parse++;
+            /* Here, we have determined that the next thing is probably a
+             * literal character.  RExC_parse points to the first byte of its
+             * definition.  (It still may be an escape sequence that evaluates
+             * to a single character) */
  
-         defchar: {
             STRLEN len = 0;
             UV ender = 0;
             char *p;
@@ -12249,7 +12344,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
               * string's UTF8ness.  The reason to do this is that EXACTF is not
               * trie-able, EXACTFU is.
               *
-             * Similarly, we can convert EXACTFL nodes to EXACTFU if they
+             * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
               * contain only above-Latin1 characters (hence must be in UTF8),
               * which don't participate in folds with Latin1-range characters,
               * as the latter's folds aren't known until runtime.  (We don't
@@ -12272,8 +12367,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
           reparse:
  
-            /* We do the EXACTFish to EXACT node only if folding.  (And we
-             * don't need to figure this out until pass 2) */
+            /* We look for the EXACTFish to EXACT node optimizaton only if
+             * folding.  (And we don't need to figure this out until pass 2).
+             * XXX It might actually make sense to split the node into portions
+             * that are exact and ones that aren't, so that we could later use
+             * the exact ones to find the longest fixed and floating strings.
+             * One would want to join them back into a larger node.  One could
+             * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
              maybe_exact = FOLD && PASS2;
  
             /* XXX The node can hold up to 255 bytes, yet this only goes to
@@ -12294,15 +12394,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
               * could back off to end with only a code point that isn't such a
               * non-final, but it is possible for there not to be any in the
               * entire node. */
-           for (p = RExC_parse - 1;
+
+            assert(   ! UTF     /* Is at the beginning of a character */
+                   || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
+                   || UTF8_IS_START(UCHARAT(RExC_parse)));
+
+           for (p = RExC_parse;
                  len < upper_parse && p < RExC_end;
                  len++)
             {
                 oldp = p;
  
-               if (RExC_flags & RXf_PMf_EXTENDED)
-                    p = regpatws(pRExC_state, p,
-                                          TRUE); /* means recognize comments */
+                /* White space has already been ignored */
+                assert(   (RExC_flags & RXf_PMf_EXTENDED) == 0
+                       || ! is_PATWS_safe((p), RExC_end, UTF));
+
                 switch ((U8)*p) {
                 case '^':
                 case '$':
@@ -12366,8 +12472,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                              flagp,
                                              depth)
                          ) {
-                            if (*flagp & RESTART_UTF8)
-                                FAIL("panic: grok_bslash_N set RESTART_UTF8");
+                            if (*flagp & NEED_UTF8)
+                                FAIL("panic: grok_bslash_N set NEED_UTF8");
+                            if (*flagp & RESTART_PASS1)
+                                return NULL;
  
                              /* Here, it wasn't a single code point.  Go close
                               * up this EXACTish node.  The switch() prior to
@@ -12377,7 +12485,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          }
                          p = RExC_parse;
                          if (ender > 0xff) {
-                            REQUIRE_UTF8;
+                            REQUIRE_UTF8(flagp);
                          }
                          break;
                     case 'r':
@@ -12424,7 +12532,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                 goto recode_encoding;
                             }
                             if (ender > 0xff) {
-                               REQUIRE_UTF8;
+                               REQUIRE_UTF8(flagp);
                             }
                             break;
                         }
@@ -12462,7 +12570,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                                  }
                             }
                              else {
-                               REQUIRE_UTF8;
+                               REQUIRE_UTF8(flagp);
                             }
                             break;
                         }
@@ -12507,7 +12615,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             STRLEN numlen = 3;
                             ender = grok_oct(p, &numlen, &flags, NULL);
                             if (ender > 0xff) {
-                               REQUIRE_UTF8;
+                               REQUIRE_UTF8(flagp);
                             }
                             p += numlen;
                              if (PASS2   /* like \08, \178 */
@@ -12526,10 +12634,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       recode_encoding:
                         if (! RExC_override_recoding) {
                             SV* enc = _get_encoding();
-                           ender = reg_recode((const char)(U8)ender, &enc);
+                           ender = reg_recode((U8)ender, &enc);
                             if (!enc && PASS2)
                                 ckWARNreg(p, "Invalid escape in the specified encoding");
-                           REQUIRE_UTF8;
+                           REQUIRE_UTF8(flagp);
                         }
                         break;
                     case '\0':
@@ -12538,7 +12646,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         /* FALLTHROUGH */
                     default:
                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
-                           /* Include any { following the alpha to emphasize
+                           /* Include any left brace following the alpha to emphasize
                              * that it could be part of an escape at some point
                              * in the future */
                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
@@ -12550,7 +12658,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 case '{':
                     /* Currently we don't warn when the lbrace is at the start
                      * of a construct.  This catches it in the middle of a
-                    * literal string, or when its the first thing after
+                    * literal string, or when it's the first thing after
                      * something like "\b" */
                     if (! SIZE_ONLY
                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
@@ -12560,7 +12668,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     /*FALLTHROUGH*/
                 default:    /* A literal character */
                   normal_default:
-                   if (UTF8_IS_START(*p) && UTF) {
+                   if (! UTF8_IS_INVARIANT(*p) && UTF) {
                         STRLEN numlen;
                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
                                                &numlen, UTF8_ALLOW_DEFAULT);
@@ -12572,12 +12680,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 } /* End of switch on the literal */
  
                 /* Here, have looked at the literal character and <ender>
-                * contains its ordinal, <p> points to the character after it
-                */
-
-               if ( RExC_flags & RXf_PMf_EXTENDED)
-                    p = regpatws(pRExC_state, p,
-                                          TRUE); /* means recognize comments */
+                 * contains its ordinal, <p> points to the character after it.
+                 * We need to check if the next non-ignored thing is a
+                 * quantifier.  Move <p> to after anything that should be
+                 * ignored, which, as a side effect, positions <p> for the next
+                 * loop iteration */
+                skip_to_be_ignored_text(pRExC_state, &p,
+                                        FALSE /* Don't force to /x */ );
  
                  /* If the next thing is a quantifier, it applies to this
                   * character only, which means that this character has to be in
@@ -12586,12 +12695,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                   * the node, close the node with just them, and set up to do
                   * this character again next time through, when it will be the
                   * only thing in its new node */
-                if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+                if ((next_is_quantifier = (   LIKELY(p < RExC_end)
+                                           && UNLIKELY(ISMULT2(p))))
+                    && LIKELY(len))
                 {
                      p = oldp;
                      goto loopdone;
                  }
  
+                /* Ready to add 'ender' to the node */
+
                  if (! FOLD) {  /* The simple case, just append the literal */
  
                      /* In the sizing pass, we need only the size of the
@@ -12599,7 +12712,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * its representation until PASS2. */
                      if (SIZE_ONLY) {
                          if (UTF) {
-                            const STRLEN unilen = UNISKIP(ender);
+                            const STRLEN unilen = UVCHR_SKIP(ender);
                              s += unilen;
  
                              /* We have to subtract 1 just below (and again in
@@ -12649,12 +12762,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      goto not_fold_common;
                  }
                  else /* A regular FOLD code point */
-                    if (! ( UTF
-                        /* See comments for join_exact() as to why we fold this
-                         * non-UTF at compile time */
-                        || (node_type == EXACTFU
-                            && ender == LATIN_SMALL_LETTER_SHARP_S)))
-                {
+                    if (! (   UTF
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+                            /* See comments for join_exact() as to why we fold
+                             * this non-UTF at compile time */
+                            || (   node_type == EXACTFU
+                                && ender == LATIN_SMALL_LETTER_SHARP_S)
+#endif
+                )) {
                      /* Here, are folding and are not UTF-8 encoded; therefore
                       * the character must be in the range 0-255, and is not /l
                       * (Not /l because we already handled these under /l in
@@ -12665,13 +12782,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          /* See if the character's fold differs between /d and
                           * /u.  This includes the multi-char fold SHARP S to
                           * 'ss' */
-                        if (maybe_exactfu
+                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+                            RExC_seen_unfolded_sharp_s = 1;
+                            maybe_exactfu = FALSE;
+                        }
+                        else if (maybe_exactfu
                              && (PL_fold[ender] != PL_fold_latin1[ender]
-                                || ender == LATIN_SMALL_LETTER_SHARP_S
-                                || (len > 0
-                                   && isALPHA_FOLD_EQ(ender, 's')
-                                   && isALPHA_FOLD_EQ(*(s-1), 's'))))
-                        {
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+                                || (   len > 0
+                                    && isALPHA_FOLD_EQ(ender, 's')
+                                    && isALPHA_FOLD_EQ(*(s-1), 's'))
+#endif
+                        )) {
                              maybe_exactfu = FALSE;
                          }
                      }
@@ -12680,7 +12804,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       * we have an array that finds its fold quickly */
                      *(s++) = (char) ender;
                  }
-                else {  /* FOLD and UTF */
+                else {  /* FOLD, and UTF (or sharp s) */
                      /* Unlike the non-fold case, we do actually have to
                       * calculate the results here in pass 1.  This is for two
                       * reasons, the folded length may be longer than the
@@ -12810,7 +12934,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              }
                          }
                          else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
                                                                    *s, *(s+1))))
                              {
                                  break;
@@ -12943,7 +13067,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
             RExC_parse = p - 1;
              Set_Node_Cur_Length(ret, parse_start);
-           nextchar(pRExC_state);
+           RExC_parse = p;
+            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                    FALSE /* Don't force to /x */ );
             {
                 /* len is STRLEN which is unsigned, need to copy to signed */
                 IV iv = len;
@@ -12958,29 +13084,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      return(ret);
  }
  
-STATIC char *
-S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
-{
-    /* Returns the next non-pattern-white space, non-comment character (the
-     * latter only if 'recognize_comment is true) in the string p, which is
-     * ended by RExC_end.  See also reg_skipcomment */
-    const char *e = RExC_end;
-
-    PERL_ARGS_ASSERT_REGPATWS;
-
-    while (p < e) {
-        STRLEN len;
-       if ((len = is_PATWS_safe(p, e, UTF))) {
-           p += len;
-        }
-       else if (recognize_comment && *p == '#') {
-            p = reg_skipcomment(pRExC_state, p);
-       }
-       else
-           break;
-    }
-    return p;
-}
  
  STATIC void
  S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
@@ -13009,9 +13112,6 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
              if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
                  ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
              }
-            else if (end >= NUM_ANYOF_CODE_POINTS) {
-                ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
-            }
  
             /* Quit if are above what we should change */
             if (start >= NUM_ANYOF_CODE_POINTS) {
@@ -13309,17 +13409,20 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                         this function */
      const bool save_fold = FOLD;    /* Temporary */
      char *save_end, *save_parse;    /* Temporaries */
+    const bool in_locale = LOC;     /* we turn off /l during processing */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
      PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
  
-    if (LOC) {  /* XXX could make valid in UTF-8 locales */
-        vFAIL("(?[...]) not valid in locale");
+    if (in_locale) {
+        set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
      }
-    RExC_uni_semantics = 1;     /* The use of this operator implies /u.  This
-                                   is required so that the compile time values
-                                   are valid in all runtime cases */
+
+    REQUIRE_UNI_RULES(flagp, NULL);   /* The use of this operator implies /u.
+                                         This is required so that the compile
+                                         time values are valid in all runtime
+                                         cases */
  
      /* This will return only an ANYOF regnode, or (unlikely) something smaller
       * (such as EXACT).  Thus we can skip most everything if just sizing.  We
@@ -13333,8 +13436,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
  
          while (RExC_parse < RExC_end) {
              SV* current = NULL;
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                          TRUE); /* means recognize comments */
+
+            skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                    TRUE /* Force /x */ );
+
              switch (*RExC_parse) {
                  case '?':
                      if (RExC_parse[1] == '[') depth++, RExC_parse++;
@@ -13349,6 +13454,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                       * default: case next time and keep on incrementing until
                       * we find one of the invariants we do handle. */
                      RExC_parse++;
+                    if (*RExC_parse == 'c') {
+                            /* Skip the \cX notation for control characters */
+                            RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                    }
                      break;
                  case '[':
                  {
@@ -13363,8 +13472,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                          RExC_parse++;
                      }
  
-                    /* regclass() can only return RESTART_UTF8 if multi-char
-                       folds are allowed.  */
+                    /* regclass() can only return RESTART_PASS1 and NEED_UTF8
+                     * if multi-char folds are allowed.  */
                      if (!regclass(pRExC_state, flagp,depth+1,
                                    is_posix_class, /* parse the whole char
                                                       class only if not a
@@ -13372,6 +13481,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                    FALSE, /* don't allow multi-char folds */
                                    TRUE, /* silence non-portable warnings. */
                                    TRUE, /* strict */
+                                  FALSE, /* Require return to be an ANYOF */
                                    &current
                                   ))
                          FAIL2("panic: regclass returned NULL to handle_sets, "
@@ -13398,11 +13508,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                          nextchar(pRExC_state);
                          Set_Node_Length(node,
                                  RExC_parse - oregcomp_parse + 1); /* MJD */
+                        if (in_locale) {
+                            set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+                        }
+
                          return node;
                      }
                      goto no_close;
              }
-            RExC_parse++;
+
+            RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
          }
  
        no_close:
@@ -13427,12 +13542,13 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
       * a stack.  Each entry on the stack is a single character representing one
       * of the operators; or else a pointer to an operand inversion list. */
  
-#define IS_OPERAND(a)  (! SvIOK(a))
+#define IS_OPERATOR(a) SvIOK(a)
+#define IS_OPERAND(a)  (! IS_OPERATOR(a))
  
      /* The stack is kept in Łukasiewicz order.  (That's pronounced similar
       * to luke-a-shave-itch (or -itz), but people who didn't want to bother
-     * with prounouncing it called it Reverse Polish instead, but now that YOU
-     * know how to prounounce it you can use the correct term, thus giving due
+     * with pronouncing it called it Reverse Polish instead, but now that YOU
+     * know how to pronounce it you can use the correct term, thus giving due
       * credit to the person who invented it, and impressing your geek friends.
       * Wikipedia says that the pronounciation of "Ł" has been changing so that
       * it is now more like an English initial W (as in wonk) than an L.)
@@ -13503,9 +13619,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                                         operand */
          SV* only_to_avoid_leaks;
  
-        /* Skip white space */
-        RExC_parse = regpatws(pRExC_state, RExC_parse,
-                TRUE /* means recognize comments */ );
+        skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                                TRUE /* Force /x */ );
          if (RExC_parse >= RExC_end) {
              Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
          }
@@ -13599,13 +13714,14 @@ redo_curchar:
                      /* If the top entry on the stack is an operator, it had
                       * better be a '!', otherwise the entry below the top
                       * operand should be an operator */
-                    if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
-                        || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
-                        || top_index - fence < 1
-                        || ! (stacked_ptr = av_fetch(stack,
-                                                     top_index - 1,
-                                                     FALSE))
-                        || IS_OPERAND(*stacked_ptr))
+                    if (   ! (top_ptr = av_fetch(stack, top_index, FALSE))
+                        || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
+                        || (   IS_OPERAND(*top_ptr)
+                            && (   top_index - fence < 1
+                                || ! (stacked_ptr = av_fetch(stack,
+                                                             top_index - 1,
+                                                             FALSE))
+                                || ! IS_OPERATOR(*stacked_ptr))))
                      {
                          RExC_parse++;
                          vFAIL("Unexpected '(' with no preceding operator");
@@ -13618,13 +13734,14 @@ redo_curchar:
                  break;
  
              case '\\':
-                /* regclass() can only return RESTART_UTF8 if multi-char
-                   folds are allowed.  */
+                /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+                 * multi-char folds are allowed.  */
                  if (!regclass(pRExC_state, flagp,depth+1,
                                TRUE, /* means parse just the next thing */
                                FALSE, /* don't allow multi-char folds */
                                FALSE, /* don't silence non-portable warnings.  */
                                TRUE,  /* strict */
+                              FALSE, /* Require return to be an ANYOF */
                                &current))
                  {
                      FAIL2("panic: regclass returned NULL to handle_sets, "
@@ -13644,14 +13761,15 @@ redo_curchar:
                      RExC_parse++;
                  }
  
-                /* regclass() can only return RESTART_UTF8 if multi-char
-                   folds are allowed.  */
+                /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+                 * multi-char folds are allowed.  */
                  if(!regclass(pRExC_state, flagp,depth+1,
                               is_posix_class, /* parse the whole char class
                                                  only if not a posix class */
                               FALSE, /* don't allow multi-char folds */
                               FALSE, /* don't silence non-portable warnings.  */
                               TRUE,   /* strict */
+                             FALSE, /* Require return to be an ANYOF */
                               &current
                              ))
                  {
@@ -13703,8 +13821,12 @@ redo_curchar:
                  /* Having gotten rid of the fence, we pop the operand at the
                   * stack top and process it as a newly encountered operand */
                  current = av_pop(stack);
-                assert(IS_OPERAND(current));
-                goto handle_operand;
+                if (IS_OPERAND(current)) {
+                    goto handle_operand;
+                }
+
+                RExC_parse++;
+                goto bad_syntax;
  
              case '&':
              case '|':
@@ -13780,11 +13902,23 @@ redo_curchar:
                  /* Here, the new operator has equal or lower precedence than
                   * what's already there.  This means the operation already
                   * there should be performed now, before the new one. */
+
                  rhs = av_pop(stack);
+                if (! IS_OPERAND(rhs)) {
+
+                    /* This can happen when a ! is not followed by an operand,
+                     * like in /(?[\t &!])/ */
+                    goto bad_syntax;
+                }
+
                  lhs = av_pop(stack);
  
-                assert(IS_OPERAND(rhs));
-                assert(IS_OPERAND(lhs));
+                if (! IS_OPERAND(lhs)) {
+
+                    /* This can happen when there is an empty (), like in
+                     * /(?[[0]+()+])/ */
+                    goto bad_syntax;
+                }
  
                  switch (stacked_operator) {
                      case '&':
@@ -13830,9 +13964,20 @@ redo_curchar:
                  av_push(stack, rhs);
                  goto redo_curchar;
  
-            case '!':   /* Highest priority, right associative, so just push
-                           onto stack */
-                av_push(stack, newSVuv(curchar));
+            case '!':   /* Highest priority, right associative */
+
+                /* If what's already at the top of the stack is another '!",
+                 * they just cancel each other out */
+                if (   (top_ptr = av_fetch(stack, top_index, FALSE))
+                    && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+                {
+                    only_to_avoid_leaks = av_pop(stack);
+                    SvREFCNT_dec(only_to_avoid_leaks);
+                }
+                else { /* Otherwise, since it's right associative, just push
+                          onto the stack */
+                    av_push(stack, newSVuv(curchar));
+                }
                  break;
  
              default:
@@ -13852,7 +13997,7 @@ redo_curchar:
                   * be an operator */
                  top_ptr = av_fetch(stack, top_index, FALSE);
                  assert(top_ptr);
-                if (! IS_OPERAND(*top_ptr)) {
+                if (IS_OPERATOR(*top_ptr)) {
  
                      /* The only permissible operator at the top of the stack is
                       * '!', which is applied immediately to this operand. */
@@ -13904,8 +14049,10 @@ redo_curchar:
      if (av_tindex(stack) < 0   /* Was empty */
          || ((final = av_pop(stack)) == NULL)
          || ! IS_OPERAND(final)
+        || SvTYPE(final) != SVt_INVLIST
          || av_tindex(stack) >= 0)  /* More left on stack */
      {
+      bad_syntax:
          SvREFCNT_dec(final);
          vFAIL("Incomplete expression within '(?[ ])'");
      }
@@ -13942,8 +14089,8 @@ redo_curchar:
       * already has all folding taken into consideration, and we don't want
       * regclass() to add to that */
      RExC_flags &= ~RXf_PMf_FOLD;
-    /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
-     */
+    /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
+     * folds are allowed.  */
      node = regclass(pRExC_state, flagp,depth+1,
                      FALSE, /* means parse the whole char class */
                      FALSE, /* don't allow multi-char folds */
@@ -13951,14 +14098,42 @@ redo_curchar:
                               well have generated non-portable code points, but
                               they're valid on this machine */
                      FALSE, /* similarly, no need for strict */
+                    FALSE, /* Require return to be an ANYOF */
                      NULL
                  );
      if (!node)
          FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
                      PTR2UV(flagp));
+
+    /* Fix up the node type if we are in locale.  (We have pretended we are
+     * under /u for the purposes of regclass(), as this construct will only
+     * work under UTF-8 locales.  But now we change the opcode to be ANYOFL (so
+     * as to cause any warnings about bad locales to be output in regexec.c),
+     * and add the flag that indicates to check if not in a UTF-8 locale.  The
+     * reason we above forbid optimization into something other than an ANYOF
+     * node is simply to minimize the number of code changes in regexec.c.
+     * Otherwise we would have to create new EXACTish node types and deal with
+     * them.  This decision could be revisited should this construct become
+     * popular.
+     *
+     * (One might think we could look at the resulting ANYOF node and suppress
+     * the flag if everything is above 255, as those would be UTF-8 only,
+     * but this isn't true, as the components that led to that result could
+     * have been locale-affected, and just happen to cancel each other out
+     * under UTF-8 locales.) */
+    if (in_locale) {
+        set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+
+        assert(OP(node) == ANYOF);
+
+        OP(node) = ANYOFL;
+        ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+    }
+
      if (save_fold) {
          RExC_flags |= RXf_PMf_FOLD;
      }
+
      RExC_parse = save_parse + 1;
      RExC_end = save_end;
      SvREFCNT_dec_NN(final);
@@ -13968,6 +14143,7 @@ redo_curchar:
      Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
      return node;
  }
+#undef IS_OPERATOR
  #undef IS_OPERAND
  
  STATIC void
@@ -14014,9 +14190,30 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
            *invlist = add_cp_to_invlist(*invlist,
                                          LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
              break;
+
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+
          case LATIN_SMALL_LETTER_SHARP_S:
            *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
              break;
+
+#endif
+
+#if    UNICODE_MAJOR_VERSION < 3                                        \
+   || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+
+        /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
+         * U+0131.  */
+        case 'i':
+        case 'I':
+          *invlist =
+             add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+#   if UNICODE_DOT_DOT_VERSION == 1
+          *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+#   endif
+            break;
+#endif
+
          default:
              /* Use deprecated warning to increase the chances of this being
               * output */
@@ -14080,6 +14277,23 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
  #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
                                          (SvCUR(listsv) != initial_listsv_len)
  
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class.  This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro.  Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p)                          \
+    STMT_START {                                                        \
+        if (do_skip) {                                                  \
+            while (   p < RExC_end                                      \
+                   && isBLANK_A(UCHARAT(p)))                            \
+            {                                                           \
+                p++;                                                    \
+            }                                                           \
+        }                                                               \
+    } STMT_END
+
  STATIC regnode *
  S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   const bool stop_at_1,  /* Just parse the next thing, don't
@@ -14089,6 +14303,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                         about too large
                                                         characters */
                   const bool strict,
+                 bool optimizable,                  /* ? Allow a non-ANYOF return
+                                                       node */
                   SV** ret_invlist  /* Return an inversion list, not a node */
            )
  {
@@ -14113,8 +14329,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
       * determinable at compile time
       *
-     * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
-     * to be restarted.  This can only happen if ret_invlist is non-NULL.
+     * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
+     * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
+     * to UTF-8.  This can only happen if ret_invlist is non-NULL.
       */
  
      UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
@@ -14165,8 +14382,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      bool has_user_defined_property = FALSE;
  
      /* inversion list of code points this node matches only when the target
-     * string is in UTF-8.  (Because is under /d) */
-    SV* depends_list = NULL;
+     * string is in UTF-8.  These are all non-ASCII, < 256.  (Because is under
+     * /d) */
+    SV* has_upper_latin1_only_utf8_matches = NULL;
  
      /* Inversion list of code points this node matches regardless of things
       * like locale, folding, utf8ness of the target string */
@@ -14209,6 +14427,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      DEBUG_PARSE("clas");
  
+#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
+    || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
+                                   && UNICODE_DOT_DOT_VERSION == 0)
+    allow_multi_folds = FALSE;
+#endif
+
      /* Assume we are going to generate an ANYOF node. */
      ret = reganode(pRExC_state,
                     (LOC)
@@ -14229,20 +14453,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
      }
  
-    if (skip_white) {
-        RExC_parse = regpatws(pRExC_state, RExC_parse,
-                              FALSE /* means don't recognize comments */ );
-    }
+    SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
  
      if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
         RExC_parse++;
          invert = TRUE;
          allow_multi_folds = FALSE;
          MARK_NAUGHTY(1);
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
      }
  
      /* Check that they didn't say [:posix:] instead of [[:posix:]] */
@@ -14279,10 +14497,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              break;
          }
  
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
  
          if  (UCHARAT(RExC_parse) == ']') {
              break;
@@ -14335,7 +14550,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               * skipped, it means that that white space is wanted literally, and
               * is already in 'value'.  Otherwise, need to translate the escape
               * into what it signifies. */
-            if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+            if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
  
             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
@@ -14360,8 +14575,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                          depth)
                      ) {
  
-                        if (*flagp & RESTART_UTF8)
-                            FAIL("panic: grok_bslash_N set RESTART_UTF8");
+                        if (*flagp & NEED_UTF8)
+                            FAIL("panic: grok_bslash_N set NEED_UTF8");
+                        if (*flagp & RESTART_PASS1)
+                            return NULL;
  
                          if (cp_count < 0) {
                              vFAIL("\\N in a character class must be a named character: \\N{...}");
@@ -14430,38 +14647,53 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     vFAIL2("Empty \\%c{}", (U8)value);
                 if (*RExC_parse == '{') {
                     const U8 c = (U8)value;
-                   e = strchr(RExC_parse++, '}');
-                    if (!e)
+                   e = strchr(RExC_parse, '}');
+                    if (!e) {
+                        RExC_parse++;
                          vFAIL2("Missing right brace on \\%c{}", c);
-                   while (isSPACE(*RExC_parse))
-                       RExC_parse++;
+                    }
+
+                    RExC_parse++;
+                    while (isSPACE(*RExC_parse)) {
+                         RExC_parse++;
+                   }
+
+                   if (UCHARAT(RExC_parse) == '^') {
+
+                        /* toggle.  (The rhs xor gets the single bit that
+                         * differs between P and p; the other xor inverts just
+                         * that bit) */
+                        value ^= 'P' ^ 'p';
+
+                        RExC_parse++;
+                        while (isSPACE(*RExC_parse)) {
+                            RExC_parse++;
+                        }
+                    }
+
                      if (e == RExC_parse)
                          vFAIL2("Empty \\%c{}", c);
+
                     n = e - RExC_parse;
                     while (isSPACE(*(RExC_parse + n - 1)))
                         n--;
-               }
-               else {
+               }   /* The \p isn't immediately followed by a '{' */
+               else if (! isALPHA(*RExC_parse)) {
+                    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                    vFAIL2("Character following \\%c must be '{' or a "
+                           "single-character Unicode property name",
+                           (U8) value);
+                }
+                else {
                     e = RExC_parse;
                     n = 1;
                 }
                 if (!SIZE_ONLY) {
                      SV* invlist;
                      char* name;
+                    char* base_name;    /* name after any packages are stripped */
+                    const char * const colon_colon = "::";
  
-                   if (UCHARAT(RExC_parse) == '^') {
-                        RExC_parse++;
-                        n--;
-                         /* toggle.  (The rhs xor gets the single bit that
-                          * differs between P and p; the other xor inverts just
-                          * that bit) */
-                         value ^= 'P' ^ 'p';
-
-                        while (isSPACE(*RExC_parse)) {
-                             RExC_parse++;
-                             n--;
-                        }
-                   }
                      /* Try to get the definition of the property into
                       * <invlist>.  If /i is in effect, the effective property
                       * will have its name be <__NAME_i>.  The design is
@@ -14477,7 +14709,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                      /* Look up the property name, and get its swash and
                       * inversion list, if the property is found  */
-                    if (swash) {
+                    if (swash) {    /* Return any left-overs */
                          SvREFCNT_dec_NN(swash);
                      }
                      swash = _core_swash_init("utf8", name, &PL_sv_undef,
@@ -14490,26 +14722,57 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          HV* curpkg = (IN_PERL_COMPILETIME)
                                        ? PL_curstash
                                        : CopSTASH(PL_curcop);
-                        if (swash) {
+                        UV final_n = n;
+                        bool has_pkg;
+
+                        if (swash) {    /* Got a swash but no inversion list.
+                                           Something is likely wrong that will
+                                           be sorted-out later */
                              SvREFCNT_dec_NN(swash);
                              swash = NULL;
                          }
  
-                        /* Here didn't find it.  It could be a user-defined
-                         * property that will be available at run-time.  If we
-                         * accept only compile-time properties, is an error;
-                         * otherwise add it to the list for run-time look up */
-                        if (ret_invlist) {
+                        /* Here didn't find it.  It could be a an error (like a
+                         * typo) in specifying a Unicode property, or it could
+                         * be a user-defined property that will be available at
+                         * run-time.  The names of these must begin with 'In'
+                         * or 'Is' (after any packages are stripped off).  So
+                         * if not one of those, or if we accept only
+                         * compile-time properties, is an error; otherwise add
+                         * it to the list for run-time look up. */
+                        if ((base_name = rninstr(name, name + n,
+                                                 colon_colon, colon_colon + 2)))
+                        { /* Has ::.  We know this must be a user-defined
+                             property */
+                            base_name += 2;
+                            final_n -= base_name - name;
+                            has_pkg = TRUE;
+                        }
+                        else {
+                            base_name = name;
+                            has_pkg = FALSE;
+                        }
+
+                        if (   final_n < 3
+                            || base_name[0] != 'I'
+                            || (base_name[1] != 's' && base_name[1] != 'n')
+                            || ret_invlist)
+                        {
+                            const char * const msg
+                                = (has_pkg)
+                                  ? "Illegal user-defined property name"
+                                  : "Can't find Unicode property definition";
                              RExC_parse = e + 1;
-                            vFAIL2utf8f(
-                                "Property '%"UTF8f"' is unknown",
-                                UTF8fARG(UTF, n, name));
+
+                            /* diag_listed_as: Can't find Unicode property definition "%s" */
+                            vFAIL3utf8f("%s \"%"UTF8f"\"",
+                                msg, UTF8fARG(UTF, n, name));
                          }
  
                          /* If the property name doesn't already have a package
                           * name, add the current one to it so that it can be
                           * referred to outside it. [perl #121777] */
-                        if (curpkg && ! instr(name, "::")) {
+                        if (! has_pkg && curpkg) {
                              char* pkgname = HvNAME(curpkg);
                              if (strNE(pkgname, "main")) {
                                  char* full_name = Perl_form(aTHX_
@@ -14525,16 +14788,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                          (value == 'p' ? '+' : '!'),
                                          UTF8fARG(UTF, n, name));
                          has_user_defined_property = TRUE;
+                        optimizable = FALSE;    /* Will have to leave this an
+                                                   ANYOF node */
  
-                        /* We don't know yet, so have to assume that the
-                         * property could match something in the Latin1 range,
-                         * hence something that isn't utf8.  Note that this
-                         * would cause things in <depends_list> to match
-                         * inappropriately, except that any \p{}, including
-                         * this one forces Unicode semantics, which means there
-                         * is no <depends_list> */
-                        ANYOF_FLAGS(ret)
-                                      |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
+                        /* We don't know yet what this matches, so have to flag
+                         * it */
+                        ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
                      }
                      else {
  
@@ -14582,7 +14841,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                  named */
  
                 /* \p means they want Unicode semantics */
-               RExC_uni_semantics = 1;
+               REQUIRE_UNI_RULES(flagp, NULL);
                 }
                 break;
             case 'n':   value = '\n';                   break;
@@ -14670,7 +14929,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
               recode_encoding:
                 if (! RExC_override_recoding) {
                     SV* enc = _get_encoding();
-                   value = reg_recode((const char)(U8)value, &enc);
+                   value = reg_recode((U8)value, &enc);
                     if (!enc) {
                          if (strict) {
                              vFAIL("Invalid escape in the specified encoding");
@@ -14759,6 +15018,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      }
                      ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
                      ANYOF_POSIXL_ZERO(ret);
+
+                    /* We can't change this into some other type of node
+                     * (unless this is the only element, in which case there
+                     * are nodes that mean exactly this) as has runtime
+                     * dependencies */
+                    optimizable = FALSE;
                  }
  
                  /* Coverity thinks it is possible for this to be negative; both
@@ -14874,10 +15139,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             }
         } /* end of namedclass \blah */
  
-        if (skip_white) {
-            RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                FALSE /* means don't recognize comments */ );
-        }
+        SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
  
          /* If 'range' is set, 'value' is the ending of a range--check its
           * validity.  (If value isn't a single code point in the case of a
@@ -14918,12 +15180,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  && *RExC_parse == '-')
              {
                  char* next_char_ptr = RExC_parse + 1;
-                if (skip_white) {   /* Get the next real char after the '-' */
-                    next_char_ptr = regpatws(pRExC_state,
-                                             RExC_parse + 1,
-                                             FALSE); /* means don't recognize
-                                                        comments */
-                }
+
+                /* Get the next real char after the '-' */
+                SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
  
                  /* If the '-' is at the end of the class (just before the ']',
                   * it is a literal minus; otherwise it is a range */
@@ -14968,7 +15227,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         /* non-Latin1 code point implies unicode semantics.  Must be set in
          * pass1 so is there for the whole of pass 2 */
         if (value > 255) {
-           RExC_uni_semantics = 1;
+            REQUIRE_UNI_RULES(flagp, NULL);
         }
  
          /* Ready to process either the single value, or the completed range.
@@ -15250,7 +15509,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
  
-       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
+       *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
  
         RExC_parse = save_parse;
         RExC_end = save_end;
@@ -15270,8 +15529,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * 2) if the character class contains only a single element (including a
       *    single range), we see if there is an equivalent node for it.
       * Other checks are possible */
-    if (! ret_invlist   /* Can't optimize if returning the constructed
-                           inversion list */
+    if (   optimizable
+        && ! ret_invlist   /* Can't optimize if returning the constructed
+                              inversion list */
          && (UNLIKELY(posixl_matches_all) || element_count == 1))
      {
          U8 op = END;
@@ -15482,7 +15742,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              /* Our calculated list will be for Unicode rules.  For locale
               * matching, we have to keep a separate list that is consulted at
               * runtime only when the locale indicates Unicode rules.  For
-             * non-locale, we just use to the general list */
+             * non-locale, we just use the general list */
              if (LOC) {
                  use_list = &only_utf8_locale_list;
              }
@@ -15531,9 +15791,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                                              PL_fold_latin1[j]);
                              }
                              else {
-                                depends_list =
-                                 add_cp_to_invlist(depends_list,
-                                                   PL_fold_latin1[j]);
+                                has_upper_latin1_only_utf8_matches
+                                    = add_cp_to_invlist(
+                                            has_upper_latin1_only_utf8_matches,
+                                            PL_fold_latin1[j]);
                              }
                          }
  
@@ -15597,8 +15858,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                              else {
                                  /* Similarly folds involving non-ascii Latin1
                                  * characters under /d are added to their list */
-                                depends_list = add_cp_to_invlist(depends_list,
-                                                                 c);
+                                has_upper_latin1_only_utf8_matches
+                                        = add_cp_to_invlist(
+                                           has_upper_latin1_only_utf8_matches,
+                                           c);
                              }
                          }
                      }
@@ -15632,7 +15895,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              if (DEPENDS_SEMANTICS) {
                  /* Under /d, everything in the upper half of the Latin1 range
                   * matches these complements */
-                ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
+                ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
              }
              else if (AT_LEAST_ASCII_RESTRICTED) {
                  /* Under /a and /aa, everything above ASCII matches these
@@ -15674,13 +15937,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  cp_list = posixes;
              }
  
-            if (depends_list) {
-                _invlist_union(depends_list, nonascii_but_latin1_properties,
-                               &depends_list);
+            if (has_upper_latin1_only_utf8_matches) {
+                _invlist_union(has_upper_latin1_only_utf8_matches,
+                               nonascii_but_latin1_properties,
+                               &has_upper_latin1_only_utf8_matches);
                  SvREFCNT_dec_NN(nonascii_but_latin1_properties);
              }
              else {
-                depends_list = nonascii_but_latin1_properties;
+                has_upper_latin1_only_utf8_matches
+                                            = nonascii_but_latin1_properties;
              }
          }
      }
@@ -15694,8 +15959,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * class that isn't a Unicode property, and which matches above Unicode, \W
       * or [\x{110000}] for example.
       * (Note that in this case, unlike the Posix one above, there is no
-     * <depends_list>, because having a Unicode property forces Unicode
-     * semantics */
+     * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
+     * forces Unicode semantics */
      if (properties) {
          if (cp_list) {
  
@@ -15719,7 +15984,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          }
  
          if (warn_super) {
-            ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+            ANYOF_FLAGS(ret)
+             |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+
+            /* Because an ANYOF node is the only one that warns, this node
+             * can't be optimized into something else */
+            optimizable = FALSE;
          }
      }
  
@@ -15739,10 +16009,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * locales, or the class matches at least one 0-255 range code point */
      if (LOC && FOLD) {
          if (only_utf8_locale_list) {
-            ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+            ANYOF_FLAGS(ret) |=  ANYOF_LOC_FOLD
+                                |ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES;
          }
-        else if (cp_list) { /* Look to see if there a 0-255 code point is in
-                               the list */
+        else if (cp_list) { /* Look to see if a 0-255 code point is in list */
              UV start, end;
              invlist_iterinit(cp_list);
              if (invlist_iternext(cp_list, &start, &end) && start < 256) {
@@ -15752,14 +16022,83 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          }
      }
  
+#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret)                                 \
+    (   DEPENDS_SEMANTICS                                                   \
+     && ANYOF_FLAGS(ret)                                                    \
+        & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+
+    /* See if we can simplify things under /d */
+    if (   has_upper_latin1_only_utf8_matches
+        || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+    {
+        if (has_upper_latin1_only_utf8_matches) {
+            if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+
+                /* Here, we have two, almost opposite, constraints in effect
+                 * for upper latin1 characters.  The macro means they all match
+                 * when the target string ISN'T in UTF-8.
+                 * 'has_upper_latin1_only_utf8_matches' contains the chars that
+                 * match only if the target string IS UTF-8.  Therefore the
+                 * ones in 'has_upper_latin1_only_utf8_matches' match
+                 * regardless of UTF-8, so can be added to the regular list,
+                 * and 'has_upper_latin1_only_utf8_matches' cleared */
+                _invlist_union(cp_list,
+                               has_upper_latin1_only_utf8_matches,
+                               &cp_list);
+                SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+                has_upper_latin1_only_utf8_matches = NULL;
+            }
+            else if (cp_list) {
+
+                /* Here, 'cp_list' gives chars that always match, and
+                 * 'has_upper_latin1_only_utf8_matches' gives chars that were
+                 * specified to match only if the target string is in UTF-8.
+                 * It may be that these overlap, so we can subtract the
+                 * unconditionally matching from the conditional ones, to make
+                 * the conditional list as small as possible, perhaps even
+                 * clearing it, in which case more optimizations are possible
+                 * later */
+                _invlist_subtract(has_upper_latin1_only_utf8_matches,
+                                  cp_list,
+                                  &has_upper_latin1_only_utf8_matches);
+                if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+                    SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+                    has_upper_latin1_only_utf8_matches = NULL;
+                }
+            }
+        }
+
+        /* Similarly, if the unconditional matches include every upper latin1
+         * character, we can clear that flag to permit later optimizations */
+        if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+            SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+            _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
+            if (_invlist_len(only_non_utf8_list) == 0) {
+                ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+            }
+            SvREFCNT_dec_NN(only_non_utf8_list);
+            only_non_utf8_list = NULL;;
+        }
+
+        /* If we haven't gotten rid of all conditional matching, we change the
+         * regnode type to indicate that */
+        if (   has_upper_latin1_only_utf8_matches
+            || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+        {
+            OP(ret) = ANYOFD;
+            optimizable = FALSE;
+        }
+    }
+#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
      /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
       * at compile time.  Besides not inverting folded locale now, we can't
       * invert if there are things such as \w, which aren't known until runtime
       * */
      if (cp_list
          && invert
+        && OP(ret) != ANYOFD
          && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
-       && ! depends_list
         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
      {
          _invlist_invert(cp_list);
@@ -15801,32 +16140,28 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
       * adjacent such nodes.  And if the class is equivalent to things like /./,
       * expensive run-time swashes can be avoided.  Now that we have more
       * complete information, we can find things necessarily missed by the
-     * earlier code.  I (khw) am not sure how much to look for here.  It would
-     * be easy, but perhaps too slow, to check any candidates against all the
-     * node types they could possibly match using _invlistEQ(). */
-
-    if (cp_list
-        && ! invert
-        && ! depends_list
-        && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
-        && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-
-           /* We don't optimize if we are supposed to make sure all non-Unicode
-            * code points raise a warning, as only ANYOF nodes have this check.
-            * */
-        && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
-    {
+     * earlier code.  Another possible "optimization" that isn't done is that
+     * something like [Ee] could be changed into an EXACTFU.  khw tried this
+     * and found that the ANYOF is faster, including for code points not in the
+     * bitmap.  This still might make sense to do, provided it got joined with
+     * an adjacent node(s) to create a longer EXACTFU one.  This could be
+     * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
+     * routine would know is joinable.  If that didn't happen, the node type
+     * could then be made a straight ANYOF */
+
+    if (optimizable && cp_list && ! invert) {
          UV start, end;
          U8 op = END;  /* The optimzation node-type */
+        int posix_class = -1;   /* Illegal value */
          const char * cur_parse= RExC_parse;
  
          invlist_iterinit(cp_list);
          if (! invlist_iternext(cp_list, &start, &end)) {
  
              /* Here, the list is empty.  This happens, for example, when a
-             * Unicode property is the only thing in the character class, and
-             * it doesn't match anything.  (perluniprops.pod notes such
-             * properties) */
+             * Unicode property that doesn't match anything is the only element
+             * in the character class (perluniprops.pod notes such properties).
+             * */
              op = OPFAIL;
              *flagp |= HASWIDTH|SIMPLE;
          }
@@ -15882,7 +16217,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      }
                  }
              }
-        }
+        }   /* End of first range contains just a single code point */
          else if (start == 0) {
              if (end == UV_MAX) {
                  op = SANY;
@@ -15900,11 +16235,58 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
          }
          invlist_iterfinish(cp_list);
  
+        if (op == END) {
+            const UV cp_list_len = _invlist_len(cp_list);
+            const UV* cp_list_array = invlist_array(cp_list);
+
+            /* Here, didn't find an optimization.  See if this matches any of
+             * the POSIX classes.  These run slightly faster for above-Unicode
+             * code points, so don't bother with POSIXA ones nor the 2 that
+             * have no above-Unicode matches.  We can avoid these checks unless
+             * the ANYOF matches at least as high as the lowest POSIX one
+             * (which was manually found to be \v.  The actual code point may
+             * increase in later Unicode releases, if a higher code point is
+             * assigned to be \v, but this code will never break.  It would
+             * just mean we could execute the checks for posix optimizations
+             * unnecessarily) */
+
+            if (cp_list_array[cp_list_len-1] > 0x2029) {
+                for (posix_class = 0;
+                     posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+                     posix_class++)
+                {
+                    int try_inverted;
+                    if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+                        continue;
+                    }
+                    for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+
+                        /* Check if matches normal or inverted */
+                        if (_invlistEQ(cp_list,
+                                       PL_XPosix_ptrs[posix_class],
+                                       try_inverted))
+                        {
+                            op = (try_inverted)
+                                 ? NPOSIXU
+                                 : POSIXU;
+                            *flagp |= HASWIDTH|SIMPLE;
+                            goto found_posix;
+                        }
+                    }
+                }
+              found_posix: ;
+            }
+        }
+
          if (op != END) {
              RExC_parse = (char *)orig_parse;
              RExC_emit = (regnode *)orig_emit;
  
-            ret = reg_node(pRExC_state, op);
+            if (regarglen[op]) {
+                ret = reganode(pRExC_state, op, 0);
+            } else {
+                ret = reg_node(pRExC_state, op);
+            }
  
              RExC_parse = (char *)cur_parse;
  
@@ -15913,6 +16295,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                             TRUE /* downgradable to EXACT */
                                            );
              }
+            else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
+                FLAGS(ret) = posix_class;
+            }
  
              SvREFCNT_dec_NN(cp_list);
              return ret;
@@ -15933,16 +16318,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
      /* Here, the bitmap has been populated with all the Latin1 code points that
       * always match.  Can now add to the overall list those that match only
-     * when the target string is UTF-8 (<depends_list>). */
-    if (depends_list) {
+     * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
+     * */
+    if (has_upper_latin1_only_utf8_matches) {
         if (cp_list) {
-           _invlist_union(cp_list, depends_list, &cp_list);
-           SvREFCNT_dec_NN(depends_list);
+           _invlist_union(cp_list,
+                           has_upper_latin1_only_utf8_matches,
+                           &cp_list);
+           SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
         }
         else {
-           cp_list = depends_list;
+           cp_list = has_upper_latin1_only_utf8_matches;
         }
-        ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
+        ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
      }
  
      /* If there is a swash and more than one element, we can't use the swash in
@@ -16010,18 +16398,13 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
  
      if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
          assert(! (ANYOF_FLAGS(node)
-                  & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
-                     |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
+                & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
         ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
      }
      else {
         AV * const av = newAV();
         SV *rv;
  
-        assert(ANYOF_FLAGS(node)
-               & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
-                  |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
         av_store(av, 0, (runtime_defns)
                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
         if (swash) {
@@ -16085,10 +16468,6 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
  
      PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
  
-    assert(ANYOF_FLAGS(node)
-        & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
-           |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
      if (data && data->count) {
         const U32 n = ARG(node);
  
@@ -16100,9 +16479,6 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
  
             si = *ary;  /* ary[0] = the string to initialize the swash with */
  
-           /* Elements 3 and 4 are either both present or both absent. [3] is
-            * any inversion list generated at compile time; [4] indicates if
-            * that inversion list has any user-defined properties in it. */
              if (av_tindex(av) >= 2) {
                  if (only_utf8_locale_ptr
                      && ary[2]
@@ -16115,6 +16491,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                      *only_utf8_locale_ptr = NULL;
                  }
  
+                /* Elements 3 and 4 are either both present or both absent. [3]
+                 * is any inversion list generated at compile time; [4]
+                 * indicates if that inversion list has any user-defined
+                 * properties in it. */
                  if (av_tindex(av) >= 3) {
                      invlist = ary[3];
                      if (SvUV(ary[4])) {
@@ -16213,50 +16593,86 @@ S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
      return p;
  }
  
-/* nextchar()
-
-   Advances the parse position, and optionally absorbs
-   "whitespace" from the inputstream.
+STATIC void
+S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
+                                char ** p,
+                                const bool force_to_xmod
+                         )
+{
+    /* If the text at the current parse position '*p' is a '(?#...)' comment,
+     * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
+     * is /x whitespace, advance '*p' so that on exit it points to the first
+     * byte past all such white space and comments */
  
-   Without /x "whitespace" means (?#...) style comments only,
-   with /x this means (?#...) and # comments and whitespace proper.
+    const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
  
-   Returns the RExC_parse point from BEFORE the scan occurs.
+    PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
  
-   This is the /x friendly way of saying RExC_parse++.
-*/
-
-STATIC char*
-S_nextchar(pTHX_ RExC_state_t *pRExC_state)
-{
-    char* const retval = RExC_parse++;
-
-    PERL_ARGS_ASSERT_NEXTCHAR;
+    assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
  
      for (;;) {
-       if (RExC_end - RExC_parse >= 3
-           && *RExC_parse == '('
-           && RExC_parse[1] == '?'
-           && RExC_parse[2] == '#')
+       if (RExC_end - (*p) >= 3
+           && *(*p)     == '('
+           && *(*p + 1) == '?'
+           && *(*p + 2) == '#')
         {
-           while (*RExC_parse != ')') {
-               if (RExC_parse == RExC_end)
+           while (*(*p) != ')') {
+               if ((*p) == RExC_end)
                     FAIL("Sequence (?#... not terminated");
-               RExC_parse++;
+               (*p)++;
             }
-           RExC_parse++;
+           (*p)++;
             continue;
         }
-       if (RExC_flags & RXf_PMf_EXTENDED) {
-            char * p = regpatws(pRExC_state, RExC_parse,
-                                          TRUE); /* means recognize comments */
-            if (p != RExC_parse) {
-                RExC_parse = p;
+
+       if (use_xmod) {
+            const char * save_p = *p;
+            while ((*p) < RExC_end) {
+                STRLEN len;
+                if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
+                    (*p) += len;
+                }
+                else if (*(*p) == '#') {
+                    (*p) = reg_skipcomment(pRExC_state, (*p));
+                }
+                else {
+                    break;
+                }
+            }
+            if (*p != save_p) {
                  continue;
              }
         }
-        return retval;
+
+        break;
      }
+
+    return;
+}
+
+/* nextchar()
+
+   Advances the parse position by one byte, unless that byte is the beginning
+   of a '(?#...)' style comment, or is /x whitespace and /x is in effect.  In
+   those two cases, the parse position is advanced beyond all such comments and
+   white space.
+
+   This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
+*/
+
+STATIC void
+S_nextchar(pTHX_ RExC_state_t *pRExC_state)
+{
+    PERL_ARGS_ASSERT_NEXTCHAR;
+
+    assert(   ! UTF
+           || UTF8_IS_INVARIANT(*RExC_parse)
+           || UTF8_IS_START(*RExC_parse));
+
+    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+
+    skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+                            FALSE /* Don't assume /x */ );
  }
  
  STATIC regnode *
@@ -16884,9 +17300,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          }
  
      } else if (k == CURLY) {
+        U32 lo = ARG1(o), hi = ARG2(o);
         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
-       Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
+        Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
+        if (hi == REG_INFTY)
+            sv_catpvs(sv, "INFTY");
+        else
+            Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
+        sv_catpvs(sv, "}");
      }
      else if (k == WHILEM && o->flags)                  /* Ordinal/of */
         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
@@ -16894,7 +17316,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
               || k == GROUPP || OP(o)==ACCEPT)
      {
          AV *name_list= NULL;
-       Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
+        U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
+        Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno);        /* Parenth number */
         if ( RXp_PAREN_NAMES(prog) ) {
              name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
          } else if ( pRExC_state ) {
@@ -16902,12 +17325,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          }
          if (name_list) {
              if ( k != REF || (OP(o) < NREF)) {
-                SV **name= av_fetch(name_list, ARG(o), 0 );
+                SV **name= av_fetch(name_list, parno, 0 );
                 if (name)
                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
              }
              else {
-                SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
+                SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
                  I32 *nums=(I32*)SvPVX(sv_dat);
                  SV **name= av_fetch(name_list, nums[0], 0 );
                  I32 n;
@@ -16950,21 +17373,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                  Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
          }
      }
-    else if (k == VERB) {
-        if (!o->flags)
-            Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
-                          SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
-    } else if (k == LOGICAL)
+    else if (k == LOGICAL)
          /* 2: embedded, otherwise 1 */
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
      else if (k == ANYOF) {
         const U8 flags = ANYOF_FLAGS(o);
         int do_sep = 0;
-        SV* bitmap_invlist;  /* Will hold what the bit map contains */
+        SV* bitmap_invlist = NULL;  /* Will hold what the bit map contains */
  
  
-       if (OP(o) == ANYOFL)
-           sv_catpvs(sv, "{loc}");
+       if (OP(o) == ANYOFL) {
+            if (flags & ANYOF_LOC_REQ_UTF8) {
+                sv_catpvs(sv, "{utf8-loc}");
+            }
+            else {
+                sv_catpvs(sv, "{loc}");
+            }
+        }
         if (flags & ANYOF_LOC_FOLD)
             sv_catpvs(sv, "{i}");
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
@@ -16988,10 +17413,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              }
          }
  
-       if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
-                      |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
-                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
-                      |ANYOF_LOC_FOLD)))
+        if (    ARG(o) != ANYOF_ONLY_HAS_BITMAP
+           || (flags
+                & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
+                   |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
+                   |ANYOF_LOC_FOLD)))
          {
              if (do_sep) {
                  Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
@@ -17000,7 +17426,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      sv_catpvs(sv, "^");
              }
  
-            if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+            if (OP(o) == ANYOFD
+                && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+            {
                  sv_catpvs(sv, "{non-utf8-latin1-all}");
              }
  
@@ -17028,11 +17456,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      if (*s == '\n') {
                          const char * const t = ++s;
  
-                        if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
-                            sv_catpvs(sv, "{outside bitmap}");
-                        }
-                        else {
-                            sv_catpvs(sv, "{utf8}");
+                        if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
+                            if (OP(o) == ANYOFD) {
+                                sv_catpvs(sv, "{utf8}");
+                            }
+                            else {
+                                sv_catpvs(sv, "{outside bitmap}");
+                            }
                          }
  
                          if (byte_output) {
@@ -17125,6 +17555,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
      else if (OP(o) == SBOL)
          Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+    /* add on the verb argument if there is one */
+    if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
+        Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
+                       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+    }
  #else
      PERL_UNUSED_CONTEXT;
      PERL_UNUSED_ARG(sv);
@@ -17916,9 +18352,13 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
          this_end = (end < NUM_ANYOF_CODE_POINTS)
                      ? end
                      : NUM_ANYOF_CODE_POINTS - 1;
+#if NUM_ANYOF_CODE_POINTS > 256
          format = (this_end < 256)
                   ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
                   : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+#else
+        format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+#endif
          GCC_DIAG_IGNORE(-Wformat-nonliteral);
          Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
          GCC_DIAG_RESTORE;
@@ -17937,23 +18377,21 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
      int i;
      UV start, end;
      unsigned int punct_count = 0;
-    SV* invlist = NULL;
-    SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
+    SV* invlist;
      bool allow_literals = TRUE;
+    bool inverted_for_output = FALSE;
  
      PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
  
-    invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
-
      /* Worst case is exactly every-other code point is in the list */
-    *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+    invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
  
      /* Convert the bit map to an inversion list, keeping track of how many
       * ASCII puncts are set, including an extra amount for the backslashed
       * ones.  */
      for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
          if (BITMAP_TEST(bitmap, i)) {
-            *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+            invlist = add_cp_to_invlist(invlist, i);
              if (isPUNCT_A(i)) {
                  punct_count++;
                  if isBACKSLASHED_PUNCT(i) {
@@ -17964,8 +18402,8 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
      }
  
      /* Nothing to output */
-    if (_invlist_len(*invlist_ptr) == 0) {
-        SvREFCNT_dec(invlist);
+    if (_invlist_len(invlist) == 0) {
+        SvREFCNT_dec_NN(invlist);
          return FALSE;
      }
  
@@ -17973,8 +18411,8 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
       * literals, but if a range (nearly) spans all of them, it's best to output
       * it as a single range.  This code will use a single range if all but 2
       * printables are in it */
-    invlist_iterinit(*invlist_ptr);
-    while (invlist_iternext(*invlist_ptr, &start, &end)) {
+    invlist_iterinit(invlist);
+    while (invlist_iternext(invlist, &start, &end)) {
  
          /* If range starts beyond final printable, it doesn't have any in it */
          if (start > MAX_PRINT_A) {
@@ -17997,7 +18435,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
              break;
          }
      }
-    invlist_iterfinish(*invlist_ptr);
+    invlist_iterfinish(invlist);
  
      /* The legibility of the output depends mostly on how many punctuation
       * characters are output.  There are 32 possible ASCII ones, and some have
@@ -18012,19 +18450,35 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
  
          /* Add everything remaining to the list, so when we invert it just
           * below, it will be excluded */
-        _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
-        _invlist_invert(*invlist_ptr);
+        _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
+        _invlist_invert(invlist);
+        inverted_for_output = TRUE;
      }
  
      /* Here we have figured things out.  Output each range */
-    invlist_iterinit(*invlist_ptr);
-    while (invlist_iternext(*invlist_ptr, &start, &end)) {
+    invlist_iterinit(invlist);
+    while (invlist_iternext(invlist, &start, &end)) {
          if (start >= NUM_ANYOF_CODE_POINTS) {
              break;
          }
          put_range(sv, start, end, allow_literals);
      }
-    invlist_iterfinish(*invlist_ptr);
+    invlist_iterfinish(invlist);
+
+    if (bitmap_invlist) {
+
+        /* Here, wants the inversion list returned.  If we inverted it, we have
+         * to restore it to the original */
+        if (inverted_for_output) {
+            _invlist_invert(invlist);
+            _invlist_intersection(invlist, PL_InBitmap, &invlist);
+        }
+
+        *bitmap_invlist = invlist;
+    }
+    else {
+        SvREFCNT_dec_NN(invlist);
+    }
  
      return TRUE;
  }