Simplify double-nextstate optimisation

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index f200353..ebda789 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -102,6 +102,9 @@ EXTERN_C const struct regexp_engine my_reg_engine;
  #define        STATIC  static
  #endif
  
  #define        STATIC  static
  #endif
  
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
  
  struct RExC_state_t {
      U32                flags;                  /* RXf_* are we folding, multilining? */
  
  struct RExC_state_t {
      U32                flags;                  /* RXf_* are we folding, multilining? */
@@ -171,9 +174,11 @@ struct RExC_state_t {
      const char  *lastparse;
      I32         lastnum;
      AV          *paren_name_list;       /* idx -> name */
      const char  *lastparse;
      I32         lastnum;
      AV          *paren_name_list;       /* idx -> name */
+    U32         study_chunk_recursed_count;
  #define RExC_lastparse (pRExC_state->lastparse)
  #define RExC_lastnum   (pRExC_state->lastnum)
  #define RExC_paren_name_list    (pRExC_state->paren_name_list)
  #define RExC_lastparse (pRExC_state->lastparse)
  #define RExC_lastnum   (pRExC_state->lastnum)
  #define RExC_paren_name_list    (pRExC_state->paren_name_list)
+#define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
  #endif
  };
  
  #endif
  };
  
@@ -495,7 +500,8 @@ static const scan_data_t zero_scan_data =
   * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   */
  #define        Simple_vFAIL(m) STMT_START {                                    \
   * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   */
  #define        Simple_vFAIL(m) STMT_START {                                    \
-    const IV offset = RExC_parse - RExC_precomp;                       \
+    const IV offset =                                                   \
+        (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
      Perl_croak(aTHX_ "%s" REPORT_LOCATION,                             \
             m, REPORT_LOCATION_ARGS(offset));   \
  } STMT_END
      Perl_croak(aTHX_ "%s" REPORT_LOCATION,                             \
             m, REPORT_LOCATION_ARGS(offset));   \
  } STMT_END
@@ -570,80 +576,85 @@ static const scan_data_t zero_scan_data =
              REPORT_LOCATION_ARGS(offset));         \
  } STMT_END
  
              REPORT_LOCATION_ARGS(offset));         \
  } STMT_END
  
+/* These have asserts in them because of [perl #122671] Many warnings in
+ * regcomp.c can occur twice.  If they get output in pass1 and later in that
+ * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
+ * would get output again.  So they should be output in pass2, and these
+ * asserts make sure new warnings follow that paradigm. */
  
  /* m is not necessarily a "literal string", in this macro */
  #define reg_warn_non_literal_string(loc, m) STMT_START {                \
      const IV offset = loc - RExC_precomp;                               \
  
  /* m is not necessarily a "literal string", in this macro */
  #define reg_warn_non_literal_string(loc, m) STMT_START {                \
      const IV offset = loc - RExC_precomp;                               \
-    Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
              m, REPORT_LOCATION_ARGS(offset));       \
  } STMT_END
  
  #define        ckWARNreg(loc,m) STMT_START {                                   \
      const IV offset = loc - RExC_precomp;                              \
              m, REPORT_LOCATION_ARGS(offset));       \
  } STMT_END
  
  #define        ckWARNreg(loc,m) STMT_START {                                   \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
+    __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,    \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        vWARN_dep(loc, m) STMT_START {                                  \
      const IV offset = loc - RExC_precomp;                              \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        vWARN_dep(loc, m) STMT_START {                                  \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,    \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,   \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARNdep(loc,m) STMT_START {                                   \
      const IV offset = loc - RExC_precomp;                              \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARNdep(loc,m) STMT_START {                                   \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                  \
+    __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
             m REPORT_LOCATION,                                          \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARNregdep(loc,m) STMT_START {                                \
      const IV offset = loc - RExC_precomp;                              \
             m REPORT_LOCATION,                                          \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARNregdep(loc,m) STMT_START {                                \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),    \
+    __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),   \
             m REPORT_LOCATION,                                          \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARN2reg_d(loc,m, a1) STMT_START {                            \
      const IV offset = loc - RExC_precomp;                              \
             m REPORT_LOCATION,                                          \
             REPORT_LOCATION_ARGS(offset));              \
  } STMT_END
  
  #define        ckWARN2reg_d(loc,m, a1) STMT_START {                            \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                      \
+    __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                     \
             m REPORT_LOCATION,                                          \
             a1, REPORT_LOCATION_ARGS(offset));  \
  } STMT_END
  
  #define        ckWARN2reg(loc, m, a1) STMT_START {                             \
      const IV offset = loc - RExC_precomp;                              \
             m REPORT_LOCATION,                                          \
             a1, REPORT_LOCATION_ARGS(offset));  \
  } STMT_END
  
  #define        ckWARN2reg(loc, m, a1) STMT_START {                             \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
+    __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,    \
             a1, REPORT_LOCATION_ARGS(offset));  \
  } STMT_END
  
  #define        vWARN3(loc, m, a1, a2) STMT_START {                             \
      const IV offset = loc - RExC_precomp;                              \
             a1, REPORT_LOCATION_ARGS(offset));  \
  } STMT_END
  
  #define        vWARN3(loc, m, a1, a2) STMT_START {                             \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,               \
             a1, a2, REPORT_LOCATION_ARGS(offset));      \
  } STMT_END
  
  #define        ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
      const IV offset = loc - RExC_precomp;                              \
             a1, a2, REPORT_LOCATION_ARGS(offset));      \
  } STMT_END
  
  #define        ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
+    __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,    \
             a1, a2, REPORT_LOCATION_ARGS(offset));      \
  } STMT_END
  
  #define        vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
      const IV offset = loc - RExC_precomp;                              \
             a1, a2, REPORT_LOCATION_ARGS(offset));      \
  } STMT_END
  
  #define        vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,               \
             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
  #define        ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
      const IV offset = loc - RExC_precomp;                              \
             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
  #define        ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
+    __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,    \
             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
  #define        vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
      const IV offset = loc - RExC_precomp;                              \
             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
  #define        vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
      const IV offset = loc - RExC_precomp;                              \
-    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                \
+    __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,               \
             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
  } STMT_END
  
@@ -800,6 +811,33 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
      PerlIO_printf(Perl_debug_log,"\n");                              \
  });
  
      PerlIO_printf(Perl_debug_log,"\n");                              \
  });
  
+#ifdef DEBUGGING
+
+/* is c a control character for which we have a mnemonic? */
+#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
+
+STATIC const char *
+S_cntrl_to_mnemonic(const U8 c)
+{
+    /* Returns the mnemonic string that represents character 'c', if one
+     * exists; NULL otherwise.  The only ones that exist for the purposes of
+     * this routine are a few control characters */
+
+    switch (c) {
+        case '\a':       return "\\a";
+        case '\b':       return "\\b";
+        case ESC_NATIVE: return "\\e";
+        case '\f':       return "\\f";
+        case '\n':       return "\\n";
+        case '\r':       return "\\r";
+        case '\t':       return "\\t";
+    }
+
+    return NULL;
+}
+
+#endif
+
  /* Mark that we cannot extend a found fixed substring at this point.
     Update the longest found anchored substring and the longest found
     floating substrings if needed. */
  /* Mark that we cannot extend a found fixed substring at this point.
     Update the longest found anchored substring and the longest found
     floating substrings if needed. */
@@ -873,7 +911,7 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc)
  
      ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
      _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
  
      ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
      _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
-    ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
+    ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
  }
  
  STATIC int
  }
  
  STATIC int
@@ -891,7 +929,7 @@ S_ssc_is_anything(const regnode_ssc *ssc)
  
      assert(is_ANYOF_SYNTHETIC(ssc));
  
  
      assert(is_ANYOF_SYNTHETIC(ssc));
  
-    if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
+    if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
          return FALSE;
      }
  
          return FALSE;
      }
  
@@ -930,7 +968,7 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
  
      Zero(ssc, 1, regnode_ssc);
      set_ANYOF_SYNTHETIC(ssc);
  
      Zero(ssc, 1, regnode_ssc);
      set_ANYOF_SYNTHETIC(ssc);
-    ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
+    ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
      ssc_anything(ssc);
  
      /* If any portion of the regex is to operate under locale rules,
      ssc_anything(ssc);
  
      /* If any portion of the regex is to operate under locale rules,
@@ -1000,7 +1038,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
  
      /* Look at the data structure created by S_set_ANYOF_arg() */
      PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
  
      /* Look at the data structure created by S_set_ANYOF_arg() */
-    if (n != ANYOF_NONBITMAP_EMPTY) {
+    if (n != ANYOF_ONLY_HAS_BITMAP) {
          SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
          AV * const av = MUTABLE_AV(SvRV(rv));
          SV **const ary = AvARRAY(av);
          SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
          AV * const av = MUTABLE_AV(SvRV(rv));
          SV **const ary = AvARRAY(av);
@@ -1056,13 +1094,13 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
  
      /* If this can match all upper Latin1 code points, have to add them
       * as well */
  
      /* If this can match all upper Latin1 code points, have to add them
       * as well */
-    if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
+    if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
          _invlist_union(invlist, PL_UpperLatin1, &invlist);
      }
  
      /* Similarly for these */
          _invlist_union(invlist, PL_UpperLatin1, &invlist);
      }
  
      /* Similarly for these */
-    if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
-        invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
+    if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+        _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
      }
  
      if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
      }
  
      if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
@@ -1095,8 +1133,8 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
  #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
  
  /* 'AND' a given class with another one.  Can create false positives.  'ssc'
  #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
  
  /* 'AND' a given class with another one.  Can create false positives.  'ssc'
- * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
- * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
+ * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
+ * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
  
  STATIC void
  S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
  STATIC void
  S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
@@ -1187,7 +1225,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
          /* If either P1 or P2 is empty, the intersection will be also; can skip
           * the loop */
  
          /* If either P1 or P2 is empty, the intersection will be also; can skip
           * the loop */
-        if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
+        if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
              ANYOF_POSIXL_ZERO(ssc);
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
              ANYOF_POSIXL_ZERO(ssc);
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
@@ -1246,16 +1284,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
              else {
                  ssc->invlist = anded_cp_list;
                  ANYOF_POSIXL_ZERO(ssc);
              else {
                  ssc->invlist = anded_cp_list;
                  ANYOF_POSIXL_ZERO(ssc);
-                if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
+                if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
                      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
                  }
              }
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
                      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
                  }
              }
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
-                 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
+                 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
          {
              /* One or the other of P1, P2 is non-empty. */
          {
              /* One or the other of P1, P2 is non-empty. */
-            if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
+            if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
                  ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
              }
              ssc_union(ssc, anded_cp_list, FALSE);
                  ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
              }
              ssc_union(ssc, anded_cp_list, FALSE);
@@ -1317,7 +1355,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      {
          /* We ignore P2, leaving P1 going forward */
      }   /* else  Not inverted */
      {
          /* We ignore P2, leaving P1 going forward */
      }   /* else  Not inverted */
-    else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
+    else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
          ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
          if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
              unsigned int i;
          ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
          if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
              unsigned int i;
@@ -1406,6 +1444,71 @@ S_ssc_clear_locale(regnode_ssc *ssc)
      ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
  }
  
      ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
  }
  
+#define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+STATIC bool
+S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+{
+    /* The synthetic start class is used to hopefully quickly winnow down
+     * places where a pattern could start a match in the target string.  If it
+     * doesn't really narrow things down that much, there isn't much point to
+     * having the overhead of using it.  This function uses some very crude
+     * heuristics to decide if to use the ssc or not.
+     *
+     * It returns TRUE if 'ssc' rules out more than half what it considers to
+     * be the "likely" possible matches, but of course it doesn't know what the
+     * actual things being matched are going to be; these are only guesses
+     *
+     * For /l matches, it assumes that the only likely matches are going to be
+     *      in the 0-255 range, uniformly distributed, so half of that is 127
+     * For /a and /d matches, it assumes that the likely matches will be just
+     *      the ASCII range, so half of that is 63
+     * For /u and there isn't anything matching above the Latin1 range, it
+     *      assumes that that is the only range likely to be matched, and uses
+     *      half that as the cut-off: 127.  If anything matches above Latin1,
+     *      it assumes that all of Unicode could match (uniformly), except for
+     *      non-Unicode code points and things in the General Category "Other"
+     *      (unassigned, private use, surrogates, controls and formats).  This
+     *      is a much large number. */
+
+    const U32 max_match = (LOC)
+                          ? 127
+                          : (! UNI_SEMANTICS)
+                            ? 63
+                            : (invlist_highest(ssc->invlist) < 256)
+                              ? 127
+                              : ((NON_OTHER_COUNT + 1) / 2) - 1;
+    U32 count = 0;      /* Running total of number of code points matched by
+                           'ssc' */
+    UV start, end;      /* Start and end points of current range in inversion
+                           list */
+
+    PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
+
+    invlist_iterinit(ssc->invlist);
+    while (invlist_iternext(ssc->invlist, &start, &end)) {
+
+        /* /u is the only thing that we expect to match above 255; so if not /u
+         * and even if there are matches above 255, ignore them.  This catches
+         * things like \d under /d which does match the digits above 255, but
+         * since the pattern is /d, it is not likely to be expecting them */
+        if (! UNI_SEMANTICS) {
+            if (start > 255) {
+                break;
+            }
+            end = MIN(end, 255);
+        }
+        count += end - start + 1;
+        if (count > max_match) {
+            invlist_iterfinish(ssc->invlist);
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+
  STATIC void
  S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
  {
  STATIC void
  S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
  {
@@ -1421,8 +1524,8 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
      assert(is_ANYOF_SYNTHETIC(ssc));
  
      /* The code in this file assumes that all but these flags aren't relevant
      assert(is_ANYOF_SYNTHETIC(ssc));
  
      /* The code in this file assumes that all but these flags aren't relevant
-     * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
-     * time we reach here */
+     * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
+     * by the time we reach here */
      assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
  
      populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
      assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
  
      populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
@@ -1434,7 +1537,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
      ssc->invlist = NULL;
  
      if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
      ssc->invlist = NULL;
  
      if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
-        ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
+        ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
      }
  
      assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
      }
  
      assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
@@ -3604,6 +3707,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
  
  
    fake_study_recurse:
  
  
    fake_study_recurse:
+    DEBUG_r(
+        RExC_study_chunk_recursed_count++;
+    );
      while ( scan && OP(scan) != END && scan < last ){
          UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
                                     node length to get a real minimum (because
      while ( scan && OP(scan) != END && scan < last ){
          UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
                                     node length to get a real minimum (because
@@ -3613,8 +3719,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
          DEBUG_OPTIMISE_MORE_r(
          {
              PerlIO_printf(Perl_debug_log,
          DEBUG_OPTIMISE_MORE_r(
          {
              PerlIO_printf(Perl_debug_log,
-                "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
+                "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu ",
                  ((int) depth*2), "", (long)stopparen,
                  ((int) depth*2), "", (long)stopparen,
+                (unsigned long)RExC_study_chunk_recursed_count,
                  (unsigned long)depth, (unsigned long)recursed_depth);
              if (recursed_depth) {
                  U32 i;
                  (unsigned long)depth, (unsigned long)recursed_depth);
              if (recursed_depth) {
                  U32 i;
@@ -4126,7 +4233,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             regnode *end;
              U32 my_recursed_depth= recursed_depth;
  
             regnode *end;
              U32 my_recursed_depth= recursed_depth;
  
-           if (OP(scan) != SUSPEND) {
+            if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
                  /* set the pointer */
                 if (OP(scan) == GOSUB) {
                     paren = ARG(scan);
                  /* set the pointer */
                 if (OP(scan) == GOSUB) {
                     paren = ARG(scan);
@@ -4138,10 +4245,43 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      start = RExC_rxi->program + 1;
                      end   = RExC_opend;
                  }
                      start = RExC_rxi->program + 1;
                      end   = RExC_opend;
                  }
-                if (!recursed_depth
+                /* this code is intended to handle expanding regex "subs" so
+                 * we can apply various optimizations. For instance with
+                 * /(?(DEFINE)(?<foo>foo)(?<bar>bar))(?&foo)(?&bar)/ we
+                 * want to recognize that the mandatory substr is going to be
+                 * "foobar".
+                 * However if we are not in SCF_DO_SUBSTR mode then there is
+                 * no point in doing this, and it can cause a serious slowdown.
+                 * See RT #122283.
+                 * Note also that this was a workaround for the core problem
+                 * which was that during compilation logic the excessive
+                 * recursion resulted in slowly consuming all the memory on
+                 * the box. Exactly what causes this is unclear. It does not
+                 * appear to be directly related to allocating the "visited"
+                 * bitmaps that is RExC_study_chunk_recursed.
+                 *
+                 * In reality study_chunk() does far far too much, and probably
+                 * this an other issues would go away if we split it into
+                 * multiple components.
+                 *
+                 * - Yves
+                 * */
+                if (flags & SCF_DO_SUBSTR) {
+                if (
+                    !recursed_depth
                      ||
                      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
                  ) {
                      ||
                      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
                  ) {
+                    /* it is quite possible that there are more efficient ways
+                     * to do this. We maintain a bitmap per level of recursion
+                     * of which patterns we have entered so we can detect if a
+                     * pattern creates a possible infinite loop. When we
+                     * recurse down a level we copy the previous levels bitmap
+                     * down. When we are at recursion level 0 we zero the top
+                     * level bitmap. It would be nice to implement a different
+                     * more efficient way of doing this. In particular the top
+                     * level bitmap may be unnecessary.
+                     */
                      if (!recursed_depth) {
                          Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
                      } else {
                      if (!recursed_depth) {
                          Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
                      } else {
@@ -4167,6 +4307,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          ssc_anything(data->start_class);
                      flags &= ~SCF_DO_STCLASS;
                 }
                          ssc_anything(data->start_class);
                      flags &= ~SCF_DO_STCLASS;
                 }
+                }
              } else {
                 Newx(newframe,1,scan_frame);
                 paren = stopparen;
              } else {
                 Newx(newframe,1,scan_frame);
                 paren = stopparen;
@@ -4235,7 +4376,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
               * can't match null string */
             if (flags & SCF_DO_STCLASS_AND) {
                  ssc_cp_and(data->start_class, uc);
               * can't match null string */
             if (flags & SCF_DO_STCLASS_AND) {
                  ssc_cp_and(data->start_class, uc);
-                ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
                  ssc_clear_locale(data->start_class);
             }
             else if (flags & SCF_DO_STCLASS_OR) {
                  ssc_clear_locale(data->start_class);
             }
             else if (flags & SCF_DO_STCLASS_OR) {
@@ -4243,7 +4384,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
-                ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
             }
             flags &= ~SCF_DO_STCLASS;
         }
             }
             flags &= ~SCF_DO_STCLASS;
         }
@@ -4418,7 +4559,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  }
              }
             if (flags & SCF_DO_STCLASS_AND) {
                  }
              }
             if (flags & SCF_DO_STCLASS_AND) {
-                ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
                  ANYOF_POSIXL_ZERO(data->start_class);
                  ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
             }
                  ANYOF_POSIXL_ZERO(data->start_class);
                  ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
             }
@@ -4427,7 +4568,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
-                ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
             }
             flags &= ~SCF_DO_STCLASS;
              SvREFCNT_dec(EXACTF_invlist);
             }
             flags &= ~SCF_DO_STCLASS;
              SvREFCNT_dec(EXACTF_invlist);
@@ -4546,7 +4687,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         flags &= ~SCF_DO_STCLASS_AND;
                         StructCopy(&this_class, data->start_class, regnode_ssc);
                         flags |= SCF_DO_STCLASS_OR;
                         flags &= ~SCF_DO_STCLASS_AND;
                         StructCopy(&this_class, data->start_class, regnode_ssc);
                         flags |= SCF_DO_STCLASS_OR;
-                        ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
+                        ANYOF_FLAGS(data->start_class)
+                                                |= SSC_MATCHES_EMPTY_STRING;
                     }
                 } else {                /* Non-zero len */
                     if (flags & SCF_DO_STCLASS_OR) {
                     }
                 } else {                /* Non-zero len */
                     if (flags & SCF_DO_STCLASS_OR) {
@@ -4842,7 +4984,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                      ssc_intersection(data->start_class,
                                      PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
                      ssc_clear_locale(data->start_class);
                      ssc_intersection(data->start_class,
                                      PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
                      ssc_clear_locale(data->start_class);
-                    ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                    ANYOF_FLAGS(data->start_class)
+                                                &= ~SSC_MATCHES_EMPTY_STRING;
                  }
                  else if (flags & SCF_DO_STCLASS_OR) {
                      ssc_union(data->start_class,
                  }
                  else if (flags & SCF_DO_STCLASS_OR) {
                      ssc_union(data->start_class,
@@ -4852,7 +4995,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
  
                      /* See commit msg for
                       * 749e076fceedeb708a624933726e7989f2302f6a */
  
                      /* See commit msg for
                       * 749e076fceedeb708a624933726e7989f2302f6a */
-                    ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                    ANYOF_FLAGS(data->start_class)
+                                                &= ~SSC_MATCHES_EMPTY_STRING;
                  }
                 flags &= ~SCF_DO_STCLASS;
              }
                  }
                 flags &= ~SCF_DO_STCLASS;
              }
@@ -4879,7 +5023,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                  U8 namedclass;
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
                  U8 namedclass;
  
                  /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
-                ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
+                ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
  
                 /* Some of the logic below assumes that switching
                    locale on will only add false positives. */
  
                 /* Some of the logic below assumes that switching
                    locale on will only add false positives. */
@@ -5120,7 +5264,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
                           * assertions are zero-length, so can match an EMPTY
                           * string */
                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
                           * assertions are zero-length, so can match an EMPTY
                           * string */
                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
-                        ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
+                        ANYOF_FLAGS(data->start_class)
+                                                   |= SSC_MATCHES_EMPTY_STRING;
                     }
                  }
             }
                     }
                  }
             }
@@ -5192,7 +5337,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
  
                  if (f & SCF_DO_STCLASS_AND) {
                      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
  
                  if (f & SCF_DO_STCLASS_AND) {
                      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
-                    ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
+                    ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
                  }
                  if (data) {
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                  }
                  if (data) {
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -5634,9 +5779,9 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
  {
      U8 *const src = (U8*)*pat_p;
                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
  {
      U8 *const src = (U8*)*pat_p;
-    U8 *dst;
+    U8 *dst, *d;
      int n=0;
      int n=0;
-    STRLEN s = 0, d = 0;
+    STRLEN s = 0;
      bool do_end = 0;
      GET_RE_DEBUG_FLAGS_DECL;
  
      bool do_end = 0;
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -5644,32 +5789,27 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
          "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
  
      Newx(dst, *plen_p * 2 + 1, U8);
          "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
  
      Newx(dst, *plen_p * 2 + 1, U8);
+    d = dst;
  
      while (s < *plen_p) {
  
      while (s < *plen_p) {
-        if (NATIVE_BYTE_IS_INVARIANT(src[s]))
-            dst[d]   = src[s];
-        else {
-            dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
-            dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
-        }
+        append_utf8_from_native_byte(src[s], &d);
          if (n < num_code_blocks) {
              if (!do_end && pRExC_state->code_blocks[n].start == s) {
          if (n < num_code_blocks) {
              if (!do_end && pRExC_state->code_blocks[n].start == s) {
-                pRExC_state->code_blocks[n].start = d;
-                assert(dst[d] == '(');
+                pRExC_state->code_blocks[n].start = d - dst - 1;
+                assert(*(d - 1) == '(');
                  do_end = 1;
              }
              else if (do_end && pRExC_state->code_blocks[n].end == s) {
                  do_end = 1;
              }
              else if (do_end && pRExC_state->code_blocks[n].end == s) {
-                pRExC_state->code_blocks[n].end = d;
-                assert(dst[d] == ')');
+                pRExC_state->code_blocks[n].end = d - dst - 1;
+                assert(*(d - 1) == ')');
                  do_end = 0;
                  n++;
              }
          }
          s++;
                  do_end = 0;
                  n++;
              }
          }
          s++;
-        d++;
      }
      }
-    dst[d] = '\0';
-    *plen_p = d;
+    *d = '\0';
+    *plen_p = d - dst;
      *pat_p = (char*) dst;
      SAVEFREEPV(*pat_p);
      RExC_orig_utf8 = RExC_utf8 = 1;
      *pat_p = (char*) dst;
      SAVEFREEPV(*pat_p);
      RExC_orig_utf8 = RExC_utf8 = 1;
@@ -6023,7 +6163,6 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
  
         ENTER;
         SAVETMPS;
  
         ENTER;
         SAVETMPS;
-       save_re_context();
         PUSHSTACKi(PERLSI_REQUIRE);
          /* G_RE_REPARSING causes the toker to collapse \\ into \ when
           * parsing qr''; normally only q'' does this. It also alters
         PUSHSTACKi(PERLSI_REQUIRE);
          /* G_RE_REPARSING causes the toker to collapse \\ into \ when
           * parsing qr''; normally only q'' does this. It also alters
@@ -6272,6 +6411,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
          PL_HasMultiCharFold =
                         _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
          PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
          PL_HasMultiCharFold =
                         _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
+
+        /* This is calculated here, because the Perl program that generates the
+         * static global ones doesn't currently have access to
+         * NUM_ANYOF_CODE_POINTS */
+       PL_InBitmap = _new_invlist(2);
+       PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
+                                                    NUM_ANYOF_CODE_POINTS - 1);
      }
  #endif
  
      }
  #endif
  
@@ -6723,6 +6869,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
  reStudy:
      r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
  
  reStudy:
      r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
+    DEBUG_r(
+        RExC_study_chunk_recursed_count= 0;
+    );
      Zero(r->substrs, 1, struct reg_substr_data);
      if (RExC_study_chunk_recursed)
          Zero(RExC_study_chunk_recursed,
      Zero(r->substrs, 1, struct reg_substr_data);
      if (RExC_study_chunk_recursed)
          Zero(RExC_study_chunk_recursed,
@@ -6834,9 +6983,7 @@ reStudy:
         else if (PL_regkind[OP(first)] == BOL) {
              r->intflags |= (OP(first) == MBOL
                             ? PREGf_ANCH_MBOL
         else if (PL_regkind[OP(first)] == BOL) {
              r->intflags |= (OP(first) == MBOL
                             ? PREGf_ANCH_MBOL
-                          : (OP(first) == SBOL
-                              ? PREGf_ANCH_SBOL
-                              : PREGf_ANCH_BOL));
+                           : PREGf_ANCH_SBOL);
             first = NEXTOPER(first);
             goto again;
         }
             first = NEXTOPER(first);
             goto again;
         }
@@ -6989,8 +7136,8 @@ reStudy:
  
         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
             && stclass_flag
  
         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
             && stclass_flag
-            && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
-           && !ssc_is_anything(data.start_class))
+            && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
+           && is_ssc_worth_it(pRExC_state, data.start_class))
         {
             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
  
         {
             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
  
@@ -7069,8 +7216,8 @@ reStudy:
         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
                 = r->float_substr = r->float_utf8 = NULL;
  
         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
                 = r->float_substr = r->float_utf8 = NULL;
  
-        if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
-            && ! ssc_is_anything(data.start_class))
+        if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
+           && is_ssc_worth_it(pRExC_state, data.start_class))
          {
             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
  
          {
             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
  
@@ -7102,8 +7249,8 @@ reStudy:
      /* Guard against an embedded (?=) or (?<=) with a longer minlen than
         the "real" pattern. */
      DEBUG_OPTIMISE_r({
      /* Guard against an embedded (?=) or (?<=) with a longer minlen than
         the "real" pattern. */
      DEBUG_OPTIMISE_r({
-        PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
-                      (IV)minlen, (IV)r->minlen, RExC_maxlen);
+        PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
+                      (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
      });
      r->minlenret = minlen;
      if (r->minlen < minlen)
      });
      r->minlenret = minlen;
      if (r->minlen < minlen)
@@ -7153,7 +7300,12 @@ reStudy:
  
          if (PL_regkind[fop] == NOTHING && nop == END)
              r->extflags |= RXf_NULL;
  
          if (PL_regkind[fop] == NOTHING && nop == END)
              r->extflags |= RXf_NULL;
-        else if (PL_regkind[fop] == BOL && nop == END)
+        else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
+            /* when fop is SBOL first->flags will be true only when it was
+             * produced by parsing /\A/, and not when parsing /^/. This is
+             * very important for the split code as there we want to
+             * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
+             * See rt #122761 for more details. -- Yves */
              r->extflags |= RXf_START_ONLY;
          else if (fop == PLUS
                   && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
              r->extflags |= RXf_START_ONLY;
          else if (fop == PLUS
                   && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
@@ -7189,7 +7341,10 @@ reStudy:
      }
      Newxz(r->offs, RExC_npar, regexp_paren_pair);
      /* assume we don't need to swap parens around before we match */
      }
      Newxz(r->offs, RExC_npar, regexp_paren_pair);
      /* assume we don't need to swap parens around before we match */
-
+    DEBUG_TEST_r({
+        PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
+            (unsigned long)RExC_study_chunk_recursed_count);
+    });
      DEBUG_DUMP_r({
          DEBUG_RExC_seen();
          PerlIO_printf(Perl_debug_log,"Final program:\n");
      DEBUG_DUMP_r({
          DEBUG_RExC_seen();
          PerlIO_printf(Perl_debug_log,"Final program:\n");
@@ -9237,6 +9392,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
      regex_charset cs;
      bool has_use_defaults = FALSE;
      const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
      regex_charset cs;
      bool has_use_defaults = FALSE;
      const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
+    int x_mod_count = 0;
  
      PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
  
  
      PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
  
@@ -9264,7 +9420,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
          switch (*RExC_parse) {
  
              /* Code for the imsx flags */
          switch (*RExC_parse) {
  
              /* Code for the imsx flags */
-            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+            CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
  
              case LOCALE_PAT_MOD:
                  if (has_charset_modifier) {
  
              case LOCALE_PAT_MOD:
                  if (has_charset_modifier) {
@@ -9342,7 +9498,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  /*NOTREACHED*/
              case ONCE_PAT_MOD: /* 'o' */
              case GLOBAL_PAT_MOD: /* 'g' */
                  /*NOTREACHED*/
              case ONCE_PAT_MOD: /* 'o' */
              case GLOBAL_PAT_MOD: /* 'g' */
-                if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+                if (PASS2 && ckWARN(WARN_REGEXP)) {
                      const I32 wflagbit = *RExC_parse == 'o'
                                           ? WASTED_O
                                           : WASTED_G;
                      const I32 wflagbit = *RExC_parse == 'o'
                                           ? WASTED_O
                                           : WASTED_G;
@@ -9362,7 +9518,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  break;
  
              case CONTINUE_PAT_MOD: /* 'c' */
                  break;
  
              case CONTINUE_PAT_MOD: /* 'c' */
-                if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+                if (PASS2 && ckWARN(WARN_REGEXP)) {
                      if (! (wastedflags & WASTED_C) ) {
                          wastedflags |= WASTED_GC;
                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
                      if (! (wastedflags & WASTED_C) ) {
                          wastedflags |= WASTED_GC;
                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
@@ -9377,7 +9533,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  break;
              case KEEPCOPY_PAT_MOD: /* 'p' */
                  if (flagsp == &negflags) {
                  break;
              case KEEPCOPY_PAT_MOD: /* 'p' */
                  if (flagsp == &negflags) {
-                    if (SIZE_ONLY)
+                    if (PASS2)
                          ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
                  } else {
                      *flagsp |= RXf_PMf_KEEPCOPY;
                          ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
                  } else {
                      *flagsp |= RXf_PMf_KEEPCOPY;
@@ -9401,6 +9557,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  if (RExC_flags & RXf_PMf_FOLD) {
                      RExC_contains_i = 1;
                  }
                  if (RExC_flags & RXf_PMf_FOLD) {
                      RExC_contains_i = 1;
                  }
+                if (PASS2) {
+                    STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+                }
                  return;
                  /*NOTREACHED*/
              default:
                  return;
                  /*NOTREACHED*/
              default:
@@ -9414,6 +9573,10 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
  
          ++RExC_parse;
      }
  
          ++RExC_parse;
      }
+
+    if (PASS2) {
+        STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+    }
  }
  
  /*
  }
  
  /*
@@ -9844,21 +10007,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      num = RExC_npar + num - 1;
                  }
  
                      num = RExC_npar + num - 1;
                  }
  
-                ret = reganode(pRExC_state, GOSUB, num);
+                ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
                  if (!SIZE_ONLY) {
                     if (num > (I32)RExC_rx->nparens) {
                         RExC_parse++;
                         vFAIL("Reference to nonexistent group");
                     }
                  if (!SIZE_ONLY) {
                     if (num > (I32)RExC_rx->nparens) {
                         RExC_parse++;
                         vFAIL("Reference to nonexistent group");
                     }
-                   ARG2L_SET( ret, RExC_recurse_count++);
-                    RExC_emit++;
+                   RExC_recurse_count++;
                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                         "Recurse #%"UVuf" to %"IVdf"\n",
                                (UV)ARG(ret), (IV)ARG2L(ret)));
                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                         "Recurse #%"UVuf" to %"IVdf"\n",
                                (UV)ARG(ret), (IV)ARG2L(ret)));
-               } else {
-                   RExC_size++;
-               }
-                    RExC_seen |= REG_RECURSE_SEEN;
+                }
+                RExC_seen |= REG_RECURSE_SEEN;
                  Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
                 Set_Node_Offset(ret, parse_start); /* MJD */
  
                  Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
                 Set_Node_Offset(ret, parse_start); /* MJD */
  
@@ -9921,17 +10081,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 if (is_logical) {
                      regnode *eval;
                     ret = reg_node(pRExC_state, LOGICAL);
                 if (is_logical) {
                      regnode *eval;
                     ret = reg_node(pRExC_state, LOGICAL);
-                    eval = reganode(pRExC_state, EVAL, n);
+
+                    eval = reg2Lanode(pRExC_state, EVAL,
+                                       n,
+
+                                       /* for later propagation into (??{})
+                                        * return value */
+                                       RExC_flags & RXf_PMf_COMPILETIME
+                                      );
                     if (!SIZE_ONLY) {
                         ret->flags = 2;
                     if (!SIZE_ONLY) {
                         ret->flags = 2;
-                        /* for later propagation into (??{}) return value */
-                        eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
                      }
                      REGTAIL(pRExC_state, ret, eval);
                      /* deal with the length of this later - MJD */
                     return ret;
                 }
                      }
                      REGTAIL(pRExC_state, ret, eval);
                      /* deal with the length of this later - MJD */
                     return ret;
                 }
-               ret = reganode(pRExC_state, EVAL, n);
+               ret = reg2Lanode(pRExC_state, EVAL, n, 0);
                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
                 Set_Node_Offset(ret, parse_start);
                 return ret;
                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
                 Set_Node_Offset(ret, parse_start);
                 return ret;
@@ -9939,6 +10104,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             case '(':           /* (?(?{...})...) and (?(?=...)...) */
             {
                 int is_define= 0;
             case '(':           /* (?(?{...})...) and (?(?=...)...) */
             {
                 int is_define= 0;
+                const int DEFINE_len = sizeof("DEFINE") - 1;
                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
                         || RExC_parse[1] == '<'
                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
                         || RExC_parse[1] == '<'
@@ -9981,15 +10147,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      ret = reganode(pRExC_state,NGROUPP,num);
                      goto insert_if_check_paren;
                 }
                      ret = reganode(pRExC_state,NGROUPP,num);
                      goto insert_if_check_paren;
                 }
-               else if (RExC_parse[0] == 'D' &&
-                        RExC_parse[1] == 'E' &&
-                        RExC_parse[2] == 'F' &&
-                        RExC_parse[3] == 'I' &&
-                        RExC_parse[4] == 'N' &&
-                        RExC_parse[5] == 'E')
-               {
+               else if (strnEQ(RExC_parse, "DEFINE",
+                                       MIN(DEFINE_len, RExC_end - RExC_parse)))
+                {
                     ret = reganode(pRExC_state,DEFINEP,0);
                     ret = reganode(pRExC_state,DEFINEP,0);
-                   RExC_parse +=6 ;
+                   RExC_parse += DEFINE_len;
                     is_define = 1;
                     goto insert_if_check_paren;
                 }
                     is_define = 1;
                     goto insert_if_check_paren;
                 }
@@ -10068,8 +10230,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     }
                     else
                         lastbr = NULL;
                     }
                     else
                         lastbr = NULL;
-                   if (c != ')')
-                       vFAIL("Switch (?(condition)... contains too many branches");
+                    if (c != ')') {
+                        if (RExC_parse>RExC_end)
+                            vFAIL("Switch (?(condition)... not terminated");
+                        else
+                            vFAIL("Switch (?(condition)... contains too many branches");
+                    }
                     ender = reg_node(pRExC_state, TAIL);
                      REGTAIL(pRExC_state, br, ender);
                     if (lastbr) {
                     ender = reg_node(pRExC_state, TAIL);
                      REGTAIL(pRExC_state, br, ender);
                     if (lastbr) {
@@ -10509,7 +10675,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              if (max < min) {    /* If can't match, warn and optimize to fail
                                     unconditionally */
                  if (SIZE_ONLY) {
              if (max < min) {    /* If can't match, warn and optimize to fail
                                     unconditionally */
                  if (SIZE_ONLY) {
-                    ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
  
                      /* We can't back off the size because we have to reserve
                       * enough space for all the things we are about to throw
  
                      /* We can't back off the size because we have to reserve
                       * enough space for all the things we are about to throw
@@ -10518,6 +10683,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
                  }
                  else {
                      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
                  }
                  else {
+                    ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
                      RExC_emit = orig_emit;
                  }
                  ret = reg_node(pRExC_state, OPFAIL);
                      RExC_emit = orig_emit;
                  }
                  ret = reg_node(pRExC_state, OPFAIL);
@@ -10527,7 +10693,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                       && RExC_parse < RExC_end
                       && (*RExC_parse == '?' || *RExC_parse == '+'))
              {
                       && RExC_parse < RExC_end
                       && (*RExC_parse == '?' || *RExC_parse == '+'))
              {
-                if (SIZE_ONLY) {
+                if (PASS2) {
                      ckWARN2reg(RExC_parse + 1,
                                 "Useless use of greediness modifier '%c'",
                                 *RExC_parse);
                      ckWARN2reg(RExC_parse + 1,
                                 "Useless use of greediness modifier '%c'",
                                 *RExC_parse);
@@ -10673,10 +10839,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      return(ret);
  }
  
      return(ret);
  }
  
-STATIC bool
+STATIC STRLEN
  S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
  S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
-                      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
-                      const bool strict   /* Apply stricter parsing rules? */
+                      UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
      )
  {
  
      )
  {
  
@@ -10684,46 +10849,75 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
     and needs to handle the rest. RExC_parse is expected to point at the first
     char following the N at the time of the call.  On successful return,
     RExC_parse has been updated to point to just after the sequence identified
     and needs to handle the rest. RExC_parse is expected to point at the first
     char following the N at the time of the call.  On successful return,
     RExC_parse has been updated to point to just after the sequence identified
-   by this routine, and <*flagp> has been updated.
-
-   The \N may be inside (indicated by the boolean <in_char_class>) or outside a
-   character class.
-
-   \N may begin either a named sequence, or if outside a character class, mean
-   to match a non-newline.  For non single-quoted regexes, the tokenizer has
-   attempted to decide which, and in the case of a named sequence, converted it
+   by this routine, <*flagp> has been updated, and the non-NULL input pointers
+   have been set appropriately.
+
+   The typical case for this is \N{some character name}.  This is usually
+   called while parsing the input, filling in or ready to fill in an EXACTish
+   node, and the code point for the character should be returned, so that it
+   can be added to the node, and parsing continued with the next input
+   character.  But it may be that instead of a single character the \N{}
+   expands to more than one, a named sequence.  In this case any following
+   quantifier applies to the whole sequence, and it is easier, given the code
+   structure that calls this, to handle it from a different area of the code.
+   For this reason, the input parameters can be set so that it returns valid
+   only on one or the other of these cases.
+
+   Another possibility is for the input to be an empty \N{}, which for
+   backwards compatibility we accept, but generate a NOTHING node which should
+   later get optimized out.  This is handled from the area of code which can
+   handle a named sequence, so if called with the parameters for the other, it
+   fails.
+
+   Still another possibility is for the \N to mean [^\n], and not a single
+   character or explicit sequence at all.  This is determined by context.
+   Again, this is handled from the area of code which can handle a named
+   sequence, so if called with the parameters for the other, it also fails.
+
+   And the final possibility is for the \N to be called from within a bracketed
+   character class.  In this case the [^\n] meaning makes no sense, and so is
+   an error.  Other anomalous situations are left to the calling code to handle.
+
+   For non-single-quoted regexes, the tokenizer has attempted to decide which
+   of the above applies, and in the case of a named sequence, has converted it
     into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
     where c1... are the characters in the sequence.  For single-quoted regexes,
     the tokenizer passes the \N sequence through unchanged; this code will not
     attempt to determine this nor expand those, instead raising a syntax error.
     The net effect is that if the beginning of the passed-in pattern isn't '{U+'
     or there is no '}', it signals that this \N occurrence means to match a
     into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
     where c1... are the characters in the sequence.  For single-quoted regexes,
     the tokenizer passes the \N sequence through unchanged; this code will not
     attempt to determine this nor expand those, instead raising a syntax error.
     The net effect is that if the beginning of the passed-in pattern isn't '{U+'
     or there is no '}', it signals that this \N occurrence means to match a
-   non-newline.
+   non-newline. (This mostly was done because of [perl #56444].)
  
  
-   Only the \N{U+...} form should occur in a character class, for the same
-   reason that '.' inside a character class means to just match a period: it
-   just doesn't make sense.
+   The API is somewhat convoluted due to historical and the above reasons.
  
     The function raises an error (via vFAIL), and doesn't return for various
  
     The function raises an error (via vFAIL), and doesn't return for various
-   syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
-   success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
-   RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
-   only possible if node_p is non-NULL.
-
+   syntax errors.  For other failures, it returns (STRLEN) -1.  For successes,
+   it returns a count of how many characters were accounted for by it.  (This
+   can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
+   points in the sequence.  It sets <node_p>, <valuep>, and/or
+   <substitute_parse> on success.
  
     If <valuep> is non-null, it means the caller can accept an input sequence
  
     If <valuep> is non-null, it means the caller can accept an input sequence
-   consisting of a just a single code point; <*valuep> is set to that value
-   if the input is such.
-
-   If <node_p> is non-null it signifies that the caller can accept any other
-   legal sequence (i.e., one that isn't just a single code point).  <*node_p>
-   is set as follows:
-    1) \N means not-a-NL: points to a newly created REG_ANY node;
-    2) \N{}:              points to a new NOTHING node;
+   consisting of a just a single code point; <*valuep> is set to the value
+   of the only or first code point in the input.
+
+   If <substitute_parse> is non-null, it means the caller can accept an input
+   sequence consisting of one or more code points; <*substitute_parse> is a
+   newly created mortal SV* in this case, containing \x{} escapes representing
+   those code points.
+
+   Both <valuep> and <substitute_parse> can be non-NULL.
+
+   If <node_p> is non-null, <substitute_parse> must be NULL.  This signifies
+   that the caller can accept any legal sequence other than a single code
+   point.  To wit, <*node_p> is set as follows:
+    1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
+    2) \N{}:              points to a new NOTHING node; return is 0
      3) otherwise:         points to a new EXACT node containing the resolved
      3) otherwise:         points to a new EXACT node containing the resolved
-                          string.
-   Note that FALSE is returned for single code point sequences if <valuep> is
-   null.
+                          string; return is the number of code points in the
+                          string.  This will never be 1.
+   Note that failure is returned for single code point sequences if <valuep> is
+   null and <node_p> is not.
   */
  
      char * endbrace;    /* '}' following the name */
   */
  
      char * endbrace;    /* '}' following the name */
@@ -10732,6 +10926,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
                             stream */
      bool has_multiple_chars; /* true if the input stream contains a sequence of
                                  more than one character */
                             stream */
      bool has_multiple_chars; /* true if the input stream contains a sequence of
                                  more than one character */
+    bool in_char_class = substitute_parse != NULL;
+    STRLEN count = 0;   /* Number of characters in this sequence */
  
      GET_RE_DEBUG_FLAGS_DECL;
  
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -10740,6 +10936,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
      GET_RE_DEBUG_FLAGS;
  
      assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
      GET_RE_DEBUG_FLAGS;
  
      assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
+    assert(! (node_p && substitute_parse)); /* At most 1 should be set */
  
      /* The [^\n] meaning of \N ignores spaces and comments under the /x
       * modifier.  The other meaning does not, so use a temporary until we find
  
      /* The [^\n] meaning of \N ignores spaces and comments under the /x
       * modifier.  The other meaning does not, so use a temporary until we find
@@ -10758,7 +10955,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
              if (in_char_class) {
                  vFAIL("\\N in a character class must be a named character: \\N{...}");
              }
              if (in_char_class) {
                  vFAIL("\\N in a character class must be a named character: \\N{...}");
              }
-            return FALSE;
+            return (STRLEN) -1;
          }
          RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
                             current char */
          }
          RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
                             current char */
@@ -10767,7 +10964,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
         *flagp |= HASWIDTH|SIMPLE;
         RExC_naughty++;
          Set_Node_Length(*node_p, 1); /* MJD */
         *flagp |= HASWIDTH|SIMPLE;
         RExC_naughty++;
          Set_Node_Length(*node_p, 1); /* MJD */
-       return TRUE;
+       return 1;
      }
  
      /* Here, we have decided it should be a named character or sequence */
      }
  
      /* Here, we have decided it should be a named character or sequence */
@@ -10794,28 +10991,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
      }
  
      if (endbrace == RExC_parse) {   /* empty: \N{} */
      }
  
      if (endbrace == RExC_parse) {   /* empty: \N{} */
-        bool ret = TRUE;
         if (node_p) {
             *node_p = reg_node(pRExC_state,NOTHING);
         }
         if (node_p) {
             *node_p = reg_node(pRExC_state,NOTHING);
         }
-        else if (in_char_class) {
-            if (SIZE_ONLY && in_char_class) {
-                if (strict) {
-                    RExC_parse++;   /* Position after the "}" */
-                    vFAIL("Zero length \\N{}");
-                }
-                else {
-                    ckWARNreg(RExC_parse,
-                              "Ignoring zero length \\N{} in character class");
-                }
-            }
-            ret = FALSE;
-       }
-        else {
-            return FALSE;
+        else if (! in_char_class) {
+            return (STRLEN) -1;
          }
          nextchar(pRExC_state);
          }
          nextchar(pRExC_state);
-        return ret;
+        return 0;
      }
  
      RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
      }
  
      RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
@@ -10827,90 +11010,103 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
       * point, and is terminated by the brace */
      has_multiple_chars = (endchar < endbrace);
  
       * point, and is terminated by the brace */
      has_multiple_chars = (endchar < endbrace);
  
-    if (valuep && (! has_multiple_chars || in_char_class)) {
-       /* We only pay attention to the first char of
-        multichar strings being returned in char classes. I kinda wonder
-       if this makes sense as it does change the behaviour
-       from earlier versions, OTOH that behaviour was broken
-       as well. XXX Solution is to recharacterize as
-       [rest-of-class]|multi1|multi2... */
-
+    /* We get the first code point if we want it, and either there is only one,
+     * or we can accept both cases of one and more than one */
+    if (valuep && (substitute_parse || ! has_multiple_chars)) {
         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
-           | PERL_SCAN_DISALLOW_PREFIX
-           | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
+                           | PERL_SCAN_DISALLOW_PREFIX
+
+                             /* No errors in the first pass (See [perl
+                              * #122671].)  We let the code below find the
+                              * errors when there are multiple chars. */
+                           | ((SIZE_ONLY || has_multiple_chars)
+                              ? PERL_SCAN_SILENT_ILLDIGIT
+                              : 0);
  
         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
  
         /* The tokenizer should have guaranteed validity, but it's possible to
  
         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
  
         /* The tokenizer should have guaranteed validity, but it's possible to
-        * bypass it by using single quoting, so check */
-       if (length_of_hex == 0
-           || length_of_hex != (STRLEN)(endchar - RExC_parse) )
-       {
-           RExC_parse += length_of_hex;        /* Includes all the valid */
-           RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
-                           ? UTF8SKIP(RExC_parse)
-                           : 1;
-           /* Guard against malformed utf8 */
-           if (RExC_parse >= endchar) {
-                RExC_parse = endchar;
+         * bypass it by using single quoting, so check.  Don't do the check
+         * here when there are multiple chars; we do it below anyway. */
+        if (! has_multiple_chars) {
+            if (length_of_hex == 0
+                || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+            {
+                RExC_parse += length_of_hex;   /* Includes all the valid */
+                RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+                                ? UTF8SKIP(RExC_parse)
+                                : 1;
+                /* Guard against malformed utf8 */
+                if (RExC_parse >= endchar) {
+                    RExC_parse = endchar;
+                }
+                vFAIL("Invalid hexadecimal number in \\N{U+...}");
              }
              }
-           vFAIL("Invalid hexadecimal number in \\N{U+...}");
-       }
  
  
-        if (in_char_class && has_multiple_chars) {
-            if (strict) {
-                RExC_parse = endbrace;
-                vFAIL("\\N{} in character class restricted to one character");
-            }
-            else {
-                ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
-            }
+            RExC_parse = endbrace + 1;
+            return 1;
          }
          }
-
-        RExC_parse = endbrace + 1;
      }
      }
-    else if (! node_p || ! has_multiple_chars) {
  
  
-        /* Here, the input is legal, but not according to the caller's
-         * options.  We fail without advancing the parse, so that the
-         * caller can try again */
+    /* Here, we should have already handled the case where a single character
+     * is expected and found.  So it is a failure if we aren't expecting
+     * multiple chars and got them; or didn't get them but wanted them.  We
+     * fail without advancing the parse, so that the caller can try again with
+     * different acceptance criteria */
+    if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
          RExC_parse = p;
          RExC_parse = p;
-        return FALSE;
+        return (STRLEN) -1;
      }
      }
-    else {
+
+    {
  
         /* What is done here is to convert this to a sub-pattern of the form
  
         /* What is done here is to convert this to a sub-pattern of the form
-        * (?:\x{char1}\x{char2}...)
-        * and then call reg recursively.  That way, it retains its atomicness,
-        * while not having to worry about special handling that some code
-        * points may have.  toke.c has converted the original Unicode values
-        * to native, so that we can just pass on the hex values unchanged.  We
-        * do have to set a flag to keep recoding from happening in the
-        * recursion */
-
-       SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+        * \x{char1}\x{char2}...
+         * and then either return it in <*substitute_parse> if non-null; or
+         * call reg recursively to parse it (enclosing in "(?: ... )" ).  That
+         * way, it retains its atomicness, while not having to worry about
+         * special handling that some code points may have.  toke.c has
+         * converted the original Unicode values to native, so that we can just
+         * pass on the hex values unchanged.  We do have to set a flag to keep
+         * recoding from happening in the recursion */
+
+       SV * dummy = NULL;
         STRLEN len;
         char *orig_end = RExC_end;
          I32 flags;
  
         STRLEN len;
         char *orig_end = RExC_end;
          I32 flags;
  
+        if (substitute_parse) {
+            *substitute_parse = newSVpvs("");
+        }
+        else {
+            substitute_parse = &dummy;
+            *substitute_parse = newSVpvs("?:");
+        }
+        *substitute_parse = sv_2mortal(*substitute_parse);
+
         while (RExC_parse < endbrace) {
  
             /* Convert to notation the rest of the code understands */
         while (RExC_parse < endbrace) {
  
             /* Convert to notation the rest of the code understands */
-           sv_catpv(substitute_parse, "\\x{");
-           sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
-           sv_catpv(substitute_parse, "}");
+           sv_catpv(*substitute_parse, "\\x{");
+           sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
+           sv_catpv(*substitute_parse, "}");
  
             /* Point to the beginning of the next character in the sequence. */
             RExC_parse = endchar + 1;
             endchar = RExC_parse + strcspn(RExC_parse, ".}");
  
             /* Point to the beginning of the next character in the sequence. */
             RExC_parse = endchar + 1;
             endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+            count++;
         }
         }
-       sv_catpv(substitute_parse, ")");
+        if (! in_char_class) {
+            sv_catpv(*substitute_parse, ")");
+        }
  
  
-       RExC_parse = SvPV(substitute_parse, len);
+       RExC_parse = SvPV(*substitute_parse, len);
  
         /* Don't allow empty number */
  
         /* Don't allow empty number */
-       if (len < 8) {
+       if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
+            RExC_parse = endbrace;
             vFAIL("Invalid hexadecimal number in \\N{U+...}");
         }
         RExC_end = RExC_parse + len;
             vFAIL("Invalid hexadecimal number in \\N{U+...}");
         }
         RExC_end = RExC_parse + len;
@@ -10918,15 +11114,17 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
         /* The values are Unicode, and therefore not subject to recoding */
         RExC_override_recoding = 1;
  
         /* The values are Unicode, and therefore not subject to recoding */
         RExC_override_recoding = 1;
  
-       if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
-            if (flags & RESTART_UTF8) {
-                *flagp = RESTART_UTF8;
-                return FALSE;
+        if (node_p) {
+            if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
+                if (flags & RESTART_UTF8) {
+                    *flagp = RESTART_UTF8;
+                    return (STRLEN) -1;
+                }
+                FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+                    (UV) flags);
              }
              }
-            FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
-                  (UV) flags);
+            *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
          }
          }
-       *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
  
         RExC_parse = endbrace;
         RExC_end = orig_end;
  
         RExC_parse = endbrace;
         RExC_end = orig_end;
@@ -10935,7 +11133,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
          nextchar(pRExC_state);
      }
  
          nextchar(pRExC_state);
      }
  
-    return TRUE;
+    return count;
  }
  
  
  }
  
  
@@ -11041,15 +11239,21 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
                  if (LOC || ! FOLD) {    /* /l defers folding until runtime */
                      *character = (U8) code_point;
                  }
                  if (LOC || ! FOLD) {    /* /l defers folding until runtime */
                      *character = (U8) code_point;
                  }
-                else { /* Here is /i and not /l (toFOLD() is defined on just
+                else { /* Here is /i and not /l. (toFOLD() is defined on just
                            ASCII, which isn't the same thing as INVARIANT on
                            EBCDIC, but it works there, as the extra invariants
                            fold to themselves) */
                      *character = toFOLD((U8) code_point);
                            ASCII, which isn't the same thing as INVARIANT on
                            EBCDIC, but it works there, as the extra invariants
                            fold to themselves) */
                      *character = toFOLD((U8) code_point);
-                    if (downgradable
-                        && *character == code_point
-                        && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
-                    {
+
+                    /* We can downgrade to an EXACT node if this character
+                     * isn't a folding one.  Note that this assumes that
+                     * nothing above Latin1 folds to some other invariant than
+                     * one of these alphabetics; otherwise we would also have
+                     * to check:
+                     *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
+                     *      || ASCII_FOLD_RESTRICTED))
+                     */
+                    if (downgradable && PL_fold[code_point] == code_point) {
                          OP(node) = EXACT;
                      }
                  }
                          OP(node) = EXACT;
                      }
                  }
@@ -11066,7 +11270,10 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
                                                        ? FOLD_FLAGS_NOMIX_ASCII
                                                        : 0));
                  if (downgradable
                                                        ? FOLD_FLAGS_NOMIX_ASCII
                                                        : 0));
                  if (downgradable
-                    && folded == code_point
+                    && folded == code_point /* This quickly rules out many
+                                               cases, avoiding the
+                                               _invlist_contains_cp() overhead
+                                               for those.  */
                      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
                  {
                      OP(node) = EXACT;
                      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
                  {
                      OP(node) = EXACT;
@@ -11251,10 +11458,8 @@ tryagain:
         nextchar(pRExC_state);
         if (RExC_flags & RXf_PMf_MULTILINE)
             ret = reg_node(pRExC_state, MBOL);
         nextchar(pRExC_state);
         if (RExC_flags & RXf_PMf_MULTILINE)
             ret = reg_node(pRExC_state, MBOL);
-       else if (RExC_flags & RXf_PMf_SINGLELINE)
-           ret = reg_node(pRExC_state, SBOL);
         else
         else
-           ret = reg_node(pRExC_state, BOL);
+           ret = reg_node(pRExC_state, SBOL);
          Set_Node_Length(ret, 1); /* MJD */
         break;
      case '$':
          Set_Node_Length(ret, 1); /* MJD */
         break;
      case '$':
@@ -11263,10 +11468,8 @@ tryagain:
             RExC_seen_zerolen++;
         if (RExC_flags & RXf_PMf_MULTILINE)
             ret = reg_node(pRExC_state, MEOL);
             RExC_seen_zerolen++;
         if (RExC_flags & RXf_PMf_MULTILINE)
             ret = reg_node(pRExC_state, MEOL);
-       else if (RExC_flags & RXf_PMf_SINGLELINE)
-           ret = reg_node(pRExC_state, SEOL);
         else
         else
-           ret = reg_node(pRExC_state, EOL);
+           ret = reg_node(pRExC_state, SEOL);
          Set_Node_Length(ret, 1); /* MJD */
         break;
      case '.':
          Set_Node_Length(ret, 1); /* MJD */
         break;
      case '.':
@@ -11355,6 +11558,11 @@ tryagain:
         case 'A':
             RExC_seen_zerolen++;
             ret = reg_node(pRExC_state, SBOL);
         case 'A':
             RExC_seen_zerolen++;
             ret = reg_node(pRExC_state, SBOL);
+            /* SBOL is shared with /^/ so we set the flags so we can tell
+             * /\A/ from /^/ in split. We check ret because first pass we
+             * have no regop struct to set the flags on. */
+            if (PASS2)
+                ret->flags = 1;
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'G':
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'G':
@@ -11386,7 +11594,7 @@ tryagain:
             ret = reg_node(pRExC_state, CANY);
              RExC_seen |= REG_CANY_SEEN;
             *flagp |= HASWIDTH|SIMPLE;
             ret = reg_node(pRExC_state, CANY);
              RExC_seen |= REG_CANY_SEEN;
             *flagp |= HASWIDTH|SIMPLE;
-            if (SIZE_ONLY) {
+            if (PASS2) {
                  ckWARNdep(RExC_parse+1, "\\C is deprecated");
              }
             goto finish_meta_pat;
                  ckWARNdep(RExC_parse+1, "\\C is deprecated");
              }
             goto finish_meta_pat;
@@ -11415,7 +11623,7 @@ tryagain:
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+           if ((U8) *(RExC_parse + 1) == '{') {
                  /* diag_listed_as: Use "%s" instead of "%s" */
                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
             }
                  /* diag_listed_as: Use "%s" instead of "%s" */
                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
             }
@@ -11433,7 +11641,7 @@ tryagain:
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
             ret = reg_node(pRExC_state, op);
             FLAGS(ret) = get_regex_charset(RExC_flags);
             *flagp |= SIMPLE;
-           if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+           if ((U8) *(RExC_parse + 1) == '{') {
                  /* diag_listed_as: Use "%s" instead of "%s" */
                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
             }
                  /* diag_listed_as: Use "%s" instead of "%s" */
                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
             }
@@ -11542,8 +11750,9 @@ tryagain:
               * special treatment for quantifiers is not needed for such single
               * character sequences */
              ++RExC_parse;
               * special treatment for quantifiers is not needed for such single
               * character sequences */
              ++RExC_parse;
-            if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
-                                FALSE /* not strict */ )) {
+            if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
+                                             depth, FALSE))
+            {
                  if (*flagp & RESTART_UTF8)
                      return NULL;
                  RExC_parse--;
                  if (*flagp & RESTART_UTF8)
                      return NULL;
                  RExC_parse--;
@@ -11844,10 +12053,12 @@ tryagain:
                           * point sequence.  Handle those in the switch() above
                           * */
                          RExC_parse = p + 1;
                           * point sequence.  Handle those in the switch() above
                           * */
                          RExC_parse = p + 1;
-                        if (! grok_bslash_N(pRExC_state, NULL, &ender,
-                                            flagp, depth, FALSE,
-                                            FALSE /* not strict */ ))
-                        {
+                        if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
+                                                         &ender,
+                                                         flagp,
+                                                         depth,
+                                                         FALSE
+                        )) {
                              if (*flagp & RESTART_UTF8)
                                  FAIL("panic: grok_bslash_N set RESTART_UTF8");
                              RExC_parse = p = oldp;
                              if (*flagp & RESTART_UTF8)
                                  FAIL("panic: grok_bslash_N set RESTART_UTF8");
                              RExC_parse = p = oldp;
@@ -11886,7 +12097,7 @@ tryagain:
                             bool valid = grok_bslash_o(&p,
                                                        &result,
                                                        &error_msg,
                             bool valid = grok_bslash_o(&p,
                                                        &result,
                                                        &error_msg,
-                                                      TRUE, /* out warnings */
+                                                      PASS2, /* out warnings */
                                                         FALSE, /* not strict */
                                                         TRUE, /* Output warnings
                                                                  for non-
                                                         FALSE, /* not strict */
                                                         TRUE, /* Output warnings
                                                                  for non-
@@ -11915,7 +12126,7 @@ tryagain:
                             bool valid = grok_bslash_x(&p,
                                                        &result,
                                                        &error_msg,
                             bool valid = grok_bslash_x(&p,
                                                        &result,
                                                        &error_msg,
-                                                      TRUE, /* out warnings */
+                                                      PASS2, /* out warnings */
                                                         FALSE, /* not strict */
                                                         TRUE, /* Output warnings
                                                                  for non-
                                                         FALSE, /* not strict */
                                                         TRUE, /* Output warnings
                                                                  for non-
@@ -11938,7 +12149,7 @@ tryagain:
                         }
                     case 'c':
                         p++;
                         }
                     case 'c':
                         p++;
-                       ender = grok_bslash_c(*p++, SIZE_ONLY);
+                       ender = grok_bslash_c(*p++, PASS2);
                         break;
                      case '8': case '9': /* must be a backreference */
                          --p;
                         break;
                      case '8': case '9': /* must be a backreference */
                          --p;
@@ -11977,7 +12188,7 @@ tryagain:
                                 REQUIRE_UTF8;
                             }
                             p += numlen;
                                 REQUIRE_UTF8;
                             }
                             p += numlen;
-                            if (SIZE_ONLY   /* like \08, \178 */
+                            if (PASS2   /* like \08, \178 */
                                  && numlen < 3
                                  && p < RExC_end
                                  && isDIGIT(*p) && ckWARN(WARN_REGEXP))
                                  && numlen < 3
                                  && p < RExC_end
                                  && isDIGIT(*p) && ckWARN(WARN_REGEXP))
@@ -11994,7 +12205,7 @@ tryagain:
                         if (! RExC_override_recoding) {
                             SV* enc = PL_encoding;
                             ender = reg_recode((const char)(U8)ender, &enc);
                         if (! RExC_override_recoding) {
                             SV* enc = PL_encoding;
                             ender = reg_recode((const char)(U8)ender, &enc);
-                           if (!enc && SIZE_ONLY)
+                           if (!enc && PASS2)
                                 ckWARNreg(p, "Invalid escape in the specified encoding");
                             REQUIRE_UTF8;
                         }
                                 ckWARNreg(p, "Invalid escape in the specified encoding");
                             REQUIRE_UTF8;
                         }
@@ -12445,11 +12656,11 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
             UV high;
             int i;
  
             UV high;
             int i;
  
-            if (end == UV_MAX && start <= 256) {
-                ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
+            if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
+                ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
              }
              }
-            else if (end >= 256) {
-                ANYOF_FLAGS(node) |= ANYOF_UTF8;
+            else if (end >= NUM_ANYOF_CODE_POINTS) {
+                ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
              }
  
             /* Quit if are above what we should change */
              }
  
             /* Quit if are above what we should change */
@@ -12472,13 +12683,13 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
         invlist_iterfinish(*invlist_ptr);
  
          /* Done with loop; remove any code points that are in the bitmap from
         invlist_iterfinish(*invlist_ptr);
  
          /* Done with loop; remove any code points that are in the bitmap from
-         * *invlist_ptr; similarly for code points above latin1 if we have a
-         * flag to match all of them anyways */
+         * *invlist_ptr; similarly for code points above the bitmap if we have
+         * a flag to match all of them anyways */
         if (change_invlist) {
         if (change_invlist) {
-           _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
+           _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
         }
         }
-        if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
-           _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
+        if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+           _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
         }
  
         /* If have completely emptied it, remove it completely */
         }
  
         /* If have completely emptied it, remove it completely */
@@ -12728,9 +12939,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
       * upon an unescaped ']' that isn't one ending a regclass.  To do both
       * these things, we need to realize that something preceded by a backslash
       * is escaped, so we have to keep track of backslashes */
       * upon an unescaped ']' that isn't one ending a regclass.  To do both
       * these things, we need to realize that something preceded by a backslash
       * is escaped, so we have to keep track of backslashes */
-    if (SIZE_ONLY) {
-        UV depth = 0; /* how many nested (?[...]) constructs */
-
+    if (PASS2) {
          Perl_ck_warner_d(aTHX_
              packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
              "The regex_sets feature is experimental" REPORT_LOCATION,
          Perl_ck_warner_d(aTHX_
              packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
              "The regex_sets feature is experimental" REPORT_LOCATION,
@@ -12738,6 +12947,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
                  UTF8fARG(UTF,
                           RExC_end - RExC_start - (RExC_parse - RExC_precomp),
                           RExC_precomp + (RExC_parse - RExC_precomp)));
                  UTF8fARG(UTF,
                           RExC_end - RExC_start - (RExC_parse - RExC_precomp),
                           RExC_precomp + (RExC_parse - RExC_precomp)));
+    }
+    else {
+        UV depth = 0; /* how many nested (?[...]) constructs */
  
          while (RExC_parse < RExC_end) {
              SV* current = NULL;
  
          while (RExC_parse < RExC_end) {
              SV* current = NULL;
@@ -13238,11 +13450,60 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl
          default:
              /* Use deprecated warning to increase the chances of this being
               * output */
          default:
              /* Use deprecated warning to increase the chances of this being
               * output */
-            ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+            if (PASS2) {
+                ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+            }
              break;
      }
  }
  
              break;
      }
  }
  
+STATIC AV *
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
+{
+    /* This adds the string scalar <multi_string> to the array
+     * <multi_char_matches>.  <multi_string> is known to have exactly
+     * <cp_count> code points in it.  This is used when constructing a
+     * bracketed character class and we find something that needs to match more
+     * than a single character.
+     *
+     * <multi_char_matches> is actually an array of arrays.  Each top-level
+     * element is an array that contains all the strings known so far that are
+     * the same length.  And that length (in number of code points) is the same
+     * as the index of the top-level array.  Hence, the [2] element is an
+     * array, each element thereof is a string containing TWO code points;
+     * while element [3] is for strings of THREE characters, and so on.  Since
+     * this is for multi-char strings there can never be a [0] nor [1] element.
+     *
+     * When we rewrite the character class below, we will do so such that the
+     * longest strings are written first, so that it prefers the longest
+     * matching strings first.  This is done even if it turns out that any
+     * quantifier is non-greedy, out of this programmer's (khw) laziness.  Tom
+     * Christiansen has agreed that this is ok.  This makes the test for the
+     * ligature 'ffi' come before the test for 'ff', for example */
+
+    AV* this_array;
+    AV** this_array_ptr;
+
+    PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
+
+    if (! multi_char_matches) {
+        multi_char_matches = newAV();
+    }
+
+    if (av_exists(multi_char_matches, cp_count)) {
+        this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+        this_array = *this_array_ptr;
+    }
+    else {
+        this_array = newAV();
+        av_store(multi_char_matches, cp_count,
+                 (SV*) this_array);
+    }
+    av_push(this_array, multi_string);
+
+    return multi_char_matches;
+}
+
  /* The names of properties whose definitions are not known at compile time are
   * stored in this SV, after a constant heading.  So if the length has been
   * changed since initialization, then there is a run-time definition. */
  /* The names of properties whose definitions are not known at compile time are
   * stored in this SV, after a constant heading.  So if the length has been
   * changed since initialization, then there is a run-time definition. */
@@ -13425,7 +13686,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      if (UCHARAT(RExC_parse) == ']')
         goto charclassloop;
  
      if (UCHARAT(RExC_parse) == ']')
         goto charclassloop;
  
-parseit:
      while (1) {
          if  (RExC_parse >= stop_ptr) {
              break;
      while (1) {
          if  (RExC_parse >= stop_ptr) {
              break;
@@ -13472,7 +13732,7 @@ parseit:
          }
          else {
              /* Is a backslash; get the code point of the char after it */
          }
          else {
              /* Is a backslash; get the code point of the char after it */
-           if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
+           if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
                 value = utf8n_to_uvchr((U8*)RExC_parse,
                                    RExC_end - RExC_parse,
                                    &numlen, UTF8_ALLOW_DEFAULT);
                 value = utf8n_to_uvchr((U8*)RExC_parse,
                                    RExC_end - RExC_parse,
                                    &numlen, UTF8_ALLOW_DEFAULT);
@@ -13505,19 +13765,58 @@ parseit:
             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
              case 'N':  /* Handle \N{NAME} in class */
                  {
             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
              case 'N':  /* Handle \N{NAME} in class */
                  {
-                    /* We only pay attention to the first char of
-                    multichar strings being returned. I kinda wonder
-                    if this makes sense as it does change the behaviour
-                    from earlier versions, OTOH that behaviour was broken
-                    as well. */
-                    if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
-                                      TRUE, /* => charclass */
-                                      strict))
-                    {
-                        if (*flagp & RESTART_UTF8)
-                            FAIL("panic: grok_bslash_N set RESTART_UTF8");
-                        goto parseit;
+                    SV *as_text;
+                    STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
+                                                    flagp, depth, &as_text);
+                    if (*flagp & RESTART_UTF8)
+                        FAIL("panic: grok_bslash_N set RESTART_UTF8");
+                    if (cp_count != 1) {    /* The typical case drops through */
+                        assert(cp_count != (STRLEN) -1);
+                        if (cp_count == 0) {
+                            if (strict) {
+                                RExC_parse++;   /* Position after the "}" */
+                                vFAIL("Zero length \\N{}");
+                            }
+                            else if (PASS2) {
+                                ckWARNreg(RExC_parse,
+                                        "Ignoring zero length \\N{} in character class");
+                            }
+                        }
+                        else { /* cp_count > 1 */
+                            if (! RExC_in_multi_char_class) {
+                                if (invert || range || *RExC_parse == '-') {
+                                    if (strict) {
+                                        RExC_parse--;
+                                        vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+                                    }
+                                    else if (PASS2) {
+                                        ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
+                                    }
+                                }
+                                else {
+                                    multi_char_matches
+                                        = add_multi_match(multi_char_matches,
+                                                          as_text,
+                                                          cp_count);
+                                }
+                                break; /* <value> contains the first code
+                                          point. Drop out of the switch to
+                                          process it */
+                            }
+                        } /* End of cp_count != 1 */
+
+                        /* This element should not be processed further in this
+                         * class */
+                        element_count--;
+                        value = save_value;
+                        prevvalue = save_prevvalue;
+                        continue;   /* Back to top of loop to get next char */
                      }
                      }
+                    /* Here, is a single code point, and <value> contains it */
+#ifdef EBCDIC
+                    /* We consider named characters to be literal characters */
+                    literal_endpoint++;
+#endif
                  }
                  break;
             case 'p':
                  }
                  break;
             case 'p':
@@ -13640,7 +13939,8 @@ parseit:
                           * inappropriately, except that any \p{}, including
                           * this one forces Unicode semantics, which means there
                           * is no <depends_list> */
                           * inappropriately, except that any \p{}, including
                           * this one forces Unicode semantics, which means there
                           * is no <depends_list> */
-                        ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
+                        ANYOF_FLAGS(ret)
+                                      |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
                      }
                      else {
  
                      }
                      else {
  
@@ -13705,8 +14005,8 @@ parseit:
                     bool valid = grok_bslash_o(&RExC_parse,
                                                &value,
                                                &error_msg,
                     bool valid = grok_bslash_o(&RExC_parse,
                                                &value,
                                                &error_msg,
-                                               SIZE_ONLY,   /* warnings in pass
-                                                               1 only */
+                                               PASS2,   /* warnings only in
+                                                           pass 2 */
                                                 strict,
                                                 silence_non_portable,
                                                 UTF);
                                                 strict,
                                                 silence_non_portable,
                                                 UTF);
@@ -13725,7 +14025,7 @@ parseit:
                     bool valid = grok_bslash_x(&RExC_parse,
                                                &value,
                                                &error_msg,
                     bool valid = grok_bslash_x(&RExC_parse,
                                                &value,
                                                &error_msg,
-                                              TRUE, /* Output warnings */
+                                              PASS2, /* Output warnings */
                                                 strict,
                                                 silence_non_portable,
                                                 UTF);
                                                 strict,
                                                 silence_non_portable,
                                                 UTF);
@@ -13737,7 +14037,7 @@ parseit:
                     goto recode_encoding;
                 break;
             case 'c':
                     goto recode_encoding;
                 break;
             case 'c':
-               value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
+               value = grok_bslash_c(*RExC_parse++, PASS2);
                 break;
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7':
                 break;
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7':
@@ -13777,7 +14077,7 @@ parseit:
                          if (strict) {
                              vFAIL("Invalid escape in the specified encoding");
                          }
                          if (strict) {
                              vFAIL("Invalid escape in the specified encoding");
                          }
-                        else if (SIZE_ONLY) {
+                        else if (PASS2) {
                              ckWARNreg(RExC_parse,
                                   "Invalid escape in the specified encoding");
                          }
                              ckWARNreg(RExC_parse,
                                   "Invalid escape in the specified encoding");
                          }
@@ -13859,18 +14159,18 @@ parseit:
                      else {
                          RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
                      }
                      else {
                          RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
                      }
-                    ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
+                    ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
                      ANYOF_POSIXL_ZERO(ret);
                  }
  
                  /* Coverity thinks it is possible for this to be negative; both
                   * jhi and khw think it's not, but be safer */
                      ANYOF_POSIXL_ZERO(ret);
                  }
  
                  /* Coverity thinks it is possible for this to be negative; both
                   * jhi and khw think it's not, but be safer */
-                assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+                assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
                         || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
  
                  /* See if it already matches the complement of this POSIX
                   * class */
                         || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
  
                  /* See if it already matches the complement of this POSIX
                   * class */
-                if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+                if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
                      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
                                                              ? -1
                                                              : 1)))
                      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
                                                              ? -1
                                                              : 1)))
@@ -13955,22 +14255,23 @@ parseit:
                                                       namedclass % 2 != 0,
                                                       posixes_ptr);
                  }
                                                       namedclass % 2 != 0,
                                                       posixes_ptr);
                  }
-                continue;   /* Go get next character */
             }
         } /* end of namedclass \blah */
  
             }
         } /* end of namedclass \blah */
  
-        /* Here, we have a single value.  If 'range' is set, it is the ending
-         * of a range--check its validity.  Later, we will handle each
-         * individual code point in the range.  If 'range' isn't set, this
-         * could be the beginning of a range, so check for that by looking
-         * ahead to see if the next real character to be processed is the range
-         * indicator--the minus sign */
-
          if (skip_white) {
              RExC_parse = regpatws(pRExC_state, RExC_parse,
                                  FALSE /* means don't recognize comments */ );
          }
  
          if (skip_white) {
              RExC_parse = regpatws(pRExC_state, RExC_parse,
                                  FALSE /* means don't recognize comments */ );
          }
  
+        /* If 'range' is set, 'value' is the ending of a range--check its
+         * validity.  (If value isn't a single code point in the case of a
+         * range, we should have figured that out above in the code that
+         * catches false ranges).  Later, we will handle each individual code
+         * point in the range.  If 'range' isn't set, this could be the
+         * beginning of a range, so check for that by looking ahead to see if
+         * the next real character to be processed is the range indicator--the
+         * minus sign */
+
         if (range) {
             if (prevvalue > value) /* b-a */ {
                 const int w = RExC_parse - rangebegin;
         if (range) {
             if (prevvalue > value) /* b-a */ {
                 const int w = RExC_parse - rangebegin;
@@ -14000,15 +14301,15 @@ parseit:
  
                      /* a bad range like \w-, [:word:]- ? */
                      if (namedclass > OOB_NAMEDCLASS) {
  
                      /* a bad range like \w-, [:word:]- ? */
                      if (namedclass > OOB_NAMEDCLASS) {
-                        if (strict || ckWARN(WARN_REGEXP)) {
-                            const int w =
-                                RExC_parse >= rangebegin ?
-                                RExC_parse - rangebegin : 0;
+                        if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
+                            const int w = RExC_parse >= rangebegin
+                                          ?  RExC_parse - rangebegin
+                                          : 0;
                              if (strict) {
                                  vFAIL4("False [] range \"%*.*s\"",
                                      w, w, rangebegin);
                              }
                              if (strict) {
                                  vFAIL4("False [] range \"%*.*s\"",
                                      w, w, rangebegin);
                              }
-                            else {
+                            else if (PASS2) {
                                  vWARN4(RExC_parse,
                                      "False [] range \"%*.*s\"",
                                      w, w, rangebegin);
                                  vWARN4(RExC_parse,
                                      "False [] range \"%*.*s\"",
                                      w, w, rangebegin);
@@ -14025,8 +14326,12 @@ parseit:
             }
         }
  
             }
         }
  
-        /* Here, <prevvalue> is the beginning of the range, if any; or <value>
-         * if not */
+        if (namedclass > OOB_NAMEDCLASS) {
+            continue;
+        }
+
+        /* Here, we have a single value, and <prevvalue> is the beginning of
+         * the range, if any; or <value> if not */
  
         /* non-Latin1 code point implies unicode semantics.  Must be set in
          * pass1 so is there for the whole of pass 2 */
  
         /* non-Latin1 code point implies unicode semantics.  Must be set in
          * pass1 so is there for the whole of pass 2 */
@@ -14074,44 +14379,17 @@ parseit:
                       * again.  Otherwise add this character to the list of
                       * multi-char folds. */
                      if (! RExC_in_multi_char_class) {
                       * again.  Otherwise add this character to the list of
                       * multi-char folds. */
                      if (! RExC_in_multi_char_class) {
-                        AV** this_array_ptr;
-                        AV* this_array;
                          STRLEN cp_count = utf8_length(foldbuf,
                                                        foldbuf + foldlen);
                          SV* multi_fold = sv_2mortal(newSVpvs(""));
  
                          Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
  
                          STRLEN cp_count = utf8_length(foldbuf,
                                                        foldbuf + foldlen);
                          SV* multi_fold = sv_2mortal(newSVpvs(""));
  
                          Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
  
+                        multi_char_matches
+                                        = add_multi_match(multi_char_matches,
+                                                          multi_fold,
+                                                          cp_count);
  
  
-                        if (! multi_char_matches) {
-                            multi_char_matches = newAV();
-                        }
-
-                        /* <multi_char_matches> is actually an array of arrays.
-                         * There will be one or two top-level elements: [2],
-                         * and/or [3].  The [2] element is an array, each
-                         * element thereof is a character which folds to TWO
-                         * characters; [3] is for folds to THREE characters.
-                         * (Unicode guarantees a maximum of 3 characters in any
-                         * fold.)  When we rewrite the character class below,
-                         * we will do so such that the longest folds are
-                         * written first, so that it prefers the longest
-                         * matching strings first.  This is done even if it
-                         * turns out that any quantifier is non-greedy, out of
-                         * programmer laziness.  Tom Christiansen has agreed
-                         * that this is ok.  This makes the test for the
-                         * ligature 'ffi' come before the test for 'ff' */
-                        if (av_exists(multi_char_matches, cp_count)) {
-                            this_array_ptr = (AV**) av_fetch(multi_char_matches,
-                                                             cp_count, FALSE);
-                            this_array = *this_array_ptr;
-                        }
-                        else {
-                            this_array = newAV();
-                            av_store(multi_char_matches, cp_count,
-                                     (SV*) this_array);
-                        }
-                        av_push(this_array, multi_fold);
                      }
  
                      /* This element should not be processed further in this
                      }
  
                      /* This element should not be processed further in this
@@ -14142,19 +14420,20 @@ parseit:
               * included.  literal_endpoint==2 means both ends of the range used
               * a literal character, not \x{foo} */
             if (literal_endpoint == 2
               * included.  literal_endpoint==2 means both ends of the range used
               * a literal character, not \x{foo} */
             if (literal_endpoint == 2
-                && ((prevvalue >= 'a' && value <= 'z')
-                    || (prevvalue >= 'A' && value <= 'Z')))
+                && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+                    || (isUPPER_A(prevvalue) && isUPPER_A(value))))
              {
                  _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
                                        &this_range);
  
              {
                  _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
                                        &this_range);
  
-                /* Since this above only contains ascii, the intersection of it
-                 * with anything will still yield only ascii */
+                /* Since 'this_range' now only contains ascii, the intersection
+                 * of it with anything will still yield only ascii */
                  _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
                                        &this_range);
              }
              _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
              literal_endpoint = 0;
                  _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
                                        &this_range);
              }
              _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
              literal_endpoint = 0;
+            SvREFCNT_dec_NN(this_range);
  #endif
          }
  
  #endif
          }
  
@@ -14225,6 +14504,7 @@ parseit:
         RExC_parse = SvPV(substitute_parse, len);
         RExC_end = RExC_parse + len;
          RExC_in_multi_char_class = 1;
         RExC_parse = SvPV(substitute_parse, len);
         RExC_end = RExC_parse + len;
          RExC_in_multi_char_class = 1;
+       RExC_override_recoding = 1;
          RExC_emit = (regnode *)orig_emit;
  
         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
          RExC_emit = (regnode *)orig_emit;
  
         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
@@ -14234,6 +14514,7 @@ parseit:
         RExC_parse = save_parse;
         RExC_end = save_end;
         RExC_in_multi_char_class = 0;
         RExC_parse = save_parse;
         RExC_end = save_end;
         RExC_in_multi_char_class = 0;
+       RExC_override_recoding = 0;
          SvREFCNT_dec_NN(multi_char_matches);
          return ret;
      }
          SvREFCNT_dec_NN(multi_char_matches);
          return ret;
      }
@@ -14592,7 +14873,7 @@ parseit:
              if (DEPENDS_SEMANTICS) {
                  /* Under /d, everything in the upper half of the Latin1 range
                   * matches these complements */
              if (DEPENDS_SEMANTICS) {
                  /* Under /d, everything in the upper half of the Latin1 range
                   * matches these complements */
-                ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
+                ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
              }
              else if (AT_LEAST_ASCII_RESTRICTED) {
                  /* Under /a and /aa, everything above ASCII matches these
              }
              else if (AT_LEAST_ASCII_RESTRICTED) {
                  /* Under /a and /aa, everything above ASCII matches these
@@ -14898,7 +15179,7 @@ parseit:
         else {
             cp_list = depends_list;
         }
         else {
             cp_list = depends_list;
         }
-        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+        ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
      }
  
      /* If there is a swash and more than one element, we can't use the swash in
      }
  
      /* If there is a swash and more than one element, we can't use the swash in
@@ -14940,7 +15221,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
  {
      /* Sets the arg field of an ANYOF-type node 'node', using information about
       * the node passed-in.  If there is nothing outside the node's bitmap, the
  {
      /* Sets the arg field of an ANYOF-type node 'node', using information about
       * the node passed-in.  If there is nothing outside the node's bitmap, the
-     * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
+     * arg is set to ANYOF_ONLY_HAS_BITMAP.  Otherwise, it sets the argument to
       * the count returned by add_data(), having allocated and stored an array,
       * av, that that count references, as follows:
       *  av[0] stores the character class description in its textual form.
       * the count returned by add_data(), having allocated and stored an array,
       * av, that that count references, as follows:
       *  av[0] stores the character class description in its textual form.
@@ -14966,15 +15247,17 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
  
      if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
          assert(! (ANYOF_FLAGS(node)
  
      if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
          assert(! (ANYOF_FLAGS(node)
-                    & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
-       ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
+                  & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
+                     |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
+       ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
      }
      else {
         AV * const av = newAV();
         SV *rv;
  
          assert(ANYOF_FLAGS(node)
      }
      else {
         AV * const av = newAV();
         SV *rv;
  
          assert(ANYOF_FLAGS(node)
-                    & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+               & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
+                  |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
  
         av_store(av, 0, (runtime_defns)
                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
  
         av_store(av, 0, (runtime_defns)
                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
@@ -15040,7 +15323,8 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
      PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
  
      assert(ANYOF_FLAGS(node)
      PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
  
      assert(ANYOF_FLAGS(node)
-                        & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+        & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
+           |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
  
      if (data && data->count) {
         const U32 n = ARG(node);
  
      if (data && data->count) {
         const U32 n = ARG(node);
@@ -15212,21 +15496,23 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
      }
  }
  
      }
  }
  
-/*
-- reg_node - emit a node
-*/
-STATIC regnode *                       /* Location. */
-S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
+STATIC regnode *
+S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
  {
  {
-    regnode *ptr;
+    /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
+     * space.  In pass1, it aligns and increments RExC_size; in pass2,
+     * RExC_emit */
+
      regnode * const ret = RExC_emit;
      GET_RE_DEBUG_FLAGS_DECL;
  
      regnode * const ret = RExC_emit;
      GET_RE_DEBUG_FLAGS_DECL;
  
-    PERL_ARGS_ASSERT_REG_NODE;
+    PERL_ARGS_ASSERT_REGNODE_GUTS;
+
+    assert(extra_size >= regarglen[op]);
  
      if (SIZE_ONLY) {
         SIZE_ALIGN(RExC_size);
  
      if (SIZE_ONLY) {
         SIZE_ALIGN(RExC_size);
-       RExC_size += 1;
+       RExC_size += 1 + extra_size;
         return(ret);
      }
      if (RExC_emit >= RExC_emit_bound)
         return(ret);
      }
      if (RExC_emit >= RExC_emit_bound)
@@ -15234,13 +15520,13 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
  
      NODE_ALIGN_FILL(ret);
                    op, (void*)RExC_emit, (void*)RExC_emit_bound);
  
      NODE_ALIGN_FILL(ret);
-    ptr = ret;
-    FILL_ADVANCE_NODE(ptr, op);
-#ifdef RE_TRACK_PATTERN_OFFSETS
+#ifndef RE_TRACK_PATTERN_OFFSETS
+    PERL_UNUSED_ARG(name);
+#else
      if (RExC_offsets) {         /* MJD */
         MJD_OFFSET_DEBUG(
                ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
      if (RExC_offsets) {         /* MJD */
         MJD_OFFSET_DEBUG(
                ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
-              "reg_node", __LINE__,
+              name, __LINE__,
                PL_reg_name[op],
                (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
                 ? "Overwriting end of array!\n" : "OK",
                PL_reg_name[op],
                (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
                 ? "Overwriting end of array!\n" : "OK",
@@ -15250,7 +15536,26 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
      }
  #endif
         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
      }
  #endif
-    RExC_emit = ptr;
+    return(ret);
+}
+
+/*
+- reg_node - emit a node
+*/
+STATIC regnode *                       /* Location. */
+S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
+{
+    regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
+
+    PERL_ARGS_ASSERT_REG_NODE;
+
+    assert(regarglen[op] == 0);
+
+    if (PASS2) {
+        regnode *ptr = ret;
+        FILL_ADVANCE_NODE(ptr, op);
+        RExC_emit = ptr;
+    }
      return(ret);
  }
  
      return(ret);
  }
  
@@ -15260,54 +15565,36 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
  STATIC regnode *                       /* Location. */
  S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
  {
  STATIC regnode *                       /* Location. */
  S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
  {
-    regnode *ptr;
-    regnode * const ret = RExC_emit;
-    GET_RE_DEBUG_FLAGS_DECL;
+    regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
  
      PERL_ARGS_ASSERT_REGANODE;
  
  
      PERL_ARGS_ASSERT_REGANODE;
  
-    if (SIZE_ONLY) {
-       SIZE_ALIGN(RExC_size);
-       RExC_size += 2;
-       /*
-          We can't do this:
+    assert(regarglen[op] == 1);
  
  
-          assert(2==regarglen[op]+1);
+    if (PASS2) {
+        regnode *ptr = ret;
+        FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+        RExC_emit = ptr;
+    }
+    return(ret);
+}
  
  
-          Anything larger than this has to allocate the extra amount.
-          If we changed this to be:
+STATIC regnode *
+S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
+{
+    /* emit a node with U32 and I32 arguments */
  
  
-          RExC_size += (1 + regarglen[op]);
+    regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
  
  
-          then it wouldn't matter. Its not clear what side effect
-          might come from that so its not done so far.
-          -- dmq
-       */
-       return(ret);
-    }
-    if (RExC_emit >= RExC_emit_bound)
-        Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
-                  op, (void*)RExC_emit, (void*)RExC_emit_bound);
+    PERL_ARGS_ASSERT_REG2LANODE;
  
  
-    NODE_ALIGN_FILL(ret);
-    ptr = ret;
-    FILL_ADVANCE_NODE_ARG(ptr, op, arg);
-#ifdef RE_TRACK_PATTERN_OFFSETS
-    if (RExC_offsets) {         /* MJD */
-       MJD_OFFSET_DEBUG(
-              ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
-              "reganode",
-             __LINE__,
-             PL_reg_name[op],
-              (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
-              "Overwriting end of array!\n" : "OK",
-              (UV)(RExC_emit - RExC_emit_start),
-              (UV)(RExC_parse - RExC_start),
-              (UV)RExC_offsets[0]));
-       Set_Cur_Node_Offset;
+    assert(regarglen[op] == 2);
+
+    if (PASS2) {
+        regnode *ptr = ret;
+        FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
+        RExC_emit = ptr;
      }
      }
-#endif
-    RExC_emit = ptr;
      return(ret);
  }
  
      return(ret);
  }
  
@@ -15701,8 +15988,6 @@ Perl_regdump(pTHX_ const regexp *r)
      }
      if (r->intflags & PREGf_ANCH) {
         PerlIO_printf(Perl_debug_log, "anchored");
      }
      if (r->intflags & PREGf_ANCH) {
         PerlIO_printf(Perl_debug_log, "anchored");
-        if (r->intflags & PREGf_ANCH_BOL)
-           PerlIO_printf(Perl_debug_log, "(BOL)");
          if (r->intflags & PREGf_ANCH_MBOL)
             PerlIO_printf(Perl_debug_log, "(MBOL)");
          if (r->intflags & PREGf_ANCH_SBOL)
          if (r->intflags & PREGf_ANCH_MBOL)
             PerlIO_printf(Perl_debug_log, "(MBOL)");
          if (r->intflags & PREGf_ANCH_SBOL)
@@ -15938,9 +16223,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              }
          }
  
              }
          }
  
-       if ((flags & (ANYOF_ABOVE_LATIN1_ALL
-                      |ANYOF_UTF8
-                      |ANYOF_NONBITMAP_NON_UTF8
+       if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
+                      |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
+                      |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
                        |ANYOF_LOC_FOLD)))
          {
              if (do_sep) {
                        |ANYOF_LOC_FOLD)))
          {
              if (do_sep) {
@@ -15950,14 +16235,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      sv_catpvs(sv, "^");
              }
  
                      sv_catpvs(sv, "^");
              }
  
-            if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
+            if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
                  sv_catpvs(sv, "{non-utf8-latin1-all}");
              }
  
              /* output information about the unicode matching */
                  sv_catpvs(sv, "{non-utf8-latin1-all}");
              }
  
              /* output information about the unicode matching */
-            if (flags & ANYOF_ABOVE_LATIN1_ALL)
-                sv_catpvs(sv, "{unicode_all}");
-            else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
+            if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
+                sv_catpvs(sv, "{above_bitmap_all}");
+            else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
                  SV *lv; /* Set if there is something outside the bit map. */
                  bool byte_output = FALSE;   /* If something in the bitmap has
                                                 been output */
                  SV *lv; /* Set if there is something outside the bit map. */
                  bool byte_output = FALSE;   /* If something in the bitmap has
                                                 been output */
@@ -15979,7 +16264,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      if (*s == '\n') {
                          const char * const t = ++s;
  
                      if (*s == '\n') {
                          const char * const t = ++s;
  
-                        if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+                        if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
                              sv_catpvs(sv, "{outside bitmap}");
                          }
                          else {
                              sv_catpvs(sv, "{outside bitmap}");
                          }
                          else {
@@ -16031,7 +16316,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
                      invlist_iterinit(only_utf8_locale);
                      while (invlist_iternext(only_utf8_locale,
                                              &start, &end)) {
                      invlist_iterinit(only_utf8_locale);
                      while (invlist_iternext(only_utf8_locale,
                                              &start, &end)) {
-                        put_range(sv, start, end);
+                        put_range(sv, start, end, FALSE);
                          max_entries --;
                          if (max_entries < 0) {
                              sv_catpvs(sv, "...");
                          max_entries --;
                          if (max_entries < 0) {
                              sv_catpvs(sv, "...");
@@ -16064,6 +16349,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
      }
      else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
      }
      else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+    else if (OP(o) == SBOL)
+        Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
  #else
      PERL_UNUSED_CONTEXT;
      PERL_UNUSED_ARG(sv);
  #else
      PERL_UNUSED_CONTEXT;
      PERL_UNUSED_ARG(sv);
@@ -16636,174 +16923,190 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
      Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
  }
  
      Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
  }
  
-/* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
-
-#ifndef PERL_IN_XSUB_RE
-void
-Perl_save_re_context(pTHX)
-{
-    /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
-    if (PL_curpm) {
-       const REGEXP * const rx = PM_GETRE(PL_curpm);
-       if (rx) {
-           U32 i;
-           for (i = 1; i <= RX_NPARENS(rx); i++) {
-               char digits[TYPE_CHARS(long)];
-               const STRLEN len = my_snprintf(digits, sizeof(digits),
-                                               "%lu", (long)i);
-               GV *const *const gvp
-                   = (GV**)hv_fetch(PL_defstash, digits, len, 0);
-
-               if (gvp) {
-                   GV * const gv = *gvp;
-                   if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
-                       save_scalar(gv);
-               }
-           }
-       }
-    }
-}
-#endif
-
  #ifdef DEBUGGING
  #ifdef DEBUGGING
-
  /* Certain characters are output as a sequence with the first being a
   * backslash. */
  #define isBACKSLASHED_PUNCT(c)                                              \
                      ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
  
  STATIC void
  /* Certain characters are output as a sequence with the first being a
   * backslash. */
  #define isBACKSLASHED_PUNCT(c)                                              \
                      ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
  
  STATIC void
-S_put_byte(pTHX_ SV *sv, int c)
+S_put_code_point(pTHX_ SV *sv, UV c)
  {
  {
-    PERL_ARGS_ASSERT_PUT_BYTE;
-
-    if (!isPRINT(c)) {
-        switch (c) {
-            case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
-            case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break;
-            case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break;
-            case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
-            case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
-            case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
-            case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
-            default: Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); break;
-        }
+    PERL_ARGS_ASSERT_PUT_CODE_POINT;
+
+    if (c > 255) {
+        Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
      }
      }
-    else {
-       const char string = c;
+    else if (isPRINT(c)) {
+       const char string = (char) c;
         if (isBACKSLASHED_PUNCT(c))
             sv_catpvs(sv, "\\");
         sv_catpvn(sv, &string, 1);
      }
         if (isBACKSLASHED_PUNCT(c))
             sv_catpvs(sv, "\\");
         sv_catpvn(sv, &string, 1);
      }
+    else {
+        const char * const mnemonic = cntrl_to_mnemonic((char) c);
+        if (mnemonic) {
+            Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
+        }
+        else {
+            Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
+        }
+    }
  }
  
  }
  
+#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
  STATIC void
  STATIC void
-S_put_range(pTHX_ SV *sv, UV start, const UV end)
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
  {
  {
-
      /* Appends to 'sv' a displayable version of the range of code points from
       * 'start' to 'end'.  It assumes that only ASCII printables are displayable
      /* Appends to 'sv' a displayable version of the range of code points from
       * 'start' to 'end'.  It assumes that only ASCII printables are displayable
-     * as-is (though some of these will be escaped by put_byte()). */
+     * as-is (though some of these will be escaped by put_code_point()). */
  
  
-    const int min_range_count = 3;
+    const unsigned int min_range_count = 3;
  
      assert(start <= end);
  
      PERL_ARGS_ASSERT_PUT_RANGE;
  
      while (start <= end) {
  
      assert(start <= end);
  
      PERL_ARGS_ASSERT_PUT_RANGE;
  
      while (start <= end) {
+        UV this_end;
+        const char * format;
+
          if (end - start < min_range_count) {
  
              /* Individual chars in short ranges */
              for (; start <= end; start++) {
          if (end - start < min_range_count) {
  
              /* Individual chars in short ranges */
              for (; start <= end; start++) {
-                put_byte(sv, start);
+                put_code_point(sv, start);
              }
              break;
          }
  
              }
              break;
          }
  
-        /* For small ranges that include printable ASCII characters, it's more
-         * legible to print those characters rather than hex values.  For
-         * larger ranges that include more than printables, it's probably
-         * clearer to just give the start and end points of the range in hex,
-         * and that's all we can do if there aren't any printables within the
-         * range
-         *
-         * On ASCII platforms the range of printables is contiguous.  If the
-         * entire range is printable, we print each character as such.  If the
-         * range is partially printable and partially not, it's less likely
-         * that the individual printables are meaningful, especially if all or
-         * almost all of them are in the range.  But we err on the side of the
-         * individual printables being meaningful by using the hex only if the
-         * range contains all but 2 of the printables.
-         *
-         * On EBCDIC platforms, the printables are scattered around so that the
-         * maximum range length containing only them is about 10.  Anything
-         * longer we treat as hex; otherwise we examine the range character by
-         * character to see */
-#ifdef EBCDIC
-        if (start < 256 && (((end < 255) ? end : 255) - start <= 10))
-#else
-        if ((isPRINT_A(start) && isPRINT_A(end))
-            || (end >= 0x7F && (isPRINT_A(start) && start > 0x21))
-            || ((end < 0x7D && isPRINT_A(end)) && start < 0x20))
-#endif
-        {
-            /* If the range beginning isn't an ASCII printable, we find the
-             * last such in the range, then split the output, so all the
-             * non-printables are in one subrange; then process the remaining
-             * portion as usual.  If the entire range isn't printables, we
-             * don't split, but drop down to print as hex */
+        /* If permitted by the input options, and there is a possibility that
+         * this range contains a printable literal, look to see if there is
+         * one.  */
+        if (allow_literals && start <= MAX_PRINT_A) {
+
+            /* If the range begin isn't an ASCII printable, effectively split
+             * the range into two parts:
+             *  1) the portion before the first such printable,
+             *  2) the rest
+             * and output them separately. */
              if (! isPRINT_A(start)) {
                  UV temp_end = start + 1;
              if (! isPRINT_A(start)) {
                  UV temp_end = start + 1;
-                while (temp_end <= end && ! isPRINT_A(temp_end)) {
+
+                /* There is no point looking beyond the final possible
+                 * printable, in MAX_PRINT_A */
+                UV max = MIN(end, MAX_PRINT_A);
+
+                while (temp_end <= max && ! isPRINT_A(temp_end)) {
                      temp_end++;
                  }
                      temp_end++;
                  }
-                if (temp_end <= end) {
-                    put_range(sv, start, temp_end - 1);
-                    start = temp_end;
-                    continue;
+
+                /* Here, temp_end points to one beyond the first printable if
+                 * found, or to one beyond 'max' if not.  If none found, make
+                 * sure that we use the entire range */
+                if (temp_end > MAX_PRINT_A) {
+                    temp_end = end + 1;
                  }
                  }
-            }
  
  
-            /* If the range beginning is a digit, output a subrange of just the
-             * digits, then process the remaining portion as usual */
-            if (isDIGIT_A(start)) {
-                put_byte(sv, start);
-                sv_catpvs(sv, "-");
-                while (start <= end && isDIGIT_A(start)) start++;
-                put_byte(sv, start - 1);
+                /* Output the first part of the split range, the part that
+                 * doesn't have printables, with no looking for literals
+                 * (otherwise we would infinitely recurse) */
+                put_range(sv, start, temp_end - 1, FALSE);
+
+                /* The 2nd part of the range (if any) starts here. */
+                start = temp_end;
+
+                /* We continue instead of dropping down because even if the 2nd
+                 * part is non-empty, it could be so short that we want to
+                 * output it specially, as tested for at the top of this loop.
+                 * */
                  continue;
              }
  
                  continue;
              }
  
-            /* Similarly for alphabetics.  Because in both ASCII and EBCDIC,
-             * the code points for upper and lower A-Z and a-z aren't
-             * intermixed, the resulting subrange will consist solely of either
-             * upper- or lower- alphabetics */
-            if (isALPHA_A(start)) {
-                put_byte(sv, start);
-                sv_catpvs(sv, "-");
-                while (start <= end && isALPHA_A(start)) start++;
-                put_byte(sv, start - 1);
+            /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
+             * output a sub-range of just the digits or letters, then process
+             * the remaining portion as usual. */
+            if (isALPHANUMERIC_A(start)) {
+                UV mask = (isDIGIT_A(start))
+                           ? _CC_DIGIT
+                             : isUPPER_A(start)
+                               ? _CC_UPPER
+                               : _CC_LOWER;
+                UV temp_end = start + 1;
+
+                /* Find the end of the sub-range that includes just the
+                 * characters in the same class as the first character in it */
+                while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
+                    temp_end++;
+                }
+                temp_end--;
+
+                /* For short ranges, don't duplicate the code above to output
+                 * them; just call recursively */
+                if (temp_end - start < min_range_count) {
+                    put_range(sv, start, temp_end, FALSE);
+                }
+                else {  /* Output as a range */
+                    put_code_point(sv, start);
+                    sv_catpvs(sv, "-");
+                    put_code_point(sv, temp_end);
+                }
+                start = temp_end + 1;
                  continue;
              }
  
              /* We output any other printables as individual characters */
              if (isPUNCT_A(start) || isSPACE_A(start)) {
                  continue;
              }
  
              /* We output any other printables as individual characters */
              if (isPUNCT_A(start) || isSPACE_A(start)) {
-                while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) {
-                    put_byte(sv, start);
+                while (start <= end && (isPUNCT_A(start)
+                                        || isSPACE_A(start)))
+                {
+                    put_code_point(sv, start);
                      start++;
                  }
                  continue;
              }
                      start++;
                  }
                  continue;
              }
+        } /* End of looking for literals */
+
+        /* Here is not to output as a literal.  Some control characters have
+         * mnemonic names.  Split off any of those at the beginning and end of
+         * the range to print mnemonically.  It isn't possible for many of
+         * these to be in a row, so this won't overwhelm with output */
+        while (isMNEMONIC_CNTRL(start) && start <= end) {
+            put_code_point(sv, start);
+            start++;
+        }
+        if (start < end && isMNEMONIC_CNTRL(end)) {
+
+            /* Here, the final character in the range has a mnemonic name.
+             * Work backwards from the end to find the final non-mnemonic */
+            UV temp_end = end - 1;
+            while (isMNEMONIC_CNTRL(temp_end)) {
+                temp_end--;
+            }
+
+            /* And separately output the range that doesn't have mnemonics */
+            put_range(sv, start, temp_end, FALSE);
+
+            /* Then output the mnemonic trailing controls */
+            start = temp_end + 1;
+            while (start <= end) {
+                put_code_point(sv, start);
+                start++;
+            }
+            break;
          }
  
          }
  
-        /* Here is a control or non-ascii.  Output the range or subrange as
-         * hex. */
-        Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
-                       start,
-                       (end < NUM_ANYOF_CODE_POINTS)
-                       ? end
-                       : NUM_ANYOF_CODE_POINTS - 1);
+        /* As a final resort, output the range or subrange as hex. */
+
+        this_end = (end < NUM_ANYOF_CODE_POINTS)
+                    ? end
+                    : NUM_ANYOF_CODE_POINTS - 1;
+        format = (this_end < 256)
+                 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
+                 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+        Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
          break;
      }
  }
          break;
      }
  }
@@ -16817,42 +17120,98 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
       * inversion list of what is in the bit map */
  
      int i;
       * inversion list of what is in the bit map */
  
      int i;
-    bool has_output_anything = FALSE;
+    UV start, end;
+    unsigned int punct_count = 0;
+    SV* invlist = NULL;
+    SV** invlist_ptr;   /* Temporary, in case bitmap_invlist is NULL */
+    bool allow_literals = TRUE;
  
      PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
  
  
      PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
  
-    if (bitmap_invlist) {
-        /* Worst case is exactly every-other code point is in the list */
-        *bitmap_invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
-    }
-    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
-        if (BITMAP_TEST((U8 *) bitmap,i)) {
-            int j;
+    invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
  
  
-            if (bitmap_invlist) {
-                *bitmap_invlist = add_cp_to_invlist(*bitmap_invlist, i);
-            }
+    /* Worst case is exactly every-other code point is in the list */
+    *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
  
  
-            /* The character at index i should be output.  Find the next
-             * character that should NOT be output */
-            for (j = i + 1; j < NUM_ANYOF_CODE_POINTS; j++) {
-                if (! BITMAP_TEST((U8 *) bitmap, j)) {
-                    break;
-                }
-                if (bitmap_invlist) {
-                    *bitmap_invlist = add_cp_to_invlist(*bitmap_invlist, j);
+    /* Convert the bit map to an inversion list, keeping track of how many
+     * ASCII puncts are set, including an extra amount for the backslashed
+     * ones.  */
+    for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+        if (BITMAP_TEST(bitmap, i)) {
+            *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+            if (isPUNCT_A(i)) {
+                punct_count++;
+                if isBACKSLASHED_PUNCT(i) {
+                    punct_count++;
                  }
              }
                  }
              }
+        }
+    }
+
+    /* Nothing to output */
+    if (_invlist_len(*invlist_ptr) == 0) {
+        SvREFCNT_dec(invlist);
+        return FALSE;
+    }
+
+    /* Generally, it is more readable if printable characters are output as
+     * literals, but if a range (nearly) spans all of them, it's best to output
+     * it as a single range.  This code will use a single range if all but 2
+     * printables are in it */
+    invlist_iterinit(*invlist_ptr);
+    while (invlist_iternext(*invlist_ptr, &start, &end)) {
+
+        /* If range starts beyond final printable, it doesn't have any in it */
+        if (start > MAX_PRINT_A) {
+            break;
+        }
  
  
-            /* Everything between them is a single range that should be output
-             * */
-            put_range(sv, i, j - 1);
-            has_output_anything = TRUE;
-            i = j;
+        /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
+         * all but two, the range must start and end no later than 2 from
+         * either end */
+        if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+            if (end > MAX_PRINT_A) {
+                end = MAX_PRINT_A;
+            }
+            if (start < ' ') {
+                start = ' ';
+            }
+            if (end - start >= MAX_PRINT_A - ' ' - 2) {
+                allow_literals = FALSE;
+            }
+            break;
          }
      }
          }
      }
+    invlist_iterfinish(*invlist_ptr);
+
+    /* The legibility of the output depends mostly on how many punctuation
+     * characters are output.  There are 32 possible ASCII ones, and some have
+     * an additional backslash, bringing it to currently 36, so if any more
+     * than 18 are to be output, we can instead output it as its complement,
+     * yielding fewer puncts, and making it more legible.  But give some weight
+     * to the fact that outputting it as a complement is less legible than a
+     * straight output, so don't complement unless we are somewhat over the 18
+     * mark */
+    if (allow_literals && punct_count > 22) {
+        sv_catpvs(sv, "^");
+
+        /* Add everything remaining to the list, so when we invert it just
+         * below, it will be excluded */
+        _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
+        _invlist_invert(*invlist_ptr);
+    }
+
+    /* Here we have figured things out.  Output each range */
+    invlist_iterinit(*invlist_ptr);
+    while (invlist_iternext(*invlist_ptr, &start, &end)) {
+        if (start >= NUM_ANYOF_CODE_POINTS) {
+            break;
+        }
+        put_range(sv, start, end, allow_literals);
+    }
+    invlist_iterfinish(*invlist_ptr);
  
  
-    return has_output_anything;
+    return TRUE;
  }
  
  #define CLEAR_OPTSTART \
  }
  
  #define CLEAR_OPTSTART \
@@ -17003,7 +17362,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
         }
         else if (PL_regkind[(U8)op] == ANYOF) {
             /* arglen 1 + class block */
         }
         else if (PL_regkind[(U8)op] == ANYOF) {
             /* arglen 1 + class block */
-           node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
+           node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
                            ? ANYOF_POSIXL_SKIP
                            : ANYOF_SKIP);
             node = NEXTOPER(node);
                            ? ANYOF_POSIXL_SKIP
                            : ANYOF_SKIP);
             node = NEXTOPER(node);