embed.fnc: Add caution about R flag

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index a753d8c..6aea5c4 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -131,6 +131,8 @@ struct RExC_state_t {
      char       *parse;                 /* Input-scan pointer. */
      char        *copy_start;            /* start of copy of input within
                                             constructed parse string */
+    char        *save_copy_start;       /* Provides one level of saving
+                                           and restoring 'copy_start' */
      char        *copy_start_in_input;   /* Position in input string
                                             corresponding to copy_start */
      SSize_t    whilem_seen;            /* number of WHILEM in this expr */
@@ -163,6 +165,7 @@ struct RExC_state_t {
      I32                seen_zerolen;
      regnode_offset *open_parens;       /* offsets to open parens */
      regnode_offset *close_parens;      /* offsets to close parens */
+    I32      parens_buf_size;           /* #slots malloced open/close_parens */
      regnode     *end_op;                /* END node in program */
      I32                utf8;           /* whether the pattern is utf8 or not */
      I32                orig_utf8;      /* whether the pattern was originally in utf8 */
@@ -179,11 +182,10 @@ struct RExC_state_t {
                                             through */
      U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
      I32                in_lookbehind;
+    I32                in_lookahead;
      I32                contains_locale;
      I32                override_recoding;
-#ifdef EBCDIC
-    I32                recode_x_to_native;
-#endif
+    I32         recode_x_to_native;
      I32                in_multi_char_class;
      struct reg_code_blocks *code_blocks;/* positions of literal (?{})
                                             within pattern */
@@ -228,6 +230,7 @@ struct RExC_state_t {
  #define RExC_precomp   (pRExC_state->precomp)
  #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
  #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
+#define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
  #define RExC_precomp_end (pRExC_state->precomp_end)
  #define RExC_rx_sv     (pRExC_state->rx_sv)
  #define RExC_rx                (pRExC_state->rx)
@@ -240,7 +243,6 @@ struct RExC_state_t {
  #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
                                                     under /d from /u ? */
  
-
  #ifdef RE_TRACK_PATTERN_OFFSETS
  #  define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
                                                           others */
@@ -253,6 +255,7 @@ struct RExC_state_t {
  #define RExC_maxlen        (pRExC_state->maxlen)
  #define RExC_npar      (pRExC_state->npar)
  #define RExC_total_parens      (pRExC_state->total_par)
+#define RExC_parens_buf_size   (pRExC_state->parens_buf_size)
  #define RExC_nestroot   (pRExC_state->nestroot)
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_utf8      (pRExC_state->utf8)
@@ -268,10 +271,17 @@ struct RExC_state_t {
  #define RExC_study_chunk_recursed_bytes  \
                                     (pRExC_state->study_chunk_recursed_bytes)
  #define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
+#define RExC_in_lookahead      (pRExC_state->in_lookahead)
  #define RExC_contains_locale   (pRExC_state->contains_locale)
+#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+
  #ifdef EBCDIC
-#   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#  define SET_recode_x_to_native(x)                                         \
+                    STMT_START { RExC_recode_x_to_native = (x); } STMT_END
+#else
+#  define SET_recode_x_to_native(x) NOOP
  #endif
+
  #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
  #define RExC_frame_head (pRExC_state->frame_head)
  #define RExC_frame_last (pRExC_state->frame_last)
@@ -364,7 +374,6 @@ struct RExC_state_t {
              }                                                               \
      } STMT_END
  
-#define BRANCH_MAX_OFFSET   U16_MAX
  #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
      STMT_START {                                                            \
                  RExC_use_BRANCHJ = 1;                                       \
@@ -705,7 +714,7 @@ static const scan_data_t zero_scan_data = {
  
  /* Used to point after bad bytes for an error message, but avoid skipping
   * past a nul byte. */
-#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
  
  /* Set up to clean up after our imminent demise */
  #define PREPARE_TO_DIE                                                      \
@@ -744,6 +753,10 @@ static const scan_data_t zero_scan_data = {
      Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
  
+#define        FAIL3(msg,arg1,arg2) _FAIL(                         \
+    Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
+     arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+
  /*
   * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   */
@@ -820,8 +833,13 @@ static const scan_data_t zero_scan_data = {
  } STMT_END
  
  /* Setting this to NULL is a signal to not output warnings */
-#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
-#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
+    STMT_START {                                                            \
+      RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
+      RExC_copy_start_in_constructed = NULL;                                \
+    } STMT_END
+#define RESTORE_WARNINGS                                                    \
+    RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
  
  /* Since a warning can be generated multiple times as the input is reparsed, we
   * output it the first time we come to that point in the parse, but suppress it
@@ -1573,6 +1591,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      unsigned int i;
      const U32 n = ARG(node);
      bool new_node_has_latin1 = FALSE;
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+                      ? 0
+                      : ANYOF_FLAGS(node);
  
      PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
  
@@ -1597,7 +1618,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
          }
  
          /* Get the code points valid only under UTF-8 locales */
-        if (   (ANYOF_FLAGS(node) & ANYOFL_FOLD)
+        if (   (flags & ANYOFL_FOLD)
              &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
          {
              only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
@@ -1618,14 +1639,14 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
       * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
       * have to do this here before we add the unconditionally matched code
       * points */
-    if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+    if (flags & ANYOF_INVERT) {
          _invlist_intersection_complement_2nd(invlist,
                                               PL_UpperLatin1,
                                               &invlist);
      }
  
      /* Add in the points from the bit map */
-    if (OP(node) != ANYOFH) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
              if (ANYOF_BITMAP_TEST(node, i)) {
                  unsigned int start = i++;
@@ -1645,21 +1666,21 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
       * as well.  But don't add them if inverting, as when that gets done below,
       * it would exclude all these characters, including the ones it shouldn't
       * that were added just above */
-    if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
-        && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+    if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
+        && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
      {
          _invlist_union(invlist, PL_UpperLatin1, &invlist);
      }
  
      /* Similarly for these */
-    if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+    if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
          _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
      }
  
-    if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+    if (flags & ANYOF_INVERT) {
          _invlist_invert(invlist);
      }
-    else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+    else if (flags & ANYOFL_FOLD) {
          if (new_node_has_latin1) {
  
              /* Under /li, any 0-255 could fold to any other 0-255, depending on
@@ -1687,7 +1708,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      if (only_utf8_locale_invlist) {
          _invlist_union_maybe_complement_2nd(invlist,
                                              only_utf8_locale_invlist,
-                                            ANYOF_FLAGS(node) & ANYOF_INVERT,
+                                            flags & ANYOF_INVERT,
                                              &invlist);
      }
  
@@ -1712,6 +1733,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * another SSC or a regular ANYOF class.  Can create false positives. */
  
      SV* anded_cp_list;
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(and_with);
      U8  anded_flags;
  
      PERL_ARGS_ASSERT_SSC_AND;
@@ -1722,7 +1746,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * the code point inversion list and just the relevant flags */
      if (is_ANYOF_SYNTHETIC(and_with)) {
          anded_cp_list = ((regnode_ssc *)and_with)->invlist;
-        anded_flags = ANYOF_FLAGS(and_with);
+        anded_flags = and_with_flags;
  
          /* XXX This is a kludge around what appears to be deficiencies in the
           * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
@@ -1746,14 +1770,14 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
      else {
          anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
          if (OP(and_with) == ANYOFD) {
-            anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+            anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
          }
          else {
-            anded_flags = ANYOF_FLAGS(and_with)
+            anded_flags = and_with_flags
              &( ANYOF_COMMON_FLAGS
                |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
-            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+            if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
                  anded_flags &=
                      ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
              }
@@ -1793,7 +1817,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       *                         <=  (C1 & ~C2) | (P1 & ~P2)
       * */
  
-    if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+    if ((and_with_flags & ANYOF_INVERT)
          && ! is_ANYOF_SYNTHETIC(and_with))
      {
          unsigned int i;
@@ -1805,7 +1829,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
          /* If either P1 or P2 is empty, the intersection will be also; can skip
           * the loop */
-        if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
+        if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
              ANYOF_POSIXL_ZERO(ssc);
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
@@ -1865,16 +1889,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
              else {
                  ssc->invlist = anded_cp_list;
                  ANYOF_POSIXL_ZERO(ssc);
-                if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+                if (and_with_flags & ANYOF_MATCHES_POSIXL) {
                      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
                  }
              }
          }
          else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
-                 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
+                 || (and_with_flags & ANYOF_MATCHES_POSIXL))
          {
              /* One or the other of P1, P2 is non-empty. */
-            if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+            if (and_with_flags & ANYOF_MATCHES_POSIXL) {
                  ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
              }
              ssc_union(ssc, anded_cp_list, FALSE);
@@ -1895,6 +1919,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
      SV* ored_cp_list;
      U8 ored_flags;
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+                         ? 0
+                         : ANYOF_FLAGS(or_with);
  
      PERL_ARGS_ASSERT_SSC_OR;
  
@@ -1904,17 +1931,17 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * the code point inversion list and just the relevant flags */
      if (is_ANYOF_SYNTHETIC(or_with)) {
          ored_cp_list = ((regnode_ssc*) or_with)->invlist;
-        ored_flags = ANYOF_FLAGS(or_with);
+        ored_flags = or_with_flags;
      }
      else {
          ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
-        ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+        ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
          if (OP(or_with) != ANYOFD) {
              ored_flags
-            |= ANYOF_FLAGS(or_with)
+            |= or_with_flags
               & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
                  |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
-            if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+            if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
                  ored_flags |=
                      ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
              }
@@ -1941,12 +1968,12 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * (which results in actually simpler code than the non-inverted case)
       * */
  
-    if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+    if ((or_with_flags & ANYOF_INVERT)
          && ! is_ANYOF_SYNTHETIC(or_with))
      {
          /* We ignore P2, leaving P1 going forward */
      }   /* else  Not inverted */
-    else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
+    else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
          ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
          if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
              unsigned int i;
@@ -2508,7 +2535,8 @@ is the recommended Unicode-aware way of saying
         if (UTF) {                                                         \
              SV *zlopp = newSV(UTF8_MAXBYTES);                             \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+            unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
+            *kapow = '\0';                                                 \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
             SvPOK_on(zlopp);                                               \
             SvUTF8_on(zlopp);                                              \
@@ -2701,7 +2729,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
          trie_words = newAV();
      });
  
-    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+    re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
      assert(re_trie_maxbuff);
      if (!SvIOK(re_trie_maxbuff)) {
          sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
@@ -5618,9 +5646,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         STRLEN l;
                         const char * const s = SvPV_const(data->last_found, l);
                         SSize_t old = b - data->last_start_min;
+                        assert(old >= 0);
  
                         if (UTF)
-                           old = utf8_hop((U8*)s, old) - (U8*)s;
+                           old = utf8_hop_forward((U8*)s, old,
+                                               (U8 *) SvEND(data->last_found))
+                                - (U8*)s;
                         l -= old;
                         /* Get the added string: */
                         last_str = newSVpvn_utf8(s  + old, l, UTF);
@@ -5829,6 +5860,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFL:
                  case ANYOFPOSIXL:
                  case ANYOFH:
+                case ANYOFHb:
+                case ANYOFHr:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -5985,14 +6018,27 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                        last, &data_fake, stopparen,
                                        recursed_depth, NULL, f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (minnext > (I32)U8_MAX) {
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || minnext > (I32)U8_MAX
+                        || minnext + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)minnext;
+
+                    /* The 'next_off' field has been repurposed to count the
+                     * additional starting positions to try beyond the initial
+                     * one.  (This leaves it at 0 for non-variable length
+                     * matches to avoid breakage for those not using this
+                     * extension) */
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                        ckWARNexperimental(RExC_parse,
+                            WARN_EXPERIMENTAL__VLB,
+                            "Variable length lookbehind is experimental");
+                    }
+                    scan->flags = (U8)minnext + deltanext;
                  }
                  if (data) {
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -6077,14 +6123,21 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                                          stopparen, recursed_depth, NULL,
                                          f, depth+1);
                  if (scan->flags) {
-                    if (deltanext) {
-                       FAIL("Variable length lookbehind not implemented");
-                    }
-                    else if (*minnextp > (I32)U8_MAX) {
+                    assert(0);  /* This code has never been tested since this
+                                   is normally not compiled */
+                    if (   deltanext < 0
+                        || deltanext > (I32) U8_MAX
+                        || *minnextp > (I32)U8_MAX
+                        || *minnextp + deltanext > (I32)U8_MAX)
+                    {
                         FAIL2("Lookbehind longer than %" UVuf " not implemented",
                                (UV)U8_MAX);
                      }
-                    scan->flags = (U8)*minnextp;
+
+                    if (deltanext) {
+                        scan->next_off = deltanext;
+                    }
+                    scan->flags = (U8)*minnextp + deltanext;
                  }
  
                  *minnextp += min;
@@ -7215,7 +7268,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
          const char* name;
  
          name = get_regex_charset_name(RExC_rx->extflags, &len);
-        if strEQ(name, DEPENDS_PAT_MODS) {  /* /d under UTF-8 => /u */
+        if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
              assert(RExC_utf8);
              name = UNICODE_PAT_MODS;
              len = sizeof(UNICODE_PAT_MODS) - 1;
@@ -7578,10 +7631,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      RExC_seen = 0;
      RExC_maxlen = 0;
      RExC_in_lookbehind = 0;
+    RExC_in_lookahead = 0;
      RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-#ifdef EBCDIC
      RExC_recode_x_to_native = 0;
-#endif
      RExC_in_multi_char_class = 0;
  
      RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
@@ -7667,6 +7719,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
  
      RExC_naughty = 0;
      RExC_npar = 1;
+    RExC_parens_buf_size = 0;
      RExC_emit_start = RExC_rxi->program;
      pRExC_state->code_index = 0;
  
@@ -10907,7 +10960,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
                  return;
              default:
                fail_modifiers:
-                RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                  vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
                        UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
@@ -10964,14 +11017,14 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
      RExC_sawback = 1;
      ret = reganode(pRExC_state,
                     ((! FOLD)
-                     ? NREF
+                     ? REFN
                       : (ASCII_FOLD_RESTRICTED)
-                       ? NREFFA
+                       ? REFFAN
                         : (AT_LEAST_UNI_SEMANTICS)
-                         ? NREFFU
+                         ? REFFUN
                           : (LOC)
-                           ? NREFFL
-                           : NREFF),
+                           ? REFFLN
+                           : REFFN),
                      num);
      *flagp |= HASWIDTH;
  
@@ -11010,6 +11063,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      I32 freeze_paren = 0;
      I32 after_freeze = 0;
      I32 num; /* numeric backreferences */
+    SV * max_open;  /* Max number of unclosed parens */
  
      char * parse_start = RExC_parse; /* MJD */
      char * const oregcomp_parse = RExC_parse;
@@ -11019,8 +11073,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      PERL_ARGS_ASSERT_REG;
      DEBUG_PARSE("reg ");
  
+
+    max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
+    assert(max_open);
+    if (!SvIOK(max_open)) {
+        sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
+    }
+    if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
+                                              open paren */
+        vFAIL("Too many nested open parens");
+    }
+
      *flagp = 0;                                /* Tentatively. */
  
+    if (RExC_in_lookbehind) {
+       RExC_in_lookbehind++;
+    }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead++;
+    }
+
      /* Having this true makes it feasible to have a lot fewer tests for the
       * parse pointer being in scope.  For example, we can write
       *      while(isFOO(*RExC_parse)) RExC_parse++;
@@ -11307,7 +11379,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
             } /* End of switch */
             if ( ! op ) {
-               RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+               RExC_parse += UTF
+                              ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                              : 1;
                  if (has_upper || verb_len == 0) {
                      vFAIL2utf8f(
                      "Unknown verb pattern '%" UTF8f "'",
@@ -11387,7 +11461,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                      return handle_named_backref(pRExC_state, flagp,
                                                  parse_start, ')');
                  }
-                RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                  /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                 vFAIL3("Sequence (%.*s...) not recognized",
                                  RExC_parse-seqstart, seqstart);
@@ -11483,10 +11557,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  if (RExC_parse >= RExC_end) {
                      vFAIL("Sequence (?... not terminated");
                  }
-
-                /* FALLTHROUGH */
+                RExC_seen_zerolen++;
+                break;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
+                RExC_in_lookahead++;
                  break;
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
@@ -11545,14 +11620,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  goto gen_recurse_regop;
                  /* NOTREACHED */
              case '+':
-                if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+                if (! inRANGE(RExC_parse[0], '1', '9')) {
                      RExC_parse++;
                      vFAIL("Illegal pattern");
                  }
                  goto parse_recursion;
                  /* NOTREACHED*/
              case '-': /* (?-1) */
-                if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+                if (! inRANGE(RExC_parse[0], '1', '9')) {
                      RExC_parse--; /* rewind to let it be handled later */
                      goto parse_flags;
                  }
@@ -11662,7 +11737,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             case '?':           /* (??...) */
                 is_logical = 1;
                 if (*RExC_parse != '{') {
-                    RExC_parse += SKIP_IF_CHAR(RExC_parse);
+                    RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
                      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
                      vFAIL2utf8f(
                          "Sequence (%" UTF8f "...) not recognized",
@@ -11798,7 +11873,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          RExC_rxi->data->data[num]=(void*)sv_dat;
                          SvREFCNT_inc_simple_void_NN(sv_dat);
                      }
-                    ret = reganode(pRExC_state, NGROUPP, num);
+                    ret = reganode(pRExC_state, GROUPPN, num);
                      goto insert_if_check_paren;
                 }
                 else if (memBEGINs(RExC_parse,
@@ -11821,7 +11896,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          parno = 1;
                          RExC_parse++;
                      }
-                    else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+                    else if (inRANGE(RExC_parse[0], '1', '9')) {
                          UV uv;
                          endptr = RExC_end;
                          if (grok_atoUV(RExC_parse, &uv, &endptr)
@@ -11842,7 +11917,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     ret = reganode(pRExC_state, INSUBP, parno);
                     goto insert_if_check_paren;
                 }
-               else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+                else if (inRANGE(RExC_parse[0], '1', '9')) {
                      /* (?(1)...) */
                     char c;
                      UV uv;
@@ -11860,7 +11935,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
                   insert_if_check_paren:
                     if (UCHARAT(RExC_parse) != ')') {
-                        RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                        RExC_parse += UTF
+                                      ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                      : 1;
                         vFAIL("Switch condition not recognized");
                     }
                     nextchar(pRExC_state);
@@ -11922,7 +11999,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  #endif
                     return ret;
                 }
-                RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                RExC_parse += UTF
+                              ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                              : 1;
                  vFAIL("Unknown switch condition (?(...))");
             }
             case '[':           /* (?[ ... ]) */
@@ -11932,6 +12011,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                 RExC_parse--; /* for vFAIL to print correctly */
                  vFAIL("Sequence (? incomplete");
                  break;
+
+            case ')':
+                if (RExC_strict) {  /* [perl #132851] */
+                    ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
+                }
+                /* FALLTHROUGH */
             default: /* e.g., (?i) */
                 RExC_parse = (char *) seqstart + 1;
                parse_flags:
@@ -11964,31 +12049,44 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              if (! ALL_PARENS_COUNTED) {
                  /* If we are in our first pass through (and maybe only pass),
                   * we  need to allocate memory for the capturing parentheses
-                 * data structures.  Since we start at npar=1, when it reaches
-                 * 2, for the first time it has something to put in it.  Above
-                 * 2 means we extend what we already have */
-                if (RExC_npar == 2) {
+                 * data structures.
+                 */
+
+                if (!RExC_parens_buf_size) {
+                    /* first guess at number of parens we might encounter */
+                    RExC_parens_buf_size = 10;
+
                      /* setup RExC_open_parens, which holds the address of each
                       * OPEN tag, and to make things simpler for the 0 index the
                       * start of the program - this is used later for offsets */
-                    Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      RExC_open_parens[0] = 1;    /* +1 for REG_MAGIC */
  
                      /* setup RExC_close_parens, which holds the address of each
                       * CLOSE tag, and to make things simpler for the 0 index
                       * the end of the program - this is used later for offsets
                       * */
-                    Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+                    Newxz(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
                      /* we dont know where end op starts yet, so we dont need to
                       * set RExC_close_parens[0] like we do RExC_open_parens[0]
                       * above */
                  }
-                else {
-                    Renew(RExC_open_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+                else if (RExC_npar > RExC_parens_buf_size) {
+                    I32 old_size = RExC_parens_buf_size;
+
+                    RExC_parens_buf_size *= 2;
  
-                    Renew(RExC_close_parens, RExC_npar, regnode_offset);
-                    Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+                    Renew(RExC_open_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_open_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
+
+                    Renew(RExC_close_parens, RExC_parens_buf_size,
+                            regnode_offset);
+                    Zero(RExC_close_parens + old_size,
+                            RExC_parens_buf_size - old_size, regnode_offset);
                  }
              }
  
@@ -12070,7 +12168,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              RETURN_FAIL_ON_RESTART(flags, flagp);
              FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
          }
-        REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
+        if (!  REGTAIL(pRExC_state, lastbr, br)) {  /* BRANCH -> BRANCH. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
         lastbr = br;
         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
      }
@@ -12141,7 +12241,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                            (IV)(ender - lastbr)
              );
          );
-        REGTAIL(pRExC_state, lastbr, ender);
+        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
  
         if (have_branch) {
              char is_nothing= 1;
@@ -12152,9 +12254,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
             for (br = REGNODE_p(ret); br; br = regnext(br)) {
                 const U8 op = PL_regkind[OP(br)];
                 if (op == BRANCH) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(br)),
-                                  ender);
+                    if (! REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(br)),
+                                        ender))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      if ( OP(NEXTOPER(br)) != NOTHING
                           || regnext(NEXTOPER(br)) != REGNODE_p(ender))
                          is_nothing= 0;
@@ -12221,7 +12326,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
             Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
             FLAGS(REGNODE_p(ret)) = flag;
-            REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+            if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
+            {
+                REQUIRE_BRANCHJ(flagp, 0);
+            }
         }
      }
  
@@ -12252,6 +12360,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      if (RExC_in_lookbehind) {
         RExC_in_lookbehind--;
      }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead--;
+    }
      if (after_freeze > RExC_npar)
          RExC_npar = after_freeze;
      return(ret);
@@ -12315,14 +12426,12 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             /* FIXME adding one for every branch after the first is probably
              * excessive now we have TRIE support. (hv) */
             MARK_NAUGHTY(1);
-            if (     chain > (SSize_t) BRANCH_MAX_OFFSET
-                && ! RExC_use_BRANCHJ)
-            {
+            if (! REGTAIL(pRExC_state, chain, latest)) {
                  /* XXX We could just redo this branch, but figuring out what
-                 * bookkeeping needs to be reset is a pain */
+                 * bookkeeping needs to be reset is a pain, and it's likely
+                 * that other branches that goto END will also be too large */
                  REQUIRE_BRANCHJ(flagp, 0);
              }
-            REGTAIL(pRExC_state, chain, latest);
         }
         chain = latest;
         c++;
@@ -12810,9 +12919,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          value = (U8 *) SvPV(value_sv, value_len);
  
          /* See if the result is one code point vs 0 or multiple */
-        if (value_len > 0 && value_len <= ((SvUTF8(value_sv))
-                                           ? UTF8SKIP(value)
-                                           : 1))
+        if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
+                                               ? UTF8SKIP(value)
+                                               : 1))
          {
              /* Here, exactly one code point.  If that isn't what is wanted,
               * fail */
@@ -12863,11 +12972,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          sv_catsv(substitute_parse, value_sv);
          sv_catpv(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The value should already be native, so no need to convert on EBCDIC
           * platforms.*/
          assert(! RExC_recode_x_to_native);
-#endif
  
      }
      else {   /* \N{U+...} */
@@ -13000,12 +13107,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
  
          sv_catpvs(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The values are Unicode, and therefore have to be converted to native
           * on a non-Unicode (meaning non-ASCII) platform. */
-        RExC_recode_x_to_native = 1;
-#endif
-
+        SET_recode_x_to_native(1);
      }
  
      /* Here, we have the string the name evaluates to, ready to be parsed,
@@ -13030,9 +13134,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
      RExC_start = save_start;
      RExC_parse = endbrace;
      RExC_end = orig_end;
-#ifdef EBCDIC
-    RExC_recode_x_to_native = 0;
-#endif
+    SET_recode_x_to_native(0);
  
      SvREFCNT_dec_NN(substitute_parse);
  
@@ -13339,15 +13441,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'K':
-           RExC_seen_zerolen++;
-           ret = reg_node(pRExC_state, KEEPS);
-           *flagp |= SIMPLE;
-           /* XXX:dmq : disabling in-place substitution seems to
-            * be necessary here to avoid cases of memory corruption, as
-            * with: C<$_="x" x 80; s/x\K/y/> -- rgs
-            */
-            RExC_seen |= REG_LOOKBEHIND_SEEN;
-           goto finish_meta_pat;
+            if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+                RExC_seen_zerolen++;
+                ret = reg_node(pRExC_state, KEEPS);
+                *flagp |= SIMPLE;
+                /* XXX:dmq : disabling in-place substitution seems to
+                 * be necessary here to avoid cases of memory corruption, as
+                 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+                 */
+                RExC_seen |= REG_LOOKBEHIND_SEEN;
+                goto finish_meta_pat;
+            }
+            else {
+                ++RExC_parse; /* advance past the 'K' */
+                vFAIL("\\K not permitted in lookahead/lookbehind");
+            }
         case 'Z':
             ret = reg_node(pRExC_state, SEOL);
             *flagp |= SIMPLE;
@@ -13711,7 +13819,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          && num >= RExC_npar
                          /* cannot be an octal escape if it starts with 8 */
                          && *RExC_parse != '8'
-                        /* cannot be an octal escape it it starts with 9 */
+                        /* cannot be an octal escape if it starts with 9 */
                          && *RExC_parse != '9'
                      ) {
                          /* Probably not meant to be a backref, instead likely
@@ -14072,13 +14180,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              UPDATE_WARNINGS_LOC(p - 1);
                              ender = result;
  
-                            if (ender < 0x100) {
  #ifdef EBCDIC
+                            if (ender < 0x100) {
                                  if (RExC_recode_x_to_native) {
                                      ender = LATIN1_TO_NATIVE(ender);
                                  }
-#endif
                             }
+#endif
                             break;
                         }
                     case 'c':
@@ -14393,18 +14501,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              has_micro_sign = TRUE;
                          }
  
-                        *(s++) = (char) (DEPENDS_SEMANTICS)
-                                        ? toFOLD(ender)
-
-                                          /* Under /u, the fold of any
-                                           * character in the 0-255 range
-                                           * happens to be its lowercase
-                                           * equivalent, except for LATIN SMALL
-                                           * LETTER SHARP S, which was handled
-                                           * above, and the MICRO SIGN, whose
-                                           * fold requires UTF-8 to represent.
-                                           * */
-                                        : toLOWER_L1(ender);
+                        *(s++) = (DEPENDS_SEMANTICS)
+                                 ? (char) toFOLD(ender)
+
+                                   /* Under /u, the fold of any character in
+                                    * the 0-255 range happens to be its
+                                    * lowercase equivalent, except for LATIN
+                                    * SMALL LETTER SHARP S, which was handled
+                                    * above, and the MICRO SIGN, whose fold
+                                    * requires UTF-8 to represent.  */
+                                 : (char) toLOWER_L1(ender);
                      }
                 } /* End of adding current character to the node */
  
@@ -14467,7 +14573,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  else {
  
                      /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop((U8 *) s, -1);
+                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
  
                      while (s >= s0) {   /* Search backwards until find
                                             a non-problematic char */
@@ -14577,8 +14683,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            loopdone:   /* Jumped to when encounters something that shouldn't be
                           in the node */
  
-            /* Free up any over-allocated space */
-            change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
+            /* Free up any over-allocated space; cast is to silence bogus
+             * warning in MS VC */
+            change_engine_size(pRExC_state,
+                                - (Ptrdiff_t) (initial_size - STR_SZ(len)));
  
              /* I (khw) don't know if you can get here with zero length, but the
               * old code handled this situation by creating a zero-length EXACT
@@ -14654,7 +14762,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  RExC_emit += STR_SZ(len);
  
                  /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
                      maybe_SIMPLE = 0;
                  }
  
@@ -14706,7 +14814,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      assert(PL_regkind[OP(node)] == ANYOF);
  
      /* There is no bitmap for this node type */
-    if (OP(node) == ANYOFH) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          return;
      }
  
@@ -15815,7 +15923,9 @@ redo_curchar:
                              RExC_parse = RExC_end;
                          }
                          else if (RExC_parse != save_parse) {
-                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                            RExC_parse += (UTF)
+                                          ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                          : 1;
                          }
                          vFAIL("Expecting '(?flags:(?[...'");
                      }
@@ -15877,8 +15987,7 @@ redo_curchar:
                                FALSE, /* Require return to be an ANYOF */
                                &current))
                  {
-                    FAIL2("panic: regclass returned failure to handle_sets, "
-                          "flags=%#" UVxf, (UV) *flagp);
+                    goto regclass_failed;
                  }
  
                  /* regclass() will return with parsing just the \ sequence,
@@ -15914,8 +16023,7 @@ redo_curchar:
                                  FALSE, /* Require return to be an ANYOF */
                                  &current))
                  {
-                    FAIL2("panic: regclass returned failure to handle_sets, "
-                          "flags=%#" UVxf, (UV) *flagp);
+                    goto regclass_failed;
                  }
  
                  if (! current) {
@@ -16276,8 +16384,7 @@ redo_curchar:
      }
  
      if (!node)
-        FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
-                    PTR2UV(flagp));
+        goto regclass_failed;
  
      /* Fix up the node type if we are in locale.  (We have pretended we are
       * under /u for the purposes of regclass(), as this construct will only
@@ -16308,6 +16415,10 @@ redo_curchar:
      nextchar(pRExC_state);
      Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
      return node;
+
+  regclass_failed:
+    FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
+                                                                (UV) *flagp);
  }
  
  #ifdef ENABLE_REGEX_SETS_DEBUGGING
@@ -16558,7 +16669,7 @@ STATIC regnode_offset
  S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   const bool stop_at_1,  /* Just parse the next thing, don't
                                             look for a full character class */
-                 bool allow_multi_folds,
+                 bool allow_mutiple_chars,
                   const bool silence_non_portable,   /* Don't output warnings
                                                         about too large
                                                         characters */
@@ -16713,7 +16824,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */      \
      || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0          \
                                     && UNICODE_DOT_DOT_VERSION == 0)
-    allow_multi_folds = FALSE;
+    allow_mutiple_chars = FALSE;
  #endif
  
      /* We include the /i status at the beginning of this so that we can
@@ -16729,7 +16840,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      if (UCHARAT(RExC_parse) == '^') {  /* Complement the class */
         RExC_parse++;
          invert = TRUE;
-        allow_multi_folds = FALSE;
+        allow_mutiple_chars = FALSE;
          MARK_NAUGHTY(1);
          SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
      }
@@ -16921,10 +17032,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                          else { /* cp_count > 1 */
                              assert(cp_count > 1);
                              if (! RExC_in_multi_char_class) {
-                                if (invert || range || *RExC_parse == '-') {
+                                if ( ! allow_mutiple_chars
+                                    || invert
+                                    || range
+                                    || *RExC_parse == '-')
+                                {
                                      if (strict) {
                                          RExC_parse--;
-                                        vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+                                        vFAIL("\\N{} here is restricted to one character");
                                      }
                                      ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
                                      break; /* <value> contains the first code
@@ -17002,7 +17117,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                 }   /* The \p isn't immediately followed by a '{' */
                 else if (! isALPHA(*RExC_parse)) {
-                    RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                    RExC_parse += (UTF)
+                                  ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                  : 1;
                      vFAIL2("Character following \\%c must be '{' or a "
                             "single-character Unicode property name",
                             (U8) value);
@@ -17024,6 +17141,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      SV * prop_definition = parse_uniprop_string(
                                              name, n, UTF, FOLD,
                                              FALSE, /* This is compile-time */
+
+                                            /* We can't defer this defn when
+                                             * the full result is required in
+                                             * this call */
+                                            ! cBOOL(ret_invlist),
+
                                              &user_defined,
                                              msg,
                                              0 /* Base level */
@@ -17171,7 +17294,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     RExC_parse += numlen;
                      if (numlen != 3) {
                          if (strict) {
-                            RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+                            RExC_parse += (UTF)
+                                          ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                                          : 1;
                              vFAIL("Need exactly 3 octal digits");
                          }
                          else if (   numlen < 3 /* like \08, \178 */
@@ -17243,40 +17368,60 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              ) {
                  SV* scratch_list = NULL;
  
-                /* What the Posix classes (like \w, [:space:]) match in locale
-                 * isn't knowable under locale until actual match time.  A
+                /* What the Posix classes (like \w, [:space:]) match isn't
+                 * generally knowable under locale until actual match time.  A
                   * special node is used for these which has extra space for a
                   * bitmap, with a bit reserved for each named class that is to
-                 * be matched against.  This isn't needed for \p{} and
+                 * be matched against.  (This isn't needed for \p{} and
                   * pseudo-classes, as they are not affected by locale, and
-                 * hence are dealt with separately */
-                POSIXL_SET(posixl, namedclass);
-                has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-                anyof_flags |= ANYOF_MATCHES_POSIXL;
-
-                /* The above-Latin1 characters are not subject to locale rules.
-                 * Just add them to the unconditionally-matched list */
-
-                /* Get the list of the above-Latin1 code points this matches */
-                _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
-                                        PL_XPosix_ptrs[classnum],
-
-                                        /* Odd numbers are complements, like
-                                        * NDIGIT, NASCII, ... */
-                                        namedclass % 2 != 0,
-                                        &scratch_list);
-                /* Checking if 'cp_list' is NULL first saves an extra clone.
-                 * Its reference count will be decremented at the next union,
-                 * etc, or if this is the only instance, at the end of the
-                 * routine */
-                if (! cp_list) {
-                    cp_list = scratch_list;
-                }
-                else {
-                    _invlist_union(cp_list, scratch_list, &cp_list);
-                    SvREFCNT_dec_NN(scratch_list);
+                 * hence are dealt with separately.)  However, if a named class
+                 * and its complement are both present, then it matches
+                 * everything, and there is no runtime dependency.  Odd numbers
+                 * are the complements of the next lower number, so xor works.
+                 * (Note that something like [\w\D] should match everything,
+                 * because \d should be a proper subset of \w.  But rather than
+                 * trust that the locale is well behaved, we leave this to
+                 * runtime to sort out) */
+                if (POSIXL_TEST(posixl, namedclass ^ 1)) {
+                    cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
+                    POSIXL_ZERO(posixl);
+                    has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
+                    anyof_flags &= ~ANYOF_MATCHES_POSIXL;
+                    continue;   /* We could ignore the rest of the class, but
+                                   best to parse it for any errors */
+                }
+                else { /* Here, isn't the complement of any already parsed
+                          class */
+                    POSIXL_SET(posixl, namedclass);
+                    has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+                    anyof_flags |= ANYOF_MATCHES_POSIXL;
+
+                    /* The above-Latin1 characters are not subject to locale
+                     * rules.  Just add them to the unconditionally-matched
+                     * list */
+
+                    /* Get the list of the above-Latin1 code points this
+                     * matches */
+                    _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
+                                            PL_XPosix_ptrs[classnum],
+
+                                            /* Odd numbers are complements,
+                                             * like NDIGIT, NASCII, ... */
+                                            namedclass % 2 != 0,
+                                            &scratch_list);
+                    /* Checking if 'cp_list' is NULL first saves an extra
+                     * clone.  Its reference count will be decremented at the
+                     * next union, etc, or if this is the only instance, at the
+                     * end of the routine */
+                    if (! cp_list) {
+                        cp_list = scratch_list;
+                    }
+                    else {
+                        _invlist_union(cp_list, scratch_list, &cp_list);
+                        SvREFCNT_dec_NN(scratch_list);
+                    }
+                    continue;   /* Go get next character */
                  }
-                continue;   /* Go get next character */
              }
              else {
  
@@ -17438,7 +17583,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
           *  "ss"  =~ /^[^\xDF]+$/i => N
           *
           * See [perl #89750] */
-        if (FOLD && allow_multi_folds && value == prevvalue) {
+        if (FOLD && allow_mutiple_chars && value == prevvalue) {
              if (    value == LATIN_SMALL_LETTER_SHARP_S
                  || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
                                                          value)))
@@ -17612,7 +17757,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             literal
                          );
                  }
-                else if isMNEMONIC_CNTRL(value) {
+                else if (isMNEMONIC_CNTRL(value)) {
                      vWARN4(RExC_parse,
                             "\"%.*s\" is more clearly written simply as \"%s\"",
                             (int) (RExC_parse - rangebegin),
@@ -18657,7 +18802,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      RExC_emit += 1 + STR_SZ(len);
                      STR_LEN(REGNODE_p(ret)) = len;
                      if (len == 1) {
-                        *STRING(REGNODE_p(ret)) = value;
+                        *STRING(REGNODE_p(ret)) = (U8) value;
                      }
                      else {
                          uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
@@ -18743,7 +18888,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                      full_cp_count += this_end - this_start + 1;
                  }
-                invlist_iterfinish(cp_list);
  
                  /* At the end of the loop, we count how many bits differ from
                   * the bits in lowest code point, call the count 'd'.  If the
@@ -18772,8 +18916,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      ret = reganode(pRExC_state, op, lowest_cp);
                      FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
                  }
+
+              done_anyofm:
+                invlist_iterfinish(cp_list);
              }
-          done_anyofm:
  
              if (inverted) {
                  _invlist_invert(cp_list);
@@ -18898,26 +19044,92 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              SvREFCNT_dec(intersection);
          }
  
-        /* If didn't find an optimization and there is no need for a
-        * bitmap, optimize to indicate that */
+        /* If didn't find an optimization and there is no need for a bitmap,
+         * optimize to indicate that */
          if (     start[0] >= NUM_ANYOF_CODE_POINTS
              && ! LOC
-            && ! upper_latin1_only_utf8_matches)
+            && ! upper_latin1_only_utf8_matches
+            &&   anyof_flags == 0)
          {
+            U8 low_utf8[UTF8_MAXBYTES+1];
+            UV highest_cp = invlist_highest(cp_list);
+
              op = ANYOFH;
+
+            /* Currently the maximum allowed code point by the system is
+             * IV_MAX.  Higher ones are reserved for future internal use.  This
+             * particular regnode can be used for higher ones, but we can't
+             * calculate the code point of those.  IV_MAX suffices though, as
+             * it will be a large first byte */
+            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+            /* We store the lowest possible first byte of the UTF-8
+             * representation, using the flags field.  This allows for quick
+             * ruling out of some inputs without having to convert from UTF-8
+             * to code point.  For EBCDIC, this has to be I8. */
+            anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+            /* If the first UTF-8 start byte for the highest code point in the
+             * range is suitably small, we may be able to get an upper bound as
+             * well */
+            if (highest_cp <= IV_MAX) {
+                U8 high_utf8[UTF8_MAXBYTES+1];
+
+                (void) uvchr_to_utf8(high_utf8, highest_cp);
+
+                /* If the lowest and highest are the same, we can get an exact
+                 * first byte instead of a just minimum.  We signal this with a
+                 * different regnode */
+                if (low_utf8[0] == high_utf8[0]) {
+
+                    /* No need to convert to I8 for EBCDIC as this is an exact
+                     * match */
+                    anyof_flags = low_utf8[0];
+                    op = ANYOFHb;
+                }
+                else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+                {
+
+                    /* Here, the high byte is not the same as the low, but is
+                     * small enough that its reasonable to have a loose upper
+                     * bound, which is packed in with the strict lower bound.
+                     * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+                     * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
+                     * is the same thing as UTF-8 */
+
+                    U8 bits = 0;
+                    U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+                    U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+                                  - anyof_flags;
+
+                    if (range_diff <= max_range_diff / 8) {
+                        bits = 3;
+                    }
+                    else if (range_diff <= max_range_diff / 4) {
+                        bits = 2;
+                    }
+                    else if (range_diff <= max_range_diff / 2) {
+                        bits = 1;
+                    }
+                    anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+                    op = ANYOFHr;
+                }
+            }
+
+            goto done_finding_op;
          }
      }   /* End of seeing if can optimize it into a different node */
  
    is_anyof: /* It's going to be an ANYOF node. */
-    if (op != ANYOFH) {
-        op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
-             ? ANYOFD
-             : ((posixl)
-                ? ANYOFPOSIXL
-                : ((LOC)
-                   ? ANYOFL
-                   : ANYOF));
-    }
+    op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+         ? ANYOFD
+         : ((posixl)
+            ? ANYOFPOSIXL
+            : ((LOC)
+               ? ANYOFL
+               : ANYOF));
+
+  done_finding_op:
  
      ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
      FILL_NODE(ret, op);        /* We set the argument later */
@@ -19104,6 +19316,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
                                                             stored here for just
                                                             this occasion */
                              TRUE,           /* run time */
+                            FALSE,          /* This call must find the defn */
                              si,             /* The property definition  */
                              &user_defined,
                              msg,
@@ -19380,7 +19593,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
                 || UTF8_IS_INVARIANT(*RExC_parse)
                 || UTF8_IS_START(*RExC_parse));
  
-        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+        RExC_parse += (UTF)
+                      ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+                      : 1;
  
          skip_to_be_ignored_text(pRExC_state, &RExC_parse,
                                  FALSE /* Don't force /x */ );
@@ -19550,7 +19765,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
      src = REGNODE_p(RExC_emit);
      RExC_emit += size;
      dst = REGNODE_p(RExC_emit);
-    if (RExC_open_parens) {
+
+    /* If we are in a "count the parentheses" pass, the numbers are unreliable,
+     * and [perl #133871] shows this can lead to problems, so skip this
+     * realignment of parens until a later pass when they are reliable */
+    if (! IN_PARENS_PASS && RExC_open_parens) {
          int paren;
          /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
          /* remember that RExC_npar is rex->nparens + 1,
@@ -19623,10 +19842,13 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
  }
  
  /*
-- regtail - set the next-pointer at the end of a node chain of p to val.
+- regtail - set the next-pointer at the end of a node chain of p to val.  If
+            that value won't fit in the space available, instead returns FALSE.
+            (Except asserts if we can't fit in the largest space the regex
+            engine is designed for.)
  - SEE ALSO: regtail_study
  */
-STATIC void
+STATIC bool
  S_regtail(pTHX_ RExC_state_t * pRExC_state,
                  const regnode_offset p,
                  const regnode_offset val,
@@ -19659,11 +19881,21 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      }
  
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert((UV) (val - scan) <= U32_MAX);
          ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            /* Since not all callers check the return value, populate this with
+             * something that won't loop and will likely lead to a crash if
+             * execution continues */
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
          NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
+
+    return TRUE;
  }
  
  #ifdef DEBUGGING
@@ -19680,10 +19912,14 @@ that it is purely analytical.
  Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
  to control which is which.
  
+This used to return a value that was ignored.  It was a problem that it is
+#ifdef'd to be another function that didn't return a value.  khw has changed it
+so both currently return a pass/fail return.
+
  */
  /* TODO: All four parms should be const */
  
-STATIC U8
+STATIC bool
  S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
                        const regnode_offset val, U32 depth)
  {
@@ -19707,7 +19943,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
             bool unfolded_multi_char;   /* Unexamined in this routine */
              if (join_exact(pRExC_state, scan, &min,
                             &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
-                return EXACT;
+                return TRUE; /* Was return EXACT */
         }
  #endif
          if ( exact ) {
@@ -19757,13 +19993,18 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
          );
      });
      if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+        assert((UV) (val - scan) <= U32_MAX);
         ARG_SET(REGNODE_p(scan), val - scan);
      }
      else {
+        if (val - scan > U16_MAX) {
+            NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+            return FALSE;
+        }
         NEXT_OFF(REGNODE_p(scan)) = val - scan;
      }
  
-    return exact;
+    return TRUE; /* Was 'return exact' */
  }
  #endif
  
@@ -20034,11 +20275,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
  
      SvPVCLEAR(sv);
  
-    if (OP(o) > REGNODE_MAX)           /* regnode.type is unsigned */
-       /* It would be nice to FAIL() here, but this may be called from
-          regexec.c, and it would be hard to supply pRExC_state. */
-       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
-                                              (int)OP(o), (int)REGNODE_MAX);
+    if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
+        if (pRExC_state) {  /* This gives more info, if we have it */
+            FAIL3("panic: corrupted regexp opcode %d > %d",
+                  (int)OP(o), (int)REGNODE_MAX);
+        }
+        else {
+            Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+                             (int)OP(o), (int)REGNODE_MAX);
+        }
+    }
      sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
  
      k = PL_regkind[OP(o)];
@@ -20121,7 +20367,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              name_list= RExC_paren_name_list;
          }
          if (name_list) {
-            if ( k != REF || (OP(o) < NREF)) {
+            if ( k != REF || (OP(o) < REFN)) {
                  SV **name= av_fetch(name_list, parno, 0 );
                 if (name)
                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
@@ -20175,7 +20421,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* 2: embedded, otherwise 1 */
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
      else if (k == ANYOF) {
-       const U8 flags = ANYOF_FLAGS(o);
+       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(o);
          bool do_sep = FALSE;    /* Do we need to separate various components of
                                     the output? */
          /* Set if there is still an unresolved user-defined property */
@@ -20231,7 +20479,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* Ready to start outputting.  First, the initial left bracket */
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
  
-        if (OP(o) != ANYOFH) {
+        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
              /* Then all the things that could fit in the bitmap */
              do_sep = put_charclass_bitmap_innards(sv,
                                                    ANYOF_BITMAP(o),
@@ -20329,6 +20577,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And finally the matching, closing ']' */
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
+        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+            U8 lowest = (OP(o) != ANYOFHr)
+                         ? FLAGS(o)
+                         : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+            U8 highest = (OP(o) == ANYOFHb)
+                         ? lowest
+                         : OP(o) == ANYOFH
+                           ? 0xFF
+                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+            if (lowest != highest) {
+                Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+            }
+            Perl_sv_catpvf(aTHX_ sv, ")");
+        }
+
          SvREFCNT_dec(unresolved);
      }
      else if (k == ANYOFM) {
@@ -20371,8 +20635,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
          sv_catpv(sv, bounds[FLAGS(o)]);
      }
-    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
-       Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+    else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
+       Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+        if (o->next_off) {
+            Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+        }
+       Perl_sv_catpvf(aTHX_ sv, "]");
+    }
      else if (OP(o) == SBOL)
          Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
  
@@ -20516,7 +20785,23 @@ Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
      if (!dsv)
         dsv = (REGEXP*) newSV_type(SVt_REGEXP);
      else {
+        assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
+
+        /* our only valid caller, sv_setsv_flags(), should have done
+         * a SV_CHECK_THINKFIRST_COW_DROP() by now */
+        assert(!SvOOK(dsv));
+        assert(!SvIsCOW(dsv));
+        assert(!SvROK(dsv));
+
+        if (SvPVX_const(dsv)) {
+            if (SvLEN(dsv))
+                Safefree(SvPVX(dsv));
+            SvPVX(dsv) = NULL;
+        }
+        SvLEN_set(dsv, 0);
+        SvCUR_set(dsv, 0);
         SvOK_off((SV *)dsv);
+
         if (islv) {
             /* For PVLVs, the head (sv_any) points to an XPVLV, while
               * the LV's xpvlenu_rx will point to a regexp body, which
@@ -20807,6 +21092,11 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
                2: something we no longer hold a reference on
                so we need to copy it locally.  */
      RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
+    /* set malloced length to a non-zero value so it will be freed
+     * (otherwise in combination with SVf_FAKE it looks like an alien
+     * buffer). It doesn't have to be the actual malloced size, since it
+     * should never be grown */
+    SvLEN_set(dstr, SvCUR(sstr)+1);
      ret->mother_re   = NULL;
  }
  #endif /* PERL_IN_XSUB_RE */
@@ -21967,6 +22257,8 @@ Perl_handle_user_defined_property(pTHX_
      const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
      const bool to_fold,         /* ? Is this under /i */
      const bool runtime,         /* ? Are we in compile- or run-time */
+    const bool deferrable,      /* Is it ok for this property's full definition
+                                   to be deferred until later? */
      SV* contents,               /* The property's definition */
      bool *user_defined_ptr,     /* This will be set TRUE as we wouldn't be
                                     getting called unless this is thought to be
@@ -22044,7 +22336,7 @@ Perl_handle_user_defined_property(pTHX_
                  Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
                                       UTF8fARG(is_contents_utf8, s - s0, s0));
                  sv_catpvs(msg, "\"");
-                goto return_msg;
+                goto return_failure;
              }
  
              /* Accumulate this digit into the value */
@@ -22079,7 +22371,7 @@ Perl_handle_user_defined_property(pTHX_
                      Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
                                        UTF8fARG(is_contents_utf8, s - s0, s0));
                      sv_catpvs(msg, "\"");
-                    goto return_msg;
+                    goto return_failure;
                  }
  
                  max = (max << 4) + READ_XDIGIT(s);
@@ -22107,7 +22399,7 @@ Perl_handle_user_defined_property(pTHX_
              Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
                                  UTF8fARG(is_contents_utf8, s - s0, s0));
              sv_catpvs(msg, "\"");
-            goto return_msg;
+            goto return_failure;
          }
  
  #if 0   /* See explanation at definition above of get_extended_utf8_msg() */
@@ -22154,6 +22446,7 @@ Perl_handle_user_defined_property(pTHX_
  
          this_definition = parse_uniprop_string(s0, s - s0,
                                                 is_utf8, to_fold, runtime,
+                                               deferrable,
                                                 user_defined_ptr, msg,
                                                 (name_len == 0)
                                                  ? level /* Don't increase level
@@ -22161,8 +22454,8 @@ Perl_handle_user_defined_property(pTHX_
                                                  : level + 1
                                                );
          if (this_definition == NULL) {
-            goto return_msg;    /* 'msg' should have had the reason appended to
-                                   it by the above call */
+            goto return_failure;    /* 'msg' should have had the reason
+                                       appended to it by the above call */
          }
  
          if (! is_invlist(this_definition)) {    /* Unknown at this time */
@@ -22219,6 +22512,10 @@ Perl_handle_user_defined_property(pTHX_
      }
  
      /* Otherwise, add some explanatory text, but we will return success */
+    goto return_msg;
+
+  return_failure:
+    running_definition = NULL;
  
    return_msg:
  
@@ -22277,6 +22574,38 @@ S_delete_recursion_entry(pTHX_ void *key)
      RESTORE_CONTEXT;
  }
  
+STATIC SV *
+S_get_fq_name(pTHX_
+              const char * const name,    /* The first non-blank in the \p{}, \P{} */
+              const Size_t name_len,      /* Its length in bytes, not including any trailing space */
+              const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
+              const bool has_colon_colon
+             )
+{
+    /* Returns a mortal SV containing the fully qualified version of the input
+     * name */
+
+    SV * fq_name;
+
+    fq_name = newSVpvs_flags("", SVs_TEMP);
+
+    /* Use the current package if it wasn't included in our input */
+    if (! has_colon_colon) {
+        const HV * pkg = (IN_PERL_COMPILETIME)
+                         ? PL_curstash
+                         : CopSTASH(PL_curcop);
+        const char* pkgname = HvNAME(pkg);
+
+        Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+                      UTF8fARG(is_utf8, strlen(pkgname), pkgname));
+        sv_catpvs(fq_name, "::");
+    }
+
+    Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
+                         UTF8fARG(is_utf8, name_len, name));
+    return fq_name;
+}
+
  SV *
  Perl_parse_uniprop_string(pTHX_
  
@@ -22305,6 +22634,8 @@ Perl_parse_uniprop_string(pTHX_
      const bool is_utf8,         /* ? Is 'name' encoded in UTF-8 */
      const bool to_fold,         /* ? Is this under /i */
      const bool runtime,         /* TRUE if this is being called at run time */
+    const bool deferrable,      /* TRUE if it's ok for the definition to not be
+                                   known at this call */
      bool *user_defined_ptr,     /* Upon return from this function it will be
                                     set to TRUE if any component is a
                                     user-defined property */
@@ -22329,8 +22660,7 @@ Perl_parse_uniprop_string(pTHX_
      int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
      int table_index = 0;    /* The entry number for this property in the table
                                 of all Unicode property names */
-    bool starts_with_In_or_Is = FALSE;  /* ? Does the name start with 'In' or
-                                             'Is' */
+    bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
      Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
                                     the normalized name in certain situations */
      Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
@@ -22343,6 +22673,8 @@ Perl_parse_uniprop_string(pTHX_
                                       it is the definition.  Otherwise it is a
                                       string containing the fully qualified sub
                                       name of 'name' */
+    SV * fq_name = NULL;        /* For user-defined properties, the fully
+                                   qualified name */
      bool invert_return = FALSE; /* ? Do we need to complement the result before
                                       returning it */
  
@@ -22489,7 +22821,8 @@ Perl_parse_uniprop_string(pTHX_
                  pos_in_brackets = strchr("([<)]>)]>", open);
                  close = (pos_in_brackets) ? pos_in_brackets[3] : open;
  
-                if (   name[name_len-1] != close
+                if (    i >= name_len
+                    ||  name[name_len-1] != close
                      || (escaped && name[name_len-2] != '\\'))
                  {
                      sv_catpvs(msg, "Unicode property wildcard not terminated");
@@ -22545,6 +22878,7 @@ Perl_parse_uniprop_string(pTHX_
                                                             is_utf8,
                                                             to_fold,
                                                             runtime,
+                                                           deferrable,
                                                             user_defined_ptr,
                                                             msg,
                                                             level + 1);
@@ -22616,7 +22950,7 @@ Perl_parse_uniprop_string(pTHX_
          /* Certain properties whose values are numeric need special handling.
           * They may optionally be prefixed by 'is'.  Ignore that prefix for the
           * purposes of checking if this is one of those properties */
-        if (memBEGINPs(lookup_name, name_len, "is")) {
+        if (memBEGINPs(lookup_name, j, "is")) {
              lookup_offset = 2;
          }
  
@@ -22782,7 +23116,9 @@ Perl_parse_uniprop_string(pTHX_
              }
  
              /* Store the first real character in the denominator */
-            lookup_name[j++] = name[i];
+            if (i < name_len) {
+                lookup_name[j++] = name[i];
+            }
          }
      }
  
@@ -22800,11 +23136,15 @@ Perl_parse_uniprop_string(pTHX_
  
      /* If the original input began with 'In' or 'Is', it could be a subroutine
       * call to a user-defined property instead of a Unicode property name. */
-    if (    non_pkg_begin + name_len > 2
+    if (    name_len - non_pkg_begin > 2
          &&  name[non_pkg_begin+0] == 'I'
          && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
      {
-        starts_with_In_or_Is = TRUE;
+        /* Names that start with In have different characterstics than those
+         * that start with Is */
+        if (name[non_pkg_begin+1] == 's') {
+            starts_with_Is = TRUE;
+        }
      }
      else {
          could_be_user_defined = FALSE;
@@ -22813,20 +23153,28 @@ Perl_parse_uniprop_string(pTHX_
      if (could_be_user_defined) {
          CV* user_sub;
  
+        /* If the user defined property returns the empty string, it could
+         * easily be because the pattern is being compiled before the data it
+         * actually needs to compile is available.  This could be argued to be
+         * a bug in the perl code, but this is a change of behavior for Perl,
+         * so we handle it.  This means that intentionally returning nothing
+         * will not be resolved until runtime */
+        bool empty_return = FALSE;
+
          /* Here, the name could be for a user defined property, which are
           * implemented as subs. */
          user_sub = get_cvn_flags(name, name_len, 0);
          if (user_sub) {
+            const char insecure[] = "Insecure user-defined property";
  
              /* Here, there is a sub by the correct name.  Normally we call it
               * to get the property definition */
              dSP;
              SV * user_sub_sv = MUTABLE_SV(user_sub);
              SV * error;     /* Any error returned by calling 'user_sub' */
-            SV * fq_name;   /* Fully qualified property name */
+            SV * key;       /* The key into the hash of user defined sub names
+                             */
              SV * placeholder;
-            char to_fold_string[] = "0:";   /* The 0 gets overwritten with the
-                                               actual value */
              SV ** saved_user_prop_ptr;      /* Hash entry for this property */
  
              /* How many times to retry when another thread is in the middle of
@@ -22838,11 +23186,11 @@ Perl_parse_uniprop_string(pTHX_
              /* If we get here, we know this property is user-defined */
              *user_defined_ptr = TRUE;
  
-            /* We refuse to call a tainted subroutine; returning an error
-             * instead */
+            /* We refuse to call a potentially tainted subroutine; returning an
+             * error instead */
              if (TAINT_get) {
                  if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
-                sv_catpvs(msg, "Insecure user-defined property");
+                sv_catpvn(msg, insecure, sizeof(insecure) - 1);
                  goto append_name_to_msg;
              }
  
@@ -22856,14 +23204,13 @@ Perl_parse_uniprop_string(pTHX_
               * should the need arise, passing the /i status as a parameter.
               *
               * We start by constructing the hash key name, consisting of the
-             * fully qualified subroutine name */
-            fq_name = sv_2mortal(newSV(10));    /* 10 is just a guess */
-            (void) cv_name(user_sub, fq_name, 0);
-
-            /* But precede the sub name in the key with the /i status, so that
-             * there is a key for /i and a different key for non-/i */
-            to_fold_string[0] = to_fold + '0';
-            sv_insert(fq_name, 0, 0, to_fold_string, 2);
+             * fully qualified subroutine name, preceded by the /i status, so
+             * that there is a key for /i and a different key for non-/i */
+            key = newSVpvn(((to_fold) ? "1" : "0"), 1);
+            fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+                                          non_pkg_begin != 0);
+            sv_catsv(key, fq_name);
+            sv_2mortal(key);
  
              /* We only call the sub once throughout the life of the program
               * (with the /i, non-/i exception noted above).  That means the
@@ -22913,7 +23260,7 @@ Perl_parse_uniprop_string(pTHX_
              /* If we have an entry for this key, the subroutine has already
               * been called once with this /i status. */
              saved_user_prop_ptr = hv_fetch(PL_user_def_props,
-                                           SvPVX(fq_name), SvCUR(fq_name), 0);
+                                                   SvPVX(key), SvCUR(key), 0);
              if (saved_user_prop_ptr) {
  
                  /* If the saved result is an inversion list, it is the valid
@@ -22981,13 +23328,14 @@ Perl_parse_uniprop_string(pTHX_
               * for this property in the hash.  So we have the go ahead to
               * expand the definition ourselves. */
  
+            PUSHSTACKi(PERLSI_MAGIC);
              ENTER;
  
              /* Create a temporary placeholder in the hash to detect recursion
               * */
              SWITCH_TO_GLOBAL_CONTEXT;
              placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
-            (void) hv_store_ent(PL_user_def_props, fq_name, placeholder, 0);
+            (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
              RESTORE_CONTEXT;
  
              /* Now that we have a placeholder, we can let other threads
@@ -22995,7 +23343,7 @@ Perl_parse_uniprop_string(pTHX_
              USER_PROP_MUTEX_UNLOCK;
  
              /* Make sure the placeholder always gets destroyed */
-            SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(fq_name));
+            SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
  
              PUSHMARK(SP);
              SAVETMPS;
@@ -23006,16 +23354,33 @@ Perl_parse_uniprop_string(pTHX_
              XPUSHs(boolSV(to_fold));
              PUTBACK;
  
+            /* The following block was taken from swash_init().  Presumably
+             * they apply to here as well, though we no longer use a swash --
+             * khw */
+            SAVEHINTS();
+            save_re_context();
+            /* We might get here via a subroutine signature which uses a utf8
+             * parameter name, at which point PL_subname will have been set
+             * but not yet used. */
+            save_item(PL_subname);
+
              (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
  
              SPAGAIN;
  
              error = ERRSV;
-            if (SvTRUE(error)) {
+            if (TAINT_get || SvTRUE(error)) {
                  if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
-                sv_catpvs(msg, "Error \"");
-                sv_catsv(msg, error);
-                sv_catpvs(msg, "\"");
+                if (SvTRUE(error)) {
+                    sv_catpvs(msg, "Error \"");
+                    sv_catsv(msg, error);
+                    sv_catpvs(msg, "\"");
+                }
+                if (TAINT_get) {
+                    if (SvTRUE(error)) sv_catpvs(msg, "; ");
+                    sv_catpvn(msg, insecure, sizeof(insecure) - 1);
+                }
+
                  if (name_len > 0) {
                      sv_catpvs(msg, " in expansion of ");
                      Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
@@ -23027,32 +23392,47 @@ Perl_parse_uniprop_string(pTHX_
                  prop_definition = NULL;
              }
              else {  /* G_SCALAR guarantees a single return value */
+                SV * contents = POPs;
  
                  /* The contents is supposed to be the expansion of the property
-                 * definition.  Call a function to check for valid syntax and
-                 * handle it */
-                prop_definition = handle_user_defined_property(name, name_len,
+                 * definition.  If the definition is deferrable, and we got an
+                 * empty string back, set a flag to later defer it (after clean
+                 * up below). */
+                if (      deferrable
+                    && (! SvPOK(contents) || SvCUR(contents) == 0))
+                {
+                        empty_return = TRUE;
+                }
+                else { /* Otherwise, call a function to check for valid syntax,
+                          and handle it */
+
+                    prop_definition = handle_user_defined_property(
+                                                    name, name_len,
                                                      is_utf8, to_fold, runtime,
-                                                    POPs, user_defined_ptr,
+                                                    deferrable,
+                                                    contents, user_defined_ptr,
                                                      msg,
                                                      level);
+                }
              }
  
-            /* Here, we have the results of the expansion.  Replace the
-             * placeholder with them.  We need exclusive access to the hash,
-             * and we can't let anyone else in, between when we delete the
-             * placeholder and add the permanent entry */
+            /* Here, we have the results of the expansion.  Delete the
+             * placeholder, and if the definition is now known, replace it with
+             * that definition.  We need exclusive access to the hash, and we
+             * can't let anyone else in, between when we delete the placeholder
+             * and add the permanent entry */
              USER_PROP_MUTEX_LOCK;
  
-            S_delete_recursion_entry(aTHX_ SvPVX(fq_name));
-
-            if (! prop_definition || is_invlist(prop_definition)) {
+            S_delete_recursion_entry(aTHX_ SvPVX(key));
  
+            if (    ! empty_return
+                && (! prop_definition || is_invlist(prop_definition)))
+            {
                  /* If we got success we use the inversion list defining the
                   * property; otherwise use the error message */
                  SWITCH_TO_GLOBAL_CONTEXT;
                  (void) hv_store_ent(PL_user_def_props,
-                                    fq_name,
+                                    key,
                                      ((prop_definition)
                                       ? newSVsv(prop_definition)
                                       : newSVsv(msg)),
@@ -23066,6 +23446,11 @@ Perl_parse_uniprop_string(pTHX_
  
              FREETMPS;
              LEAVE;
+            POPSTACK;
+
+            if (empty_return) {
+                goto definition_deferred;
+            }
  
              if (prop_definition) {
  
@@ -23098,8 +23483,11 @@ Perl_parse_uniprop_string(pTHX_
      /* If it didn't find the property ... */
      if (table_index == 0) {
  
-        /* Try again stripping off any initial 'In' or 'Is' */
-        if (starts_with_In_or_Is) {
+        /* Try again stripping off any initial 'Is'.  This is because we
+         * promise that an initial Is is optional.  The same isn't true of
+         * names that start with 'In'.  Those can match only blocks, and the
+         * lookup table already has those accounted for. */
+        if (starts_with_Is) {
              lookup_name += 2;
              lookup_len -= 2;
              equals_pos -= 2;
@@ -23122,7 +23510,7 @@ Perl_parse_uniprop_string(pTHX_
                   * compile time, it might just be that the subroutine for that
                   * property hasn't been encountered yet, but at runtime, it's
                   * an error to try to use an undefined one */
-                if (runtime) {
+                if (! deferrable) {
                      if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
                      sv_catpvs(msg, "Unknown user-defined property name");
                      goto append_name_to_msg;
@@ -23142,10 +23530,12 @@ Perl_parse_uniprop_string(pTHX_
                   * NV. */
  
                  NV value;
+                SSize_t value_len = lookup_len - equals_pos;
  
                  /* Get the value */
-                if (my_atof3(lookup_name + equals_pos, &value,
-                             lookup_len - equals_pos)
+                if (   value_len <= 0
+                    || my_atof3(lookup_name + equals_pos, &value,
+                                value_len)
                            != lookup_name + lookup_len)
                  {
                      goto failed;
@@ -23355,6 +23745,7 @@ Perl_parse_uniprop_string(pTHX_
                                                            0, /* Not UTF-8 */
                                                            0, /* Not folded */
                                                            runtime,
+                                                          deferrable,
                                                            pu_definition,
                                                            &dummy,
                                                            msg,
@@ -23410,28 +23801,17 @@ Perl_parse_uniprop_string(pTHX_
    definition_deferred:
  
      /* Here it could yet to be defined, so defer evaluation of this
-     * until its needed at runtime. */
-    prop_definition = newSVpvs_flags("", SVs_TEMP);
-
-    /* To avoid any ambiguity, the package is always specified.
-     * Use the current one if it wasn't included in our input */
-    if (non_pkg_begin == 0) {
-        const HV * pkg = (IN_PERL_COMPILETIME)
-                         ? PL_curstash
-                         : CopSTASH(PL_curcop);
-        const char* pkgname = HvNAME(pkg);
-
-        Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
-                      UTF8fARG(is_utf8, strlen(pkgname), pkgname));
-        sv_catpvs(prop_definition, "::");
+     * until its needed at runtime.  We need the fully qualified property name
+     * to avoid ambiguity, and a trailing newline */
+    if (! fq_name) {
+        fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
+                                      non_pkg_begin != 0 /* If has "::" */
+                               );
      }
-
-    Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f,
-                         UTF8fARG(is_utf8, name_len, name));
-    sv_catpvs(prop_definition, "\n");
+    sv_catpvs(fq_name, "\n");
  
      *user_defined_ptr = TRUE;
-    return prop_definition;
+    return fq_name;
  }
  
  #endif