Mailling list archaeology, restoring old content

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 3ad09c5..f007d90 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -131,6 +131,8 @@ struct RExC_state_t {
      char       *parse;                 /* Input-scan pointer. */
      char        *copy_start;            /* start of copy of input within
                                             constructed parse string */
+    char        *save_copy_start;       /* Provides one level of saving
+                                           and restoring 'copy_start' */
      char        *copy_start_in_input;   /* Position in input string
                                             corresponding to copy_start */
      SSize_t    whilem_seen;            /* number of WHILEM in this expr */
@@ -180,11 +182,10 @@ struct RExC_state_t {
                                             through */
      U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
      I32                in_lookbehind;
+    I32                in_lookahead;
      I32                contains_locale;
      I32                override_recoding;
-#ifdef EBCDIC
-    I32                recode_x_to_native;
-#endif
+    I32         recode_x_to_native;
      I32                in_multi_char_class;
      struct reg_code_blocks *code_blocks;/* positions of literal (?{})
                                             within pattern */
@@ -229,6 +230,7 @@ struct RExC_state_t {
  #define RExC_precomp   (pRExC_state->precomp)
  #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
  #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
+#define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
  #define RExC_precomp_end (pRExC_state->precomp_end)
  #define RExC_rx_sv     (pRExC_state->rx_sv)
  #define RExC_rx                (pRExC_state->rx)
@@ -241,7 +243,6 @@ struct RExC_state_t {
  #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
                                                     under /d from /u ? */
  
-
  #ifdef RE_TRACK_PATTERN_OFFSETS
  #  define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
                                                           others */
@@ -270,10 +271,17 @@ struct RExC_state_t {
  #define RExC_study_chunk_recursed_bytes  \
                                     (pRExC_state->study_chunk_recursed_bytes)
  #define RExC_in_lookbehind     (pRExC_state->in_lookbehind)
+#define RExC_in_lookahead      (pRExC_state->in_lookahead)
  #define RExC_contains_locale   (pRExC_state->contains_locale)
+#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+
  #ifdef EBCDIC
-#   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#  define SET_recode_x_to_native(x)                                         \
+                    STMT_START { RExC_recode_x_to_native = (x); } STMT_END
+#else
+#  define SET_recode_x_to_native(x) NOOP
  #endif
+
  #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
  #define RExC_frame_head (pRExC_state->frame_head)
  #define RExC_frame_last (pRExC_state->frame_last)
@@ -369,12 +377,8 @@ struct RExC_state_t {
  #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
      STMT_START {                                                            \
                  RExC_use_BRANCHJ = 1;                                       \
-                if (LIKELY(! IN_PARENS_PASS)) {                             \
-                    /* No need to restart the parse immediately if we're    \
-                     * going to reparse anyway to count parens */           \
-                    *flagp |= RESTART_PARSE;                                \
-                    return restart_retval;                                  \
-                }                                                           \
+                *flagp |= RESTART_PARSE;                                    \
+                return restart_retval;                                      \
      } STMT_END
  
  /* Until we have completed the parse, we leave RExC_total_parens at 0 or
@@ -745,6 +749,10 @@ static const scan_data_t zero_scan_data = {
      Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
  
+#define        FAIL3(msg,arg1,arg2) _FAIL(                         \
+    Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",      \
+     arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
+
  /*
   * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   */
@@ -821,8 +829,13 @@ static const scan_data_t zero_scan_data = {
  } STMT_END
  
  /* Setting this to NULL is a signal to not output warnings */
-#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
-#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
+    STMT_START {                                                            \
+      RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
+      RExC_copy_start_in_constructed = NULL;                                \
+    } STMT_END
+#define RESTORE_WARNINGS                                                    \
+    RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
  
  /* Since a warning can be generated multiple times as the input is reparsed, we
   * output it the first time we come to that point in the parse, but suppress it
@@ -1574,7 +1587,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      unsigned int i;
      const U32 n = ARG(node);
      bool new_node_has_latin1 = FALSE;
-    const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
+    const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+                      ? 0
+                      : ANYOF_FLAGS(node);
  
      PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
  
@@ -1627,7 +1642,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
      }
  
      /* Add in the points from the bit map */
-    if (OP(node) != ANYOFH) {
+    if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
              if (ANYOF_BITMAP_TEST(node, i)) {
                  unsigned int start = i++;
@@ -1714,7 +1729,9 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
       * another SSC or a regular ANYOF class.  Can create false positives. */
  
      SV* anded_cp_list;
-    U8  and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
+    U8  and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(and_with);
      U8  anded_flags;
  
      PERL_ARGS_ASSERT_SSC_AND;
@@ -1898,7 +1915,9 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
  
      SV* ored_cp_list;
      U8 ored_flags;
-    U8  or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
+    U8  or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+                         ? 0
+                         : ANYOF_FLAGS(or_with);
  
      PERL_ARGS_ASSERT_SSC_OR;
  
@@ -2512,7 +2531,8 @@ is the recommended Unicode-aware way of saying
         if (UTF) {                                                         \
              SV *zlopp = newSV(UTF8_MAXBYTES);                             \
             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
+            unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
+            *kapow = '\0';                                                 \
             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
             SvPOK_on(zlopp);                                               \
             SvUTF8_on(zlopp);                                              \
@@ -3531,9 +3551,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                      if ( state==1 ) {
                          OP( convert ) = nodetype;
                          str=STRING(convert);
-                        STR_LEN(convert)=0;
+                        setSTR_LEN(convert, 0);
                      }
-                    STR_LEN(convert) += len;
+                    setSTR_LEN(convert, STR_LEN(convert) + len);
                      while (len--)
                          *str++ = *ch++;
                 } else {
@@ -3973,8 +3993,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      using /iaa matching will be doing so almost entirely with ASCII
   *      strings, so this should rarely be encountered in practice */
  
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
-    if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
+    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
+                                      && OP(scan) != LEXACT_ONLY8)  \
          join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
  
  STATIC U32
@@ -4140,7 +4161,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
              merged++;
  
              NEXT_OFF(scan) += NEXT_OFF(n);
-            STR_LEN(scan) += STR_LEN(n);
+            setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
              next = n + NODE_SZ_STR(n);
              /* Now we can overwrite *n : */
              Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
@@ -5177,7 +5198,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
         }
         else if (   OP(scan) == EXACT
+                 || OP(scan) == LEXACT
                   || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == LEXACT_ONLY8
                   || OP(scan) == EXACTL)
          {
             SSize_t l = STR_LEN(scan);
@@ -5299,7 +5322,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                     next = NEXTOPER(scan);
                     if (   OP(next) == EXACT
+                        || OP(next) == LEXACT
                          || OP(next) == EXACT_ONLY8
+                        || OP(next) == LEXACT_ONLY8
                          || OP(next) == EXACTL
                          || (flags & SCF_DO_STCLASS))
                      {
@@ -5836,6 +5861,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
                  case ANYOFL:
                  case ANYOFPOSIXL:
                  case ANYOFH:
+                case ANYOFHb:
+                case ANYOFHr:
                  case ANYOF:
                     if (flags & SCF_DO_STCLASS_AND)
                         ssc_and(pRExC_state, data->start_class,
@@ -7242,7 +7269,7 @@ S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
          const char* name;
  
          name = get_regex_charset_name(RExC_rx->extflags, &len);
-        if strEQ(name, DEPENDS_PAT_MODS) {  /* /d under UTF-8 => /u */
+        if (strEQ(name, DEPENDS_PAT_MODS)) {  /* /d under UTF-8 => /u */
              assert(RExC_utf8);
              name = UNICODE_PAT_MODS;
              len = sizeof(UNICODE_PAT_MODS) - 1;
@@ -7562,6 +7589,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          && memEQ(RX_PRECOMP(old_re), exp, plen)
         && !runtime_code /* with runtime code, always recompile */ )
      {
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+            Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
+                          PL_colors[4], PL_colors[5], s);
+        });
          return old_re;
      }
  
@@ -7605,10 +7638,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      RExC_seen = 0;
      RExC_maxlen = 0;
      RExC_in_lookbehind = 0;
+    RExC_in_lookahead = 0;
      RExC_seen_zerolen = *exp == '^' ? -1 : 0;
-#ifdef EBCDIC
      RExC_recode_x_to_native = 0;
-#endif
      RExC_in_multi_char_class = 0;
  
      RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
@@ -7819,6 +7851,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
      SetProgLen(RExC_rxi,RExC_size);
  #endif
  
+    DEBUG_DUMP_PRE_OPTIMIZE_r({
+        SV * const sv = sv_newmortal();
+        RXi_GET_DECL(RExC_rx, ri);
+        DEBUG_RExC_seen();
+        Perl_re_printf( aTHX_ "Program before optimization:\n");
+
+        (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
+                        sv, 0, 0);
+    });
+
      DEBUG_OPTIMISE_r(
          Perl_re_printf( aTHX_  "Starting post parse optimization\n");
      );
@@ -7941,7 +7983,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          /* Ignore EXACT as we deal with it later. */
         if (PL_regkind[OP(first)] == EXACT) {
             if (   OP(first) == EXACT
+               || OP(first) == LEXACT
                  || OP(first) == EXACT_ONLY8
+                || OP(first) == LEXACT_ONLY8
                  || OP(first) == EXACTL)
              {
                 NOOP;   /* Empty, get anchored substr later. */
@@ -8287,7 +8331,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                   && nop == END)
              RExC_rx->extflags |= RXf_WHITE;
          else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+                  && (   fop == EXACT || fop == LEXACT
+                      || fop == EXACT_ONLY8 || fop == LEXACT_ONLY8
+                      || fop == EXACTL)
                    && STR_LEN(first) == 1
                    && *(STRING(first)) == ' '
                    && nop == END )
@@ -10992,14 +11038,14 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
      RExC_sawback = 1;
      ret = reganode(pRExC_state,
                     ((! FOLD)
-                     ? NREF
+                     ? REFN
                       : (ASCII_FOLD_RESTRICTED)
-                       ? NREFFA
+                       ? REFFAN
                         : (AT_LEAST_UNI_SEMANTICS)
-                         ? NREFFU
+                         ? REFFUN
                           : (LOC)
-                           ? NREFFL
-                           : NREFF),
+                           ? REFFLN
+                           : REFFN),
                      num);
      *flagp |= HASWIDTH;
  
@@ -11061,6 +11107,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
      *flagp = 0;                                /* Tentatively. */
  
+    if (RExC_in_lookbehind) {
+       RExC_in_lookbehind++;
+    }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead++;
+    }
+
      /* Having this true makes it feasible to have a lot fewer tests for the
       * parse pointer being in scope.  For example, we can write
       *      while(isFOO(*RExC_parse)) RExC_parse++;
@@ -11302,10 +11355,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          return 0;
                      }
  
-                    REGTAIL(pRExC_state, ret, atomic);
+                    if (! REGTAIL(pRExC_state, ret, atomic)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
  
-                    REGTAIL(pRExC_state, atomic,
-                           reg_node(pRExC_state, SRCLOSE));
+                    if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
+                                                                SRCLOSE)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
  
                      RExC_in_script_run = 0;
                      return ret;
@@ -11525,10 +11583,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  if (RExC_parse >= RExC_end) {
                      vFAIL("Sequence (?... not terminated");
                  }
-
-                /* FALLTHROUGH */
+                RExC_seen_zerolen++;
+                break;
             case '=':           /* (?=...) */
                 RExC_seen_zerolen++;
+                RExC_in_lookahead++;
                  break;
             case '!':           /* (?!...) */
                 RExC_seen_zerolen++;
@@ -11764,7 +11823,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                                         RExC_flags & RXf_PMf_COMPILETIME
                                        );
                      FLAGS(REGNODE_p(ret)) = 2;
-                    REGTAIL(pRExC_state, ret, eval);
+                    if (! REGTAIL(pRExC_state, ret, eval)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      /* deal with the length of this later - MJD */
                     return ret;
                 }
@@ -11817,7 +11878,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
  
                      tail = reg(pRExC_state, 1, &flag, depth+1);
                      RETURN_FAIL_ON_RESTART(flag, flagp);
-                    REGTAIL(pRExC_state, ret, tail);
+                    if (! REGTAIL(pRExC_state, ret, tail)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      goto insert_if;
                  }
                 else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
@@ -11840,7 +11903,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          RExC_rxi->data->data[num]=(void*)sv_dat;
                          SvREFCNT_inc_simple_void_NN(sv_dat);
                      }
-                    ret = reganode(pRExC_state, NGROUPP, num);
+                    ret = reganode(pRExC_state, GROUPPN, num);
                      goto insert_if_check_paren;
                 }
                 else if (memBEGINs(RExC_parse,
@@ -11909,15 +11972,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                     }
                     nextchar(pRExC_state);
                   insert_if:
-                    REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+                    if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
+                                                             IFTHEN, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                      br = regbranch(pRExC_state, &flags, 1, depth+1);
                     if (br == 0) {
                          RETURN_FAIL_ON_RESTART(flags,flagp);
                          FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                                (UV) flags);
                      } else
-                        REGTAIL(pRExC_state, br, reganode(pRExC_state,
-                                                          LONGJMP, 0));
+                    if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
+                                                             LONGJMP, 0)))
+                    {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     c = UCHARAT(RExC_parse);
                      nextchar(pRExC_state);
                     if (flags&HASWIDTH)
@@ -11934,7 +12004,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                              FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
                                    (UV) flags);
                          }
-                        REGTAIL(pRExC_state, ret, lastbr);
+                        if (! REGTAIL(pRExC_state, ret, lastbr)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                         if (flags&HASWIDTH)
                             *flagp |= HASWIDTH;
                          c = UCHARAT(RExC_parse);
@@ -11949,16 +12021,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                              vFAIL("Switch (?(condition)... contains too many branches");
                      }
                     ender = reg_node(pRExC_state, TAIL);
-                    REGTAIL(pRExC_state, br, ender);
+                    if (! REGTAIL(pRExC_state, br, ender)) {
+                        REQUIRE_BRANCHJ(flagp, 0);
+                    }
                     if (lastbr) {
-                        REGTAIL(pRExC_state, lastbr, ender);
-                        REGTAIL(pRExC_state, REGNODE_OFFSET(
-                                                NEXTOPER(
-                                                NEXTOPER(REGNODE_p(lastbr)))),
-                                             ender);
+                        if (! REGTAIL(pRExC_state, lastbr, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
+                        if (! REGTAIL(pRExC_state,
+                                      REGNODE_OFFSET(
+                                                 NEXTOPER(
+                                                 NEXTOPER(REGNODE_p(lastbr)))),
+                                      ender))
+                        {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
                     }
                     else
-                        REGTAIL(pRExC_state, ret, ender);
+                        if (! REGTAIL(pRExC_state, ret, ender)) {
+                            REQUIRE_BRANCHJ(flagp, 0);
+                        }
  #if 0  /* Removing this doesn't cause failures in the test suite -- khw */
                      RExC_size++; /* XXX WHY do we need this?!!
                                      For large programs it seems to be required
@@ -12108,7 +12190,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
         *flagp |= flags&SIMPLE;
      }
      if (is_open) {                             /* Starts with OPEN. */
-        REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
+        if (! REGTAIL(pRExC_state, ret, br)) {  /* OPEN -> first. */
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
      else if (paren != '?')             /* Not Conditional */
         ret = br;
@@ -12116,12 +12200,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      lastbr = br;
      while (*RExC_parse == '|') {
         if (RExC_use_BRANCHJ) {
+            bool shut_gcc_up;
+
             ender = reganode(pRExC_state, LONGJMP, 0);
  
              /* Append to the previous. */
-            REGTAIL(pRExC_state,
-                    REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
-                    ender);
+            shut_gcc_up = REGTAIL(pRExC_state,
+                         REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
+                         ender);
+            PERL_UNUSED_VAR(shut_gcc_up);
         }
         nextchar(pRExC_state);
         if (freeze_paren) {
@@ -12232,9 +12319,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          is_nothing= 0;
                 }
                 else if (op == BRANCHJ) {
-                    REGTAIL_STUDY(pRExC_state,
-                                  REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
-                                  ender);
+                    bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
+                                        REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
+                                        ender);
+                    PERL_UNUSED_VAR(shut_gcc_up);
                      /* for now we always disable this optimisation * /
                      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
                           || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
@@ -12327,6 +12415,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      if (RExC_in_lookbehind) {
         RExC_in_lookbehind--;
      }
+    if (RExC_in_lookahead) {
+        RExC_in_lookahead--;
+    }
      if (after_freeze > RExC_npar)
          RExC_npar = after_freeze;
      return(ret);
@@ -12546,7 +12637,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 const regnode_offset w = reg_node(pRExC_state, WHILEM);
  
                 FLAGS(REGNODE_p(w)) = 0;
-                REGTAIL(pRExC_state, ret, w);
+                if (!  REGTAIL(pRExC_state, ret, w)) {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                 if (RExC_use_BRANCHJ) {
                     reginsert(pRExC_state, LONGJMP, ret, depth+1);
                     reginsert(pRExC_state, NOTHING, ret, depth+1);
@@ -12561,7 +12654,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 if (RExC_use_BRANCHJ)
                      NEXT_OFF(REGNODE_p(ret)) = 3;   /* Go over NOTHING to
                                                         LONGJMP. */
-                REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+                if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+                                                          NOTHING)))
+                {
+                    REQUIRE_BRANCHJ(flagp, 0);
+                }
                  RExC_whilem_seen++;
                  MARK_NAUGHTY_EXP(1, 4);     /* compound interest */
             }
@@ -12633,16 +12730,22 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      if (*RExC_parse == '?') {
         nextchar(pRExC_state);
         reginsert(pRExC_state, MINMOD, ret, depth+1);
-        REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+        if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
      else if (*RExC_parse == '+') {
          regnode_offset ender;
          nextchar(pRExC_state);
          ender = reg_node(pRExC_state, SUCCEED);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
          reginsert(pRExC_state, SUSPEND, ret, depth+1);
          ender = reg_node(pRExC_state, TAIL);
-        REGTAIL(pRExC_state, ret, ender);
+        if (! REGTAIL(pRExC_state, ret, ender)) {
+            REQUIRE_BRANCHJ(flagp, 0);
+        }
      }
  
      if (ISMULT2(RExC_parse)) {
@@ -12936,11 +13039,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
          sv_catsv(substitute_parse, value_sv);
          sv_catpv(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The value should already be native, so no need to convert on EBCDIC
           * platforms.*/
          assert(! RExC_recode_x_to_native);
-#endif
  
      }
      else {   /* \N{U+...} */
@@ -13073,12 +13174,9 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
  
          sv_catpvs(substitute_parse, ")");
  
-#ifdef EBCDIC
          /* The values are Unicode, and therefore have to be converted to native
           * on a non-Unicode (meaning non-ASCII) platform. */
-        RExC_recode_x_to_native = 1;
-#endif
-
+        SET_recode_x_to_native(1);
      }
  
      /* Here, we have the string the name evaluates to, ready to be parsed,
@@ -13103,9 +13201,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
      RExC_start = save_start;
      RExC_parse = endbrace;
      RExC_end = orig_end;
-#ifdef EBCDIC
-    RExC_recode_x_to_native = 0;
-#endif
+    SET_recode_x_to_native(0);
  
      SvREFCNT_dec_NN(substitute_parse);
  
@@ -13283,7 +13379,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      char *parse_start;
      U8 op;
      int invert = 0;
-    U8 arg;
  
      GET_RE_DEBUG_FLAGS_DECL;
  
@@ -13412,15 +13507,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             *flagp |= SIMPLE;
             goto finish_meta_pat;
         case 'K':
-           RExC_seen_zerolen++;
-           ret = reg_node(pRExC_state, KEEPS);
-           *flagp |= SIMPLE;
-           /* XXX:dmq : disabling in-place substitution seems to
-            * be necessary here to avoid cases of memory corruption, as
-            * with: C<$_="x" x 80; s/x\K/y/> -- rgs
-            */
-            RExC_seen |= REG_LOOKBEHIND_SEEN;
-           goto finish_meta_pat;
+            if (!RExC_in_lookbehind && !RExC_in_lookahead) {
+                RExC_seen_zerolen++;
+                ret = reg_node(pRExC_state, KEEPS);
+                *flagp |= SIMPLE;
+                /* XXX:dmq : disabling in-place substitution seems to
+                 * be necessary here to avoid cases of memory corruption, as
+                 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
+                 */
+                RExC_seen |= REG_LOOKBEHIND_SEEN;
+                goto finish_meta_pat;
+            }
+            else {
+                ++RExC_parse; /* advance past the 'K' */
+                vFAIL("\\K not permitted in lookahead/lookbehind");
+            }
         case 'Z':
             ret = reg_node(pRExC_state, SEOL);
             *flagp |= SIMPLE;
@@ -13438,13 +13539,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             *flagp |= HASWIDTH;
             goto finish_meta_pat;
  
-       case 'W':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'w':
-            arg = ANYOF_WORDCHAR;
-            goto join_posix;
-
         case 'B':
              invert = 1;
              /* FALLTHROUGH */
@@ -13563,85 +13657,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             goto finish_meta_pat;
            }
  
-       case 'D':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'd':
-            arg = ANYOF_DIGIT;
-            if (! DEPENDS_SEMANTICS) {
-                goto join_posix;
-            }
-
-            /* \d doesn't have any matches in the upper Latin1 range, hence /d
-             * is equivalent to /u.  Changing to /u saves some branches at
-             * runtime */
-            op = POSIXU;
-            goto join_posix_op_known;
-
         case 'R':
             ret = reg_node(pRExC_state, LNBREAK);
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
  
-       case 'H':
-            invert = 1;
-            /* FALLTHROUGH */
+       case 'd':
+       case 'D':
         case 'h':
-           arg = ANYOF_BLANK;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'V':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'v':
-           arg = ANYOF_VERTWS;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'S':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 's':
-            arg = ANYOF_SPACE;
-
-          join_posix:
-
-           op = POSIXD + get_regex_charset(RExC_flags);
-            if (op > POSIXA) {  /* /aa is same as /a */
-                op = POSIXA;
-            }
-            else if (op == POSIXL) {
-                RExC_contains_locale = 1;
-            }
-            else if (op == POSIXD) {
-                RExC_seen_d_op = TRUE;
-            }
-
-          join_posix_op_known:
-
-            if (invert) {
-                op += NPOSIXD - POSIXD;
-            }
-
-           ret = reg_node(pRExC_state, op);
-            FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
-           *flagp |= HASWIDTH|SIMPLE;
-            /* FALLTHROUGH */
-
-          finish_meta_pat:
-            if (   UCHARAT(RExC_parse + 1) == '{'
-                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
-            {
-                RExC_parse += 2;
-                vFAIL("Unescaped left brace in regex is illegal here");
-            }
-           nextchar(pRExC_state);
-            Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
-           break;
+       case 'H':
         case 'p':
         case 'P':
+       case 's':
+       case 'S':
+       case 'v':
+       case 'V':
+       case 'w':
+       case 'W':
+            /* These all have the same meaning inside [brackets], and it knows
+             * how to do the best optimizations for them.  So, pretend we found
+             * these within brackets, and let it do the work */
              RExC_parse--;
  
              ret = regclass(pRExC_state, flagp, depth+1,
@@ -13660,10 +13695,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
                        (UV) *flagp);
  
-            RExC_parse--;
+            RExC_parse--;   /* regclass() leaves this one too far ahead */
  
+          finish_meta_pat:
+                   /* The escapes above that don't take a parameter can't be
+                    * followed by a '{'.  But 'pX', 'p{foo}' and
+                    * correspondingly 'P' can be */
+            if (   RExC_parse - parse_start == 1
+                && UCHARAT(RExC_parse + 1) == '{'
+                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+            {
+                RExC_parse += 2;
+                vFAIL("Unescaped left brace in regex is illegal here");
+            }
              Set_Node_Offset(REGNODE_p(ret), parse_start);
-            Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+            Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
              nextchar(pRExC_state);
             break;
          case 'N':
@@ -13784,7 +13830,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          && num >= RExC_npar
                          /* cannot be an octal escape if it starts with 8 */
                          && *RExC_parse != '8'
-                        /* cannot be an octal escape it it starts with 9 */
+                        /* cannot be an octal escape if it starts with 9 */
                          && *RExC_parse != '9'
                      ) {
                          /* Probably not meant to be a backref, instead likely
@@ -13885,13 +13931,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             UV ender = 0;
             char *p;
             char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
             char *s0;
-           U8 upper_parse = MAX_NODE_STRING_SIZE;
+            U32 max_string_len = 255;
+
+            /* We may have to reparse the node, artificially stopping filling
+             * it early, based on info gleaned in the first parse.  This
+             * variable gives where we stop.  Make it above the normal stopping
+             * place first time through. */
+           U32 upper_fill = max_string_len + 1;
  
              /* We start out as an EXACT node, even if under /i, until we find a
               * character which is in a fold.  The algorithm now segregates into
@@ -13907,7 +13954,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* Assume the node will be fully used; the excess is given back at
               * the end.  We can't make any other length assumptions, as a byte
               * input sequence could shrink down. */
-            Ptrdiff_t initial_size = STR_SZ(256);
+            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len);
  
              bool next_is_quantifier;
              char * oldp = NULL;
@@ -13938,10 +13985,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* So is the MICRO SIGN */
              bool has_micro_sign = FALSE;
  
+            /* Set when we fill up the current node and there is still more
+             * text to process */
+            bool overflowed;
+
              /* Allocate an EXACT node.  The node_type may change below to
               * another EXACTish node, but since the size of the node doesn't
               * change, it works */
-            ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+            ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+                                                                    "exact");
              FILL_NODE(ret, node_type);
              RExC_emit++;
  
@@ -13951,6 +14003,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
           reparse:
  
+            p = RExC_parse;
+            len = 0;
+            s = s0;
+
+          continue_parse:
+
              /* This breaks under rare circumstances.  If folding, we do not
               * want to split a node at a character that is a non-final in a
               * multi-char fold, as an input string could just happen to want to
@@ -13965,12 +14023,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                     || UTF8_IS_START(UCHARAT(RExC_parse)));
  
+            overflowed = FALSE;
+
              /* Here, we have a literal character.  Find the maximal string of
               * them in the input that we can fit into a single EXACTish node.
               * We quit at the first non-literal or when the node gets full, or
               * under /i the categorization of folding/non-folding character
               * changes */
-           for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+            while (p < RExC_end && len < upper_fill) {
  
                  /* In most cases each iteration adds one byte to the output.
                   * The exceptions override this */
@@ -14145,13 +14205,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              UPDATE_WARNINGS_LOC(p - 1);
                              ender = result;
  
-                            if (ender < 0x100) {
  #ifdef EBCDIC
+                            if (ender < 0x100) {
                                  if (RExC_recode_x_to_native) {
                                      ender = LATIN1_TO_NATIVE(ender);
                                  }
-#endif
                             }
+#endif
                             break;
                         }
                     case 'c':
@@ -14308,20 +14368,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  /* Ready to add 'ender' to the node */
  
                  if (! FOLD) {  /* The simple case, just append the literal */
+                  not_fold_common:
  
-                      not_fold_common:
-                        if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
-                            *(s++) = (char) ender;
-                        }
-                        else {
-                            U8 * new_s = uvchr_to_utf8((U8*)s, ender);
-                            added_len = (char *) new_s - s;
-                            s = (char *) new_s;
+                    /* Don't output if it would overflow */
+                    if (UNLIKELY(len > max_string_len - ((UTF)
+                                                         ? UVCHR_SKIP(ender)
+                                                         : 1)))
+                    {
+                        overflowed = TRUE;
+                        break;
+                    }
  
-                            if (ender > 255)  {
-                                requires_utf8_target = TRUE;
-                            }
+                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+                        *(s++) = (char) ender;
+                    }
+                    else {
+                        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+                        added_len = (char *) new_s - s;
+                        s = (char *) new_s;
+
+                        if (ender > 255)  {
+                            requires_utf8_target = TRUE;
                          }
+                    }
                  }
                  else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
  
@@ -14387,20 +14456,33 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                      if (UTF) {  /* Use the folded value */
                          if (UVCHR_IS_INVARIANT(ender)) {
+                            if (UNLIKELY(len + 1 > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                              *(s)++ = (U8) toFOLD(ender);
                          }
                          else {
-                            ender = _to_uni_fold_flags(
+                            U8 temp[UTF8_MAXBYTES_CASE+1];
+
+                            UV folded = _to_uni_fold_flags(
                                      ender,
-                                    (U8 *) s,
+                                    temp,
                                      &added_len,
                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
                                                      ? FOLD_FLAGS_NOMIX_ASCII
                                                      : 0));
+                            if (UNLIKELY(len + added_len > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
+                            Copy(temp, s, added_len, char);
                              s += added_len;
  
-                            if (   ender > 255
-                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            if (   folded > 255
+                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
                              {
                                  /* U+B5 folds to the MU, so its possible for a
                                   * non-UTF-8 target to match it */
@@ -14452,9 +14534,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
                                  maybe_SIMPLE = 0;
                                  if (node_type == EXACTFU) {
+
+                                    if (UNLIKELY(len + 2 > max_string_len)) {
+                                        overflowed = TRUE;
+                                        break;
+                                    }
+
                                      *(s++) = 's';
  
-                                    /* Let the code below add in the extra 's' */
+                                    /* Let the code below add in the extra 's'
+                                     * */
                                      ender = 's';
                                      added_len = 2;
                                  }
@@ -14466,6 +14555,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              has_micro_sign = TRUE;
                          }
  
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
+                        }
+
                          *(s++) = (DEPENDS_SEMANTICS)
                                   ? (char) toFOLD(ender)
  
@@ -14490,168 +14584,280 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
             } /* End of loop through literal characters */
  
-            /* Here we have either exhausted the input or ran out of room in
-             * the node.  (If we encountered a character that can't be in the
-             * node, transfer is made directly to <loopdone>, and so we
-             * wouldn't have fallen off the end of the loop.)  In the latter
-             * case, we artificially have to split the node into two, because
-             * we just don't have enough space to hold everything.  This
-             * creates a problem if the final character participates in a
-             * multi-character fold in the non-final position, as a match that
-             * should have occurred won't, due to the way nodes are matched,
-             * and our artificial boundary.  So back off until we find a non-
-             * problematic character -- one that isn't at the beginning or
-             * middle of such a fold.  (Either it doesn't participate in any
-             * folds, or appears only in the final position of all the folds it
-             * does participate in.)  A better solution with far fewer false
-             * positives, and that would fill the nodes more completely, would
-             * be to actually have available all the multi-character folds to
-             * test against, and to back-off only far enough to be sure that
-             * this node isn't ending with a partial one.  <upper_parse> is set
-             * further below (if we need to reparse the node) to include just
-             * up through that final non-problematic character that this code
-             * identifies, so when it is set to less than the full node, we can
-             * skip the rest of this */
-            if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
-                PERL_UINT_FAST8_T backup_count = 0;
-
-                const STRLEN full_len = len;
-
-               assert(len >= MAX_NODE_STRING_SIZE);
-
-                /* Here, <s> points to just beyond where we have output the
-                 * final character of the node.  Look backwards through the
-                 * string until find a non- problematic character */
-
-               if (! UTF) {
-
-                    /* This has no multi-char folds to non-UTF characters */
-                    if (ASCII_FOLD_RESTRICTED) {
-                        goto loopdone;
-                    }
+            /* Here we have either exhausted the input or run out of room in
+             * the node.  If the former, we are done.  (If we encountered a
+             * character that can't be in the node, transfer is made directly
+             * to <loopdone>, and so we wouldn't have fallen off the end of the
+             * loop.)  */
+            if (LIKELY(! overflowed)) {
+                goto loopdone;
+            }
+
+            /* Here we have run out of room.  We can grow plain EXACT and
+             * LEXACT nodes.  If the pattern is gigantic enough, though,
+             * eventually we'll have to artificially chunk the pattern into
+             * multiple nodes. */
+            if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+                Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+                Size_t overhead_expansion = 0;
+                char temp[256];
+                Size_t max_nodes_for_string;
+                Size_t achievable;
+                SSize_t delta;
+
+                /* Here we couldn't fit the final character in the current
+                 * node, so it will have to be reparsed, no matter what else we
+                 * do */
+                p = oldp;
+
+
+                /* If would have overflowed a regular EXACT node, switch
+                 * instead to an LEXACT.  The code below is structured so that
+                 * the actual growing code is common to changing from an EXACT
+                 * or just increasing the LEXACT size.  This means that we have
+                 * to save the string in the EXACT case before growing, and
+                 * then copy it afterwards to its new location */
+                if (node_type == EXACT) {
+                    overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+                    RExC_emit += overhead_expansion;
+                    Copy(s0, temp, len, char);
+                }
+
+                /* Ready to grow.  If it was a plain EXACT, the string was
+                 * saved, and the first few bytes of it overwritten by adding
+                 * an argument field.  We assume, as we do elsewhere in this
+                 * file, that one byte of remaining input will translate into
+                 * one byte of output, and if that's too small, we grow again,
+                 * if too large the excess memory is freed at the end */
+
+                max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+                achievable = MIN(max_nodes_for_string,
+                                 current_string_nodes + STR_SZ(RExC_end - p));
+                delta = achievable - current_string_nodes;
+
+                /* If there is just no more room, go finish up this chunk of
+                 * the pattern. */
+                if (delta <= 0) {
+                    goto loopdone;
+                }
  
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
-                        backup_count++;
-                    }
-                    len = s - s0 + 1;
-               }
-                else {
+                change_engine_size(pRExC_state, delta + overhead_expansion);
+                current_string_nodes += delta;
+                max_string_len
+                           = sizeof(struct regnode) * current_string_nodes;
+                upper_fill = max_string_len + 1;
  
-                    /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                /* If the length was small, we know this was originally an
+                 * EXACT node now converted to LEXACT, and the string has to be
+                 * restored.  Otherwise the string was untouched.  260 is just
+                 * a number safely above 255 so don't have to worry about
+                 * getting it precise */
+                if (len < 260) {
+                    node_type = LEXACT;
+                    FILL_NODE(ret, node_type);
+                    s0 = STRING(REGNODE_p(ret));
+                    Copy(temp, s0, len, char);
+                    s = s0 + len;
+                }
  
-                    while (s >= s0) {   /* Search backwards until find
-                                           a non-problematic char */
-                        if (UTF8_IS_INVARIANT(*s)) {
+                goto continue_parse;
+            }
+            else {
  
-                            /* There are no ascii characters that participate
-                             * in multi-char folds under /aa.  In EBCDIC, the
-                             * non-ascii invariants are all control characters,
-                             * so don't ever participate in any folds. */
-                            if (ASCII_FOLD_RESTRICTED
-                                || ! IS_NON_FINAL_FOLD(*s))
-                            {
-                                break;
-                            }
-                        }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
-                                                                  *s, *(s+1))))
-                            {
-                                break;
-                            }
+                /* Here is /i.  Running out of room creates a problem if we are
+                 * folding, and the split happens in the middle of a
+                 * multi-character fold, as a match that should have occurred,
+                 * won't, due to the way nodes are matched, and our artificial
+                 * boundary.  So back off until we aren't splitting such a
+                 * fold.  If there is no such place to back off to, we end up
+                 * taking the entire node as-is.  This can happen if the node
+                 * consists entirely of 'f' or entirely of 's' characters (or
+                 * things that fold to them) as 'ff' and 'ss' are
+                 * multi-character folds.
+                 *
+                 * At this point:
+                 *  oldp        points to the beginning in the input of the
+                 *              final character in the node.
+                 *  p           points to the beginning in the input of the
+                 *              next character in the input, the one that won't
+                 *              fit in the node.
+                 *
+                 * We aren't in the middle of a multi-char fold unless the
+                 * final character in the node can appear in a non-final
+                 * position in such a fold.  Very few characters actually
+                 * participate in multi-character folds, and fewer still can be
+                 * in the non-final position.  But it's complicated to know
+                 * here if that final character is folded or not, so skip this
+                 * check */
+
+                           /* Make sure enough space for final char of node,
+                            * first char of following node, and the fold of the
+                            * following char (so we don't have to worry about
+                            * that fold running off the end */
+                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+                STRLEN fold_len;
+                UV folded;
+
+                assert(FOLD);
+
+                /* The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  So we create a buffer
+                 * containing a window of three.  The first is the final
+                 * character in the node (folded), and then the two that begin
+                 * the following node.   But if the first character of the
+                 * following node can't be in a non-final fold position, there
+                 * is no need to look at its successor character.  The macros
+                 * used below to check for multi character folds require folded
+                 * inputs, so we have to fold these.  (The fold of p was likely
+                 * calculated in the loop above, but it hasn't beeen saved, and
+                 * khw thinks it would be too entangled to change to do so) */
+
+                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+                    folded = _to_uni_fold_flags(ender,
+                                                foldbuf,
+                                                &fold_len,
+                                                FOLD_FLAGS_FULL);
+                }
+                else {
+                    foldbuf[0] = folded = MICRO_SIGN;
+                    fold_len = 1;
+                }
+
+                /* Here, foldbuf contains the fold of the first character in
+                 * the next node.  We may also need the next one (if there is
+                 * one) to get our third, but if the first character folded to
+                 * more than one, those extra one(s) will serve as the third.
+                 * Also, we don't need a third unless the previous one can
+                 * appear in a non-final position in a fold */
+                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+                    && (fold_len == 1 || (   UTF
+                                          && UVCHR_SKIP(folded) == fold_len))
+                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+                {
+                    if (UTF) {
+                        STRLEN next_fold_len;
+
+                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+                                         (U8*) RExC_end, foldbuf + fold_len,
+                                         &next_fold_len);
+                        fold_len += next_fold_len;
+                    }
+                    else {
+                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+                            foldbuf[fold_len] = 's';
                          }
-                        else if (! _invlist_contains_cp(
-                                        PL_NonFinalFold,
-                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
-                        {
-                            break;
+                        else {
+                            foldbuf[fold_len] = toLOWER_L1(p[1]);
                          }
+                        fold_len++;
+                    }
+                }
  
-                        /* Here, the current character is problematic in that
-                         * it does occur in the non-final position of some
-                         * fold, so try the character before it, but have to
-                         * special case the very first byte in the string, so
-                         * we don't read outside the string */
-                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
-                        backup_count++;
-                    } /* End of loop backwards through the string */
-
-                    /* If there were only problematic characters in the string,
-                     * <s> will point to before s0, in which case the length
-                     * should be 0, otherwise include the length of the
-                     * non-problematic character just found */
-                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
-               }
+                /* Here foldbuf contains the the fold of p, and if appropriate
+                 * that of the character following p in the input. */
  
-                /* Here, have found the final character, if any, that is
-                 * non-problematic as far as ending the node without splitting
-                 * it across a potential multi-char fold.  <len> contains the
-                 * number of bytes in the node up-to and including that
-                 * character, or is 0 if there is no such character, meaning
-                 * the whole node contains only problematic characters.  In
-                 * this case, give up and just take the node as-is.  We can't
-                 * do any better */
-                if (len == 0) {
-                    len = full_len;
+                /* Search backwards until find a place that doesn't split a
+                 * multi-char fold */
+                while (1) {
+                    STRLEN s_len;
+                    char s_fold_buf[UTF8_MAXBYTES_CASE];
+                    char * s_fold = s_fold_buf;
  
-                } else {
+                    if (s <= s0) {
  
-                    /* Here, the node does contain some characters that aren't
-                     * problematic.  If we didn't have to backup any, then the
-                     * final character in the node is non-problematic, and we
-                     * can take the node as-is */
-                    if (backup_count == 0) {
-                        goto loopdone;
+                        /* There's no safe place in the node to split.  Quit so
+                         * will take the whole node */
+                        break;
                      }
-                    else if (backup_count == 1) {
  
-                        /* If the final character is problematic, but the
-                         * penultimate is not, back-off that last character to
-                         * later start a new node with it */
-                        p = oldp;
-                        goto loopdone;
+                    /* Backup 1 character.  The first time through this moves s
+                     * to point to the final character in the node */
+                    if (UTF) {
+                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    }
+                    else {
+                        s--;
+                    }
+
+                    /* 's' may or may not be folded; so make sure it is, and
+                     * use just the final character in its fold (should there
+                     * be more than one */
+                    if (UTF) {
+                        toFOLD_utf8_safe((U8*) s,
+                                         (U8*) s + UTF8SKIP(s),
+                                         (U8 *) s_fold_buf, &s_len);
+                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+                        {
+                            s_fold += UTF8SKIP(s_fold);
+                        }
+                        s_len = UTF8SKIP(s_fold);
+                    }
+                    else {
+                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+                        {
+                            s_fold_buf[0] = 's';
+                        }
+                        else {  /* This works for all other non-UTF-8 folds
+                                 */
+                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+                        }
+                        s_len = 1;
                      }
  
-                    /* Here, the final non-problematic character is earlier
-                     * in the input than the penultimate character.  What we do
-                     * is reparse from the beginning, going up only as far as
-                     * this final ok one, thus guaranteeing that the node ends
-                     * in an acceptable character.  The reason we reparse is
-                     * that we know how far in the character is, but we don't
-                     * know how to correlate its position with the input parse.
-                     * An alternate implementation would be to build that
-                     * correlation as we go along during the original parse,
-                     * but that would entail extra work for every node, whereas
-                     * this code gets executed only when the string is too
-                     * large for the node, and the final two characters are
-                     * problematic, an infrequent occurrence.  Yet another
-                     * possible strategy would be to save the tail of the
-                     * string, and the next time regatom is called, initialize
-                     * with that.  The problem with this is that unless you
-                     * back off one more character, you won't be guaranteed
-                     * regatom will get called again, unless regbranch,
-                     * regpiece ... are also changed.  If you do back off that
-                     * extra character, so that there is input guaranteed to
-                     * force calling regatom, you can't handle the case where
-                     * just the first character in the node is acceptable.  I
-                     * (khw) decided to try this method which doesn't have that
-                     * pitfall; if performance issues are found, we can do a
-                     * combination of the current approach plus that one */
-                    upper_parse = len;
-                    len = 0;
-                    s = s0;
-                    goto reparse;
+                    /* Unshift this character to the beginning of the buffer,
+                     * No longer needed trailing characters are overwritten.
+                     * */
+                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+                    Copy(s_fold, foldbuf, s_len, U8);
+
+                    /* If this isn't a multi-character fold, we have found a
+                     * splittable place.  If this is the final character in the
+                     * node, that means the node is valid as-is, and can quit.
+                     * Otherwise, we note how much we can fill the node before
+                     * coming to a non-splittable position, and go parse it
+                     * again, stopping there. This is done because we know
+                     * where in the output to stop, but we don't have a map to
+                     * where that is in the input.  One could be created, but
+                     * it seems like overkill for such a rare event as we are
+                     * dealing with here */
+                    if (UTF) {
+                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                        {
+                            upper_fill = s + UTF8SKIP(s) - s0;
+                            if (LIKELY(upper_fill == 255)) {
+                                break;
+                            }
+                            goto reparse;
+                        }
+                    }
+                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                    {
+                        upper_fill = s + 1 - s0;
+                        if (LIKELY(upper_fill == 255)) {
+                            break;
+                        }
+                        goto reparse;
+                    }
                  }
+
+                /* Here the node consists entirely of non-final multi-char
+                 * folds.  (Likely it is all 'f's or all 's's.)  There's no
+                 * decent place to split it, so give up and just take the whole
+                 * thing */
+
             }   /* End of verifying node ends with an appropriate char */
  
+            p = oldp;
+
            loopdone:   /* Jumped to when encounters something that shouldn't be
                           in the node */
  
              /* Free up any over-allocated space; cast is to silence bogus
               * warning in MS VC */
              change_engine_size(pRExC_state,
-                                - (Ptrdiff_t) (initial_size - STR_SZ(len)));
+                        - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
  
              /* I (khw) don't know if you can get here with zero length, but the
               * old code handled this situation by creating a zero-length EXACT
@@ -14670,7 +14876,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      else if (requires_utf8_target) {
                          node_type = EXACT_ONLY8;
                      }
-                } else if (FOLD) {
+                }
+                else if (node_type == LEXACT) {
+                    if (requires_utf8_target) {
+                        node_type = LEXACT_ONLY8;
+                    }
+                }
+                else if (FOLD) {
                      if (    UNLIKELY(has_micro_sign || has_ss)
                          && (node_type == EXACTFU || (   node_type == EXACTF
                                                       && maybe_exactfu)))
@@ -14690,6 +14902,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      }
                      else if (node_type == EXACTF) {  /* Means is /di */
  
+                        /* This intermediate variable is needed solely because
+                         * the asserts in the macro where used exceed Win32's
+                         * literal string capacity */
+                        char first_char = * STRING(REGNODE_p(ret));
+
                          /* If 'maybe_exactfu' is clear, then we need to stay
                           * /di.  If it is set, it means there are no code
                           * points that match differently depending on UTF8ness
@@ -14698,7 +14915,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          if (! maybe_exactfu) {
                              RExC_seen_d_op = TRUE;
                          }
-                        else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+                        else if (   isALPHA_FOLD_EQ(first_char, 's')
                                   || isALPHA_FOLD_EQ(ender, 's'))
                          {
                              /* But, if the node begins or ends in an 's' we
@@ -14723,11 +14940,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  }
  
                  OP(REGNODE_p(ret)) = node_type;
-                STR_LEN(REGNODE_p(ret)) = len;
+                setSTR_LEN(REGNODE_p(ret), len);
                  RExC_emit += STR_SZ(len);
  
                  /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
                      maybe_SIMPLE = 0;
                  }
  
@@ -14779,7 +14996,7 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
      assert(PL_regkind[OP(node)] == ANYOF);
  
      /* There is no bitmap for this node type */
-    if (OP(node) == ANYOFH) {
+    if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
          return;
      }
  
@@ -17004,7 +17221,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                  {
                                      if (strict) {
                                          RExC_parse--;
-                                        vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
+                                        vFAIL("\\N{} here is restricted to one character");
                                      }
                                      ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
                                      break; /* <value> contains the first code
@@ -17722,7 +17939,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             literal
                          );
                  }
-                else if isMNEMONIC_CNTRL(value) {
+                else if (isMNEMONIC_CNTRL(value)) {
                      vWARN4(RExC_parse,
                             "\"%.*s\" is more clearly written simply as \"%s\"",
                             (int) (RExC_parse - rangebegin),
@@ -18765,7 +18982,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      ret = regnode_guts(pRExC_state, op, len, "exact");
                      FILL_NODE(ret, op);
                      RExC_emit += 1 + STR_SZ(len);
-                    STR_LEN(REGNODE_p(ret)) = len;
+                    setSTR_LEN(REGNODE_p(ret), len);
                      if (len == 1) {
                          *STRING(REGNODE_p(ret)) = (U8) value;
                      }
@@ -18853,7 +19070,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                      full_cp_count += this_end - this_start + 1;
                  }
-                invlist_iterfinish(cp_list);
  
                  /* At the end of the loop, we count how many bits differ from
                   * the bits in lowest code point, call the count 'd'.  If the
@@ -18882,8 +19098,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      ret = reganode(pRExC_state, op, lowest_cp);
                      FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
                  }
+
+              done_anyofm:
+                invlist_iterfinish(cp_list);
              }
-          done_anyofm:
  
              if (inverted) {
                  _invlist_invert(cp_list);
@@ -19008,52 +19226,92 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              SvREFCNT_dec(intersection);
          }
  
-        /* If didn't find an optimization and there is no need for a
-        * bitmap, optimize to indicate that */
+        /* If didn't find an optimization and there is no need for a bitmap,
+         * optimize to indicate that */
          if (     start[0] >= NUM_ANYOF_CODE_POINTS
              && ! LOC
              && ! upper_latin1_only_utf8_matches
              &&   anyof_flags == 0)
          {
+            U8 low_utf8[UTF8_MAXBYTES+1];
              UV highest_cp = invlist_highest(cp_list);
  
-            /* If the lowest and highest code point in the class have the same
-             * UTF-8 first byte, then all do, and we can store that byte for
-             * regexec.c to use so that it can more quickly scan the target
-             * string for potential matches for this class.  We co-opt the the
-             * flags field for this.  Zero means, they don't have the same
-             * first byte.  We do accept here very large code points (for
-             * future use), but don't bother with this optimization for them,
-             * as it would cause other complications */
-            if (highest_cp > IV_MAX) {
-                anyof_flags = 0;
-            }
-            else {
-                U8 low_utf8[UTF8_MAXBYTES+1];
+            op = ANYOFH;
+
+            /* Currently the maximum allowed code point by the system is
+             * IV_MAX.  Higher ones are reserved for future internal use.  This
+             * particular regnode can be used for higher ones, but we can't
+             * calculate the code point of those.  IV_MAX suffices though, as
+             * it will be a large first byte */
+            (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+
+            /* We store the lowest possible first byte of the UTF-8
+             * representation, using the flags field.  This allows for quick
+             * ruling out of some inputs without having to convert from UTF-8
+             * to code point.  For EBCDIC, this has to be I8. */
+            anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+
+            /* If the first UTF-8 start byte for the highest code point in the
+             * range is suitably small, we may be able to get an upper bound as
+             * well */
+            if (highest_cp <= IV_MAX) {
                  U8 high_utf8[UTF8_MAXBYTES+1];
  
-                (void) uvchr_to_utf8(low_utf8, start[0]);
-                (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+                (void) uvchr_to_utf8(high_utf8, highest_cp);
  
-                anyof_flags = (low_utf8[0] == high_utf8[0])
-                            ? low_utf8[0]
-                            : 0;
+                /* If the lowest and highest are the same, we can get an exact
+                 * first byte instead of a just minimum.  We signal this with a
+                 * different regnode */
+                if (low_utf8[0] == high_utf8[0]) {
+
+                    /* No need to convert to I8 for EBCDIC as this is an exact
+                     * match */
+                    anyof_flags = low_utf8[0];
+                    op = ANYOFHb;
+                }
+                else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
+                {
+
+                    /* Here, the high byte is not the same as the low, but is
+                     * small enough that its reasonable to have a loose upper
+                     * bound, which is packed in with the strict lower bound.
+                     * See comments at the definition of MAX_ANYOF_HRx_BYTE.
+                     * On EBCDIC platforms, I8 is used.  On ASCII platforms I8
+                     * is the same thing as UTF-8 */
+
+                    U8 bits = 0;
+                    U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
+                    U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
+                                  - anyof_flags;
+
+                    if (range_diff <= max_range_diff / 8) {
+                        bits = 3;
+                    }
+                    else if (range_diff <= max_range_diff / 4) {
+                        bits = 2;
+                    }
+                    else if (range_diff <= max_range_diff / 2) {
+                        bits = 1;
+                    }
+                    anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
+                    op = ANYOFHr;
+                }
              }
  
-            op = ANYOFH;
+            goto done_finding_op;
          }
      }   /* End of seeing if can optimize it into a different node */
  
    is_anyof: /* It's going to be an ANYOF node. */
-    if (op != ANYOFH) {
-        op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
-             ? ANYOFD
-             : ((posixl)
-                ? ANYOFPOSIXL
-                : ((LOC)
-                   ? ANYOFL
-                   : ANYOF));
-    }
+    op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+         ? ANYOFD
+         : ((posixl)
+            ? ANYOFPOSIXL
+            : ((LOC)
+               ? ANYOFL
+               : ANYOF));
+
+  done_finding_op:
  
      ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
      FILL_NODE(ret, op);        /* We set the argument later */
@@ -19529,8 +19787,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
  STATIC void
  S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
  {
-    /* 'size' is the delta to add or subtract from the current memory allocated
-     * to the regex engine being constructed */
+    /* 'size' is the delta number of smallest regnode equivalents to add or
+     * subtract from the current memory allocated to the regex engine being
+     * constructed. */
  
      PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
  
@@ -19562,8 +19821,8 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
  STATIC regnode_offset
  S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
  {
-    /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
-     * and increments RExC_size and RExC_emit
+    /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+     * equivalents space.  It aligns and increments RExC_size and RExC_emit
       *
       * It returns the regnode's offset into the regex engine program */
  
@@ -19810,8 +20069,8 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state,
      }
      else {
          if (val - scan > U16_MAX) {
-            /* Since not all callers check the return value, populate this with
-             * something that won't loop and will likely lead to a crash if
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
               * execution continues */
              NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
              return FALSE;
@@ -19872,7 +20131,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
  #endif
          if ( exact ) {
              switch (OP(REGNODE_p(scan))) {
+                case LEXACT:
                  case EXACT:
+                case LEXACT_ONLY8:
                  case EXACT_ONLY8:
                  case EXACTL:
                  case EXACTF:
@@ -19922,6 +20183,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
      }
      else {
          if (val - scan > U16_MAX) {
+            /* Populate this with something that won't loop and will likely
+             * lead to a crash if the caller ignores the failure return, and
+             * execution continues */
              NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
              return FALSE;
          }
@@ -20199,11 +20463,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
  
      SvPVCLEAR(sv);
  
-    if (OP(o) > REGNODE_MAX)           /* regnode.type is unsigned */
-       /* It would be nice to FAIL() here, but this may be called from
-          regexec.c, and it would be hard to supply pRExC_state. */
-       Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
-                                              (int)OP(o), (int)REGNODE_MAX);
+    if (OP(o) > REGNODE_MAX) {          /* regnode.type is unsigned */
+        if (pRExC_state) {  /* This gives more info, if we have it */
+            FAIL3("panic: corrupted regexp opcode %d > %d",
+                  (int)OP(o), (int)REGNODE_MAX);
+        }
+        else {
+            Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+                             (int)OP(o), (int)REGNODE_MAX);
+        }
+    }
      sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
  
      k = PL_regkind[OP(o)];
@@ -20286,7 +20555,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
              name_list= RExC_paren_name_list;
          }
          if (name_list) {
-            if ( k != REF || (OP(o) < NREF)) {
+            if ( k != REF || (OP(o) < REFN)) {
                  SV **name= av_fetch(name_list, parno, 0 );
                 if (name)
                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
@@ -20340,7 +20609,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* 2: embedded, otherwise 1 */
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
      else if (k == ANYOF) {
-       const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
+       const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
+                          ? 0
+                          : ANYOF_FLAGS(o);
          bool do_sep = FALSE;    /* Do we need to separate various components of
                                     the output? */
          /* Set if there is still an unresolved user-defined property */
@@ -20396,7 +20667,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* Ready to start outputting.  First, the initial left bracket */
         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
  
-        if (OP(o) != ANYOFH) {
+        if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
              /* Then all the things that could fit in the bitmap */
              do_sep = put_charclass_bitmap_innards(sv,
                                                    ANYOF_BITMAP(o),
@@ -20494,11 +20765,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
          /* And finally the matching, closing ']' */
         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
  
-        if (OP(o) == ANYOFH && FLAGS(o) != 0) {
-            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+        if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+            U8 lowest = (OP(o) != ANYOFHr)
+                         ? FLAGS(o)
+                         : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+            U8 highest = (OP(o) == ANYOFHb)
+                         ? lowest
+                         : OP(o) == ANYOFH
+                           ? 0xFF
+                           : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+            Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+            if (lowest != highest) {
+                Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+            }
+            Perl_sv_catpvf(aTHX_ sv, ")");
          }
  
-
          SvREFCNT_dec(unresolved);
      }
      else if (k == ANYOFM) {
@@ -21427,9 +21709,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
  
          /* As a final resort, output the range or subrange as hex. */
  
-        this_end = (end < NUM_ANYOF_CODE_POINTS)
-                    ? end
-                    : NUM_ANYOF_CODE_POINTS - 1;
+        if (start >= NUM_ANYOF_CODE_POINTS) {
+            this_end = end;
+        }
+        else {
+            this_end = (end < NUM_ANYOF_CODE_POINTS)
+                        ? end
+                        : NUM_ANYOF_CODE_POINTS - 1;
+        }
  #if NUM_ANYOF_CODE_POINTS > 256
          format = (this_end < 256)
                   ? "\\x%02" UVXf "-\\x%02" UVXf
@@ -22566,8 +22853,7 @@ Perl_parse_uniprop_string(pTHX_
      int slash_pos  = -1;    /* Where the '/' is found, or negative if none */
      int table_index = 0;    /* The entry number for this property in the table
                                 of all Unicode property names */
-    bool starts_with_In_or_Is = FALSE;  /* ? Does the name start with 'In' or
-                                             'Is' */
+    bool starts_with_Is = FALSE;  /* ? Does the name start with 'Is' */
      Size_t lookup_offset = 0;   /* Used to ignore the first few characters of
                                     the normalized name in certain situations */
      Size_t non_pkg_begin = 0;   /* Offset of first byte in 'name' that isn't
@@ -22857,7 +23143,7 @@ Perl_parse_uniprop_string(pTHX_
          /* Certain properties whose values are numeric need special handling.
           * They may optionally be prefixed by 'is'.  Ignore that prefix for the
           * purposes of checking if this is one of those properties */
-        if (memBEGINPs(lookup_name, name_len, "is")) {
+        if (memBEGINPs(lookup_name, j, "is")) {
              lookup_offset = 2;
          }
  
@@ -23023,7 +23309,9 @@ Perl_parse_uniprop_string(pTHX_
              }
  
              /* Store the first real character in the denominator */
-            lookup_name[j++] = name[i];
+            if (i < name_len) {
+                lookup_name[j++] = name[i];
+            }
          }
      }
  
@@ -23041,11 +23329,15 @@ Perl_parse_uniprop_string(pTHX_
  
      /* If the original input began with 'In' or 'Is', it could be a subroutine
       * call to a user-defined property instead of a Unicode property name. */
-    if (    non_pkg_begin + name_len > 2
+    if (    name_len - non_pkg_begin > 2
          &&  name[non_pkg_begin+0] == 'I'
          && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
      {
-        starts_with_In_or_Is = TRUE;
+        /* Names that start with In have different characterstics than those
+         * that start with Is */
+        if (name[non_pkg_begin+1] == 's') {
+            starts_with_Is = TRUE;
+        }
      }
      else {
          could_be_user_defined = FALSE;
@@ -23384,8 +23676,11 @@ Perl_parse_uniprop_string(pTHX_
      /* If it didn't find the property ... */
      if (table_index == 0) {
  
-        /* Try again stripping off any initial 'In' or 'Is' */
-        if (starts_with_In_or_Is) {
+        /* Try again stripping off any initial 'Is'.  This is because we
+         * promise that an initial Is is optional.  The same isn't true of
+         * names that start with 'In'.  Those can match only blocks, and the
+         * lookup table already has those accounted for. */
+        if (starts_with_Is) {
              lookup_name += 2;
              lookup_len -= 2;
              equals_pos -= 2;