S_regatom: reinitialize flags if reparsing

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 9f1a837..6ce4996 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -850,7 +850,8 @@ static const scan_data_t zero_scan_data = {
  #define UPDATE_WARNINGS_LOC(loc)                                        \
      STMT_START {                                                        \
          if (TO_OUTPUT_WARNINGS(loc)) {                                  \
-            RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
+            RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
+                                                       - RExC_precomp;  \
          }                                                               \
      } STMT_END
  
@@ -2701,7 +2702,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
  #endif
  
      switch (flags) {
-        case EXACT: case EXACT_ONLY8: case EXACTL: break;
+        case EXACT: case EXACT_REQ8: case EXACTL: break;
         case EXACTFAA:
          case EXACTFUP:
         case EXACTFU:
@@ -2716,7 +2717,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
      trie->wordcount = word_count;
      RExC_rxi->data->data[ data_slot ] = (void*)trie;
      trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
+    if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
      trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                         trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -2792,8 +2793,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
  
          if (    noper < tail
              && (    OP(noper) == flags
-                || (flags == EXACT && OP(noper) == EXACT_ONLY8)
-                || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                || (flags == EXACT && OP(noper) == EXACT_REQ8)
+                || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
                                           || OP(noper) == EXACTFUP))))
          {
              uc= (U8*)STRING(noper);
@@ -3010,8 +3011,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
  
              if (    noper < tail
                  && (    OP(noper) == flags
-                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
-                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                    || (flags == EXACT && OP(noper) == EXACT_REQ8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
                                               || OP(noper) == EXACTFUP))))
              {
                  const U8 *uc= (U8*)STRING(noper);
@@ -3235,8 +3236,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
  
              if (    noper < tail
                  && (    OP(noper) == flags
-                    || (flags == EXACT && OP(noper) == EXACT_ONLY8)
-                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
+                    || (flags == EXACT && OP(noper) == EXACT_REQ8)
+                    || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
                                               || OP(noper) == EXACTFUP))))
              {
                  const U8 *uc= (U8*)STRING(noper);
@@ -3993,8 +3994,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      using /iaa matching will be doing so almost entirely with ASCII
   *      strings, so this should rarely be encountered in practice */
  
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
-    if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
+    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
+                                      && OP(scan) != LEXACT_REQ8)  \
          join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
  
  STATIC U32
@@ -4059,16 +4061,16 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
  
              /* Joining something that requires UTF-8 with something that
               * doesn't, means the result requires UTF-8. */
-            if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
-                OP(scan) = EXACT_ONLY8;
+            if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
+                OP(scan) = EXACT_REQ8;
              }
-            else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
+            else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
                  ;   /* join is compatible, no need to change OP */
              }
-            else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
-                OP(scan) = EXACTFU_ONLY8;
+            else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
+                OP(scan) = EXACTFU_REQ8;
              }
-            else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
+            else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
                  ;   /* join is compatible, no need to change OP */
              }
              else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
@@ -4091,7 +4093,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                    * node.  And if the only adjacent node is EXACTF, they get
                    * absorbed into that, under the theory that a longer node is
                    * better than two shorter ones, even if one is EXACTFU.  Note
-                  * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
+                  * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
                    * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
  
                  if (STRING(n)[STR_LEN(n)-1] == 's') {
@@ -4199,7 +4201,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
       * this final joining, sequences could have been split over boundaries, and
       * hence missed).  The sequences only happen in folding, hence for any
       * non-EXACT EXACTish node */
-    if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
+    if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
          U8* s0 = (U8*) STRING(scan);
          U8* s = s0;
          U8* s_end = s0 + STR_LEN(scan);
@@ -4542,31 +4544,31 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
           */
          JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
  
-       /* Follow the next-chain of the current node and optimize
-          away all the NOTHINGs from it.  */
-       if (OP(scan) != CURLYX) {
-           const int max = (reg_off_by_arg[OP(scan)]
-                      ? I32_MAX
-                      /* I32 may be smaller than U16 on CRAYs! */
-                      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
-           int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
-           int noff;
-           regnode *n = scan;
-
-           /* Skip NOTHING and LONGJMP. */
-           while ((n = regnext(n))
-                  && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
-                      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
-                  && off + noff < max)
-               off += noff;
-           if (reg_off_by_arg[OP(scan)])
-               ARG(scan) = off;
-           else
-               NEXT_OFF(scan) = off;
-       }
+        /* Follow the next-chain of the current node and optimize
+           away all the NOTHINGs from it.  */
+        if (OP(scan) != CURLYX) {
+            const int max = (reg_off_by_arg[OP(scan)]
+                            ? I32_MAX
+                              /* I32 may be smaller than U16 on CRAYs! */
+                            : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+            int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
+            int noff;
+            regnode *n = scan;
+
+            /* Skip NOTHING and LONGJMP. */
+            while (   (n = regnext(n))
+                   && (   (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+                       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
+                   && off + noff < max)
+                off += noff;
+            if (reg_off_by_arg[OP(scan)])
+                ARG(scan) = off;
+            else
+                NEXT_OFF(scan) = off;
+        }
  
-       /* The principal pseudo-switch.  Cannot be a switch, since we
-          look into several different things.  */
+        /* The principal pseudo-switch.  Cannot be a switch, since we look into
+         * several different things.  */
          if ( OP(scan) == DEFINEP ) {
              SSize_t minlen = 0;
              SSize_t deltanext = 0;
@@ -4858,9 +4860,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                  ----------------+-----------
                                  NOTHING         | NOTHING
                                  EXACT           | EXACT
-                                EXACT_ONLY8     | EXACT
+                                EXACT_REQ8     | EXACT
                                  EXACTFU         | EXACTFU
-                                EXACTFU_ONLY8   | EXACTFU
+                                EXACTFU_REQ8   | EXACTFU
                                  EXACTFUP        | EXACTFU
                                  EXACTFAA        | EXACTFAA
                                  EXACTL          | EXACTL
@@ -4870,10 +4872,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          */
  #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
                         ? NOTHING                                            \
-                       : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
+                       : ( EXACT == (X) || EXACT_REQ8 == (X) )             \
                           ? EXACT                                            \
                           : (     EXACTFU == (X)                             \
-                              || EXACTFU_ONLY8 == (X)                       \
+                              || EXACTFU_REQ8 == (X)                       \
                                || EXACTFUP == (X) )                          \
                             ? EXACTFU                                        \
                             : ( EXACTFAA == (X) )                            \
@@ -5197,7 +5199,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
         }
         else if (   OP(scan) == EXACT
-                 || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == LEXACT
+                 || OP(scan) == EXACT_REQ8
+                 || OP(scan) == LEXACT_REQ8
                   || OP(scan) == EXACTL)
          {
             SSize_t l = STR_LEN(scan);
@@ -5282,7 +5286,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
  
              if (flags & SCF_DO_STCLASS) {
-                SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
+                SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
  
                  assert(EXACTF_invlist);
                  if (flags & SCF_DO_STCLASS_AND) {
@@ -5319,7 +5323,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                     next = NEXTOPER(scan);
                     if (   OP(next) == EXACT
-                        || OP(next) == EXACT_ONLY8
+                        || OP(next) == LEXACT
+                        || OP(next) == EXACT_REQ8
+                        || OP(next) == LEXACT_REQ8
                          || OP(next) == EXACTL
                          || (flags & SCF_DO_STCLASS))
                      {
@@ -6171,7 +6177,6 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
             }
  #endif
         }
-
         else if (OP(scan) == OPEN) {
             if (stopparen != (I32)ARG(scan))
                 pars++;
@@ -7978,7 +7983,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          /* Ignore EXACT as we deal with it later. */
         if (PL_regkind[OP(first)] == EXACT) {
             if (   OP(first) == EXACT
-                || OP(first) == EXACT_ONLY8
+               || OP(first) == LEXACT
+                || OP(first) == EXACT_REQ8
+                || OP(first) == LEXACT_REQ8
                  || OP(first) == EXACTL)
              {
                 NOOP;   /* Empty, get anchored substr later. */
@@ -8324,7 +8331,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                   && nop == END)
              RExC_rx->extflags |= RXf_WHITE;
          else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+                  && (   fop == EXACT || fop == LEXACT
+                      || fop == EXACT_REQ8 || fop == LEXACT_REQ8
+                      || fop == EXACTL)
                    && STR_LEN(first) == 1
                    && *(STRING(first)) == ' '
                    && nop == END )
@@ -9022,23 +9031,6 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0)
      return zero_addr + *offset;
  }
  
-PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
-{
-    /* Sets the current number of elements stored in the inversion list.
-     * Updates SvCUR correspondingly */
-    PERL_UNUSED_CONTEXT;
-    PERL_ARGS_ASSERT_INVLIST_SET_LEN;
-
-    assert(is_invlist(invlist));
-
-    SvCUR_set(invlist,
-              (len == 0)
-               ? 0
-               : TO_INTERNAL_SIZE(len + offset));
-    assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
-}
-
  STATIC void
  S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
  {
@@ -9189,6 +9181,7 @@ S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
      invlist_iterfinish(invlist);
  
      *get_invlist_previous_index_addr(invlist) = 0;
+    SvPOK_on(invlist);  /* This allows B to extract the PV */
  }
  
  SV*
@@ -9263,25 +9256,12 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list)
      invlist_iterfinish(invlist);
  
      SvREADONLY_on(invlist);
+    SvPOK_on(invlist);
  
      return invlist;
  }
  
  STATIC void
-S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
-{
-    /* Grow the maximum size of an inversion list */
-
-    PERL_ARGS_ASSERT_INVLIST_EXTEND;
-
-    assert(is_invlist(invlist));
-
-    /* Add one to account for the zero element at the beginning which may not
-     * be counted by the calling parameters */
-    SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
-}
-
-STATIC void
  S__append_range_to_invlist(pTHX_ SV* const invlist,
                                   const UV start, const UV end)
  {
@@ -10267,11 +10247,6 @@ Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
  
  #endif
  
-PERL_STATIC_INLINE SV*
-S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
-    return _add_range_to_invlist(invlist, cp, cp);
-}
-
  #ifndef PERL_IN_XSUB_RE
  void
  Perl__invlist_invert(pTHX_ SV* const invlist)
@@ -10322,108 +10297,6 @@ Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
  
  #endif
  
-PERL_STATIC_INLINE STRLEN*
-S_get_invlist_iter_addr(SV* invlist)
-{
-    /* Return the address of the UV that contains the current iteration
-     * position */
-
-    PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
-
-    assert(is_invlist(invlist));
-
-    return &(((XINVLIST*) SvANY(invlist))->iterator);
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterinit(SV* invlist)        /* Initialize iterator for invlist */
-{
-    PERL_ARGS_ASSERT_INVLIST_ITERINIT;
-
-    *get_invlist_iter_addr(invlist) = 0;
-}
-
-PERL_STATIC_INLINE void
-S_invlist_iterfinish(SV* invlist)
-{
-    /* Terminate iterator for invlist.  This is to catch development errors.
-     * Any iteration that is interrupted before completed should call this
-     * function.  Functions that add code points anywhere else but to the end
-     * of an inversion list assert that they are not in the middle of an
-     * iteration.  If they were, the addition would make the iteration
-     * problematical: if the iteration hadn't reached the place where things
-     * were being added, it would be ok */
-
-    PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
-
-    *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
-}
-
-STATIC bool
-S_invlist_iternext(SV* invlist, UV* start, UV* end)
-{
-    /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
-     * This call sets in <*start> and <*end>, the next range in <invlist>.
-     * Returns <TRUE> if successful and the next call will return the next
-     * range; <FALSE> if was already at the end of the list.  If the latter,
-     * <*start> and <*end> are unchanged, and the next call to this function
-     * will start over at the beginning of the list */
-
-    STRLEN* pos = get_invlist_iter_addr(invlist);
-    UV len = _invlist_len(invlist);
-    UV *array;
-
-    PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
-
-    if (*pos >= len) {
-       *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
-       return FALSE;
-    }
-
-    array = invlist_array(invlist);
-
-    *start = array[(*pos)++];
-
-    if (*pos >= len) {
-       *end = UV_MAX;
-    }
-    else {
-       *end = array[(*pos)++] - 1;
-    }
-
-    return TRUE;
-}
-
-PERL_STATIC_INLINE UV
-S_invlist_highest(SV* const invlist)
-{
-    /* Returns the highest code point that matches an inversion list.  This API
-     * has an ambiguity, as it returns 0 under either the highest is actually
-     * 0, or if the list is empty.  If this distinction matters to you, check
-     * for emptiness before calling this function */
-
-    UV len = _invlist_len(invlist);
-    UV *array;
-
-    PERL_ARGS_ASSERT_INVLIST_HIGHEST;
-
-    if (len == 0) {
-       return 0;
-    }
-
-    array = invlist_array(invlist);
-
-    /* The last element in the array in the inversion list always starts a
-     * range that goes to infinity.  That range may be for code points that are
-     * matched in the inversion list, or it may be for ones that aren't
-     * matched.  In the latter case, the highest code point in the set is one
-     * less than the beginning of this range; otherwise it is the final element
-     * of this range: infinity */
-    return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
-           ? UV_MAX
-           : array[len - 1] - 1;
-}
-
  STATIC SV *
  S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
  {
@@ -10602,7 +10475,7 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
   * call SvREFCNT_dec() when done with it.
   */
  STATIC SV*
-S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
  {
      dVAR;
      const U8 * s = (U8*)STRING(node);
@@ -10611,7 +10484,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
      /* Start out big enough for 2 separate code points */
      SV* invlist = _new_invlist(4);
  
-    PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
+    PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
  
      if (! UTF) {
          uc = *s;
@@ -11085,7 +10958,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
      PERL_ARGS_ASSERT_REG;
      DEBUG_PARSE("reg ");
  
-
      max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
      assert(max_open);
      if (!SvIOK(max_open)) {
@@ -11318,12 +11190,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                          goto parse_rest;
                      }
  
-                    /* By doing this here, we avoid extra warnings for nested
-                     * script runs */
-                    ckWARNexperimental(RExC_parse,
-                        WARN_EXPERIMENTAL__SCRIPT_RUN,
-                        "The script_run feature is experimental");
-
                      if (paren == 's') {
                          /* Here, we're starting a new regular script run */
                          ret = reg_node(pRExC_state, SROPEN);
@@ -11368,9 +11234,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  /*FALLTHROUGH*/
  
              alpha_assertions:
-                ckWARNexperimental(RExC_parse,
-                        WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
-                        "The alpha_assertions feature is experimental");
  
                  RExC_seen_zerolen++;
  
@@ -12073,16 +11936,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
                  goto parse_rest;
              } /* end switch */
         }
-       else {
-            if (*RExC_parse == '{') {
-                ckWARNregdep(RExC_parse + 1,
-                            "Unescaped left brace in regex is "
-                            "deprecated here (and will be fatal "
-                            "in Perl 5.32), passed through");
-            }
-            /* Not bothering to indent here, as the above 'else' is temporary
-             * */
-        if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
+        else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) {   /* (...) */
           capturing_parens:
             parno = RExC_npar;
             RExC_npar++;
@@ -12149,7 +12003,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
              /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
              paren = ':';
             ret = 0;
-       }
          }
      }
      else                        /* ! paren */
@@ -13922,13 +13775,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             UV ender = 0;
             char *p;
             char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_FOLDED_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
             char *s0;
-           U8 upper_fill = MAX_FOLDED_NODE_STRING_SIZE;
+            U32 max_string_len = 255;
+
+            /* We may have to reparse the node, artificially stopping filling
+             * it early, based on info gleaned in the first parse.  This
+             * variable gives where we stop.  Make it above the normal stopping
+             * place first time through; otherwise it would stop too early */
+            U32 upper_fill = max_string_len + 1;
  
              /* We start out as an EXACT node, even if under /i, until we find a
               * character which is in a fold.  The algorithm now segregates into
@@ -13942,12 +13796,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              U8 node_type = EXACT;
  
              /* Assume the node will be fully used; the excess is given back at
-             * the end.  We can't make any other length assumptions, as a byte
-             * input sequence could shrink down. */
-            Ptrdiff_t current_string_nodes = STR_SZ(256);
+             * the end.  Under /i, leave enough extra room so that we won't
+             * overflow the buffer when we fold a character which would end up
+             * overflowing the node.   We can't make any other length
+             * assumptions, as a byte input sequence could shrink down. */
+            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
+                                                 + ((! FOLD)
+                                                    ? 0
+                                                    : 1 * ((UTF)
+                                                           ? UTF8_MAXBYTES_CASE
+                        /* Max non-UTF-8 expansion is 2 */ : 2)));
  
              bool next_is_quantifier;
              char * oldp = NULL;
+            char * old_oldp = NULL;
  
              /* We can convert EXACTF nodes to EXACTFU if they contain only
               * characters that match identically regardless of the target
@@ -13975,6 +13837,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* So is the MICRO SIGN */
              bool has_micro_sign = FALSE;
  
+            /* Set when we fill up the current node and there is still more
+             * text to process */
+            bool overflowed;
+
              /* Allocate an EXACT node.  The node_type may change below to
               * another EXACTish node, but since the size of the node doesn't
               * change, it works */
@@ -13989,6 +13855,19 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
           reparse:
  
+            p = RExC_parse;
+            len = 0;
+            s = s0;
+            node_type = EXACT;
+            oldp = NULL;
+            maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
+            maybe_SIMPLE = SIMPLE;
+            requires_utf8_target = FALSE;
+            has_ss = FALSE;
+            has_micro_sign = FALSE;
+
+          continue_parse:
+
              /* This breaks under rare circumstances.  If folding, we do not
               * want to split a node at a character that is a non-final in a
               * multi-char fold, as an input string could just happen to want to
@@ -14003,17 +13882,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                     || UTF8_IS_START(UCHARAT(RExC_parse)));
  
+            overflowed = FALSE;
+
              /* Here, we have a literal character.  Find the maximal string of
               * them in the input that we can fit into a single EXACTish node.
               * We quit at the first non-literal or when the node gets full, or
               * under /i the categorization of folding/non-folding character
               * changes */
-            for (p = RExC_parse; len < upper_fill && p < RExC_end; ) {
+            while (p < RExC_end && len < upper_fill) {
  
                  /* In most cases each iteration adds one byte to the output.
                   * The exceptions override this */
                  Size_t added_len = 1;
  
+                old_oldp = oldp;
                 oldp = p;
  
                  /* White space has already been ignored */
@@ -14346,8 +14228,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  /* Ready to add 'ender' to the node */
  
                  if (! FOLD) {  /* The simple case, just append the literal */
-
                    not_fold_common:
+
+                    /* Don't output if it would overflow */
+                    if (UNLIKELY(len > max_string_len - ((UTF)
+                                                      ? UVCHR_SKIP(ender)
+                                                      : 1)))
+                    {
+                        overflowed = TRUE;
+                        break;
+                    }
+
                      if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
                          *(s++) = (char) ender;
                      }
@@ -14423,22 +14314,35 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          goto loopdone;
                      }
  
-                    if (UTF) {  /* Use the folded value */
+                    if (UTF) {  /* Alway use the folded value for UTF-8
+                                   patterns */
                          if (UVCHR_IS_INVARIANT(ender)) {
+                            if (UNLIKELY(len + 1 > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                              *(s)++ = (U8) toFOLD(ender);
                          }
                          else {
-                            ender = _to_uni_fold_flags(
+                            UV folded = _to_uni_fold_flags(
                                      ender,
-                                    (U8 *) s,
+                                    (U8 *) s,  /* We have allocated extra space
+                                                  in 's' so can't run off the
+                                                  end */
                                      &added_len,
                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
                                                      ? FOLD_FLAGS_NOMIX_ASCII
                                                      : 0));
+                            if (UNLIKELY(len + added_len > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                              s += added_len;
  
-                            if (   ender > 255
-                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            if (   folded > 255
+                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
                              {
                                  /* U+B5 folds to the MU, so its possible for a
                                   * non-UTF-8 target to match it */
@@ -14446,64 +14350,77 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              }
                          }
                      }
-                    else {
-
-                        /* Here is non-UTF8.  First, see if the character's
-                         * fold differs between /d and /u. */
-                        if (PL_fold[ender] != PL_fold_latin1[ender]) {
-                            maybe_exactfu = FALSE;
+                    else { /* Here is non-UTF8. */
+
+                        /* The fold will be one or (rarely) two characters.
+                         * Check that there's room for at least a single one
+                         * before setting any flags, etc.  Because otherwise an
+                         * overflowing character could cause a flag to be set
+                         * even though it doesn't end up in this node.  (For
+                         * the two character fold, we check again, before
+                         * setting any flags) */
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
                          }
  
  #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
     || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
                                        || UNICODE_DOT_DOT_VERSION > 0)
  
-                        /* On non-ancient Unicode versions, this includes the
-                         * multi-char fold SHARP S to 'ss' */
-
-                        if (   UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
-                                 || (   isALPHA_FOLD_EQ(ender, 's')
-                                     && len > 0
-                                     && isALPHA_FOLD_EQ(*(s-1), 's')))
-                        {
-                            /* Here, we have one of the following:
-                             *  a)  a SHARP S.  This folds to 'ss' only under
-                             *      /u rules.  If we are in that situation,
-                             *      fold the SHARP S to 'ss'.  See the comments
-                             *      for join_exact() as to why we fold this
-                             *      non-UTF at compile time, and no others.
-                             *  b)  'ss'.  When under /u, there's nothing
-                             *      special needed to be done here.  The
-                             *      previous iteration handled the first 's',
-                             *      and this iteration will handle the second.
-                             *      If, on the otherhand it's not /u, we have
-                             *      to exclude the possibility of moving to /u,
-                             *      so that we won't generate an unwanted
-                             *      match, unless, at runtime, the target
-                             *      string is in UTF-8.
-                             * */
+                        /* On non-ancient Unicodes, check for the only possible
+                         * multi-char fold  */
+                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
  
+                            /* This potential multi-char fold means the node
+                             * can't be simple (because it could match more
+                             * than a single char).  And in some cases it will
+                             * match 'ss', so set that flag */
+                            maybe_SIMPLE = 0;
                              has_ss = TRUE;
-                            maybe_exactfu = FALSE;  /* Can't generate an
-                                                       EXACTFU node (unless we
-                                                       already are in one) */
-                            if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
-                                maybe_SIMPLE = 0;
-                                if (node_type == EXACTFU) {
-                                    *(s++) = 's';
-
-                                    /* Let the code below add in the extra 's'
-                                     * */
-                                    ender = 's';
-                                    added_len = 2;
+
+                            /* It can't change to be an EXACTFU (unless already
+                             * is one).  We fold it iff under /u rules. */
+                            if (node_type != EXACTFU) {
+                                maybe_exactfu = FALSE;
+                            }
+                            else {
+                                if (UNLIKELY(len + 2 > max_string_len)) {
+                                    overflowed = TRUE;
+                                    break;
                                  }
+
+                                *(s++) = 's';
+                                *(s++) = 's';
+                                added_len = 2;
+
+                                goto done_with_this_char;
                              }
                          }
+                        else if (   UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
+                                 && LIKELY(len > 0)
+                                 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
+                        {
+                            /* Also, the sequence 'ss' is special when not
+                             * under /u.  If the target string is UTF-8, it
+                             * should match SHARP S; otherwise it won't.  So,
+                             * here we have to exclude the possibility of this
+                             * node moving to /u.*/
+                            has_ss = TRUE;
+                            maybe_exactfu = FALSE;
+                        }
  #endif
+                        /* Here, the fold will be a single character */
  
-                        else if (UNLIKELY(ender == MICRO_SIGN)) {
+                        if (UNLIKELY(ender == MICRO_SIGN)) {
                              has_micro_sign = TRUE;
                          }
+                        else if (PL_fold[ender] != PL_fold_latin1[ender]) {
+
+                            /* If the character's fold differs between /d and
+                             * /u, this can't change to be an EXACTFU node */
+                            maybe_exactfu = FALSE;
+                        }
  
                          *(s++) = (DEPENDS_SEMANTICS)
                                   ? (char) toFOLD(ender)
@@ -14518,6 +14435,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      }
                 } /* End of adding current character to the node */
  
+              done_with_this_char:
+
                  len += added_len;
  
                 if (next_is_quantifier) {
@@ -14530,160 +14449,279 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             } /* End of loop through literal characters */
  
              /* Here we have either exhausted the input or run out of room in
-             * the node.  (If we encountered a character that can't be in the
-             * node, transfer is made directly to <loopdone>, and so we
-             * wouldn't have fallen off the end of the loop.)  In the latter
-             * case, we artificially have to split the node into two, because
-             * we just don't have enough space to hold everything.  This
-             * creates a problem if the final character participates in a
-             * multi-character fold in the non-final position, as a match that
-             * should have occurred won't, due to the way nodes are matched,
-             * and our artificial boundary.  So back off until we find a non-
-             * problematic character -- one that isn't at the beginning or
-             * middle of such a fold.  (Either it doesn't participate in any
-             * folds, or appears only in the final position of all the folds it
-             * does participate in.)  A better solution with far fewer false
-             * positives, and that would fill the nodes more completely, would
-             * be to actually have available all the multi-character folds to
-             * test against, and to back-off only far enough to be sure that
-             * this node isn't ending with a partial one.  <upper_fill> is set
-             * further below (if we need to reparse the node) to include just
-             * up through that final non-problematic character that this code
-             * identifies, so when it is set to less than the full node, we can
-             * skip the rest of this */
-            if (FOLD && p < RExC_end && upper_fill == MAX_FOLDED_NODE_STRING_SIZE) {
-                PERL_UINT_FAST8_T backup_count = 0;
-
-                const STRLEN full_len = len;
-
-               assert(len >= MAX_FOLDED_NODE_STRING_SIZE);
-
-                /* Here, <s> points to just beyond where we have output the
-                 * final character of the node.  Look backwards through the
-                 * string until find a non- problematic character */
-
-               if (! UTF) {
-
-                    /* This has no multi-char folds to non-UTF characters */
-                    if (ASCII_FOLD_RESTRICTED) {
-                        goto loopdone;
-                    }
+             * the node.  If the former, we are done.  (If we encountered a
+             * character that can't be in the node, transfer is made directly
+             * to <loopdone>, and so we wouldn't have fallen off the end of the
+             * loop.)  */
+            if (LIKELY(! overflowed)) {
+                goto loopdone;
+            }
+
+            /* Here we have run out of room.  We can grow plain EXACT and
+             * LEXACT nodes.  If the pattern is gigantic enough, though,
+             * eventually we'll have to artificially chunk the pattern into
+             * multiple nodes. */
+            if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+                Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+                Size_t overhead_expansion = 0;
+                char temp[256];
+                Size_t max_nodes_for_string;
+                Size_t achievable;
+                SSize_t delta;
+
+                /* Here we couldn't fit the final character in the current
+                 * node, so it will have to be reparsed, no matter what else we
+                 * do */
+                p = oldp;
+
+                /* If would have overflowed a regular EXACT node, switch
+                 * instead to an LEXACT.  The code below is structured so that
+                 * the actual growing code is common to changing from an EXACT
+                 * or just increasing the LEXACT size.  This means that we have
+                 * to save the string in the EXACT case before growing, and
+                 * then copy it afterwards to its new location */
+                if (node_type == EXACT) {
+                    overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+                    RExC_emit += overhead_expansion;
+                    Copy(s0, temp, len, char);
+                }
+
+                /* Ready to grow.  If it was a plain EXACT, the string was
+                 * saved, and the first few bytes of it overwritten by adding
+                 * an argument field.  We assume, as we do elsewhere in this
+                 * file, that one byte of remaining input will translate into
+                 * one byte of output, and if that's too small, we grow again,
+                 * if too large the excess memory is freed at the end */
+
+                max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+                achievable = MIN(max_nodes_for_string,
+                                 current_string_nodes + STR_SZ(RExC_end - p));
+                delta = achievable - current_string_nodes;
+
+                /* If there is just no more room, go finish up this chunk of
+                 * the pattern. */
+                if (delta <= 0) {
+                    goto loopdone;
+                }
  
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
-                        backup_count++;
-                    }
-                    len = s - s0 + 1;
-               }
+                change_engine_size(pRExC_state, delta + overhead_expansion);
+                current_string_nodes += delta;
+                max_string_len
+                           = sizeof(struct regnode) * current_string_nodes;
+                upper_fill = max_string_len + 1;
+
+                /* If the length was small, we know this was originally an
+                 * EXACT node now converted to LEXACT, and the string has to be
+                 * restored.  Otherwise the string was untouched.  260 is just
+                 * a number safely above 255 so don't have to worry about
+                 * getting it precise */
+                if (len < 260) {
+                    node_type = LEXACT;
+                    FILL_NODE(ret, node_type);
+                    s0 = STRING(REGNODE_p(ret));
+                    Copy(temp, s0, len, char);
+                    s = s0 + len;
+                }
+
+                goto continue_parse;
+            }
+            else if (! LOC) {  /* XXX shouldn't /l assume could be a UTF-8
+                                locale, and prepare for that? */
+
+                /* Here is /i.  Running out of room creates a problem if we are
+                 * folding, and the split happens in the middle of a
+                 * multi-character fold, as a match that should have occurred,
+                 * won't, due to the way nodes are matched, and our artificial
+                 * boundary.  So back off until we aren't splitting such a
+                 * fold.  If there is no such place to back off to, we end up
+                 * taking the entire node as-is.  This can happen if the node
+                 * consists entirely of 'f' or entirely of 's' characters (or
+                 * things that fold to them) as 'ff' and 'ss' are
+                 * multi-character folds.
+                 *
+                 * At this point:
+                 *  old_oldp  points to the beginning in the input of the
+                 *              penultimate character in the node.
+                 *  oldp      points to the beginning in the input of the
+                 *              final character in the node.
+                 *  p         points to the beginning in the input of the
+                 *              next character in the input, the one that won't
+                 *              fit in the node.
+                 *
+                 * We aren't in the middle of a multi-char fold unless the
+                 * final character in the node can appear in a non-final
+                 * position in such a fold.  Very few characters actually
+                 * participate in multi-character folds, and fewer still can be
+                 * in the non-final position.  But it's complicated to know
+                 * here if that final character is folded or not, so skip this
+                 * check */
+
+                           /* Make sure enough space for final char of node,
+                            * first char of following node, and the fold of the
+                            * following char (so we don't have to worry about
+                            * that fold running off the end */
+                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+                STRLEN fold_len;
+                UV folded;
+                char * const sav_oldp = oldp;
+
+                assert(FOLD);
+
+                /* The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  So we create a buffer
+                 * containing a window of three.  The first is the final
+                 * character in the node (folded), and then the two that begin
+                 * the following node.   But if the first character of the
+                 * following node can't be in a non-final fold position, there
+                 * is no need to look at its successor character.  The macros
+                 * used below to check for multi character folds require folded
+                 * inputs, so we have to fold these.  (The fold of p was likely
+                 * calculated in the loop above, but it hasn't beeen saved, and
+                 * khw thinks it would be too entangled to change to do so) */
+
+                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+                    folded = _to_uni_fold_flags(ender,
+                                                foldbuf,
+                                                &fold_len,
+                                                FOLD_FLAGS_FULL);
+                }
                  else {
+                    foldbuf[0] = folded = MICRO_SIGN;
+                    fold_len = 1;
+                }
+
+                /* Here, foldbuf contains the fold of the first character in
+                 * the next node.  We may also need the next one (if there is
+                 * one) to get our third, but if the first character folded to
+                 * more than one, those extra one(s) will serve as the third.
+                 * Also, we don't need a third unless the previous one can
+                 * appear in a non-final position in a fold */
+                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+                    && (fold_len == 1 || (   UTF
+                                          && UVCHR_SKIP(folded) == fold_len))
+                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+                {
+                    if (UTF) {
+                        STRLEN next_fold_len;
  
-                    /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+                                         (U8*) RExC_end, foldbuf + fold_len,
+                                         &next_fold_len);
+                        fold_len += next_fold_len;
+                    }
+                    else {
+                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+                            foldbuf[fold_len] = 's';
+                        }
+                        else {
+                            foldbuf[fold_len] = toLOWER_L1(p[1]);
+                        }
+                        fold_len++;
+                    }
+                }
  
-                    while (s >= s0) {   /* Search backwards until find
-                                           a non-problematic char */
-                        if (UTF8_IS_INVARIANT(*s)) {
+                /* Here foldbuf contains the the fold of p, and if appropriate
+                 * that of the character following p in the input. */
  
-                            /* There are no ascii characters that participate
-                             * in multi-char folds under /aa.  In EBCDIC, the
-                             * non-ascii invariants are all control characters,
-                             * so don't ever participate in any folds. */
-                            if (ASCII_FOLD_RESTRICTED
-                                || ! IS_NON_FINAL_FOLD(*s))
-                            {
-                                break;
-                            }
+                /* Search backwards until find a place that doesn't split a
+                 * multi-char fold */
+                while (1) {
+                    STRLEN s_len;
+                    char s_fold_buf[UTF8_MAXBYTES_CASE];
+                    char * s_fold = s_fold_buf;
+
+                    if (s <= s0) {
+
+                        /* There's no safe place in the node to split.  Quit so
+                         * will take the whole node */
+                        oldp = sav_oldp;
+                        break;
+                    }
+
+                    /* Backup 1 character.  The first time through this moves s
+                     * to point to the final character in the node */
+                    if (UTF) {
+                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    }
+                    else {
+                        s--;
+                    }
+
+                    /* 's' may or may not be folded; so make sure it is, and
+                     * use just the final character in its fold (should there
+                     * be more than one */
+                    if (UTF) {
+                        toFOLD_utf8_safe((U8*) s,
+                                         (U8*) s + UTF8SKIP(s),
+                                         (U8 *) s_fold_buf, &s_len);
+                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+                        {
+                            s_fold += UTF8SKIP(s_fold);
                          }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
-                                                                  *s, *(s+1))))
-                            {
+                        s_len = UTF8SKIP(s_fold);
+                    }
+                    else {
+                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+                        {
+                            s_fold_buf[0] = 's';
+                        }
+                        else {  /* This works for all other non-UTF-8 folds
+                                 */
+                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+                        }
+                        s_len = 1;
+                    }
+
+                    /* Unshift this character to the beginning of the buffer,
+                     * No longer needed trailing characters are overwritten.
+                     * */
+                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+                    Copy(s_fold, foldbuf, s_len, U8);
+
+                    /* If this isn't a multi-character fold, we have found a
+                     * splittable place.  If this is the final character in the
+                     * node, that means the node is valid as-is, and can quit.
+                     * Otherwise, we note how much we can fill the node before
+                     * coming to a non-splittable position, and go parse it
+                     * again, stopping there. This is done because we know
+                     * where in the output to stop, but we don't have a map to
+                     * where that is in the input.  One could be created, but
+                     * it seems like overkill for such a rare event as we are
+                     * dealing with here */
+                    if (UTF) {
+                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                        {
+                            upper_fill = s + UTF8SKIP(s) - s0;
+                            if (LIKELY(oldp)) {
                                  break;
                              }
+                            goto reparse;
                          }
-                        else if (! _invlist_contains_cp(
-                                        PL_NonFinalFold,
-                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
-                        {
+                    }
+                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                    {
+                        upper_fill = s + 1 - s0;
+                        if (LIKELY(oldp)) {
                              break;
                          }
-
-                        /* Here, the current character is problematic in that
-                         * it does occur in the non-final position of some
-                         * fold, so try the character before it, but have to
-                         * special case the very first byte in the string, so
-                         * we don't read outside the string */
-                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
-                        backup_count++;
-                    } /* End of loop backwards through the string */
-
-                    /* If there were only problematic characters in the string,
-                     * <s> will point to before s0, in which case the length
-                     * should be 0, otherwise include the length of the
-                     * non-problematic character just found */
-                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
-               }
-
-                /* Here, have found the final character, if any, that is
-                 * non-problematic as far as ending the node without splitting
-                 * it across a potential multi-char fold.  <len> contains the
-                 * number of bytes in the node up-to and including that
-                 * character, or is 0 if there is no such character, meaning
-                 * the whole node contains only problematic characters.  In
-                 * this case, give up and just take the node as-is.  We can't
-                 * do any better */
-                if (len == 0) {
-                    len = full_len;
-
-                } else {
-
-                    /* Here, the node does contain some characters that aren't
-                     * problematic.  If we didn't have to backup any, then the
-                     * final character in the node is non-problematic, and we
-                     * can take the node as-is */
-                    if (backup_count == 0) {
-                        goto loopdone;
+                        goto reparse;
                      }
-                    else if (backup_count == 1) {
  
-                        /* If the final character is problematic, but the
-                         * penultimate is not, back-off that last character to
-                         * later start a new node with it */
-                        p = oldp;
-                        goto loopdone;
-                    }
+                    oldp = old_oldp;
+                    old_oldp = NULL;
+
+                } /* End of loop backing up through the node */
+                    /* Here the node consists entirely of non-final multi-char
+                     * folds.  (Likely it is all 'f's or all 's's.)  There's no
+                     * decent place to split it, so give up and just take the
+                     * whole thing */
  
-                    /* Here, the final non-problematic character is earlier
-                     * in the input than the penultimate character.  What we do
-                     * is reparse from the beginning, going up only as far as
-                     * this final ok one, thus guaranteeing that the node ends
-                     * in an acceptable character.  The reason we reparse is
-                     * that we know how far in the character is, but we don't
-                     * know how to correlate its position with the input parse.
-                     * An alternate implementation would be to build that
-                     * correlation as we go along during the original parse,
-                     * but that would entail extra work for every node, whereas
-                     * this code gets executed only when the string is too
-                     * large for the node, and the final two characters are
-                     * problematic, an infrequent occurrence.  Yet another
-                     * possible strategy would be to save the tail of the
-                     * string, and the next time regatom is called, initialize
-                     * with that.  The problem with this is that unless you
-                     * back off one more character, you won't be guaranteed
-                     * regatom will get called again, unless regbranch,
-                     * regpiece ... are also changed.  If you do back off that
-                     * extra character, so that there is input guaranteed to
-                     * force calling regatom, you can't handle the case where
-                     * just the first character in the node is acceptable.  I
-                     * (khw) decided to try this method which doesn't have that
-                     * pitfall; if performance issues are found, we can do a
-                     * combination of the current approach plus that one */
-                    upper_fill = len;
-                    len = 0;
-                    s = s0;
-                    goto reparse;
-                }
             }   /* End of verifying node ends with an appropriate char */
  
+                p = oldp;
+
            loopdone:   /* Jumped to when encounters something that shouldn't be
                           in the node */
  
@@ -14701,15 +14739,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              else {
  
                  /* If the node type is EXACT here, check to see if it
-                 * should be EXACTL, or EXACT_ONLY8. */
+                 * should be EXACTL, or EXACT_REQ8. */
                  if (node_type == EXACT) {
                      if (LOC) {
                          node_type = EXACTL;
                      }
                      else if (requires_utf8_target) {
-                        node_type = EXACT_ONLY8;
+                        node_type = EXACT_REQ8;
                      }
-                } else if (FOLD) {
+                }
+                else if (node_type == LEXACT) {
+                    if (requires_utf8_target) {
+                        node_type = LEXACT_REQ8;
+                    }
+                }
+                else if (FOLD) {
                      if (    UNLIKELY(has_micro_sign || has_ss)
                          && (node_type == EXACTFU || (   node_type == EXACTF
                                                       && maybe_exactfu)))
@@ -14726,9 +14770,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          if (maybe_exactfu) {
                              node_type = EXACTFLU8;
                          }
+                        else if (UNLIKELY(
+                             _invlist_contains_cp(PL_HasMultiCharFold, ender)))
+                        {
+                            /* A character that folds to more than one will
+                             * match multiple characters, so can't be SIMPLE.
+                             * We don't have to worry about this with EXACTFLU8
+                             * nodes just above, as they have already been
+                             * folded (since the fold doesn't vary at run
+                             * time).  Here, if the final character in the node
+                             * folds to multiple, it can't be simple.  (This
+                             * only has an effect if the node has only a single
+                             * character, hence the final one, as elsewhere we
+                             * turn off simple for nodes whose length > 1 */
+                            maybe_SIMPLE = 0;
+                        }
                      }
                      else if (node_type == EXACTF) {  /* Means is /di */
  
+                        /* This intermediate variable is needed solely because
+                         * the asserts in the macro where used exceed Win32's
+                         * literal string capacity */
+                        char first_char = * STRING(REGNODE_p(ret));
+
                          /* If 'maybe_exactfu' is clear, then we need to stay
                           * /di.  If it is set, it means there are no code
                           * points that match differently depending on UTF8ness
@@ -14737,7 +14801,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          if (! maybe_exactfu) {
                              RExC_seen_d_op = TRUE;
                          }
-                        else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+                        else if (   isALPHA_FOLD_EQ(first_char, 's')
                                   || isALPHA_FOLD_EQ(ender, 's'))
                          {
                              /* But, if the node begins or ends in an 's' we
@@ -14757,7 +14821,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      }
  
                      if (requires_utf8_target && node_type == EXACTFU) {
-                        node_type = EXACTFU_ONLY8;
+                        node_type = EXACTFU_REQ8;
                      }
                  }
  
@@ -14766,7 +14830,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  RExC_emit += STR_SZ(len);
  
                  /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
                      maybe_SIMPLE = 0;
                  }
  
@@ -15936,8 +16000,11 @@ redo_curchar:
  
                      /* Recurse, with the meat of the embedded expression */
                      RExC_parse++;
-                    (void) handle_regex_sets(pRExC_state, &current, flagp,
-                                                    depth+1, oregcomp_parse);
+                    if (! handle_regex_sets(pRExC_state, &current, flagp,
+                                                    depth+1, oregcomp_parse))
+                    {
+                        RETURN_FAIL_ON_RESTART(*flagp, flagp);
+                    }
  
                      /* Here, 'current' contains the embedded expression's
                       * inversion list, and RExC_parse points to the trailing
@@ -15991,6 +16058,7 @@ redo_curchar:
                                FALSE, /* Require return to be an ANYOF */
                                &current))
                  {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                      goto regclass_failed;
                  }
  
@@ -16027,6 +16095,7 @@ redo_curchar:
                                  FALSE, /* Require return to be an ANYOF */
                                  &current))
                  {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                      goto regclass_failed;
                  }
  
@@ -16387,8 +16456,10 @@ redo_curchar:
          RExC_flags |= RXf_PMf_FOLD;
      }
  
-    if (!node)
+    if (!node) {
+        RETURN_FAIL_ON_RESTART(*flagp, flagp);
          goto regclass_failed;
+    }
  
      /* Fix up the node type if we are in locale.  (We have pretended we are
       * under /u for the purposes of regclass(), as this construct will only
@@ -18125,6 +18196,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
                  /* Likewise for 'posixes' */
                  _invlist_union(posixes, cp_list, &cp_list);
+                SvREFCNT_dec(posixes);
  
                  /* Likewise for anything else in the range that matched only
                   * under UTF-8 */
@@ -18375,6 +18447,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      goto not_anyof;
                  }
              }
+
              /* For well-behaved locales, some classes are subsets of others,
               * so complementing the subset and including the non-complemented
               * superset should match everything, like [\D[:alnum:]], and
@@ -18479,7 +18552,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
          /* Next see if can optimize classes that contain just a few code points
           * into an EXACTish node.  The reason to do this is to let the
-         * optimizer join this node with adjacent EXACTish ones.
+         * optimizer join this node with adjacent EXACTish ones, and ANYOF
+         * nodes require conversion to code point from UTF-8.
           *
           * An EXACTFish node can be generated even if not under /i, and vice
           * versa.  But care must be taken.  An EXACTFish node has to be such
@@ -18527,7 +18601,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      op = (FOLD) ? EXACTFL : EXACTL;
                  }
                  else if (! FOLD) { /* Not /l and not /i */
-                    op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+                    op = (start[0] < 256) ? EXACT : EXACT_REQ8;
                  }
                  else if (start[0] < 256) { /* /i, not /l, and the code point is
                                                small */
@@ -18557,8 +18631,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                             applies to it */
                      op = _invlist_contains_cp(PL_InMultiCharFold,
                                                start[0])
-                         ? EXACTFU_ONLY8
-                         : EXACT_ONLY8;
+                         ? EXACTFU_REQ8
+                         : EXACT_REQ8;
                  }
  
                  value = start[0];
@@ -18722,7 +18796,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                       ? EXACTFLU8
                                       : (ASCII_FOLD_RESTRICTED)
                                         ? EXACTFAA
-                                       : EXACTFU_ONLY8;
+                                       : EXACTFU_REQ8;
                                  value = folded;
                              }
                          }   /* Below, the lowest code point < 256 */
@@ -18855,7 +18929,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
  
              if (invlist_highest(cp_list) <= max_permissible) {
                  UV this_start, this_end;
-                UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
+                UV lowest_cp = UV_MAX;  /* init'ed to suppress compiler warn */
                  U8 bits_differing = 0;
                  Size_t full_cp_count = 0;
                  bool first_time = TRUE;
@@ -18932,6 +19006,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              if (op != END) {
                  goto not_anyof;
              }
+
+            /* XXX We could create an ANYOFR_LOW node here if we saved above if
+             * all were invariants, it wasn't inverted, and there is a single
+             * range.  This would be faster than some of the posix nodes we
+             * create below like /\d/a, but would be twice the size.  Without
+             * having actually measured the gain, khw doesn't think the
+             * tradeoff is really worth it */
          }
  
          if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
@@ -19070,7 +19151,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
              /* We store the lowest possible first byte of the UTF-8
               * representation, using the flags field.  This allows for quick
               * ruling out of some inputs without having to convert from UTF-8
-             * to code point.  For EBCDIC, this has to be I8. */
+             * to code point.  For EBCDIC, we use I8, as not doing that
+             * transformation would not rule out nearly so many things */
              anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
  
              /* If the first UTF-8 start byte for the highest code point in the
@@ -19953,15 +20035,17 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
  #endif
          if ( exact ) {
              switch (OP(REGNODE_p(scan))) {
+                case LEXACT:
                  case EXACT:
-                case EXACT_ONLY8:
+                case LEXACT_REQ8:
+                case EXACT_REQ8:
                  case EXACTL:
                  case EXACTF:
                  case EXACTFU_S_EDGE:
                  case EXACTFAA_NO_TRIE:
                  case EXACTFAA:
                  case EXACTFU:
-                case EXACTFU_ONLY8:
+                case EXACTFU_REQ8:
                  case EXACTFLU8:
                  case EXACTFUP:
                  case EXACTFL:
@@ -21532,7 +21616,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
          if (start >= NUM_ANYOF_CODE_POINTS) {
              this_end = end;
          }
-        else {
+        else {  /* Have to split range at the bitmap boundary */
              this_end = (end < NUM_ANYOF_CODE_POINTS)
                          ? end
                          : NUM_ANYOF_CODE_POINTS - 1;
@@ -22926,7 +23010,7 @@ Perl_parse_uniprop_string(pTHX_
                                  Titlecase Mapping  (both full and simple)
                                  Uppercase Mapping  (both full and simple)
               * Move the part that looks at the property values into a perl
-             * script, like utf8_heavy.pl is done.  This makes things somewhat
+             * script, like utf8_heavy.pl was done.  This makes things somewhat
               * easier, but most importantly, it avoids always adding all these
               * strings to the memory usage when the feature is little-used.
               *