perlguts: Update info about e.g. UVuf

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index b389f9e..4948a03 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -850,7 +850,8 @@ static const scan_data_t zero_scan_data = {
  #define UPDATE_WARNINGS_LOC(loc)                                        \
      STMT_START {                                                        \
          if (TO_OUTPUT_WARNINGS(loc)) {                                  \
-            RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
+            RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
+                                                       - RExC_precomp;  \
          }                                                               \
      } STMT_END
  
@@ -3551,9 +3552,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                      if ( state==1 ) {
                          OP( convert ) = nodetype;
                          str=STRING(convert);
-                        STR_LEN(convert)=0;
+                        setSTR_LEN(convert, 0);
                      }
-                    STR_LEN(convert) += len;
+                    setSTR_LEN(convert, STR_LEN(convert) + len);
                      while (len--)
                          *str++ = *ch++;
                 } else {
@@ -3993,8 +3994,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      using /iaa matching will be doing so almost entirely with ASCII
   *      strings, so this should rarely be encountered in practice */
  
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
-    if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
+    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
+                                      && OP(scan) != LEXACT_ONLY8)  \
          join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
  
  STATIC U32
@@ -4160,7 +4162,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
              merged++;
  
              NEXT_OFF(scan) += NEXT_OFF(n);
-            STR_LEN(scan) += STR_LEN(n);
+            setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
              next = n + NODE_SZ_STR(n);
              /* Now we can overwrite *n : */
              Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
@@ -5197,7 +5199,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
         }
         else if (   OP(scan) == EXACT
+                 || OP(scan) == LEXACT
                   || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == LEXACT_ONLY8
                   || OP(scan) == EXACTL)
          {
             SSize_t l = STR_LEN(scan);
@@ -5319,7 +5323,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                     next = NEXTOPER(scan);
                     if (   OP(next) == EXACT
+                        || OP(next) == LEXACT
                          || OP(next) == EXACT_ONLY8
+                        || OP(next) == LEXACT_ONLY8
                          || OP(next) == EXACTL
                          || (flags & SCF_DO_STCLASS))
                      {
@@ -7584,6 +7590,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          && memEQ(RX_PRECOMP(old_re), exp, plen)
         && !runtime_code /* with runtime code, always recompile */ )
      {
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+            Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
+                          PL_colors[4], PL_colors[5], s);
+        });
          return old_re;
      }
  
@@ -7972,7 +7984,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
          /* Ignore EXACT as we deal with it later. */
         if (PL_regkind[OP(first)] == EXACT) {
             if (   OP(first) == EXACT
+               || OP(first) == LEXACT
                  || OP(first) == EXACT_ONLY8
+                || OP(first) == LEXACT_ONLY8
                  || OP(first) == EXACTL)
              {
                 NOOP;   /* Empty, get anchored substr later. */
@@ -8318,7 +8332,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                   && nop == END)
              RExC_rx->extflags |= RXf_WHITE;
          else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+                  && (   fop == EXACT || fop == LEXACT
+                      || fop == EXACT_ONLY8 || fop == LEXACT_ONLY8
+                      || fop == EXACTL)
                    && STR_LEN(first) == 1
                    && *(STRING(first)) == ' '
                    && nop == END )
@@ -13916,13 +13932,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             UV ender = 0;
             char *p;
             char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
             char *s0;
-           U8 upper_parse = MAX_NODE_STRING_SIZE;
+            U32 max_string_len = 255;
+
+            /* We may have to reparse the node, artificially stopping filling
+             * it early, based on info gleaned in the first parse.  This
+             * variable gives where we stop.  Make it above the normal stopping
+             * place first time through. */
+           U32 upper_fill = max_string_len + 1;
  
              /* We start out as an EXACT node, even if under /i, until we find a
               * character which is in a fold.  The algorithm now segregates into
@@ -13938,7 +13955,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* Assume the node will be fully used; the excess is given back at
               * the end.  We can't make any other length assumptions, as a byte
               * input sequence could shrink down. */
-            Ptrdiff_t initial_size = STR_SZ(256);
+            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len);
  
              bool next_is_quantifier;
              char * oldp = NULL;
@@ -13969,10 +13986,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
              /* So is the MICRO SIGN */
              bool has_micro_sign = FALSE;
  
+            /* Set when we fill up the current node and there is still more
+             * text to process */
+            bool overflowed;
+
              /* Allocate an EXACT node.  The node_type may change below to
               * another EXACTish node, but since the size of the node doesn't
               * change, it works */
-            ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+            ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+                                                                    "exact");
              FILL_NODE(ret, node_type);
              RExC_emit++;
  
@@ -13982,6 +14004,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
           reparse:
  
+            p = RExC_parse;
+            len = 0;
+            s = s0;
+
+          continue_parse:
+
              /* This breaks under rare circumstances.  If folding, we do not
               * want to split a node at a character that is a non-final in a
               * multi-char fold, as an input string could just happen to want to
@@ -13996,12 +14024,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                     || UTF8_IS_START(UCHARAT(RExC_parse)));
  
+            overflowed = FALSE;
+
              /* Here, we have a literal character.  Find the maximal string of
               * them in the input that we can fit into a single EXACTish node.
               * We quit at the first non-literal or when the node gets full, or
               * under /i the categorization of folding/non-folding character
               * changes */
-           for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+            while (p < RExC_end && len < upper_fill) {
  
                  /* In most cases each iteration adds one byte to the output.
                   * The exceptions override this */
@@ -14339,20 +14369,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  /* Ready to add 'ender' to the node */
  
                  if (! FOLD) {  /* The simple case, just append the literal */
+                  not_fold_common:
  
-                      not_fold_common:
-                        if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
-                            *(s++) = (char) ender;
-                        }
-                        else {
-                            U8 * new_s = uvchr_to_utf8((U8*)s, ender);
-                            added_len = (char *) new_s - s;
-                            s = (char *) new_s;
+                    /* Don't output if it would overflow */
+                    if (UNLIKELY(len > max_string_len - ((UTF)
+                                                         ? UVCHR_SKIP(ender)
+                                                         : 1)))
+                    {
+                        overflowed = TRUE;
+                        break;
+                    }
  
-                            if (ender > 255)  {
-                                requires_utf8_target = TRUE;
-                            }
+                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+                        *(s++) = (char) ender;
+                    }
+                    else {
+                        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+                        added_len = (char *) new_s - s;
+                        s = (char *) new_s;
+
+                        if (ender > 255)  {
+                            requires_utf8_target = TRUE;
                          }
+                    }
                  }
                  else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
  
@@ -14418,20 +14457,33 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
                      if (UTF) {  /* Use the folded value */
                          if (UVCHR_IS_INVARIANT(ender)) {
+                            if (UNLIKELY(len + 1 > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                              *(s)++ = (U8) toFOLD(ender);
                          }
                          else {
-                            ender = _to_uni_fold_flags(
+                            U8 temp[UTF8_MAXBYTES_CASE+1];
+
+                            UV folded = _to_uni_fold_flags(
                                      ender,
-                                    (U8 *) s,
+                                    temp,
                                      &added_len,
                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
                                                      ? FOLD_FLAGS_NOMIX_ASCII
                                                      : 0));
+                            if (UNLIKELY(len + added_len > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
+                            Copy(temp, s, added_len, char);
                              s += added_len;
  
-                            if (   ender > 255
-                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            if (   folded > 255
+                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
                              {
                                  /* U+B5 folds to the MU, so its possible for a
                                   * non-UTF-8 target to match it */
@@ -14483,9 +14535,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
                                  maybe_SIMPLE = 0;
                                  if (node_type == EXACTFU) {
+
+                                    if (UNLIKELY(len + 2 > max_string_len)) {
+                                        overflowed = TRUE;
+                                        break;
+                                    }
+
                                      *(s++) = 's';
  
-                                    /* Let the code below add in the extra 's' */
+                                    /* Let the code below add in the extra 's'
+                                     * */
                                      ender = 's';
                                      added_len = 2;
                                  }
@@ -14497,6 +14556,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                              has_micro_sign = TRUE;
                          }
  
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
+                        }
+
                          *(s++) = (DEPENDS_SEMANTICS)
                                   ? (char) toFOLD(ender)
  
@@ -14521,168 +14585,280 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  
             } /* End of loop through literal characters */
  
-            /* Here we have either exhausted the input or ran out of room in
-             * the node.  (If we encountered a character that can't be in the
-             * node, transfer is made directly to <loopdone>, and so we
-             * wouldn't have fallen off the end of the loop.)  In the latter
-             * case, we artificially have to split the node into two, because
-             * we just don't have enough space to hold everything.  This
-             * creates a problem if the final character participates in a
-             * multi-character fold in the non-final position, as a match that
-             * should have occurred won't, due to the way nodes are matched,
-             * and our artificial boundary.  So back off until we find a non-
-             * problematic character -- one that isn't at the beginning or
-             * middle of such a fold.  (Either it doesn't participate in any
-             * folds, or appears only in the final position of all the folds it
-             * does participate in.)  A better solution with far fewer false
-             * positives, and that would fill the nodes more completely, would
-             * be to actually have available all the multi-character folds to
-             * test against, and to back-off only far enough to be sure that
-             * this node isn't ending with a partial one.  <upper_parse> is set
-             * further below (if we need to reparse the node) to include just
-             * up through that final non-problematic character that this code
-             * identifies, so when it is set to less than the full node, we can
-             * skip the rest of this */
-            if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
-                PERL_UINT_FAST8_T backup_count = 0;
-
-                const STRLEN full_len = len;
-
-               assert(len >= MAX_NODE_STRING_SIZE);
-
-                /* Here, <s> points to just beyond where we have output the
-                 * final character of the node.  Look backwards through the
-                 * string until find a non- problematic character */
-
-               if (! UTF) {
-
-                    /* This has no multi-char folds to non-UTF characters */
-                    if (ASCII_FOLD_RESTRICTED) {
-                        goto loopdone;
-                    }
-
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
-                        backup_count++;
-                    }
-                    len = s - s0 + 1;
-               }
-                else {
+            /* Here we have either exhausted the input or run out of room in
+             * the node.  If the former, we are done.  (If we encountered a
+             * character that can't be in the node, transfer is made directly
+             * to <loopdone>, and so we wouldn't have fallen off the end of the
+             * loop.)  */
+            if (LIKELY(! overflowed)) {
+                goto loopdone;
+            }
+
+            /* Here we have run out of room.  We can grow plain EXACT and
+             * LEXACT nodes.  If the pattern is gigantic enough, though,
+             * eventually we'll have to artificially chunk the pattern into
+             * multiple nodes. */
+            if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+                Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+                Size_t overhead_expansion = 0;
+                char temp[256];
+                Size_t max_nodes_for_string;
+                Size_t achievable;
+                SSize_t delta;
+
+                /* Here we couldn't fit the final character in the current
+                 * node, so it will have to be reparsed, no matter what else we
+                 * do */
+                p = oldp;
+
+
+                /* If would have overflowed a regular EXACT node, switch
+                 * instead to an LEXACT.  The code below is structured so that
+                 * the actual growing code is common to changing from an EXACT
+                 * or just increasing the LEXACT size.  This means that we have
+                 * to save the string in the EXACT case before growing, and
+                 * then copy it afterwards to its new location */
+                if (node_type == EXACT) {
+                    overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+                    RExC_emit += overhead_expansion;
+                    Copy(s0, temp, len, char);
+                }
+
+                /* Ready to grow.  If it was a plain EXACT, the string was
+                 * saved, and the first few bytes of it overwritten by adding
+                 * an argument field.  We assume, as we do elsewhere in this
+                 * file, that one byte of remaining input will translate into
+                 * one byte of output, and if that's too small, we grow again,
+                 * if too large the excess memory is freed at the end */
+
+                max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+                achievable = MIN(max_nodes_for_string,
+                                 current_string_nodes + STR_SZ(RExC_end - p));
+                delta = achievable - current_string_nodes;
+
+                /* If there is just no more room, go finish up this chunk of
+                 * the pattern. */
+                if (delta <= 0) {
+                    goto loopdone;
+                }
  
-                    /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                change_engine_size(pRExC_state, delta + overhead_expansion);
+                current_string_nodes += delta;
+                max_string_len
+                           = sizeof(struct regnode) * current_string_nodes;
+                upper_fill = max_string_len + 1;
+
+                /* If the length was small, we know this was originally an
+                 * EXACT node now converted to LEXACT, and the string has to be
+                 * restored.  Otherwise the string was untouched.  260 is just
+                 * a number safely above 255 so don't have to worry about
+                 * getting it precise */
+                if (len < 260) {
+                    node_type = LEXACT;
+                    FILL_NODE(ret, node_type);
+                    s0 = STRING(REGNODE_p(ret));
+                    Copy(temp, s0, len, char);
+                    s = s0 + len;
+                }
+
+                goto continue_parse;
+            }
+            else if (! LOC) {
+
+                /* Here is /i.  Running out of room creates a problem if we are
+                 * folding, and the split happens in the middle of a
+                 * multi-character fold, as a match that should have occurred,
+                 * won't, due to the way nodes are matched, and our artificial
+                 * boundary.  So back off until we aren't splitting such a
+                 * fold.  If there is no such place to back off to, we end up
+                 * taking the entire node as-is.  This can happen if the node
+                 * consists entirely of 'f' or entirely of 's' characters (or
+                 * things that fold to them) as 'ff' and 'ss' are
+                 * multi-character folds.
+                 *
+                 * At this point:
+                 *  oldp        points to the beginning in the input of the
+                 *              final character in the node.
+                 *  p           points to the beginning in the input of the
+                 *              next character in the input, the one that won't
+                 *              fit in the node.
+                 *
+                 * We aren't in the middle of a multi-char fold unless the
+                 * final character in the node can appear in a non-final
+                 * position in such a fold.  Very few characters actually
+                 * participate in multi-character folds, and fewer still can be
+                 * in the non-final position.  But it's complicated to know
+                 * here if that final character is folded or not, so skip this
+                 * check */
+
+                           /* Make sure enough space for final char of node,
+                            * first char of following node, and the fold of the
+                            * following char (so we don't have to worry about
+                            * that fold running off the end */
+                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+                STRLEN fold_len;
+                UV folded;
  
-                    while (s >= s0) {   /* Search backwards until find
-                                           a non-problematic char */
-                        if (UTF8_IS_INVARIANT(*s)) {
+                assert(FOLD);
+
+                /* The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  So we create a buffer
+                 * containing a window of three.  The first is the final
+                 * character in the node (folded), and then the two that begin
+                 * the following node.   But if the first character of the
+                 * following node can't be in a non-final fold position, there
+                 * is no need to look at its successor character.  The macros
+                 * used below to check for multi character folds require folded
+                 * inputs, so we have to fold these.  (The fold of p was likely
+                 * calculated in the loop above, but it hasn't beeen saved, and
+                 * khw thinks it would be too entangled to change to do so) */
+
+                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+                    folded = _to_uni_fold_flags(ender,
+                                                foldbuf,
+                                                &fold_len,
+                                                FOLD_FLAGS_FULL);
+                }
+                else {
+                    foldbuf[0] = folded = MICRO_SIGN;
+                    fold_len = 1;
+                }
+
+                /* Here, foldbuf contains the fold of the first character in
+                 * the next node.  We may also need the next one (if there is
+                 * one) to get our third, but if the first character folded to
+                 * more than one, those extra one(s) will serve as the third.
+                 * Also, we don't need a third unless the previous one can
+                 * appear in a non-final position in a fold */
+                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+                    && (fold_len == 1 || (   UTF
+                                          && UVCHR_SKIP(folded) == fold_len))
+                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+                {
+                    if (UTF) {
+                        STRLEN next_fold_len;
  
-                            /* There are no ascii characters that participate
-                             * in multi-char folds under /aa.  In EBCDIC, the
-                             * non-ascii invariants are all control characters,
-                             * so don't ever participate in any folds. */
-                            if (ASCII_FOLD_RESTRICTED
-                                || ! IS_NON_FINAL_FOLD(*s))
-                            {
-                                break;
-                            }
-                        }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
-                                                                  *s, *(s+1))))
-                            {
-                                break;
-                            }
+                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+                                         (U8*) RExC_end, foldbuf + fold_len,
+                                         &next_fold_len);
+                        fold_len += next_fold_len;
+                    }
+                    else {
+                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+                            foldbuf[fold_len] = 's';
                          }
-                        else if (! _invlist_contains_cp(
-                                        PL_NonFinalFold,
-                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
-                        {
-                            break;
+                        else {
+                            foldbuf[fold_len] = toLOWER_L1(p[1]);
                          }
+                        fold_len++;
+                    }
+                }
  
-                        /* Here, the current character is problematic in that
-                         * it does occur in the non-final position of some
-                         * fold, so try the character before it, but have to
-                         * special case the very first byte in the string, so
-                         * we don't read outside the string */
-                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
-                        backup_count++;
-                    } /* End of loop backwards through the string */
-
-                    /* If there were only problematic characters in the string,
-                     * <s> will point to before s0, in which case the length
-                     * should be 0, otherwise include the length of the
-                     * non-problematic character just found */
-                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
-               }
+                /* Here foldbuf contains the the fold of p, and if appropriate
+                 * that of the character following p in the input. */
  
-                /* Here, have found the final character, if any, that is
-                 * non-problematic as far as ending the node without splitting
-                 * it across a potential multi-char fold.  <len> contains the
-                 * number of bytes in the node up-to and including that
-                 * character, or is 0 if there is no such character, meaning
-                 * the whole node contains only problematic characters.  In
-                 * this case, give up and just take the node as-is.  We can't
-                 * do any better */
-                if (len == 0) {
-                    len = full_len;
+                /* Search backwards until find a place that doesn't split a
+                 * multi-char fold */
+                while (1) {
+                    STRLEN s_len;
+                    char s_fold_buf[UTF8_MAXBYTES_CASE];
+                    char * s_fold = s_fold_buf;
  
-                } else {
+                    if (s <= s0) {
  
-                    /* Here, the node does contain some characters that aren't
-                     * problematic.  If we didn't have to backup any, then the
-                     * final character in the node is non-problematic, and we
-                     * can take the node as-is */
-                    if (backup_count == 0) {
-                        goto loopdone;
+                        /* There's no safe place in the node to split.  Quit so
+                         * will take the whole node */
+                        break;
                      }
-                    else if (backup_count == 1) {
  
-                        /* If the final character is problematic, but the
-                         * penultimate is not, back-off that last character to
-                         * later start a new node with it */
-                        p = oldp;
-                        goto loopdone;
+                    /* Backup 1 character.  The first time through this moves s
+                     * to point to the final character in the node */
+                    if (UTF) {
+                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    }
+                    else {
+                        s--;
                      }
  
-                    /* Here, the final non-problematic character is earlier
-                     * in the input than the penultimate character.  What we do
-                     * is reparse from the beginning, going up only as far as
-                     * this final ok one, thus guaranteeing that the node ends
-                     * in an acceptable character.  The reason we reparse is
-                     * that we know how far in the character is, but we don't
-                     * know how to correlate its position with the input parse.
-                     * An alternate implementation would be to build that
-                     * correlation as we go along during the original parse,
-                     * but that would entail extra work for every node, whereas
-                     * this code gets executed only when the string is too
-                     * large for the node, and the final two characters are
-                     * problematic, an infrequent occurrence.  Yet another
-                     * possible strategy would be to save the tail of the
-                     * string, and the next time regatom is called, initialize
-                     * with that.  The problem with this is that unless you
-                     * back off one more character, you won't be guaranteed
-                     * regatom will get called again, unless regbranch,
-                     * regpiece ... are also changed.  If you do back off that
-                     * extra character, so that there is input guaranteed to
-                     * force calling regatom, you can't handle the case where
-                     * just the first character in the node is acceptable.  I
-                     * (khw) decided to try this method which doesn't have that
-                     * pitfall; if performance issues are found, we can do a
-                     * combination of the current approach plus that one */
-                    upper_parse = len;
-                    len = 0;
-                    s = s0;
-                    goto reparse;
+                    /* 's' may or may not be folded; so make sure it is, and
+                     * use just the final character in its fold (should there
+                     * be more than one */
+                    if (UTF) {
+                        toFOLD_utf8_safe((U8*) s,
+                                         (U8*) s + UTF8SKIP(s),
+                                         (U8 *) s_fold_buf, &s_len);
+                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+                        {
+                            s_fold += UTF8SKIP(s_fold);
+                        }
+                        s_len = UTF8SKIP(s_fold);
+                    }
+                    else {
+                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+                        {
+                            s_fold_buf[0] = 's';
+                        }
+                        else {  /* This works for all other non-UTF-8 folds
+                                 */
+                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+                        }
+                        s_len = 1;
+                    }
+
+                    /* Unshift this character to the beginning of the buffer,
+                     * No longer needed trailing characters are overwritten.
+                     * */
+                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+                    Copy(s_fold, foldbuf, s_len, U8);
+
+                    /* If this isn't a multi-character fold, we have found a
+                     * splittable place.  If this is the final character in the
+                     * node, that means the node is valid as-is, and can quit.
+                     * Otherwise, we note how much we can fill the node before
+                     * coming to a non-splittable position, and go parse it
+                     * again, stopping there. This is done because we know
+                     * where in the output to stop, but we don't have a map to
+                     * where that is in the input.  One could be created, but
+                     * it seems like overkill for such a rare event as we are
+                     * dealing with here */
+                    if (UTF) {
+                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                        {
+                            upper_fill = s + UTF8SKIP(s) - s0;
+                            if (LIKELY(upper_fill == 255)) {
+                                break;
+                            }
+                            goto reparse;
+                        }
+                    }
+                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                    {
+                        upper_fill = s + 1 - s0;
+                        if (LIKELY(upper_fill == 255)) {
+                            break;
+                        }
+                        goto reparse;
+                    }
                  }
+
+                /* Here the node consists entirely of non-final multi-char
+                 * folds.  (Likely it is all 'f's or all 's's.)  There's no
+                 * decent place to split it, so give up and just take the whole
+                 * thing */
+
             }   /* End of verifying node ends with an appropriate char */
  
+            p = oldp;
+
            loopdone:   /* Jumped to when encounters something that shouldn't be
                           in the node */
  
              /* Free up any over-allocated space; cast is to silence bogus
               * warning in MS VC */
              change_engine_size(pRExC_state,
-                                - (Ptrdiff_t) (initial_size - STR_SZ(len)));
+                        - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
  
              /* I (khw) don't know if you can get here with zero length, but the
               * old code handled this situation by creating a zero-length EXACT
@@ -14701,7 +14877,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      else if (requires_utf8_target) {
                          node_type = EXACT_ONLY8;
                      }
-                } else if (FOLD) {
+                }
+                else if (node_type == LEXACT) {
+                    if (requires_utf8_target) {
+                        node_type = LEXACT_ONLY8;
+                    }
+                }
+                else if (FOLD) {
                      if (    UNLIKELY(has_micro_sign || has_ss)
                          && (node_type == EXACTFU || (   node_type == EXACTF
                                                       && maybe_exactfu)))
@@ -14721,6 +14903,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                      }
                      else if (node_type == EXACTF) {  /* Means is /di */
  
+                        /* This intermediate variable is needed solely because
+                         * the asserts in the macro where used exceed Win32's
+                         * literal string capacity */
+                        char first_char = * STRING(REGNODE_p(ret));
+
                          /* If 'maybe_exactfu' is clear, then we need to stay
                           * /di.  If it is set, it means there are no code
                           * points that match differently depending on UTF8ness
@@ -14729,7 +14916,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                          if (! maybe_exactfu) {
                              RExC_seen_d_op = TRUE;
                          }
-                        else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+                        else if (   isALPHA_FOLD_EQ(first_char, 's')
                                   || isALPHA_FOLD_EQ(ender, 's'))
                          {
                              /* But, if the node begins or ends in an 's' we
@@ -14754,11 +14941,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                  }
  
                  OP(REGNODE_p(ret)) = node_type;
-                STR_LEN(REGNODE_p(ret)) = len;
+                setSTR_LEN(REGNODE_p(ret), len);
                  RExC_emit += STR_SZ(len);
  
                  /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
                      maybe_SIMPLE = 0;
                  }
  
@@ -15928,8 +16115,11 @@ redo_curchar:
  
                      /* Recurse, with the meat of the embedded expression */
                      RExC_parse++;
-                    (void) handle_regex_sets(pRExC_state, &current, flagp,
-                                                    depth+1, oregcomp_parse);
+                    if (! handle_regex_sets(pRExC_state, &current, flagp,
+                                                    depth+1, oregcomp_parse))
+                    {
+                        RETURN_FAIL_ON_RESTART(*flagp, flagp);
+                    }
  
                      /* Here, 'current' contains the embedded expression's
                       * inversion list, and RExC_parse points to the trailing
@@ -15983,6 +16173,7 @@ redo_curchar:
                                FALSE, /* Require return to be an ANYOF */
                                &current))
                  {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                      goto regclass_failed;
                  }
  
@@ -16019,6 +16210,7 @@ redo_curchar:
                                  FALSE, /* Require return to be an ANYOF */
                                  &current))
                  {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                      goto regclass_failed;
                  }
  
@@ -16379,8 +16571,10 @@ redo_curchar:
          RExC_flags |= RXf_PMf_FOLD;
      }
  
-    if (!node)
+    if (!node) {
+        RETURN_FAIL_ON_RESTART(*flagp, flagp);
          goto regclass_failed;
+    }
  
      /* Fix up the node type if we are in locale.  (We have pretended we are
       * under /u for the purposes of regclass(), as this construct will only
@@ -18796,7 +18990,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                      ret = regnode_guts(pRExC_state, op, len, "exact");
                      FILL_NODE(ret, op);
                      RExC_emit += 1 + STR_SZ(len);
-                    STR_LEN(REGNODE_p(ret)) = len;
+                    setSTR_LEN(REGNODE_p(ret), len);
                      if (len == 1) {
                          *STRING(REGNODE_p(ret)) = (U8) value;
                      }
@@ -19601,8 +19795,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
  STATIC void
  S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
  {
-    /* 'size' is the delta to add or subtract from the current memory allocated
-     * to the regex engine being constructed */
+    /* 'size' is the delta number of smallest regnode equivalents to add or
+     * subtract from the current memory allocated to the regex engine being
+     * constructed. */
  
      PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
  
@@ -19634,8 +19829,8 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
  STATIC regnode_offset
  S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
  {
-    /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
-     * and increments RExC_size and RExC_emit
+    /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+     * equivalents space.  It aligns and increments RExC_size and RExC_emit
       *
       * It returns the regnode's offset into the regex engine program */
  
@@ -19944,7 +20139,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
  #endif
          if ( exact ) {
              switch (OP(REGNODE_p(scan))) {
+                case LEXACT:
                  case EXACT:
+                case LEXACT_ONLY8:
                  case EXACT_ONLY8:
                  case EXACTL:
                  case EXACTF: