This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlguts: Update info about e.g. UVuf
[perl5.git] / regcomp.c
index b389f9e..4948a03 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -850,7 +850,8 @@ static const scan_data_t zero_scan_data = {
 #define UPDATE_WARNINGS_LOC(loc)                                        \
     STMT_START {                                                        \
         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
-            RExC_latest_warn_offset = (xI(loc)) - RExC_precomp;         \
+            RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
+                                                       - RExC_precomp;  \
         }                                                               \
     } STMT_END
 
@@ -3551,9 +3552,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
                     if ( state==1 ) {
                         OP( convert ) = nodetype;
                         str=STRING(convert);
-                        STR_LEN(convert)=0;
+                        setSTR_LEN(convert, 0);
                     }
-                    STR_LEN(convert) += len;
+                    setSTR_LEN(convert, STR_LEN(convert) + len);
                     while (len--)
                         *str++ = *ch++;
                } else {
@@ -3993,8 +3994,9 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
  *      using /iaa matching will be doing so almost entirely with ASCII
  *      strings, so this should rarely be encountered in practice */
 
-#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
-    if (PL_regkind[OP(scan)] == EXACT) \
+#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags)    \
+    if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT         \
+                                      && OP(scan) != LEXACT_ONLY8)  \
         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
 
 STATIC U32
@@ -4160,7 +4162,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
             merged++;
 
             NEXT_OFF(scan) += NEXT_OFF(n);
-            STR_LEN(scan) += STR_LEN(n);
+            setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
             next = n + NODE_SZ_STR(n);
             /* Now we can overwrite *n : */
             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
@@ -5197,7 +5199,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
        }
        else if (   OP(scan) == EXACT
+                 || OP(scan) == LEXACT
                  || OP(scan) == EXACT_ONLY8
+                 || OP(scan) == LEXACT_ONLY8
                  || OP(scan) == EXACTL)
         {
            SSize_t l = STR_LEN(scan);
@@ -5319,7 +5323,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
                    next = NEXTOPER(scan);
                    if (   OP(next) == EXACT
+                        || OP(next) == LEXACT
                         || OP(next) == EXACT_ONLY8
+                        || OP(next) == LEXACT_ONLY8
                         || OP(next) == EXACTL
                         || (flags & SCF_DO_STCLASS))
                     {
@@ -7584,6 +7590,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         && memEQ(RX_PRECOMP(old_re), exp, plen)
        && !runtime_code /* with runtime code, always recompile */ )
     {
+        DEBUG_COMPILE_r({
+            SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
+            Perl_re_printf( aTHX_  "%sSkipping recompilation of unchanged REx%s %s\n",
+                          PL_colors[4], PL_colors[5], s);
+        });
         return old_re;
     }
 
@@ -7972,7 +7984,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         /* Ignore EXACT as we deal with it later. */
        if (PL_regkind[OP(first)] == EXACT) {
            if (   OP(first) == EXACT
+               || OP(first) == LEXACT
                 || OP(first) == EXACT_ONLY8
+                || OP(first) == LEXACT_ONLY8
                 || OP(first) == EXACTL)
             {
                NOOP;   /* Empty, get anchored substr later. */
@@ -8318,7 +8332,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
                  && nop == END)
             RExC_rx->extflags |= RXf_WHITE;
         else if ( RExC_rx->extflags & RXf_SPLIT
-                  && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
+                  && (   fop == EXACT || fop == LEXACT
+                      || fop == EXACT_ONLY8 || fop == LEXACT_ONLY8
+                      || fop == EXACTL)
                   && STR_LEN(first) == 1
                   && *(STRING(first)) == ' '
                   && nop == END )
@@ -13916,13 +13932,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            UV ender = 0;
            char *p;
            char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
            char *s0;
-           U8 upper_parse = MAX_NODE_STRING_SIZE;
+            U32 max_string_len = 255;
+
+            /* We may have to reparse the node, artificially stopping filling
+             * it early, based on info gleaned in the first parse.  This
+             * variable gives where we stop.  Make it above the normal stopping
+             * place first time through. */
+           U32 upper_fill = max_string_len + 1;
 
             /* We start out as an EXACT node, even if under /i, until we find a
              * character which is in a fold.  The algorithm now segregates into
@@ -13938,7 +13955,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* Assume the node will be fully used; the excess is given back at
              * the end.  We can't make any other length assumptions, as a byte
              * input sequence could shrink down. */
-            Ptrdiff_t initial_size = STR_SZ(256);
+            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len);
 
             bool next_is_quantifier;
             char * oldp = NULL;
@@ -13969,10 +13986,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* So is the MICRO SIGN */
             bool has_micro_sign = FALSE;
 
+            /* Set when we fill up the current node and there is still more
+             * text to process */
+            bool overflowed;
+
             /* Allocate an EXACT node.  The node_type may change below to
              * another EXACTish node, but since the size of the node doesn't
              * change, it works */
-            ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+            ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
+                                                                    "exact");
             FILL_NODE(ret, node_type);
             RExC_emit++;
 
@@ -13982,6 +14004,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
          reparse:
 
+            p = RExC_parse;
+            len = 0;
+            s = s0;
+
+          continue_parse:
+
             /* This breaks under rare circumstances.  If folding, we do not
              * want to split a node at a character that is a non-final in a
              * multi-char fold, as an input string could just happen to want to
@@ -13996,12 +14024,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
+            overflowed = FALSE;
+
             /* Here, we have a literal character.  Find the maximal string of
              * them in the input that we can fit into a single EXACTish node.
              * We quit at the first non-literal or when the node gets full, or
              * under /i the categorization of folding/non-folding character
              * changes */
-           for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+            while (p < RExC_end && len < upper_fill) {
 
                 /* In most cases each iteration adds one byte to the output.
                  * The exceptions override this */
@@ -14339,20 +14369,29 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 /* Ready to add 'ender' to the node */
 
                 if (! FOLD) {  /* The simple case, just append the literal */
+                  not_fold_common:
 
-                      not_fold_common:
-                        if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
-                            *(s++) = (char) ender;
-                        }
-                        else {
-                            U8 * new_s = uvchr_to_utf8((U8*)s, ender);
-                            added_len = (char *) new_s - s;
-                            s = (char *) new_s;
+                    /* Don't output if it would overflow */
+                    if (UNLIKELY(len > max_string_len - ((UTF)
+                                                         ? UVCHR_SKIP(ender)
+                                                         : 1)))
+                    {
+                        overflowed = TRUE;
+                        break;
+                    }
 
-                            if (ender > 255)  {
-                                requires_utf8_target = TRUE;
-                            }
+                    if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+                        *(s++) = (char) ender;
+                    }
+                    else {
+                        U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+                        added_len = (char *) new_s - s;
+                        s = (char *) new_s;
+
+                        if (ender > 255)  {
+                            requires_utf8_target = TRUE;
                         }
+                    }
                 }
                 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
 
@@ -14418,20 +14457,33 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                     if (UTF) {  /* Use the folded value */
                         if (UVCHR_IS_INVARIANT(ender)) {
+                            if (UNLIKELY(len + 1 > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                             *(s)++ = (U8) toFOLD(ender);
                         }
                         else {
-                            ender = _to_uni_fold_flags(
+                            U8 temp[UTF8_MAXBYTES_CASE+1];
+
+                            UV folded = _to_uni_fold_flags(
                                     ender,
-                                    (U8 *) s,
+                                    temp,
                                     &added_len,
                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
                                                     ? FOLD_FLAGS_NOMIX_ASCII
                                                     : 0));
+                            if (UNLIKELY(len + added_len > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
+                            Copy(temp, s, added_len, char);
                             s += added_len;
 
-                            if (   ender > 255
-                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            if (   folded > 255
+                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
                             {
                                 /* U+B5 folds to the MU, so its possible for a
                                  * non-UTF-8 target to match it */
@@ -14483,9 +14535,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
                                 maybe_SIMPLE = 0;
                                 if (node_type == EXACTFU) {
+
+                                    if (UNLIKELY(len + 2 > max_string_len)) {
+                                        overflowed = TRUE;
+                                        break;
+                                    }
+
                                     *(s++) = 's';
 
-                                    /* Let the code below add in the extra 's' */
+                                    /* Let the code below add in the extra 's'
+                                     * */
                                     ender = 's';
                                     added_len = 2;
                                 }
@@ -14497,6 +14556,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             has_micro_sign = TRUE;
                         }
 
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
+                        }
+
                         *(s++) = (DEPENDS_SEMANTICS)
                                  ? (char) toFOLD(ender)
 
@@ -14521,168 +14585,280 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
            } /* End of loop through literal characters */
 
-            /* Here we have either exhausted the input or ran out of room in
-             * the node.  (If we encountered a character that can't be in the
-             * node, transfer is made directly to <loopdone>, and so we
-             * wouldn't have fallen off the end of the loop.)  In the latter
-             * case, we artificially have to split the node into two, because
-             * we just don't have enough space to hold everything.  This
-             * creates a problem if the final character participates in a
-             * multi-character fold in the non-final position, as a match that
-             * should have occurred won't, due to the way nodes are matched,
-             * and our artificial boundary.  So back off until we find a non-
-             * problematic character -- one that isn't at the beginning or
-             * middle of such a fold.  (Either it doesn't participate in any
-             * folds, or appears only in the final position of all the folds it
-             * does participate in.)  A better solution with far fewer false
-             * positives, and that would fill the nodes more completely, would
-             * be to actually have available all the multi-character folds to
-             * test against, and to back-off only far enough to be sure that
-             * this node isn't ending with a partial one.  <upper_parse> is set
-             * further below (if we need to reparse the node) to include just
-             * up through that final non-problematic character that this code
-             * identifies, so when it is set to less than the full node, we can
-             * skip the rest of this */
-            if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
-                PERL_UINT_FAST8_T backup_count = 0;
-
-                const STRLEN full_len = len;
-
-               assert(len >= MAX_NODE_STRING_SIZE);
-
-                /* Here, <s> points to just beyond where we have output the
-                 * final character of the node.  Look backwards through the
-                 * string until find a non- problematic character */
-
-               if (! UTF) {
-
-                    /* This has no multi-char folds to non-UTF characters */
-                    if (ASCII_FOLD_RESTRICTED) {
-                        goto loopdone;
-                    }
-
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
-                        backup_count++;
-                    }
-                    len = s - s0 + 1;
-               }
-                else {
+            /* Here we have either exhausted the input or run out of room in
+             * the node.  If the former, we are done.  (If we encountered a
+             * character that can't be in the node, transfer is made directly
+             * to <loopdone>, and so we wouldn't have fallen off the end of the
+             * loop.)  */
+            if (LIKELY(! overflowed)) {
+                goto loopdone;
+            }
+
+            /* Here we have run out of room.  We can grow plain EXACT and
+             * LEXACT nodes.  If the pattern is gigantic enough, though,
+             * eventually we'll have to artificially chunk the pattern into
+             * multiple nodes. */
+            if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
+                Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
+                Size_t overhead_expansion = 0;
+                char temp[256];
+                Size_t max_nodes_for_string;
+                Size_t achievable;
+                SSize_t delta;
+
+                /* Here we couldn't fit the final character in the current
+                 * node, so it will have to be reparsed, no matter what else we
+                 * do */
+                p = oldp;
+
+
+                /* If would have overflowed a regular EXACT node, switch
+                 * instead to an LEXACT.  The code below is structured so that
+                 * the actual growing code is common to changing from an EXACT
+                 * or just increasing the LEXACT size.  This means that we have
+                 * to save the string in the EXACT case before growing, and
+                 * then copy it afterwards to its new location */
+                if (node_type == EXACT) {
+                    overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
+                    RExC_emit += overhead_expansion;
+                    Copy(s0, temp, len, char);
+                }
+
+                /* Ready to grow.  If it was a plain EXACT, the string was
+                 * saved, and the first few bytes of it overwritten by adding
+                 * an argument field.  We assume, as we do elsewhere in this
+                 * file, that one byte of remaining input will translate into
+                 * one byte of output, and if that's too small, we grow again,
+                 * if too large the excess memory is freed at the end */
+
+                max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
+                achievable = MIN(max_nodes_for_string,
+                                 current_string_nodes + STR_SZ(RExC_end - p));
+                delta = achievable - current_string_nodes;
+
+                /* If there is just no more room, go finish up this chunk of
+                 * the pattern. */
+                if (delta <= 0) {
+                    goto loopdone;
+                }
 
-                    /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                change_engine_size(pRExC_state, delta + overhead_expansion);
+                current_string_nodes += delta;
+                max_string_len
+                           = sizeof(struct regnode) * current_string_nodes;
+                upper_fill = max_string_len + 1;
+
+                /* If the length was small, we know this was originally an
+                 * EXACT node now converted to LEXACT, and the string has to be
+                 * restored.  Otherwise the string was untouched.  260 is just
+                 * a number safely above 255 so don't have to worry about
+                 * getting it precise */
+                if (len < 260) {
+                    node_type = LEXACT;
+                    FILL_NODE(ret, node_type);
+                    s0 = STRING(REGNODE_p(ret));
+                    Copy(temp, s0, len, char);
+                    s = s0 + len;
+                }
+
+                goto continue_parse;
+            }
+            else if (! LOC) {
+
+                /* Here is /i.  Running out of room creates a problem if we are
+                 * folding, and the split happens in the middle of a
+                 * multi-character fold, as a match that should have occurred,
+                 * won't, due to the way nodes are matched, and our artificial
+                 * boundary.  So back off until we aren't splitting such a
+                 * fold.  If there is no such place to back off to, we end up
+                 * taking the entire node as-is.  This can happen if the node
+                 * consists entirely of 'f' or entirely of 's' characters (or
+                 * things that fold to them) as 'ff' and 'ss' are
+                 * multi-character folds.
+                 *
+                 * At this point:
+                 *  oldp        points to the beginning in the input of the
+                 *              final character in the node.
+                 *  p           points to the beginning in the input of the
+                 *              next character in the input, the one that won't
+                 *              fit in the node.
+                 *
+                 * We aren't in the middle of a multi-char fold unless the
+                 * final character in the node can appear in a non-final
+                 * position in such a fold.  Very few characters actually
+                 * participate in multi-character folds, and fewer still can be
+                 * in the non-final position.  But it's complicated to know
+                 * here if that final character is folded or not, so skip this
+                 * check */
+
+                           /* Make sure enough space for final char of node,
+                            * first char of following node, and the fold of the
+                            * following char (so we don't have to worry about
+                            * that fold running off the end */
+                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+                STRLEN fold_len;
+                UV folded;
 
-                    while (s >= s0) {   /* Search backwards until find
-                                           a non-problematic char */
-                        if (UTF8_IS_INVARIANT(*s)) {
+                assert(FOLD);
+
+                /* The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  So we create a buffer
+                 * containing a window of three.  The first is the final
+                 * character in the node (folded), and then the two that begin
+                 * the following node.   But if the first character of the
+                 * following node can't be in a non-final fold position, there
+                 * is no need to look at its successor character.  The macros
+                 * used below to check for multi character folds require folded
+                 * inputs, so we have to fold these.  (The fold of p was likely
+                 * calculated in the loop above, but it hasn't beeen saved, and
+                 * khw thinks it would be too entangled to change to do so) */
+
+                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+                    folded = _to_uni_fold_flags(ender,
+                                                foldbuf,
+                                                &fold_len,
+                                                FOLD_FLAGS_FULL);
+                }
+                else {
+                    foldbuf[0] = folded = MICRO_SIGN;
+                    fold_len = 1;
+                }
+
+                /* Here, foldbuf contains the fold of the first character in
+                 * the next node.  We may also need the next one (if there is
+                 * one) to get our third, but if the first character folded to
+                 * more than one, those extra one(s) will serve as the third.
+                 * Also, we don't need a third unless the previous one can
+                 * appear in a non-final position in a fold */
+                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+                    && (fold_len == 1 || (   UTF
+                                          && UVCHR_SKIP(folded) == fold_len))
+                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+                {
+                    if (UTF) {
+                        STRLEN next_fold_len;
 
-                            /* There are no ascii characters that participate
-                             * in multi-char folds under /aa.  In EBCDIC, the
-                             * non-ascii invariants are all control characters,
-                             * so don't ever participate in any folds. */
-                            if (ASCII_FOLD_RESTRICTED
-                                || ! IS_NON_FINAL_FOLD(*s))
-                            {
-                                break;
-                            }
-                        }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
-                                                                  *s, *(s+1))))
-                            {
-                                break;
-                            }
+                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+                                         (U8*) RExC_end, foldbuf + fold_len,
+                                         &next_fold_len);
+                        fold_len += next_fold_len;
+                    }
+                    else {
+                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+                            foldbuf[fold_len] = 's';
                         }
-                        else if (! _invlist_contains_cp(
-                                        PL_NonFinalFold,
-                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
-                        {
-                            break;
+                        else {
+                            foldbuf[fold_len] = toLOWER_L1(p[1]);
                         }
+                        fold_len++;
+                    }
+                }
 
-                        /* Here, the current character is problematic in that
-                         * it does occur in the non-final position of some
-                         * fold, so try the character before it, but have to
-                         * special case the very first byte in the string, so
-                         * we don't read outside the string */
-                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
-                        backup_count++;
-                    } /* End of loop backwards through the string */
-
-                    /* If there were only problematic characters in the string,
-                     * <s> will point to before s0, in which case the length
-                     * should be 0, otherwise include the length of the
-                     * non-problematic character just found */
-                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
-               }
+                /* Here foldbuf contains the the fold of p, and if appropriate
+                 * that of the character following p in the input. */
 
-                /* Here, have found the final character, if any, that is
-                 * non-problematic as far as ending the node without splitting
-                 * it across a potential multi-char fold.  <len> contains the
-                 * number of bytes in the node up-to and including that
-                 * character, or is 0 if there is no such character, meaning
-                 * the whole node contains only problematic characters.  In
-                 * this case, give up and just take the node as-is.  We can't
-                 * do any better */
-                if (len == 0) {
-                    len = full_len;
+                /* Search backwards until find a place that doesn't split a
+                 * multi-char fold */
+                while (1) {
+                    STRLEN s_len;
+                    char s_fold_buf[UTF8_MAXBYTES_CASE];
+                    char * s_fold = s_fold_buf;
 
-                } else {
+                    if (s <= s0) {
 
-                    /* Here, the node does contain some characters that aren't
-                     * problematic.  If we didn't have to backup any, then the
-                     * final character in the node is non-problematic, and we
-                     * can take the node as-is */
-                    if (backup_count == 0) {
-                        goto loopdone;
+                        /* There's no safe place in the node to split.  Quit so
+                         * will take the whole node */
+                        break;
                     }
-                    else if (backup_count == 1) {
 
-                        /* If the final character is problematic, but the
-                         * penultimate is not, back-off that last character to
-                         * later start a new node with it */
-                        p = oldp;
-                        goto loopdone;
+                    /* Backup 1 character.  The first time through this moves s
+                     * to point to the final character in the node */
+                    if (UTF) {
+                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    }
+                    else {
+                        s--;
                     }
 
-                    /* Here, the final non-problematic character is earlier
-                     * in the input than the penultimate character.  What we do
-                     * is reparse from the beginning, going up only as far as
-                     * this final ok one, thus guaranteeing that the node ends
-                     * in an acceptable character.  The reason we reparse is
-                     * that we know how far in the character is, but we don't
-                     * know how to correlate its position with the input parse.
-                     * An alternate implementation would be to build that
-                     * correlation as we go along during the original parse,
-                     * but that would entail extra work for every node, whereas
-                     * this code gets executed only when the string is too
-                     * large for the node, and the final two characters are
-                     * problematic, an infrequent occurrence.  Yet another
-                     * possible strategy would be to save the tail of the
-                     * string, and the next time regatom is called, initialize
-                     * with that.  The problem with this is that unless you
-                     * back off one more character, you won't be guaranteed
-                     * regatom will get called again, unless regbranch,
-                     * regpiece ... are also changed.  If you do back off that
-                     * extra character, so that there is input guaranteed to
-                     * force calling regatom, you can't handle the case where
-                     * just the first character in the node is acceptable.  I
-                     * (khw) decided to try this method which doesn't have that
-                     * pitfall; if performance issues are found, we can do a
-                     * combination of the current approach plus that one */
-                    upper_parse = len;
-                    len = 0;
-                    s = s0;
-                    goto reparse;
+                    /* 's' may or may not be folded; so make sure it is, and
+                     * use just the final character in its fold (should there
+                     * be more than one */
+                    if (UTF) {
+                        toFOLD_utf8_safe((U8*) s,
+                                         (U8*) s + UTF8SKIP(s),
+                                         (U8 *) s_fold_buf, &s_len);
+                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+                        {
+                            s_fold += UTF8SKIP(s_fold);
+                        }
+                        s_len = UTF8SKIP(s_fold);
+                    }
+                    else {
+                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+                        {
+                            s_fold_buf[0] = 's';
+                        }
+                        else {  /* This works for all other non-UTF-8 folds
+                                 */
+                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+                        }
+                        s_len = 1;
+                    }
+
+                    /* Unshift this character to the beginning of the buffer,
+                     * No longer needed trailing characters are overwritten.
+                     * */
+                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+                    Copy(s_fold, foldbuf, s_len, U8);
+
+                    /* If this isn't a multi-character fold, we have found a
+                     * splittable place.  If this is the final character in the
+                     * node, that means the node is valid as-is, and can quit.
+                     * Otherwise, we note how much we can fill the node before
+                     * coming to a non-splittable position, and go parse it
+                     * again, stopping there. This is done because we know
+                     * where in the output to stop, but we don't have a map to
+                     * where that is in the input.  One could be created, but
+                     * it seems like overkill for such a rare event as we are
+                     * dealing with here */
+                    if (UTF) {
+                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                        {
+                            upper_fill = s + UTF8SKIP(s) - s0;
+                            if (LIKELY(upper_fill == 255)) {
+                                break;
+                            }
+                            goto reparse;
+                        }
+                    }
+                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                    {
+                        upper_fill = s + 1 - s0;
+                        if (LIKELY(upper_fill == 255)) {
+                            break;
+                        }
+                        goto reparse;
+                    }
                 }
+
+                /* Here the node consists entirely of non-final multi-char
+                 * folds.  (Likely it is all 'f's or all 's's.)  There's no
+                 * decent place to split it, so give up and just take the whole
+                 * thing */
+
            }   /* End of verifying node ends with an appropriate char */
 
+            p = oldp;
+
           loopdone:   /* Jumped to when encounters something that shouldn't be
                          in the node */
 
             /* Free up any over-allocated space; cast is to silence bogus
              * warning in MS VC */
             change_engine_size(pRExC_state,
-                                - (Ptrdiff_t) (initial_size - STR_SZ(len)));
+                        - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
 
             /* I (khw) don't know if you can get here with zero length, but the
              * old code handled this situation by creating a zero-length EXACT
@@ -14701,7 +14877,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     else if (requires_utf8_target) {
                         node_type = EXACT_ONLY8;
                     }
-                } else if (FOLD) {
+                }
+                else if (node_type == LEXACT) {
+                    if (requires_utf8_target) {
+                        node_type = LEXACT_ONLY8;
+                    }
+                }
+                else if (FOLD) {
                     if (    UNLIKELY(has_micro_sign || has_ss)
                         && (node_type == EXACTFU || (   node_type == EXACTF
                                                      && maybe_exactfu)))
@@ -14721,6 +14903,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                     }
                     else if (node_type == EXACTF) {  /* Means is /di */
 
+                        /* This intermediate variable is needed solely because
+                         * the asserts in the macro where used exceed Win32's
+                         * literal string capacity */
+                        char first_char = * STRING(REGNODE_p(ret));
+
                         /* If 'maybe_exactfu' is clear, then we need to stay
                          * /di.  If it is set, it means there are no code
                          * points that match differently depending on UTF8ness
@@ -14729,7 +14916,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                         if (! maybe_exactfu) {
                             RExC_seen_d_op = TRUE;
                         }
-                        else if (   isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
+                        else if (   isALPHA_FOLD_EQ(first_char, 's')
                                  || isALPHA_FOLD_EQ(ender, 's'))
                         {
                             /* But, if the node begins or ends in an 's' we
@@ -14754,11 +14941,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 }
 
                 OP(REGNODE_p(ret)) = node_type;
-                STR_LEN(REGNODE_p(ret)) = len;
+                setSTR_LEN(REGNODE_p(ret), len);
                 RExC_emit += STR_SZ(len);
 
                 /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
                     maybe_SIMPLE = 0;
                 }
 
@@ -15928,8 +16115,11 @@ redo_curchar:
 
                     /* Recurse, with the meat of the embedded expression */
                     RExC_parse++;
-                    (void) handle_regex_sets(pRExC_state, &current, flagp,
-                                                    depth+1, oregcomp_parse);
+                    if (! handle_regex_sets(pRExC_state, &current, flagp,
+                                                    depth+1, oregcomp_parse))
+                    {
+                        RETURN_FAIL_ON_RESTART(*flagp, flagp);
+                    }
 
                     /* Here, 'current' contains the embedded expression's
                      * inversion list, and RExC_parse points to the trailing
@@ -15983,6 +16173,7 @@ redo_curchar:
                               FALSE, /* Require return to be an ANYOF */
                               &current))
                 {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                     goto regclass_failed;
                 }
 
@@ -16019,6 +16210,7 @@ redo_curchar:
                                 FALSE, /* Require return to be an ANYOF */
                                 &current))
                 {
+                    RETURN_FAIL_ON_RESTART(*flagp, flagp);
                     goto regclass_failed;
                 }
 
@@ -16379,8 +16571,10 @@ redo_curchar:
         RExC_flags |= RXf_PMf_FOLD;
     }
 
-    if (!node)
+    if (!node) {
+        RETURN_FAIL_ON_RESTART(*flagp, flagp);
         goto regclass_failed;
+    }
 
     /* Fix up the node type if we are in locale.  (We have pretended we are
      * under /u for the purposes of regclass(), as this construct will only
@@ -18796,7 +18990,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     ret = regnode_guts(pRExC_state, op, len, "exact");
                     FILL_NODE(ret, op);
                     RExC_emit += 1 + STR_SZ(len);
-                    STR_LEN(REGNODE_p(ret)) = len;
+                    setSTR_LEN(REGNODE_p(ret), len);
                     if (len == 1) {
                         *STRING(REGNODE_p(ret)) = (U8) value;
                     }
@@ -19601,8 +19795,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 STATIC void
 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
 {
-    /* 'size' is the delta to add or subtract from the current memory allocated
-     * to the regex engine being constructed */
+    /* 'size' is the delta number of smallest regnode equivalents to add or
+     * subtract from the current memory allocated to the regex engine being
+     * constructed. */
 
     PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
 
@@ -19634,8 +19829,8 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
 STATIC regnode_offset
 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
 {
-    /* Allocate a regnode for 'op', with 'extra_size' extra space.  It aligns
-     * and increments RExC_size and RExC_emit
+    /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
+     * equivalents space.  It aligns and increments RExC_size and RExC_emit
      *
      * It returns the regnode's offset into the regex engine program */
 
@@ -19944,7 +20139,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
 #endif
         if ( exact ) {
             switch (OP(REGNODE_p(scan))) {
+                case LEXACT:
                 case EXACT:
+                case LEXACT_ONLY8:
                 case EXACT_ONLY8:
                 case EXACTL:
                 case EXACTF: