X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7f18ad16e2d0e9f1ba9b08fd0b11d9f69c5edcf7..3ed356df9354193bbcc5202f066f3c07ae84b443:/regexec.c?ds=sidebyside

diff --git a/regexec.c b/regexec.c
index 6b6dce1..4029f1e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -95,14 +95,16 @@ const char* const non_utf8_target_but_utf8_required
 
 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 
+#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+
 #ifndef STATIC
 #define	STATIC	static
 #endif
 
-/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
+/* Valid for non-utf8 strings: avoids the reginclass
  * call if there are no complications: i.e., if everything matchable is
  * straight forward in the bitmap */
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 					      : ANYOF_BITMAP_TEST(p,*(c)))
 
 /*
@@ -658,8 +660,12 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 	      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 	      goto fail;
 	  }
-	  if (prog->check_offset_min == prog->check_offset_max &&
-	      !(prog->extflags & RXf_CANY_SEEN)) {
+	  if (prog->check_offset_min == prog->check_offset_max
+              && !(prog->extflags & RXf_CANY_SEEN)
+              && ! multiline)   /* /m can cause \n's to match that aren't
+                                   accounted for in the string max length.
+                                   See [perl #115242] */
+          {
 	    /* Substring at constant offset from beg-of-str... */
 	    I32 slen;
 
@@ -1450,12 +1456,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         
 	/* We know what class it must start with. */
 	switch (OP(c)) {
-	case ANYOFV:
 	case ANYOF:
-	    if (utf8_target || OP(c) == ANYOFV) {
-		STRLEN inclasslen = strend - s;
+	    if (utf8_target) {
 		REXEC_FBC_UTF8_CLASS_SCAN(
-                          reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
+                          reginclass(prog, c, (U8*)s, utf8_target));
 	    }
 	    else {
 		REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
@@ -2935,6 +2939,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
 
 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
+#define CHRTEST_NOT_A_CP_1 -999
+#define CHRTEST_NOT_A_CP_2 -998
 
 #define SLAB_FIRST(s) (&(s)->states[0])
 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
@@ -3271,7 +3277,260 @@ S_clear_backtrack_stack(pTHX_ void *p)
 	Safefree(osl);
     }
 }
+static bool
+S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
+{
+    /* This function determines if there are one or two characters that match
+     * the first character of the passed-in EXACTish node <text_node>, and if
+     * so, returns them in the passed-in pointers.
+     *
+     * If it determines that no possible character in the target string can
+     * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
+     * the first character in <text_node> requires UTF-8 to represent, and the
+     * target string isn't in UTF-8.)
+     *
+     * If there are more than two characters that could match the beginning of
+     * <text_node>, or if more context is required to determine a match or not,
+     * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
+     *
+     * The motiviation behind this function is to allow the caller to set up
+     * tight loops for matching.  If <text_node> is of type EXACT, there is
+     * only one possible character that can match its first character, and so
+     * the situation is quite simple.  But things get much more complicated if
+     * folding is involved.  It may be that the first character of an EXACTFish
+     * node doesn't participate in any possible fold, e.g., punctuation, so it
+     * can be matched only by itself.  The vast majority of characters that are
+     * in folds match just two things, their lower and upper-case equivalents.
+     * But not all are like that; some have multiple possible matches, or match
+     * sequences of more than one character.  This function sorts all that out.
+     *
+     * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
+     * loop of trying to match A*, we know we can't exit where the thing
+     * following it isn't a B.  And something can't be a B unless it is the
+     * beginning of B.  By putting a quick test for that beginning in a tight
+     * loop, we can rule out things that can't possibly be B without having to
+     * break out of the loop, thus avoiding work.  Similarly, if A is a single
+     * character, we can make a tight loop matching A*, using the outputs of
+     * this function.
+     *
+     * If the target string to match isn't in UTF-8, and there aren't
+     * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
+     * the one or two possible octets (which are characters in this situation)
+     * that can match.  In all cases, if there is only one character that can
+     * match, *<c1p> and *<c2p> will be identical.
+     *
+     * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
+     * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
+     * can match the beginning of <text_node>.  They should be declared with at
+     * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
+     * undefined what these contain.)  If one or both of the buffers are
+     * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
+     * corresponding invariant.  If variant, the corresponding *<c1p> and/or
+     * *<c2p> will be set to a negative number(s) that shouldn't match any code
+     * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
+     * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
+
+    const bool utf8_target = PL_reg_match_utf8;
+
+    UV c1 = CHRTEST_NOT_A_CP_1;
+    UV c2 = CHRTEST_NOT_A_CP_2;
+    bool use_chrtest_void = FALSE;
+
+    /* Used when we have both utf8 input and utf8 output, to avoid converting
+     * to/from code points */
+    bool utf8_has_been_setup = FALSE;
+
+    dVAR;
+
+    U8 *pat = (U8*)STRING(text_node);
+
+    if (OP(text_node) == EXACT) {
+
+        /* In an exact node, only one thing can be matched, that first
+         * character.  If both the pat and the target are UTF-8, we can just
+         * copy the input to the output, avoiding finding the code point of
+         * that character */
+        if (! UTF_PATTERN) {
+            c2 = c1 = *pat;
+        }
+        else if (utf8_target) {
+            Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
+            Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
+            utf8_has_been_setup = TRUE;
+        }
+        else {
+            c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
+        }
+    }
+    else /* an EXACTFish node */
+         if ((UTF_PATTERN
+                    && is_MULTI_CHAR_FOLD_utf8_safe(pat,
+                                                    pat + STR_LEN(text_node)))
+             || (! UTF_PATTERN
+                    && is_MULTI_CHAR_FOLD_latin1_safe(pat,
+                                                    pat + STR_LEN(text_node))))
+    {
+        /* Multi-character folds require more context to sort out.  Also
+         * PL_utf8_foldclosures used below doesn't handle them, so have to be
+         * handled outside this routine */
+        use_chrtest_void = TRUE;
+    }
+    else { /* an EXACTFish node which doesn't begin with a multi-char fold */
+        c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
+        if (c1 > 256) {
+            /* Load the folds hash, if not already done */
+            SV** listp;
+            if (! PL_utf8_foldclosures) {
+                if (! PL_utf8_tofold) {
+                    U8 dummy[UTF8_MAXBYTES+1];
+
+                    /* Force loading this by folding an above-Latin1 char */
+                    to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+                    assert(PL_utf8_tofold); /* Verify that worked */
+                }
+                PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+            }
+
+            /* The fold closures data structure is a hash with the keys being
+             * the UTF-8 of every character that is folded to, like 'k', and
+             * the values each an array of all code points that fold to its
+             * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
+             * not included */
+            if ((! (listp = hv_fetch(PL_utf8_foldclosures,
+                                     (char *) pat,
+                                     UTF8SKIP(pat),
+                                     FALSE))))
+            {
+                /* Not found in the hash, therefore there are no folds
+                 * containing it, so there is only a single character that
+                 * could match */
+                c2 = c1;
+            }
+            else {  /* Does participate in folds */
+                AV* list = (AV*) *listp;
+                if (av_len(list) != 1) {
+
+                    /* If there aren't exactly two folds to this, it is outside
+                     * the scope of this function */
+                    use_chrtest_void = TRUE;
+                }
+                else {  /* There are two.  Get them */
+                    SV** c_p = av_fetch(list, 0, FALSE);
+                    if (c_p == NULL) {
+                        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                    }
+                    c1 = SvUV(*c_p);
+
+                    c_p = av_fetch(list, 1, FALSE);
+                    if (c_p == NULL) {
+                        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                    }
+                    c2 = SvUV(*c_p);
+
+                    /* Folds that cross the 255/256 boundary are forbidden if
+                     * EXACTFL, or EXACTFA and one is ASCIII.  Since the
+                     * pattern character is above 256, and its only other match
+                     * is below 256, the only legal match will be to itself.
+                     * We have thrown away the original, so have to compute
+                     * which is the one above 255 */
+                    if ((c1 < 256) != (c2 < 256)) {
+                        if (OP(text_node) == EXACTFL
+                            || (OP(text_node) == EXACTFA
+                                && (isASCII(c1) || isASCII(c2))))
+                        {
+                            if (c1 < 256) {
+                                c1 = c2;
+                            }
+                            else {
+                                c2 = c1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else /* Here, c1 is < 255 */
+             if (utf8_target
+                 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
+                 && OP(text_node) != EXACTFL
+                 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
+        {
+            /* Here, there could be something above Latin1 in the target which
+             * folds to this character in the pattern.  All such cases except
+             * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
+             * involved in their folds, so are outside the scope of this
+             * function */
+            if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+                c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
+            }
+            else {
+                use_chrtest_void = TRUE;
+            }
+        }
+        else { /* Here nothing above Latin1 can fold to the pattern character */
+            switch (OP(text_node)) {
+
+                case EXACTFL:   /* /l rules */
+                    c2 = PL_fold_locale[c1];
+                    break;
+
+                case EXACTF:
+                    if (! utf8_target) {    /* /d rules */
+                        c2 = PL_fold[c1];
+                        break;
+                    }
+                    /* FALLTHROUGH */
+                    /* /u rules for all these.  This happens to work for
+                     * EXACTFA as nothing in Latin1 folds to ASCII */
+                case EXACTFA:
+                case EXACTFU_TRICKYFOLD:
+                case EXACTFU_SS:
+                case EXACTFU:
+                    c2 = PL_fold_latin1[c1];
+                    break;
+
+		default:
+                    Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+                    assert(0); /* NOTREACHED */
+            }
+        }
+    }
+
+    /* Here have figured things out.  Set up the returns */
+    if (use_chrtest_void) {
+        *c2p = *c1p = CHRTEST_VOID;
+    }
+    else if (utf8_target) {
+        if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
+            uvchr_to_utf8(c1_utf8, c1);
+            uvchr_to_utf8(c2_utf8, c2);
+        }
+
+        /* Invariants are stored in both the utf8 and byte outputs; Use
+         * negative numbers otherwise for the byte ones.  Make sure that the
+         * byte ones are the same iff the utf8 ones are the same */
+        *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
+        *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
+                ? *c2_utf8
+                : (c1 == c2)
+                  ? CHRTEST_NOT_A_CP_1
+                  : CHRTEST_NOT_A_CP_2;
+    }
+    else if (c1 > 255) {
+       if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
+                           can represent */
+           return FALSE;
+       }
+
+       *c1p = *c2p = c2;    /* c2 is the only representable value */
+    }
+    else {  /* c1 is representable; see about c2 */
+       *c1p = c1;
+       *c2p = (c2 < 256) ? c2 : c1;
+    }
 
+    return TRUE;
+}
 
 /* returns -1 on failure, $+[0] on success */
 STATIC I32
@@ -3405,6 +3664,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
       reenter_switch:
 
         SET_nextchr;
+        assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
 
 	switch (state_num) {
 	case BOL: /*  /^../  */
@@ -3438,12 +3698,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    st->u.keeper.val = rex->offs[0].start;
 	    rex->offs[0].start = locinput - PL_bostr;
 	    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
-	    /*NOT-REACHED*/
+	    assert(0); /*NOTREACHED*/
 	case KEEPS_next_fail:
 	    /* rollback the start point change */
 	    rex->offs[0].start = st->u.keeper.val;
 	    sayNO_SILENT;
-	    /*NOT-REACHED*/
+	    assert(0); /*NOTREACHED*/
 
 	case EOL: /* /..$/  */
 		goto seol;
@@ -3952,8 +4212,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 	    }
 
 	    /* Neither the target nor the pattern are utf8 */
-	    if (UCHARAT(s) != nextchr &&
-		UCHARAT(s) != fold_array[nextchr])
+	    if (UCHARAT(s) != nextchr
+                && !NEXTCHR_IS_EOS
+		&& UCHARAT(s) != fold_array[nextchr])
 	    {
 		sayNO;
 	    }
@@ -4049,15 +4310,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		    sayNO;
 	    break;
 
-	case ANYOFV: /*  /[abx{df}]/i  */
 	case ANYOF:  /*  /[abc]/       */
             if (NEXTCHR_IS_EOS)
                 sayNO;
-	    if (utf8_target || state_num == ANYOFV) {
-	        STRLEN inclasslen = PL_regeol - locinput;
-	        if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
+	    if (utf8_target) {
+	        if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
 		    sayNO;
-		locinput += inclasslen;
+		locinput += UTF8SKIP(locinput);
 		break;
 	    }
 	    else {
@@ -4396,7 +4655,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 		/* This call case insensitively compares the entire buffer
 		    * at s, with the current input starting at locinput, but
 		    * not going off the end given by PL_regeol, and returns in
-		    * limit upon success, how much of the current input was
+		    * <limit> upon success, how much of the current input was
 		    * matched */
 		if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
 				    locinput, &limit, 0, utf8_target, utf8_fold_flags))
@@ -5400,19 +5659,12 @@ NULL
 	            	if this changes back then the macro for 
 	            	IS_TEXT and friends need to change.
 	             */
-		    if (PL_regkind[OP(text_node)] == EXACT)
-		    {
-		        
-			ST.c1 = (U8)*STRING(text_node);
-			switch (OP(text_node)) {
-			    case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
-			    case EXACTFA:
-			    case EXACTFU_SS:
-			    case EXACTFU_TRICKYFOLD:
-			    case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
-			    case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
-			    default: ST.c2 = ST.c1;
-			}
+		    if (PL_regkind[OP(text_node)] == EXACT) {
+                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
+                           text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
+                        {
+                            sayNO;
+                        }
 		    }
 		}
 	    }
@@ -5423,21 +5675,36 @@ NULL
 		    (int)(REPORT_CODE_OFF+(depth*2)),
 		    "", (IV)ST.count)
 		);
-	    if (       !NEXTCHR_IS_EOS
-                    && ST.c1 != CHRTEST_VOID
-		    && nextchr != ST.c1
-		    && nextchr != ST.c2)
-	    {
-		/* simulate B failing */
-		DEBUG_OPTIMISE_r(
-		    PerlIO_printf(Perl_debug_log,
-		        "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
-		        (int)(REPORT_CODE_OFF+(depth*2)),"",
-		        (IV)ST.c1,(IV)ST.c2
-		));
-		state_num = CURLYM_B_fail;
-		goto reenter_switch;
-	    }
+	    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
+                if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
+                    if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
+                        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                    {
+                        /* simulate B failing */
+                        DEBUG_OPTIMISE_r(
+                            PerlIO_printf(Perl_debug_log,
+                                "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
+                                (int)(REPORT_CODE_OFF+(depth*2)),"",
+                                valid_utf8_to_uvchr((U8 *) locinput, NULL),
+                                valid_utf8_to_uvchr(ST.c1_utf8, NULL),
+                                valid_utf8_to_uvchr(ST.c2_utf8, NULL))
+                        );
+                        state_num = CURLYM_B_fail;
+                        goto reenter_switch;
+                    }
+                }
+                else if (nextchr != ST.c1 && nextchr != ST.c2) {
+                    /* simulate B failing */
+                    DEBUG_OPTIMISE_r(
+                        PerlIO_printf(Perl_debug_log,
+                            "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
+                            (int)(REPORT_CODE_OFF+(depth*2)),"",
+                            (int) nextchr, ST.c1, ST.c2)
+                    );
+                    state_num = CURLYM_B_fail;
+                    goto reenter_switch;
+                }
+            }
 
 	    if (ST.me->flags) {
 		/* emulate CLOSE: mark current A as captured */
@@ -5500,23 +5767,23 @@ NULL
 	} \
     }
 
-	case STAR:		/*  /A*B/ where A is width 1 */
+        case STAR:		/*  /A*B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = 0;
 	    ST.max = REG_INFTY;
 	    scan = NEXTOPER(scan);
 	    goto repeat;
 
-	case PLUS:		/*  /A+B/ where A is width 1 */
+        case PLUS:		/*  /A+B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = 1;
 	    ST.max = REG_INFTY;
 	    scan = NEXTOPER(scan);
 	    goto repeat;
 
-	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 */
-	    ST.paren = scan->flags;	/* Which paren to set */
-	    ST.lastparen      = rex->lastparen;
+	case CURLYN:		/*  /(A){m,n}B/ where A is width 1 char */
+            ST.paren = scan->flags;	/* Which paren to set */
+            ST.lastparen      = rex->lastparen;
 	    ST.lastcloseparen = rex->lastcloseparen;
 	    if (ST.paren > PL_regsize)
 		PL_regsize = ST.paren;
@@ -5530,7 +5797,7 @@ NULL
             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
 	    goto repeat;
 
-	case CURLY:		/*  /A{m,n}B/ where A is width 1 */
+	case CURLY:		/*  /A{m,n}B/ where A is width 1 char */
 	    ST.paren = 0;
 	    ST.min = ARG1(scan);  /* min to match */
 	    ST.max = ARG2(scan);  /* max to match */
@@ -5545,10 +5812,11 @@ NULL
 	    * of the quantifier and the EXACT-like node.  -- japhy
 	    */
 
-	    if (ST.min > ST.max) /* XXX make this a compile-time check? */
-		sayNO;
-	    if (HAS_TEXT(next) || JUMPABLE(next)) {
-		U8 *s;
+	    assert(ST.min <= ST.max);
+            if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
+                ST.c1 = ST.c2 = CHRTEST_VOID;
+            }
+            else {
 		regnode *text_node = next;
 
 		if (! HAS_TEXT(text_node)) 
@@ -5559,10 +5827,8 @@ NULL
 		else {
 		    if ( PL_regkind[OP(text_node)] != EXACT ) {
 			ST.c1 = ST.c2 = CHRTEST_VOID;
-			goto assume_ok_easy;
 		    }
-		    else
-			s = (U8*)STRING(text_node);
+		    else {
                     
                     /*  Currently we only get here when 
                         
@@ -5570,37 +5836,14 @@ NULL
                     
                         if this changes back then the macro for IS_TEXT and 
                         friends need to change. */
-		    if (!UTF_PATTERN) {
-			ST.c1 = *s;
-			switch (OP(text_node)) {
-			    case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
-			    case EXACTFA:
-			    case EXACTFU_SS:
-			    case EXACTFU_TRICKYFOLD:
-			    case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
-			    case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
-			    default: ST.c2 = ST.c1; break;
-			}
-		    }
-		    else { /* UTF_PATTERN */
-			if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
-			     STRLEN ulen;
-			     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
-			     to_utf8_fold((U8*)s, tmpbuf, &ulen);
-			     ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
-						    uniflags);
-			}
-			else {
-			    ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
-						     uniflags);
-			}
-		    }
+                        if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
+                           text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
+                        {
+                            sayNO;
+                        }
+                    }
 		}
 	    }
-	    else
-		ST.c1 = ST.c2 = CHRTEST_VOID;
-	assume_ok_easy:
 
 	    ST.A = scan;
 	    ST.B = next;
@@ -5686,31 +5929,26 @@ NULL
 		if (utf8_target) {
 		    n = (ST.oldloc == locinput) ? 0 : 1;
 		    if (ST.c1 == ST.c2) {
-			STRLEN len;
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos &&
-			       utf8n_to_uvchr((U8*)locinput,
-					      UTF8_MAXBYTES, &len,
-					      uniflags) != (UV)ST.c1) {
-			    locinput += len;
+			while (locinput <= ST.maxpos
+                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
+                        {
+			    locinput += UTF8SKIP(locinput);
 			    n++;
 			}
 		    }
 		    else {
 			/* set n to utf8_distance(oldloc, locinput) */
-			while (locinput <= ST.maxpos) {
-			    STRLEN len;
-			    const UV c = utf8n_to_uvchr((U8*)locinput,
-						  UTF8_MAXBYTES, &len,
-						  uniflags);
-			    if (c == (UV)ST.c1 || c == (UV)ST.c2)
-				break;
-			    locinput += len;
+			while (locinput <= ST.maxpos
+                              && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
+                              && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
+                        {
+			    locinput += UTF8SKIP(locinput);
 			    n++;
 			}
 		    }
 		}
-		else {
+		else {  /* Not utf8_target */
 		    if (ST.c1 == ST.c2) {
 			while (locinput <= ST.maxpos &&
 			       UCHARAT(locinput) != ST.c1)
@@ -5775,6 +6013,7 @@ NULL
 		    PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
 		}
 	    }
+            sayNO;
 	    assert(0); /* NOTREACHED */
 
 
@@ -5785,16 +6024,25 @@ NULL
                 goto fake_end;
             }
 	    {
-		UV c = 0;
-		if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
-		    c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
-					   UTF8_MAXBYTES, 0, uniflags)
-				: (UV) UCHARAT(locinput);
+		bool could_match = locinput < PL_regeol;
+
 		/* If it could work, try it. */
-		if (ST.c1 == CHRTEST_VOID
-                    || (locinput < PL_regeol &&
-                        (c == (UV)ST.c1 || c == (UV)ST.c2)))
-                {
+                if (ST.c1 != CHRTEST_VOID && could_match) {
+                    if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
+                    {
+                        could_match = memEQ(locinput,
+                                            ST.c1_utf8,
+                                            UTF8SKIP(locinput))
+                                    || memEQ(locinput,
+                                             ST.c2_utf8,
+                                             UTF8SKIP(locinput));
+                    }
+                    else {
+                        could_match = UCHARAT(locinput) == ST.c1
+                                      || UCHARAT(locinput) == ST.c2;
+                    }
+                }
+                if (ST.c1 == CHRTEST_VOID || could_match) {
 		    CURLY_SETPAREN(ST.paren, ST.count);
 		    PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
 		    assert(0); /* NOTREACHED */
@@ -6085,6 +6333,7 @@ NULL
         /* this is a point to jump to in order to increment
          * locinput by one character */
         increment_locinput:
+            assert(!NEXTCHR_IS_EOS);
             if (utf8_target) {
                 locinput += PL_utf8skip[nextchr];
                 /* locinput is allowed to go 1 char off the end, but not 2+ */
@@ -6282,21 +6531,24 @@ no_silent:
 /*
  - regrepeat - repeatedly match something simple, report how many
  *
+ * What 'simple' means is a node which can be the operand of a quantifier like
+ * '+', or {1,3}
+ *
  * startposp - pointer a pointer to the start position.  This is updated
  *             to point to the byte following the highest successful
  *             match.
  * p         - the regnode to be repeatedly matched against.
- * max       - maximum number of characters to match.
+ * max       - maximum number of things to match.
  * depth     - (for debugging) backtracking depth.
  */
 STATIC I32
 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
 {
     dVAR;
-    char *scan;
+    char *scan;     /* Pointer to current position in target string */
     I32 c;
-    char *loceol = PL_regeol;
-    I32 hardcount = 0;
+    char *loceol = PL_regeol;   /* local version */
+    I32 hardcount = 0;  /* How many matches so far */
     bool utf8_target = PL_reg_match_utf8;
     UV utf8_flags;
 #ifndef DEBUGGING
@@ -6308,12 +6560,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     scan = *startposp;
     if (max == REG_INFTY)
 	max = I32_MAX;
-    else if (max < loceol - scan)
+    else if (! utf8_target && scan + max < loceol)
 	loceol = scan + max;
+
+    /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
+     * to the maximum of how far we should go in it (leaving it set to the real
+     * end, if the maximum permissible would take us beyond that).  This allows
+     * us to make the loop exit condition that we haven't gone past <loceol> to
+     * also mean that we haven't exceeded the max permissible count, saving a
+     * test each time through the loop.  But it assumes that the OP matches a
+     * single byte, which is true for most of the OPs below when applied to a
+     * non-UTF-8 target.  Those relatively few OPs that don't have this
+     * characteristic will have to compensate.
+     *
+     * There is no adjustment for UTF-8 targets, as the number of bytes per
+     * character varies.  OPs will have to test both that the count is less
+     * than the max permissible (using <hardcount> to keep track), and that we
+     * are still within the bounds of the string (using <loceol>.  A few OPs
+     * match a single byte no matter what the encoding.  They can omit the max
+     * test if, for the UTF-8 case, they do the adjustment that was skipped
+     * above.
+     *
+     * Thus, the code above sets things up for the common case; and exceptional
+     * cases need extra work; the common case is to make sure <scan> doesn't
+     * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
+     * count doesn't exceed the maximum permissible */
+
     switch (OP(p)) {
     case REG_ANY:
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (scan < loceol && hardcount < max && *scan != '\n') {
 		scan += UTF8SKIP(scan);
 		hardcount++;
@@ -6325,7 +6600,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case SANY:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (scan < loceol && hardcount < max) {
 	        scan += UTF8SKIP(scan);
 		hardcount++;
@@ -6334,35 +6608,69 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	else
 	    scan = loceol;
 	break;
-    case CANY:
-	scan = loceol;
+    case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
+        if (utf8_target && scan + max < loceol) {
+
+            /* <loceol> hadn't been adjusted in the UTF-8 case */
+            scan +=  max;
+        }
+        else {
+            scan = loceol;
+        }
 	break;
     case EXACT:
-	/* To get here, EXACTish nodes must have *byte* length == 1.  That
-	 * means they match only characters in the string that can be expressed
-	 * as a single byte.  For non-utf8 strings, that means a simple match.
-	 * For utf8 strings, the character matched must be an invariant, or
-	 * downgradable to a single byte.  The pattern's utf8ness is
-	 * irrelevant, as since it's a single byte, it either isn't utf8, or if
-	 * it is, it's an invariant */
+        assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
 
 	c = (U8)*STRING(p);
-	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
 
-	if (! utf8_target || UNI_IS_INVARIANT(c)) {
+        /* Can use a simple loop if the pattern char to match on is invariant
+         * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
+         * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
+         * true iff it doesn't matter if the argument is in UTF-8 or not */
+        if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
+            if (utf8_target && scan + max < loceol) {
+                /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+                 * since here, to match at all, 1 char == 1 byte */
+                loceol = scan + max;
+            }
 	    while (scan < loceol && UCHARAT(scan) == c) {
 		scan++;
 	    }
 	}
-	else {
+	else if (UTF_PATTERN) {
+            if (utf8_target) {
+                STRLEN scan_char_len;
+
+                /* When both target and pattern are UTF-8, we have to do
+                 * string EQ */
+                while (hardcount < max
+                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
+                       && scan_char_len <= STR_LEN(p)
+                       && memEQ(scan, STRING(p), scan_char_len))
+                {
+                    scan += scan_char_len;
+                    hardcount++;
+                }
+            }
+            else if (! UTF8_IS_ABOVE_LATIN1(c)) {
 
-	    /* Here, the string is utf8, and the pattern char is different
-	     * in utf8 than not, so can't compare them directly.  Outside the
-	     * loop, find the two utf8 bytes that represent c, and then
-	     * look for those in sequence in the utf8 string */
+                /* Target isn't utf8; convert the character in the UTF-8
+                 * pattern to non-UTF8, and do a simple loop */
+                c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+                while (scan < loceol && UCHARAT(scan) == c) {
+                    scan++;
+                }
+            } /* else pattern char is above Latin1, can't possibly match the
+                 non-UTF-8 target */
+        }
+        else {
+
+            /* Here, the string must be utf8; pattern isn't, and <c> is
+             * different in utf8 than not, so can't compare them directly.
+             * Outside the loop, find the two utf8 bytes that represent c, and
+             * then look for those in sequence in the utf8 string */
 	    U8 high = UTF8_TWO_BYTE_HI(c);
 	    U8 low = UTF8_TWO_BYTE_LO(c);
-	    loceol = PL_regeol;
 
 	    while (hardcount < max
 		    && scan + 1 < loceol
@@ -6374,6 +6682,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	    }
 	}
 	break;
+
     case EXACTFA:
 	utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
 	goto do_exactf;
@@ -6392,65 +6701,69 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case EXACTFU:
 	utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
-	/* The comments for the EXACT case above apply as well to these fold
-	 * ones */
-
-    do_exactf:
-	c = (U8)*STRING(p);
-	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
-
-	if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
-	    char *tmpeol = loceol;
-	    while (hardcount < max
-		    && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
-				   STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
-	    {
-		scan = tmpeol;
-		tmpeol = loceol;
-		hardcount++;
-	    }
-
-	    /* XXX Note that the above handles properly the German sharp s in
-	     * the pattern matching ss in the string.  But it doesn't handle
-	     * properly cases where the string contains say 'LIGATURE ff' and
-	     * the pattern is 'f+'.  This would require, say, a new function or
-	     * revised interface to foldEQ_utf8(), in which the maximum number
-	     * of characters to match could be passed and it would return how
-	     * many actually did.  This is just one of many cases where
-	     * multi-char folds don't work properly, and so the fix is being
-	     * deferred */
-	}
-	else {
-	    U8 folded;
-
-	    /* Here, the string isn't utf8 and c is a single byte; and either
-	     * the pattern isn't utf8 or c is an invariant, so its utf8ness
-	     * doesn't affect c.  Can just do simple comparisons for exact or
-	     * fold matching. */
-	    switch (OP(p)) {
-		case EXACTF: folded = PL_fold[c]; break;
-		case EXACTFA:
-		case EXACTFU_TRICKYFOLD:
-		case EXACTFU: folded = PL_fold_latin1[c]; break;
-		case EXACTFL: folded = PL_fold_locale[c]; break;
-		default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
-	    }
-	    while (scan < loceol &&
-		   (UCHARAT(scan) == c || UCHARAT(scan) == folded))
-	    {
-		scan++;
-	    }
+    do_exactf: {
+        int c1, c2;
+        U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
+
+        assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
+
+        if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
+            if (c1 == CHRTEST_VOID) {
+                /* Use full Unicode fold matching */
+                char *tmpeol = PL_regeol;
+                STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
+                while (hardcount < max
+                        && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
+                                             STRING(p), NULL, pat_len,
+                                             cBOOL(UTF_PATTERN), utf8_flags))
+                {
+                    scan = tmpeol;
+                    tmpeol = PL_regeol;
+                    hardcount++;
+                }
+            }
+            else if (utf8_target) {
+                if (c1 == c2) {
+                    while (scan < loceol
+                           && hardcount < max
+                           && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+                    {
+                        scan += UTF8SKIP(scan);
+                        hardcount++;
+                    }
+                }
+                else {
+                    while (scan < loceol
+                           && hardcount < max
+                           && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
+                               || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+                    {
+                        scan += UTF8SKIP(scan);
+                        hardcount++;
+                    }
+                }
+            }
+            else if (c1 == c2) {
+                while (scan < loceol && UCHARAT(scan) == c1) {
+                    scan++;
+                }
+            }
+            else {
+                while (scan < loceol &&
+                    (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
+                {
+                    scan++;
+                }
+            }
 	}
 	break;
-    case ANYOFV:
+    }
     case ANYOF:
-	if (utf8_target || OP(p) == ANYOFV) {
+	if (utf8_target) {
 	    STRLEN inclasslen;
-	    loceol = PL_regeol;
-	    inclasslen = loceol - scan;
 	    while (hardcount < max
-		   && ((inclasslen = loceol - scan) > 0)
-		   && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
+                   && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
+		   && reginclass(prog, p, (U8*)scan, utf8_target))
 	    {
 		scan += inclasslen;
 		hardcount++;
@@ -6463,7 +6776,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case ALNUMU:
 	if (utf8_target) {
     utf8_wordchar:
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_ALNUM();
 	    while (hardcount < max && scan < loceol &&
                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
@@ -6485,6 +6797,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	}
 	break;
     case ALNUMA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
 	while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
 	    scan++;
 	}
@@ -6492,7 +6810,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case ALNUML:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   isALNUM_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6508,7 +6825,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 
     utf8_Nwordchar:
 
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_ALNUM();
 	    while (hardcount < max && scan < loceol &&
                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
@@ -6531,14 +6847,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
 
     case POSIXA:
-       while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
+        while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
 	    scan++;
 	}
 	break;
     case NPOSIXA:
 	if (utf8_target) {
-	    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
+	    while (scan < loceol && hardcount < max
+                   && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
+            {
 		scan += UTF8SKIP(scan);
+                hardcount++;
 	    }
 	}
 	else {
@@ -6549,8 +6874,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NALNUMA:
 	if (utf8_target) {
-	    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+	    while (scan < loceol && hardcount < max
+                   && ! isWORDCHAR_A((U8) *scan))
+            {
 		scan += UTF8SKIP(scan);
+                hardcount++;
 	    }
 	}
 	else {
@@ -6562,7 +6890,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case NALNUML:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   !isALNUM_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6578,7 +6905,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 
     utf8_space:
 
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_SPACE();
 	    while (hardcount < max && scan < loceol &&
 		   (*scan == ' ' ||
@@ -6604,6 +6930,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	}
 	break;
     case SPACEA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
 	while (scan < loceol && isSPACE_A((U8) *scan)) {
 	    scan++;
 	}
@@ -6611,7 +6943,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case SPACEL:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   isSPACE_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6627,7 +6958,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 
     utf8_Nspace:
 
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_SPACE();
 	    while (hardcount < max && scan < loceol &&
 		   ! (*scan == ' ' ||
@@ -6654,8 +6984,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NSPACEA:
 	if (utf8_target) {
-	    while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+	    while (hardcount < max && scan < loceol
+	           && ! isSPACE_A((U8) *scan))
+            {
 		scan += UTF8SKIP(scan);
+		hardcount++;
 	    }
 	}
 	else {
@@ -6667,7 +7000,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case NSPACEL:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   !isSPACE_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6680,7 +7012,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case DIGIT:
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_DIGIT();
 	    while (hardcount < max && scan < loceol &&
 		   swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
@@ -6693,6 +7024,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	}
 	break;
     case DIGITA:
+        if (utf8_target && scan + max < loceol) {
+
+            /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
+             * since here, to match, 1 char == 1 byte */
+            loceol = scan + max;
+        }
 	while (scan < loceol && isDIGIT_A((U8) *scan)) {
 	    scan++;
 	}
@@ -6700,7 +7037,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case DIGITL:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   isDIGIT_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6713,7 +7049,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NDIGIT:
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_DIGIT();
 	    while (hardcount < max && scan < loceol &&
 		   !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
@@ -6727,8 +7062,10 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NDIGITA:
 	if (utf8_target) {
-	    while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+	    while (hardcount < max && scan < loceol
+	           && ! isDIGIT_A((U8) *scan)) {
 		scan += UTF8SKIP(scan);
+                hardcount++;
 	    }
 	}
 	else {
@@ -6740,7 +7077,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case NDIGITL:
 	PL_reg_flags |= RF_tainted;
 	if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
 		   !isDIGIT_LC_utf8((U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -6753,27 +7089,24 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case LNBREAK:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
 		scan += c;
 		hardcount++;
 	    }
 	} else {
-	    /*
-	      LNBREAK can match two latin chars, which is ok,
-	      because we have a null terminated string, but we
-	      have to use hardcount in this situation
-	    */
+            /* LNBREAK can match one or two latin chars, which is ok, but we
+             * have to use hardcount in this situation, and throw away the
+             * adjustment to <loceol> done before the switch statement */
+            loceol = PL_regeol;
 	    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
 		scan+=c;
 		hardcount++;
 	    }
-	}	
+	}
 	break;
     case HORIZWS:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
             {
@@ -6787,7 +7120,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NHORIZWS:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
                         !is_HORIZWS_utf8_safe(scan, loceol))
             {
@@ -6802,7 +7134,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case VERTWS:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
                             (c=is_VERTWS_utf8_safe(scan, loceol)))
             {
@@ -6817,7 +7148,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	break;
     case NVERTWS:
         if (utf8_target) {
-	    loceol = PL_regeol;
 	    while (hardcount < max && scan < loceol &&
                                 !is_VERTWS_utf8_safe(scan, loceol))
             {
@@ -6831,8 +7161,27 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	}	
 	break;
 
-    default:		/* Called on something of 0 width. */
-	break;		/* So match right here or not at all. */
+    case BOUND:
+    case BOUNDA:
+    case BOUNDL:
+    case BOUNDU:
+    case EOS:
+    case GPOS:
+    case KEEPS:
+    case NBOUND:
+    case NBOUNDA:
+    case NBOUNDL:
+    case NBOUNDU:
+    case OPFAIL:
+    case SBOL:
+    case SEOL:
+        /* These are all 0 width, so match right here or not at all. */
+        break;
+
+    default:
+        Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
+        assert(0); /* NOTREACHED */
+
     }
 
     if (hardcount)
@@ -6859,32 +7208,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
 /*
 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
-create a copy so that changes the caller makes won't change the shared one
+create a copy so that changes the caller makes won't change the shared one.
+If <altsvp> is non-null, will return NULL in it, for back-compat.
  */
 SV *
 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
 {
     PERL_ARGS_ASSERT_REGCLASS_SWASH;
-    return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
+
+    if (altsvp) {
+        *altsvp = NULL;
+    }
+
+    return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
 }
 #endif
 
 STATIC SV *
-S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
+S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
 {
     /* Returns the swash for the input 'node' in the regex 'prog'.
      * If <doinit> is true, will attempt to create the swash if not already
      *	  done.
      * If <listsvp> is non-null, will return the swash initialization string in
      *	  it.
-     * If <altsvp> is non-null, will return the alternates to the regular swash
-     *	  in it
      * Tied intimately to how regcomp.c sets up the data structure */
 
     dVAR;
     SV *sw  = NULL;
     SV *si  = NULL;
-    SV *alt = NULL;
     SV*  invlist = NULL;
 
     RXi_GET_DECL(prog,progi);
@@ -6905,12 +7257,12 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
 	
 	    si = *ary;	/* ary[0] = the string to initialize the swash with */
 
-	    /* Elements 3 and 4 are either both present or both absent. [3] is
-	     * any inversion list generated at compile time; [4] indicates if
+	    /* Elements 2 and 3 are either both present or both absent. [2] is
+	     * any inversion list generated at compile time; [3] indicates if
 	     * that inversion list has any user-defined properties in it. */
-	    if (av_len(av) >= 3) {
-		invlist = ary[3];
-		if (SvUV(ary[4])) {
+	    if (av_len(av) >= 2) {
+		invlist = ary[2];
+		if (SvUV(ary[3])) {
                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
                 }
 	    }
@@ -6934,13 +7286,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
 				      &swash_init_flags);
 		(void)av_store(av, 1, sw);
 	    }
-
-	    /* Element [2] is for any multi-char folds.  Note that is a
-	     * fundamentally flawed design, because can't backtrack and try
-	     * again.  See [perl #89774] */
-	    if (SvTYPE(ary[2]) == SVt_PVAV) {
-	        alt = ary[2];
-	    }
 	}
     }
 	
@@ -6965,9 +7310,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
 	*listsvp = matches_string;
     }
 
-    if (altsvp)
-	*altsvp  = alt;
-
     return sw;
 }
 
@@ -6976,15 +7318,9 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
  
   n is the ANYOF regnode
   p is the target string
-  lenp is pointer to the maximum number of bytes of how far to go in p
-    (This is assumed wthout checking to always be at least the current
-    character's size)
   utf8_target tells whether p is in UTF-8.
 
-  Returns true if matched; false otherwise.  If lenp is not NULL, on return
-  from a successful match, the value it points to will be updated to how many
-  bytes in p were matched.  If there was no match, the value is undefined,
-  possibly changed from the input.
+  Returns true if matched; false otherwise.
 
   Note that this can be a synthetic start class, a combination of various
   nodes, so things you think might be mutually exclusive, such as locale,
@@ -6993,19 +7329,19 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
  */
 
 STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
-    STRLEN c_len = 0;
-    STRLEN maxlen;
 
     PERL_ARGS_ASSERT_REGINCLASS;
 
-    /* If c is not already the code point, get it */
-    if (utf8_target && !UTF8_IS_INVARIANT(c)) {
+    /* If c is not already the code point, get it.  Note that
+     * UTF8_IS_INVARIANT() works even if not in UTF-8 */
+    if (! UTF8_IS_INVARIANT(c) && utf8_target) {
+        STRLEN c_len = 0;
 	c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
 		(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
 		| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
@@ -7014,21 +7350,6 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 	if (c_len == (STRLEN)-1)
 	    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
     }
-    else {
-	c_len = 1;
-    }
-
-    /* Use passed in max length, or one character if none passed in or less
-     * than one character.  And assume will match just one character.  This is
-     * overwritten later if matched more. */
-    if (lenp) {
-	maxlen = (*lenp > c_len) ? *lenp : c_len;
-	*lenp = c_len;
-
-    }
-    else {
-	maxlen = c_len;
-    }
 
     /* If this character is potentially in the bitmap, check it */
     if (c < 256) {
@@ -7040,11 +7361,10 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 	{
 	    match = TRUE;
 	}
-
 	else if (flags & ANYOF_LOCALE) {
 	    PL_reg_flags |= RF_tainted;
 
-	    if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
+	    if ((flags & ANYOF_LOC_FOLD)
 		 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
 	    {
 		match = TRUE;
@@ -7088,7 +7408,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
     }
 
     /* If the bitmap didn't (or couldn't) match, and something outside the
-     * bitmap could match, try that.  Locale nodes specifiy completely the
+     * bitmap could match, try that.  Locale nodes specify completely the
      * behavior of code points in the bit map (otherwise, a utf8 target would
      * cause them to be treated as Unicode and not locale), except in
      * the very unlikely event when this node is a synthetic start class, which
@@ -7107,167 +7427,19 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
 			     || (! (flags & ANYOF_LOCALE))
 			     || (flags & ANYOF_IS_SYNTHETIC)))))
 	{
-	    AV *av;
-	    SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-
+	    SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
 	    if (sw) {
 		U8 * utf8_p;
 		if (utf8_target) {
 		    utf8_p = (U8 *) p;
-		} else {
-
-		    /* Not utf8.  Convert as much of the string as available up
-		     * to the limit of how far the (single) character in the
-		     * pattern can possibly match (no need to go further).  If
-		     * the node is a straight ANYOF or not folding, it can't
-		     * match more than one.  Otherwise, It can match up to how
-		     * far a single char can fold to.  Since not utf8, each
-		     * character is a single byte, so the max it can be in
-		     * bytes is the same as the max it can be in characters */
-		    STRLEN len = (OP(n) == ANYOF
-				  || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
-				  ? 1
-				  : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
-				    ? maxlen
-				    : UTF8_MAX_FOLD_CHAR_EXPAND;
+		} else { /* Convert to utf8 */
+		    STRLEN len = 1;
 		    utf8_p = bytes_to_utf8(p, &len);
 		}
 
-		if (swash_fetch(sw, utf8_p, TRUE))
+		if (swash_fetch(sw, utf8_p, TRUE)) {
 		    match = TRUE;
-		else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
-
-		    /* Here, we need to test if the fold of the target string
-		     * matches.  The non-multi char folds have all been moved to
-                     * the compilation phase, and the multi-char folds have
-                     * been stored by regcomp into 'av'; we linearly check to
-                     * see if any match the target string (folded).   We know
-                     * that the originals were each one character, but we don't
-                     * currently know how many characters/bytes each folded to,
-                     * except we do know that there are small limits imposed by
-                     * Unicode.  XXX A performance enhancement would be to have
-                     * regcomp.c store the max number of chars/bytes that are
-                     * in an av entry, as, say the 0th element.  Even better
-                     * would be to have a hash of the few characters that can
-                     * start a multi-char fold to the max number of chars of
-                     * those folds.
-		     *
-		     * If there is a match, we will need to advance (if lenp is
-		     * specified) the match pointer in the target string.  But
-		     * what we are comparing here isn't that string directly,
-		     * but its fold, whose length may differ from the original.
-		     * As we go along in constructing the fold, therefore, we
-		     * create a map so that we know how many bytes in the
-		     * source to advance given that we have matched a certain
-		     * number of bytes in the fold.  This map is stored in
-		     * 'map_fold_len_back'.  Let n mean the number of bytes in
-		     * the fold of the first character that we are folding.
-		     * Then map_fold_len_back[n] is set to the number of bytes
-		     * in that first character.  Similarly let m be the
-		     * corresponding number for the second character to be
-		     * folded.  Then map_fold_len_back[n+m] is set to the
-		     * number of bytes occupied by the first two source
-		     * characters. ... */
-		    U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
-		    U8 folded[UTF8_MAXBYTES_CASE+1];
-		    STRLEN foldlen = 0; /* num bytes in fold of 1st char */
-		    STRLEN total_foldlen = 0; /* num bytes in fold of all
-						  chars */
-
-		    if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
-
-			/* Here, only need to fold the first char of the target
-			 * string.  It the source wasn't utf8, is 1 byte long */
-			to_utf8_fold(utf8_p, folded, &foldlen);
-			total_foldlen = foldlen;
-			map_fold_len_back[foldlen] = (utf8_target)
-						     ? UTF8SKIP(utf8_p)
-						     : 1;
-		    }
-		    else {
-
-			/* Here, need to fold more than the first char.  Do so
-			 * up to the limits */
-			U8* source_ptr = utf8_p;    /* The source for the fold
-						       is the regex target
-						       string */
-			U8* folded_ptr = folded;
-			U8* e = utf8_p + maxlen;    /* Can't go beyond last
-						       available byte in the
-						       target string */
-			U8 i;
-			for (i = 0;
-			     i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
-			     i++)
-			{
-
-			    /* Fold the next character */
-			    U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
-			    STRLEN this_char_foldlen;
-			    to_utf8_fold(source_ptr,
-				         this_char_folded,
-					 &this_char_foldlen);
-
-			    /* Bail if it would exceed the byte limit for
-			     * folding a single char. */
-			    if (this_char_foldlen + folded_ptr - folded >
-							    UTF8_MAXBYTES_CASE)
-			    {
-				break;
-			    }
-
-			    /* Add the fold of this character */
-			    Copy(this_char_folded,
-				 folded_ptr,
-				 this_char_foldlen,
-				 U8);
-			    source_ptr += UTF8SKIP(source_ptr);
-			    folded_ptr += this_char_foldlen;
-			    total_foldlen = folded_ptr - folded;
-
-			    /* Create map from the number of bytes in the fold
-			     * back to the number of bytes in the source.  If
-			     * the source isn't utf8, the byte count is just
-			     * the number of characters so far */
-			    map_fold_len_back[total_foldlen]
-						      = (utf8_target)
-							? source_ptr - utf8_p
-							: i + 1;
-			}
-			*folded_ptr = '\0';
-		    }
-
-
-		    /* Do the linear search to see if the fold is in the list
-		     * of multi-char folds. */
-		    if (av) {
-		        I32 i;
-			for (i = 0; i <= av_len(av); i++) {
-			    SV* const sv = *av_fetch(av, i, FALSE);
-			    STRLEN len;
-			    const char * const s = SvPV_const(sv, len);
-
-			    if (len <= total_foldlen
-				&& memEQ(s, (char*)folded, len)
-
-				   /* If 0, means matched a partial char. See
-				    * [perl #90536] */
-				&& map_fold_len_back[len])
-			    {
-
-				/* Advance the target string ptr to account for
-				 * this fold, but have to translate from the
-				 * folded length to the corresponding source
-				 * length. */
-				if (lenp) {
-				    *lenp = map_fold_len_back[len];
-				}
-				match = TRUE;
-				break;
-			    }
-			}
-		    }
-		}
+                }
 
 		/* If we allocated a string above, free it */
 		if (! utf8_target) Safefree(utf8_p);
@@ -7453,15 +7625,15 @@ S_to_byte_substr(pTHX_ register regexp *prog)
 	    if (! sv_utf8_downgrade(sv, TRUE)) {
                 return FALSE;
             }
-		if (SvVALID(prog->substrs->data[i].utf8_substr)) {
-		    if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
-			/* Trim the trailing \n that fbm_compile added last
-			   time.  */
-			SvCUR_set(sv, SvCUR(sv) - 1);
-			fbm_compile(sv, FBMcf_TAIL);
-		    } else
-			fbm_compile(sv, 0);
-		}
+            if (SvVALID(prog->substrs->data[i].utf8_substr)) {
+                if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
+                    /* Trim the trailing \n that fbm_compile added last
+                        time.  */
+                    SvCUR_set(sv, SvCUR(sv) - 1);
+                    fbm_compile(sv, FBMcf_TAIL);
+                } else
+                    fbm_compile(sv, 0);
+            }
 	    prog->substrs->data[i].substr = sv;
 	    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
 		prog->check_substr = sv;