X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d1c771f5a95fddf225347623798f65884aa6eee7..df5fcde5a244ac321cdf54f61194eb713c04d040:/regexec.c

diff --git a/regexec.c b/regexec.c
index ec4c4b0..d866f1c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -94,7 +94,11 @@
 #define	STATIC	static
 #endif
 
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
+/* Valid for non-utf8 strings only: avoids the reginclass call if there are no
+ * complications: i.e., if everything matchable is straight forward in the
+ * bitmap */
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
+					      : ANYOF_BITMAP_TEST(p,*(c)))
 
 /*
  * Forwards.
@@ -176,76 +180,106 @@
 #endif
 
 
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME:                                                                     \
-            if (!nextchr)                                                               \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if (!(OP(scan) == NAME                                                  \
+#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
+        case NAMEL:                                                         \
+            PL_reg_flags |= RF_tainted;                                     \
+            /* FALL THROUGH */                                              \
+        case NAME:                                                          \
+            if (!nextchr)                                                   \
+                sayNO;                                                      \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                \
+                    bool ok;                                                \
+                    ENTER;                                                  \
+                    save_re_context();                                      \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
+                    assert(ok);                                             \
+                    LEAVE;                                                  \
+                }                                                           \
+                if (!(OP(scan) == NAME                                      \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                          \
+                {                                                           \
+                    sayNO;                                                  \
+                }                                                           \
+                locinput += PL_utf8skip[nextchr];                           \
+                nextchr = UCHARAT(locinput);                                \
+                break;                                                      \
+            }                                                               \
+	    /* Drops through to the macro that calls this one */
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
+            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
+                sayNO;                                                      \
+            nextchr = UCHARAT(++locinput);                                  \
             break
 
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME :                                                                     \
-            if (!nextchr && locinput >= PL_regeol)                                      \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if ((OP(scan) == NAME                                                  \
+/* Almost identical to the above, but has a case for a node that matches chars
+ * between 128 and 255 using Unicode (latin1) semantics. */
+#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
+
+#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
+        case NAMEL:                                                          \
+            PL_reg_flags |= RF_tainted;                                      \
+            /* FALL THROUGH */                                               \
+        case NAME :                                                          \
+            if (!nextchr && locinput >= PL_regeol)                           \
+                sayNO;                                                       \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
+                if (!CAT2(PL_utf8_,CLASS)) {                                 \
+                    bool ok;                                                 \
+                    ENTER;                                                   \
+                    save_re_context();                                       \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
+                    assert(ok);                                              \
+                    LEAVE;                                                   \
+                }                                                            \
+                if ((OP(scan) == NAME                                        \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                           \
+                {                                                            \
+                    sayNO;                                                   \
+                }                                                            \
+                locinput += PL_utf8skip[nextchr];                            \
+                nextchr = UCHARAT(locinput);                                 \
+                break;                                                       \
+            }
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
             break
 
 
+#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
+            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
 
 
 
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 
 /* for use after a quantifier and before an EXACT-like node -- japhy */
-/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
+/* it would be nice to rework regcomp.sym to generate this stuff. sigh
+ *
+ * NOTE that *nothing* that affects backtracking should be in here, specifically
+ * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
+ * node that is in between two EXACT like nodes when ascertaining what the required
+ * "follow" character is. This should probably be moved to regex compile time
+ * although it may be done at run time beause of the REF possibility - more
+ * investigation required. -- demerphq
+*/
 #define JUMPABLE(rn) (      \
     OP(rn) == OPEN ||       \
     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
@@ -253,7 +287,6 @@
     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
     OP(rn) == PLUS || OP(rn) == MINMOD || \
     OP(rn) == KEEPS || \
-    /*(PL_regkind[OP(rn)] == VERB && OP(rn) != PRUNE && OP(rn) != COMMIT && OP(rn) != MARKPOINT && OP(rn) != SKIP && OP(rn) != CUTGROUP)  || */\
     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 )
 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
@@ -1326,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	switch (OP(c)) {
 	case ANYOF:
 	    if (utf8_target) {
-		 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+		 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
 			  !UTF8_IS_INVARIANT((U8)s[0]) ?
 			  reginclass(prog, c, (U8*)s, 0, utf8_target) :
 			  REGINCLASS(prog, c, (U8*)s));
@@ -1499,12 +1532,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		}
 		);
 	    }
-	    else {
+            else {  /* Not utf8 */
 		tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-		tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == BOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
 		REXEC_FBC_SCAN(
 		    if (tmp ==
-			!(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                        !((OP(c) == BOUNDL)
+                          ? isALNUM_LC(*s)
+                          : (isWORDCHAR_L1((U8) *s)
+                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+		    {
 			tmp = !tmp;
 			REXEC_FBC_TRYIT;
 		}
@@ -1537,12 +1577,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    }
 	    else {
 		tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-		tmp = ((OP(c) == NBOUND ?
-			isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == NBOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
 		REXEC_FBC_SCAN(
-		    if (tmp ==
-			!(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+		    if (tmp == ! cBOOL(
+                            (OP(c) == NBOUNDL)
+                            ? isALNUM_LC(*s)
+                            : (isWORDCHAR_L1((U8) *s)
+                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                    {
 			tmp = !tmp;
+                    }
 		    else REXEC_FBC_TRYIT;
 		);
 	    }
@@ -1553,7 +1600,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    REXEC_FBC_CSCAN_PRELOAD(
 		LOAD_UTF8_CHARCLASS_PERL_WORD(),
 		swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-		isALNUM(*s)
+                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
 	    );
 	case ALNUML:
 	    REXEC_FBC_CSCAN_TAINT(
@@ -1564,7 +1611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    REXEC_FBC_CSCAN_PRELOAD(
 		LOAD_UTF8_CHARCLASS_PERL_WORD(),
 		!swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-		!isALNUM(*s)
+                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
 	    );
 	case NALNUML:
 	    REXEC_FBC_CSCAN_TAINT(
@@ -1575,7 +1622,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    REXEC_FBC_CSCAN_PRELOAD(
 		LOAD_UTF8_CHARCLASS_PERL_SPACE(),
 		*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-		isSPACE(*s)
+                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
 	    );
 	case SPACEL:
 	    REXEC_FBC_CSCAN_TAINT(
@@ -1586,7 +1633,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 	    REXEC_FBC_CSCAN_PRELOAD(
 		LOAD_UTF8_CHARCLASS_PERL_SPACE(),
 		!(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-		!isSPACE(*s)
+                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
 	    );
 	case NSPACEL:
 	    REXEC_FBC_CSCAN_TAINT(
@@ -1733,10 +1780,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                         PerlIO_printf( Perl_debug_log,
                                             " Scanning for legal start char...\n");
                                     }
-                                );            
-                                while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
-                                    uc++;
-                                }
+                                );
+				if (utf8_target) {
+				    while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+					uc += UTF8SKIP(uc);
+				    }
+				} else {
+				    while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
+					uc++;
+				    }
+				}
                                 s= (char *)uc;
                             }
                             if (uc >(U8*)last_start) break;
@@ -3170,7 +3223,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                         	          "%*s  %smatched empty string...%s\n",
                         	          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
                         );
-        	        break;
+			if (!trie->jump)
+			    break;
         	    } else {
         	        DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3562,7 +3616,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	    else {
 		ln = (locinput != PL_bostr) ?
 		    UCHARAT(locinput - 1) : '\n';
-		if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+		if (FLAGS(scan) & USE_UNI) {
+
+                    /* Here, can't be BOUNDL or NBOUNDL because they never set
+                     * the flags to USE_UNI */
+                    ln = isWORDCHAR_L1(ln);
+                    n = isWORDCHAR_L1(nextchr);
+                }
+                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
 		    ln = isALNUM(ln);
 		    n = isALNUM(nextchr);
 		}
@@ -3578,22 +3639,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	case ANYOF:
 	    if (utf8_target) {
 	        STRLEN inclasslen = PL_regeol - locinput;
+		if (locinput >= PL_regeol)
+		    sayNO;
 
 	        if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
 		    goto anyof_fail;
-		if (locinput >= PL_regeol)
-		    sayNO;
-		locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
+		locinput += inclasslen;
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
 	    else {
 		if (nextchr < 0)
 		    nextchr = UCHARAT(locinput);
-		if (!REGINCLASS(rex, scan, (U8*)locinput))
-		    goto anyof_fail;
 		if (!nextchr && locinput >= PL_regeol)
 		    sayNO;
+		if (!REGINCLASS(rex, scan, (U8*)locinput))
+		    goto anyof_fail;
 		nextchr = UCHARAT(++locinput);
 		break;
 	    }
@@ -3609,11 +3670,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		 sayNO;
 	    break;
 	/* Special char classes - The defines start on line 129 or so */
-	CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
-	CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
 
-	CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
-	CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
 
 	CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
 	CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
@@ -3968,7 +4029,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		COP * const ocurcop = PL_curcop;
 		PAD *old_comppad;
 		char *saved_regeol = PL_regeol;
-	    
+		struct re_save_state saved_state;
+
+		/* To not corrupt the existing regex state while executing the
+		 * eval we would normally put it on the save stack, like with
+		 * save_re_context. However, re-evals have a weird scoping so we
+		 * can't just add ENTER/LEAVE here. With that, things like
+		 *
+		 *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
+		 *
+		 * would break, as they expect the localisation to be unwound
+		 * only when the re-engine backtracks through the bit that
+		 * localised it.
+		 *
+		 * What we do instead is just saving the state in a local c
+		 * variable.
+		 */
+		Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
+
 		n = ARG(scan);
 		PL_op = (OP_4tree*)rexi->data->data[n];
 		DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, 
@@ -3990,6 +4068,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		    PUTBACK;
 		}
 
+		Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
+
 		PL_op = oop;
 		PAD_RESTORE_LOCAL(old_comppad);
 		PL_curcop = ocurcop;
@@ -5681,23 +5761,103 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
 	scan = loceol;
 	break;
-    case EXACT:		/* length of string is 1 */
-	c = (U8)*STRING(p);
-	while (scan < loceol && UCHARAT(scan) == c)
-	    scan++;
-	break;
-    case EXACTF:	/* length of string is 1 */
+    case EXACT:
+	/* To get here, EXACTish nodes must have *byte* length == 1.  That
+	 * means they match only characters in the string that can be expressed
+	 * as a single byte.  For non-utf8 strings, that means a simple match.
+	 * For utf8 strings, the character matched must be an invariant, or
+	 * downgradable to a single byte.  The pattern's utf8ness is
+	 * irrelevant, as since it's a single byte, it either isn't utf8, or if
+	 * it is, it's an invariant */
+
 	c = (U8)*STRING(p);
-	while (scan < loceol &&
-	       (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
-	    scan++;
+	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+	if (! utf8_target || UNI_IS_INVARIANT(c)) {
+	    while (scan < loceol && UCHARAT(scan) == c) {
+		scan++;
+	    }
+	}
+	else {
+
+	    /* Here, the string is utf8, and the pattern char is different
+	     * in utf8 than not, so can't compare them directly.  Outside the
+	     * loop, find find the two utf8 bytes that represent c, and then
+	     * look for those in sequence in the utf8 string */
+	    U8 high = UTF8_TWO_BYTE_HI(c);
+	    U8 low = UTF8_TWO_BYTE_LO(c);
+	    loceol = PL_regeol;
+
+	    while (hardcount < max
+		    && scan + 1 < loceol
+		    && UCHARAT(scan) == high
+		    && UCHARAT(scan + 1) == low)
+	    {
+		scan += 2;
+		hardcount++;
+	    }
+	}
 	break;
-    case EXACTFL:	/* length of string is 1 */
+    case EXACTFL:
 	PL_reg_flags |= RF_tainted;
+	/* FALL THROUGH */
+    case EXACTF:
+
+	/* The comments for the EXACT case apply as well to these fold ones */
+
 	c = (U8)*STRING(p);
-	while (scan < loceol &&
-	       (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
-	    scan++;
+	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+	if (utf8_target) { /* Use full Unicode fold matching */
+
+	    /* For the EXACTFL case, It doesn't really make sense to compare
+	     * locale and utf8, but it is best we can do.  The documents warn
+	     * against mixing them */
+
+	    char *tmpeol = loceol;
+	    while (hardcount < max
+		    && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+				    STRING(p), NULL, 1, UTF_PATTERN))
+	    {
+		scan = tmpeol;
+		tmpeol = loceol;
+		hardcount++;
+	    }
+
+	    /* XXX Note that the above handles properly the German sharp s in
+	     * the pattern matching ss in the string.  But it doesn't handle
+	     * properly cases where the string contains say 'LIGATURE ff' and
+	     * the pattern is 'f+'.  This would require, say, a new function or
+	     * revised interface to foldEQ_utf8(), in which the maximum number
+	     * of characters to match could be passed and it would return how
+	     * many actually did.  This is just one of many cases where
+	     * multi-char folds don't work properly, and so the fix is being
+	     * deferred */
+	}
+	else {
+
+	    /* Here, the string isn't utf8; and either the pattern isn't utf8
+	     * or c is an invariant, so its utf8ness doesn't affect c.  Can
+	     * just do simple comparisons for exact or fold matching. */
+	    switch (OP(p)) {
+	    case EXACTF:
+		while (scan < loceol &&
+		    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
+		{
+		    scan++;
+		}
+		break;
+	    case EXACTFL:
+		while (scan < loceol &&
+		    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
+		{
+		    scan++;
+		}
+		break;
+	    default:
+		Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
+	    }
+	}
 	break;
     case ANYOF:
 	if (utf8_target) {
@@ -5717,13 +5877,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_ALNUM();
 	    while (hardcount < max && scan < loceol &&
-		   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
 	} else {
-	    while (scan < loceol && isALNUM(*scan))
-		scan++;
+            while (scan < loceol && isALNUM((U8) *scan)) {
+                scan++;
+            }
 	}
 	break;
     case ALNUML:
@@ -5745,13 +5911,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    loceol = PL_regeol;
 	    LOAD_UTF8_CHARCLASS_ALNUM();
 	    while (hardcount < max && scan < loceol &&
-		   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
 	} else {
-	    while (scan < loceol && !isALNUM(*scan))
-		scan++;
+            while (scan < loceol && ! isALNUM((U8) *scan)) {
+                scan++;
+            }
 	}
 	break;
     case NALNUML:
@@ -5774,13 +5946,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    LOAD_UTF8_CHARCLASS_SPACE();
 	    while (hardcount < max && scan < loceol &&
 		   (*scan == ' ' ||
-		    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
 	} else {
-	    while (scan < loceol && isSPACE(*scan))
-		scan++;
+            while (scan < loceol && isSPACE((U8) *scan))
+                scan++;
 	}
 	break;
     case SPACEL:
@@ -5803,13 +5980,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    LOAD_UTF8_CHARCLASS_SPACE();
 	    while (hardcount < max && scan < loceol &&
 		   !(*scan == ' ' ||
-		     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
 		scan += UTF8SKIP(scan);
 		hardcount++;
 	    }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
 	} else {
-	    while (scan < loceol && !isSPACE(*scan))
-		scan++;
+            while (scan < loceol && ! isSPACE((U8) *scan)) {
+                scan++;
+            }
 	}
 	break;
     case NSPACEL:
@@ -6005,91 +6188,60 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool
 /*
  - reginclass - determine if a character falls into a character class
  
-  The n is the ANYOF regnode, the p is the target string, lenp
-  is pointer to the maximum length of how far to go in the p
-  (if the lenp is zero, UTF8SKIP(p) is used),
-  utf8_target tells whether the target string is in UTF-8.
+  n is the ANYOF regnode
+  p is the target string
+  lenp is pointer to the maximum number of bytes of how far to go in p
+    (This is assumed wthout checking to always be at least the current
+    character's size)
+  utf8_target tells whether p is in UTF-8.
+
+  Returns true if matched; false otherwise.  If lenp is not NULL, on return
+  from a successful match, the value it points to will be updated to how many
+  bytes in p were matched.  If there was no match, the value is undefined,
+  possibly changed from the input.
 
  */
 
 STATIC bool
-S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
-    STRLEN len = 0;
-    STRLEN plen;
+    STRLEN c_len = 0;
+    STRLEN maxlen;
 
     PERL_ARGS_ASSERT_REGINCLASS;
 
+    /* If c is not already the code point, get it */
     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
-	c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
+	c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
 		(UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
 		| UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
 		/* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
 		 * UTF8_ALLOW_FFFF */
-	if (len == (STRLEN)-1) 
+	if (c_len == (STRLEN)-1)
 	    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
     }
+    else {
+	c_len = 1;
+    }
 
-    plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
-    if (utf8_target || (flags & ANYOF_UNICODE)) {
-        if (lenp)
-	    *lenp = 0;
-	if (utf8_target && !ANYOF_RUNTIME(n)) {
-	    if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
-		match = TRUE;
-	}
-	if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
-	    match = TRUE;
-	if (!match) {
-	    AV *av;
-	    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-	
-	    if (sw) {
-		U8 * utf8_p;
-		if (utf8_target) {
-		    utf8_p = (U8 *) p;
-		} else {
-		    STRLEN len = 1;
-		    utf8_p = bytes_to_utf8(p, &len);
-		}
-		if (swash_fetch(sw, utf8_p, 1))
-		    match = TRUE;
-		else if (flags & ANYOF_FOLD) {
-		    if (!match && lenp && av) {
-		        I32 i;
-			for (i = 0; i <= av_len(av); i++) {
-			    SV* const sv = *av_fetch(av, i, FALSE);
-			    STRLEN len;
-			    const char * const s = SvPV_const(sv, len);
-			    if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
-			        *lenp = len;
-				match = TRUE;
-				break;
-			    }
-			}
-		    }
-		    if (!match) {
-		        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
-			STRLEN tmplen;
-			to_utf8_fold(utf8_p, tmpbuf, &tmplen);
-			if (swash_fetch(sw, tmpbuf, 1))
-			    match = TRUE;
-		    }
-		}
+    /* Use passed in max length, or one character if none passed in or less
+     * than one character.  And assume will match just one character.  This is
+     * overwritten later if matched more. */
+    if (lenp) {
+	maxlen = (*lenp > c_len) ? *lenp : c_len;
+	*lenp = c_len;
 
-		/* If we allocated a string above, free it */
-		if (! utf8_target) Safefree(utf8_p);
-	    }
-	}
-	if (match && lenp && *lenp == 0)
-	    *lenp = UNISKIP(NATIVE_TO_UNI(c));
     }
-    if (!match && c < 256) {
+    else {
+	maxlen = c_len;
+    }
+
+    /* If this character is potentially in the bitmap, check it */
+    if (c < 256) {
 	if (ANYOF_BITMAP_TEST(n, c))
 	    match = TRUE;
 	else if (flags & ANYOF_FOLD) {
@@ -6105,7 +6257,7 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
 		match = TRUE;
 	}
 	
-	if (!match && (flags & ANYOF_CLASS)) {
+	if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
 	    PL_reg_flags |= RF_tainted;
 	    if (
 		(ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
@@ -6145,6 +6297,105 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
 	}
     }
 
+    /* If the bitmap didn't (or couldn't) match, and something outside the
+     * bitmap could match, try that */
+    if (!match) {
+	if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+	    match = TRUE;
+	}
+	else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+		 || (utf8_target && flags & ANYOF_UTF8))
+	{
+	    AV *av;
+	    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
+
+	    if (sw) {
+		U8 * utf8_p;
+		if (utf8_target) {
+		    utf8_p = (U8 *) p;
+		} else {
+		    STRLEN len = 1;
+		    utf8_p = bytes_to_utf8(p, &len);
+		}
+		if (swash_fetch(sw, utf8_p, 1))
+		    match = TRUE;
+		else if (flags & ANYOF_FOLD) {
+		    if (!match && lenp && av) {
+		        I32 i;
+			for (i = 0; i <= av_len(av); i++) {
+			    SV* const sv = *av_fetch(av, i, FALSE);
+			    STRLEN len;
+			    const char * const s = SvPV_const(sv, len);
+			    if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
+			        *lenp = len;
+				match = TRUE;
+				break;
+			    }
+			}
+		    }
+		    if (!match) {
+		        U8 folded[UTF8_MAXBYTES_CASE+1];
+
+			/* See if the folded version matches */
+			STRLEN foldlen;
+			to_utf8_fold(utf8_p, folded, &foldlen);
+			if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
+			    match = TRUE;
+			}
+			else {
+			    SV** listp;
+
+                            /* Consider "k" =~ /[K]/i.  The line above would
+                             * have just folded the 'k' to itself, and that
+                             * isn't going to match 'K'.  So we look through
+                             * the closure of everything that folds to 'k'.
+                             * That will find the 'K'.  Initialize the list, if
+                             * necessary */
+			    if (! PL_utf8_foldclosures) {
+
+                                /* If the folds haven't been read in, call a
+                                 * fold function to force that */
+				if (! PL_utf8_tofold) {
+				    U8 dummy[UTF8_MAXBYTES+1];
+				    STRLEN dummy_len;
+				    to_utf8_fold((U8*) "A", dummy, &dummy_len);
+				}
+				PL_utf8_foldclosures =
+					_swash_inversion_hash(PL_utf8_tofold);
+			    }
+
+                            /* The data structure is a hash with the keys every
+                             * character that is folded to, like 'k', and the
+                             * values each an array of everything that folds to
+                             * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+			    if ((listp = hv_fetch(PL_utf8_foldclosures,
+					    (char *) folded, foldlen, FALSE)))
+			    {
+				AV* list = (AV*) *listp;
+				IV i;
+				for (i = 0; i <= av_len(list); i++) {
+				    SV** try_p = av_fetch(list, i, FALSE);
+				    if (try_p == NULL) {
+					Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+				    }
+				    /* Don't have to worry about embeded nulls
+				     * since NULL isn't folded or foldable */
+				    if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+					match = TRUE;
+					break;
+				    }
+				}
+			    }
+			}
+		    }
+		}
+
+		/* If we allocated a string above, free it */
+		if (! utf8_target) Safefree(utf8_p);
+	    }
+	}
+    }
+
     return (flags & ANYOF_INVERT) ? !match : match;
 }