This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Correct indent
[perl5.git] / regexec.c
index dd4ec41..d866f1c 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #define        STATIC  static
 #endif
 
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
+/* Valid for non-utf8 strings only: avoids the reginclass call if there are no
+ * complications: i.e., if everything matchable is straight forward in the
+ * bitmap */
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
+                                             : ANYOF_BITMAP_TEST(p,*(c)))
 
 /*
  * Forwards.
 #endif
 
 
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME:                                                                     \
-            if (!nextchr)                                                               \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if (!(OP(scan) == NAME                                                  \
+#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
+        case NAMEL:                                                         \
+            PL_reg_flags |= RF_tainted;                                     \
+            /* FALL THROUGH */                                              \
+        case NAME:                                                          \
+            if (!nextchr)                                                   \
+                sayNO;                                                      \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                \
+                    bool ok;                                                \
+                    ENTER;                                                  \
+                    save_re_context();                                      \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
+                    assert(ok);                                             \
+                    LEAVE;                                                  \
+                }                                                           \
+                if (!(OP(scan) == NAME                                      \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                          \
+                {                                                           \
+                    sayNO;                                                  \
+                }                                                           \
+                locinput += PL_utf8skip[nextchr];                           \
+                nextchr = UCHARAT(locinput);                                \
+                break;                                                      \
+            }                                                               \
+           /* Drops through to the macro that calls this one */
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
+            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
+                sayNO;                                                      \
+            nextchr = UCHARAT(++locinput);                                  \
             break
 
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME :                                                                     \
-            if (!nextchr && locinput >= PL_regeol)                                      \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if ((OP(scan) == NAME                                                  \
+/* Almost identical to the above, but has a case for a node that matches chars
+ * between 128 and 255 using Unicode (latin1) semantics. */
+#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
+
+#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
+        case NAMEL:                                                          \
+            PL_reg_flags |= RF_tainted;                                      \
+            /* FALL THROUGH */                                               \
+        case NAME :                                                          \
+            if (!nextchr && locinput >= PL_regeol)                           \
+                sayNO;                                                       \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
+                if (!CAT2(PL_utf8_,CLASS)) {                                 \
+                    bool ok;                                                 \
+                    ENTER;                                                   \
+                    save_re_context();                                       \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
+                    assert(ok);                                              \
+                    LEAVE;                                                   \
+                }                                                            \
+                if ((OP(scan) == NAME                                        \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                           \
+                {                                                            \
+                    sayNO;                                                   \
+                }                                                            \
+                locinput += PL_utf8skip[nextchr];                            \
+                nextchr = UCHARAT(locinput);                                 \
+                break;                                                       \
+            }
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
             break
 
 
+#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
+            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
 
 
 
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 
 /* for use after a quantifier and before an EXACT-like node -- japhy */
-/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
+/* it would be nice to rework regcomp.sym to generate this stuff. sigh
+ *
+ * NOTE that *nothing* that affects backtracking should be in here, specifically
+ * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
+ * node that is in between two EXACT like nodes when ascertaining what the required
+ * "follow" character is. This should probably be moved to regex compile time
+ * although it may be done at run time beause of the REF possibility - more
+ * investigation required. -- demerphq
+*/
 #define JUMPABLE(rn) (      \
     OP(rn) == OPEN ||       \
     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
     OP(rn) == EVAL ||   \
     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
     OP(rn) == PLUS || OP(rn) == MINMOD || \
-    OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
+    OP(rn) == KEEPS || \
     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 )
 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
@@ -1325,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
        switch (OP(c)) {
        case ANYOF:
            if (utf8_target) {
-                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
                          !UTF8_IS_INVARIANT((U8)s[0]) ?
                          reginclass(prog, c, (U8*)s, 0, utf8_target) :
                          REGINCLASS(prog, c, (U8*)s));
@@ -1498,12 +1532,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                }
                );
            }
-           else {
+            else {  /* Not utf8 */
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == BOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
                    if (tmp ==
-                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                        !((OP(c) == BOUNDL)
+                          ? isALNUM_LC(*s)
+                          : (isWORDCHAR_L1((U8) *s)
+                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                   {
                        tmp = !tmp;
                        REXEC_FBC_TRYIT;
                }
@@ -1536,12 +1577,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            }
            else {
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == NBOUND ?
-                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == NBOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
-                   if (tmp ==
-                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                   if (tmp == ! cBOOL(
+                            (OP(c) == NBOUNDL)
+                            ? isALNUM_LC(*s)
+                            : (isWORDCHAR_L1((U8) *s)
+                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                    {
                        tmp = !tmp;
+                    }
                    else REXEC_FBC_TRYIT;
                );
            }
@@ -1552,7 +1600,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               isALNUM(*s)
+                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
            );
        case ALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1563,7 +1611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               !isALNUM(*s)
+                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
            );
        case NALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1574,7 +1622,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-               isSPACE(*s)
+                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -1585,7 +1633,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-               !isSPACE(*s)
+                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -1732,10 +1780,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                         PerlIO_printf( Perl_debug_log,
                                             " Scanning for legal start char...\n");
                                     }
-                                );            
-                                while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
-                                    uc++;
-                                }
+                                );
+                               if (utf8_target) {
+                                   while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+                                       uc += UTF8SKIP(uc);
+                                   }
+                               } else {
+                                   while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
+                                       uc++;
+                                   }
+                               }
                                 s= (char *)uc;
                             }
                             if (uc >(U8*)last_start) break;
@@ -2018,33 +2072,68 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre
            end = HOP3c(strend, -dontbother, strbeg) - 1;
            /* for multiline we only have to try after newlines */
            if (prog->check_substr || prog->check_utf8) {
-               if (s == startpos)
-                   goto after_try;
-               while (1) {
-                   if (regtry(&reginfo, &s))
-                       goto got_it;
-                 after_try:
-                   if (s > end)
-                       goto phooey;
-                   if (prog->extflags & RXf_USE_INTUIT) {
-                       s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
-                       if (!s)
-                           goto phooey;
-                   }
-                   else
-                       s++;
-               }               
-           } else {
-               if (s > startpos)
+                /* because of the goto we can not easily reuse the macros for bifurcating the
+                   unicode/non-unicode match modes here like we do elsewhere - demerphq */
+                if (utf8_target) {
+                    if (s == startpos)
+                        goto after_try_utf8;
+                    while (1) {
+                        if (regtry(&reginfo, &s)) {
+                            goto got_it;
+                        }
+                      after_try_utf8:
+                        if (s > end) {
+                            goto phooey;
+                        }
+                        if (prog->extflags & RXf_USE_INTUIT) {
+                            s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
+                            if (!s) {
+                                goto phooey;
+                            }
+                        }
+                        else {
+                            s += UTF8SKIP(s);
+                        }
+                    }
+                } /* end search for check string in unicode */
+                else {
+                    if (s == startpos) {
+                        goto after_try_latin;
+                    }
+                    while (1) {
+                        if (regtry(&reginfo, &s)) {
+                            goto got_it;
+                        }
+                      after_try_latin:
+                        if (s > end) {
+                            goto phooey;
+                        }
+                        if (prog->extflags & RXf_USE_INTUIT) {
+                            s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
+                            if (!s) {
+                                goto phooey;
+                            }
+                        }
+                        else {
+                            s++;
+                        }
+                    }
+                } /* end search for check string in latin*/
+           } /* end search for check string */
+           else { /* search for newline */
+               if (s > startpos) {
+                    /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
                    s--;
+               }
+                /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
                while (s < end) {
                    if (*s++ == '\n') { /* don't need PL_utf8skip here */
                        if (regtry(&reginfo, &s))
                            goto got_it;
                    }
-               }               
-           }
-       }
+               }
+           } /* end search for newline */
+       } /* end anchored/multiline check string search */
        goto phooey;
     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) 
     {
@@ -3134,7 +3223,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                          "%*s  %smatched empty string...%s\n",
                                          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
                         );
-                       break;
+                       if (!trie->jump)
+                           break;
                    } else {
                        DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3526,7 +3616,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            else {
                ln = (locinput != PL_bostr) ?
                    UCHARAT(locinput - 1) : '\n';
-               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+               if (FLAGS(scan) & USE_UNI) {
+
+                    /* Here, can't be BOUNDL or NBOUNDL because they never set
+                     * the flags to USE_UNI */
+                    ln = isWORDCHAR_L1(ln);
+                    n = isWORDCHAR_L1(nextchr);
+                }
+                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
                    ln = isALNUM(ln);
                    n = isALNUM(nextchr);
                }
@@ -3542,22 +3639,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
        case ANYOF:
            if (utf8_target) {
                STRLEN inclasslen = PL_regeol - locinput;
+               if (locinput >= PL_regeol)
+                   sayNO;
 
                if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
                    goto anyof_fail;
-               if (locinput >= PL_regeol)
-                   sayNO;
-               locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
+               locinput += inclasslen;
                nextchr = UCHARAT(locinput);
                break;
            }
            else {
                if (nextchr < 0)
                    nextchr = UCHARAT(locinput);
-               if (!REGINCLASS(rex, scan, (U8*)locinput))
-                   goto anyof_fail;
                if (!nextchr && locinput >= PL_regeol)
                    sayNO;
+               if (!REGINCLASS(rex, scan, (U8*)locinput))
+                   goto anyof_fail;
                nextchr = UCHARAT(++locinput);
                break;
            }
@@ -3573,11 +3670,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 sayNO;
            break;
        /* Special char classes - The defines start on line 129 or so */
-       CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
-       CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
 
-       CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
-       CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
 
        CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
        CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
@@ -3932,7 +4029,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                COP * const ocurcop = PL_curcop;
                PAD *old_comppad;
                char *saved_regeol = PL_regeol;
-           
+               struct re_save_state saved_state;
+
+               /* To not corrupt the existing regex state while executing the
+                * eval we would normally put it on the save stack, like with
+                * save_re_context. However, re-evals have a weird scoping so we
+                * can't just add ENTER/LEAVE here. With that, things like
+                *
+                *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
+                *
+                * would break, as they expect the localisation to be unwound
+                * only when the re-engine backtracks through the bit that
+                * localised it.
+                *
+                * What we do instead is just saving the state in a local c
+                * variable.
+                */
+               Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
+
                n = ARG(scan);
                PL_op = (OP_4tree*)rexi->data->data[n];
                DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, 
@@ -3954,6 +4068,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    PUTBACK;
                }
 
+               Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
+
                PL_op = oop;
                PAD_RESTORE_LOCAL(old_comppad);
                PL_curcop = ocurcop;
@@ -5645,23 +5761,103 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
        scan = loceol;
        break;
-    case EXACT:                /* length of string is 1 */
-       c = (U8)*STRING(p);
-       while (scan < loceol && UCHARAT(scan) == c)
-           scan++;
-       break;
-    case EXACTF:       /* length of string is 1 */
+    case EXACT:
+       /* To get here, EXACTish nodes must have *byte* length == 1.  That
+        * means they match only characters in the string that can be expressed
+        * as a single byte.  For non-utf8 strings, that means a simple match.
+        * For utf8 strings, the character matched must be an invariant, or
+        * downgradable to a single byte.  The pattern's utf8ness is
+        * irrelevant, as since it's a single byte, it either isn't utf8, or if
+        * it is, it's an invariant */
+
        c = (U8)*STRING(p);
-       while (scan < loceol &&
-              (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
-           scan++;
+       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+       if (! utf8_target || UNI_IS_INVARIANT(c)) {
+           while (scan < loceol && UCHARAT(scan) == c) {
+               scan++;
+           }
+       }
+       else {
+
+           /* Here, the string is utf8, and the pattern char is different
+            * in utf8 than not, so can't compare them directly.  Outside the
+            * loop, find find the two utf8 bytes that represent c, and then
+            * look for those in sequence in the utf8 string */
+           U8 high = UTF8_TWO_BYTE_HI(c);
+           U8 low = UTF8_TWO_BYTE_LO(c);
+           loceol = PL_regeol;
+
+           while (hardcount < max
+                   && scan + 1 < loceol
+                   && UCHARAT(scan) == high
+                   && UCHARAT(scan + 1) == low)
+           {
+               scan += 2;
+               hardcount++;
+           }
+       }
        break;
-    case EXACTFL:      /* length of string is 1 */
+    case EXACTFL:
        PL_reg_flags |= RF_tainted;
+       /* FALL THROUGH */
+    case EXACTF:
+
+       /* The comments for the EXACT case apply as well to these fold ones */
+
        c = (U8)*STRING(p);
-       while (scan < loceol &&
-              (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
-           scan++;
+       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+       if (utf8_target) { /* Use full Unicode fold matching */
+
+           /* For the EXACTFL case, It doesn't really make sense to compare
+            * locale and utf8, but it is best we can do.  The documents warn
+            * against mixing them */
+
+           char *tmpeol = loceol;
+           while (hardcount < max
+                   && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+                                   STRING(p), NULL, 1, UTF_PATTERN))
+           {
+               scan = tmpeol;
+               tmpeol = loceol;
+               hardcount++;
+           }
+
+           /* XXX Note that the above handles properly the German sharp s in
+            * the pattern matching ss in the string.  But it doesn't handle
+            * properly cases where the string contains say 'LIGATURE ff' and
+            * the pattern is 'f+'.  This would require, say, a new function or
+            * revised interface to foldEQ_utf8(), in which the maximum number
+            * of characters to match could be passed and it would return how
+            * many actually did.  This is just one of many cases where
+            * multi-char folds don't work properly, and so the fix is being
+            * deferred */
+       }
+       else {
+
+           /* Here, the string isn't utf8; and either the pattern isn't utf8
+            * or c is an invariant, so its utf8ness doesn't affect c.  Can
+            * just do simple comparisons for exact or fold matching. */
+           switch (OP(p)) {
+           case EXACTF:
+               while (scan < loceol &&
+                   (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
+               {
+                   scan++;
+               }
+               break;
+           case EXACTFL:
+               while (scan < loceol &&
+                   (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
+               {
+                   scan++;
+               }
+               break;
+           default:
+               Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
+           }
+       }
        break;
     case ANYOF:
        if (utf8_target) {
@@ -5681,13 +5877,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isALNUM(*scan))
-               scan++;
+            while (scan < loceol && isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case ALNUML:
@@ -5709,13 +5911,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isALNUM(*scan))
-               scan++;
+            while (scan < loceol && ! isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NALNUML:
@@ -5738,13 +5946,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   (*scan == ' ' ||
-                   swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isSPACE(*scan))
-               scan++;
+            while (scan < loceol && isSPACE((U8) *scan))
+                scan++;
        }
        break;
     case SPACEL:
@@ -5767,13 +5980,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   !(*scan == ' ' ||
-                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isSPACE(*scan))
-               scan++;
+            while (scan < loceol && ! isSPACE((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NSPACEL:
@@ -5969,91 +6188,60 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool
 /*
  - reginclass - determine if a character falls into a character class
  
-  The n is the ANYOF regnode, the p is the target string, lenp
-  is pointer to the maximum length of how far to go in the p
-  (if the lenp is zero, UTF8SKIP(p) is used),
-  utf8_target tells whether the target string is in UTF-8.
+  n is the ANYOF regnode
+  p is the target string
+  lenp is pointer to the maximum number of bytes of how far to go in p
+    (This is assumed wthout checking to always be at least the current
+    character's size)
+  utf8_target tells whether p is in UTF-8.
+
+  Returns true if matched; false otherwise.  If lenp is not NULL, on return
+  from a successful match, the value it points to will be updated to how many
+  bytes in p were matched.  If there was no match, the value is undefined,
+  possibly changed from the input.
 
  */
 
 STATIC bool
-S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
-    STRLEN len = 0;
-    STRLEN plen;
+    STRLEN c_len = 0;
+    STRLEN maxlen;
 
     PERL_ARGS_ASSERT_REGINCLASS;
 
+    /* If c is not already the code point, get it */
     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
-       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
+       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
                (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
                | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
                /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
                 * UTF8_ALLOW_FFFF */
-       if (len == (STRLEN)-1) 
+       if (c_len == (STRLEN)-1)
            Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
     }
+    else {
+       c_len = 1;
+    }
 
-    plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
-    if (utf8_target || (flags & ANYOF_UNICODE)) {
-        if (lenp)
-           *lenp = 0;
-       if (utf8_target && !ANYOF_RUNTIME(n)) {
-           if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
-               match = TRUE;
-       }
-       if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
-           match = TRUE;
-       if (!match) {
-           AV *av;
-           SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-       
-           if (sw) {
-               U8 * utf8_p;
-               if (utf8_target) {
-                   utf8_p = (U8 *) p;
-               } else {
-                   STRLEN len = 1;
-                   utf8_p = bytes_to_utf8(p, &len);
-               }
-               if (swash_fetch(sw, utf8_p, 1))
-                   match = TRUE;
-               else if (flags & ANYOF_FOLD) {
-                   if (!match && lenp && av) {
-                       I32 i;
-                       for (i = 0; i <= av_len(av); i++) {
-                           SV* const sv = *av_fetch(av, i, FALSE);
-                           STRLEN len;
-                           const char * const s = SvPV_const(sv, len);
-                           if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
-                               *lenp = len;
-                               match = TRUE;
-                               break;
-                           }
-                       }
-                   }
-                   if (!match) {
-                       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
-                       STRLEN tmplen;
-                       to_utf8_fold(utf8_p, tmpbuf, &tmplen);
-                       if (swash_fetch(sw, tmpbuf, 1))
-                           match = TRUE;
-                   }
-               }
+    /* Use passed in max length, or one character if none passed in or less
+     * than one character.  And assume will match just one character.  This is
+     * overwritten later if matched more. */
+    if (lenp) {
+       maxlen = (*lenp > c_len) ? *lenp : c_len;
+       *lenp = c_len;
 
-               /* If we allocated a string above, free it */
-               if (! utf8_target) Safefree(utf8_p);
-           }
-       }
-       if (match && lenp && *lenp == 0)
-           *lenp = UNISKIP(NATIVE_TO_UNI(c));
     }
-    if (!match && c < 256) {
+    else {
+       maxlen = c_len;
+    }
+
+    /* If this character is potentially in the bitmap, check it */
+    if (c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
        else if (flags & ANYOF_FOLD) {
@@ -6069,7 +6257,7 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
                match = TRUE;
        }
        
-       if (!match && (flags & ANYOF_CLASS)) {
+       if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
            PL_reg_flags |= RF_tainted;
            if (
                (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
@@ -6109,6 +6297,105 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
        }
     }
 
+    /* If the bitmap didn't (or couldn't) match, and something outside the
+     * bitmap could match, try that */
+    if (!match) {
+       if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+           match = TRUE;
+       }
+       else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+                || (utf8_target && flags & ANYOF_UTF8))
+       {
+           AV *av;
+           SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
+
+           if (sw) {
+               U8 * utf8_p;
+               if (utf8_target) {
+                   utf8_p = (U8 *) p;
+               } else {
+                   STRLEN len = 1;
+                   utf8_p = bytes_to_utf8(p, &len);
+               }
+               if (swash_fetch(sw, utf8_p, 1))
+                   match = TRUE;
+               else if (flags & ANYOF_FOLD) {
+                   if (!match && lenp && av) {
+                       I32 i;
+                       for (i = 0; i <= av_len(av); i++) {
+                           SV* const sv = *av_fetch(av, i, FALSE);
+                           STRLEN len;
+                           const char * const s = SvPV_const(sv, len);
+                           if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
+                               *lenp = len;
+                               match = TRUE;
+                               break;
+                           }
+                       }
+                   }
+                   if (!match) {
+                       U8 folded[UTF8_MAXBYTES_CASE+1];
+
+                       /* See if the folded version matches */
+                       STRLEN foldlen;
+                       to_utf8_fold(utf8_p, folded, &foldlen);
+                       if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
+                           match = TRUE;
+                       }
+                       else {
+                           SV** listp;
+
+                            /* Consider "k" =~ /[K]/i.  The line above would
+                             * have just folded the 'k' to itself, and that
+                             * isn't going to match 'K'.  So we look through
+                             * the closure of everything that folds to 'k'.
+                             * That will find the 'K'.  Initialize the list, if
+                             * necessary */
+                           if (! PL_utf8_foldclosures) {
+
+                                /* If the folds haven't been read in, call a
+                                 * fold function to force that */
+                               if (! PL_utf8_tofold) {
+                                   U8 dummy[UTF8_MAXBYTES+1];
+                                   STRLEN dummy_len;
+                                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+                               }
+                               PL_utf8_foldclosures =
+                                       _swash_inversion_hash(PL_utf8_tofold);
+                           }
+
+                            /* The data structure is a hash with the keys every
+                             * character that is folded to, like 'k', and the
+                             * values each an array of everything that folds to
+                             * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                           if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                           (char *) folded, foldlen, FALSE)))
+                           {
+                               AV* list = (AV*) *listp;
+                               IV i;
+                               for (i = 0; i <= av_len(list); i++) {
+                                   SV** try_p = av_fetch(list, i, FALSE);
+                                   if (try_p == NULL) {
+                                       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                                   }
+                                   /* Don't have to worry about embeded nulls
+                                    * since NULL isn't folded or foldable */
+                                   if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+                                       match = TRUE;
+                                       break;
+                                   }
+                               }
+                           }
+                       }
+                   }
+               }
+
+               /* If we allocated a string above, free it */
+               if (! utf8_target) Safefree(utf8_p);
+           }
+       }
+    }
+
     return (flags & ANYOF_INVERT) ? !match : match;
 }