This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Don't use "try" as a variable name
[perl5.git] / regexec.c
index e569a91..375d4fd 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #define        STATIC  static
 #endif
 
-#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
+/* Valid for non-utf8 strings only: avoids the reginclass call if there are no
+ * complications: i.e., if everything matchable is straight forward in the
+ * bitmap */
+#define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
+                                             : ANYOF_BITMAP_TEST(p,*(c)))
 
 /*
  * Forwards.
 #endif
 
 
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME:                                                                     \
-            if (!nextchr)                                                               \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if (!(OP(scan) == NAME                                                  \
+#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
+        case NAMEL:                                                         \
+            PL_reg_flags |= RF_tainted;                                     \
+            /* FALL THROUGH */                                              \
+        case NAME:                                                          \
+            if (!nextchr)                                                   \
+                sayNO;                                                      \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
+                if (!CAT2(PL_utf8_,CLASS)) {                                \
+                    bool ok;                                                \
+                    ENTER;                                                  \
+                    save_re_context();                                      \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
+                    assert(ok);                                             \
+                    LEAVE;                                                  \
+                }                                                           \
+                if (!(OP(scan) == NAME                                      \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                          \
+                {                                                           \
+                    sayNO;                                                  \
+                }                                                           \
+                locinput += PL_utf8skip[nextchr];                           \
+                nextchr = UCHARAT(locinput);                                \
+                break;                                                      \
+            }                                                               \
+           /* Drops through to the macro that calls this one */
+
+#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
+            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
+                sayNO;                                                      \
+            nextchr = UCHARAT(++locinput);                                  \
             break
 
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
-        case NAMEL:                                                              \
-            PL_reg_flags |= RF_tainted;                                                 \
-            /* FALL THROUGH */                                                          \
-        case NAME :                                                                     \
-            if (!nextchr && locinput >= PL_regeol)                                      \
-                sayNO;                                                                  \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                            \
-                    bool ok;                                                            \
-                    ENTER;                                                              \
-                    save_re_context();                                                  \
-                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
-                    assert(ok);                                                         \
-                    LEAVE;                                                              \
-                }                                                                       \
-                if ((OP(scan) == NAME                                                  \
+/* Almost identical to the above, but has a case for a node that matches chars
+ * between 128 and 255 using Unicode (latin1) semantics. */
+#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
+
+#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
+        case NAMEL:                                                          \
+            PL_reg_flags |= RF_tainted;                                      \
+            /* FALL THROUGH */                                               \
+        case NAME :                                                          \
+            if (!nextchr && locinput >= PL_regeol)                           \
+                sayNO;                                                       \
+            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
+                if (!CAT2(PL_utf8_,CLASS)) {                                 \
+                    bool ok;                                                 \
+                    ENTER;                                                   \
+                    save_re_context();                                       \
+                    ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
+                    assert(ok);                                              \
+                    LEAVE;                                                   \
+                }                                                            \
+                if ((OP(scan) == NAME                                        \
                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                                      \
-                {                                                                       \
-                    sayNO;                                                              \
-                }                                                                       \
-                locinput += PL_utf8skip[nextchr];                                       \
-                nextchr = UCHARAT(locinput);                                            \
-                break;                                                                  \
-            }                                                                           \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
-                sayNO;                                                                  \
-            nextchr = UCHARAT(++locinput);                                              \
+                    : LCFUNC_utf8((U8*)locinput)))                           \
+                {                                                            \
+                    sayNO;                                                   \
+                }                                                            \
+                locinput += PL_utf8skip[nextchr];                            \
+                nextchr = UCHARAT(locinput);                                 \
+                break;                                                       \
+            }
+
+#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
+            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
             break
 
 
+#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
+    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
+            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
+                sayNO;                                                       \
+            nextchr = UCHARAT(++locinput);                                   \
+            break
 
 
 
@@ -1333,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
        switch (OP(c)) {
        case ANYOF:
            if (utf8_target) {
-                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+                REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
                          !UTF8_IS_INVARIANT((U8)s[0]) ?
                          reginclass(prog, c, (U8*)s, 0, utf8_target) :
                          REGINCLASS(prog, c, (U8*)s));
@@ -1506,12 +1532,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                }
                );
            }
-           else {
+            else {  /* Not utf8 */
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == BOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
                    if (tmp ==
-                       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+                        !((OP(c) == BOUNDL)
+                          ? isALNUM_LC(*s)
+                          : (isWORDCHAR_L1((U8) *s)
+                             && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                   {
                        tmp = !tmp;
                        REXEC_FBC_TRYIT;
                }
@@ -1544,12 +1577,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            }
            else {
                tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-               tmp = ((OP(c) == NBOUND ?
-                       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+                tmp = cBOOL((OP(c) == NBOUNDL)
+                            ? isALNUM_LC(tmp)
+                            : (isWORDCHAR_L1(tmp)
+                               && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
                REXEC_FBC_SCAN(
-                   if (tmp ==
-                       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+                   if (tmp == ! cBOOL(
+                            (OP(c) == NBOUNDL)
+                            ? isALNUM_LC(*s)
+                            : (isWORDCHAR_L1((U8) *s)
+                               && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
+                    {
                        tmp = !tmp;
+                    }
                    else REXEC_FBC_TRYIT;
                );
            }
@@ -1560,7 +1600,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               isALNUM(*s)
+                (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
            );
        case ALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1571,7 +1611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-               !isALNUM(*s)
+                ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
            );
        case NALNUML:
            REXEC_FBC_CSCAN_TAINT(
@@ -1582,7 +1622,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-               isSPACE(*s)
+                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -1593,7 +1633,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-               !isSPACE(*s)
+                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -1740,10 +1780,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                         PerlIO_printf( Perl_debug_log,
                                             " Scanning for legal start char...\n");
                                     }
-                                );            
-                                while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
-                                    uc++;
-                                }
+                                );
+                               if (utf8_target) {
+                                   while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+                                       uc += UTF8SKIP(uc);
+                                   }
+                               } else {
+                                   while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
+                                       uc++;
+                                   }
+                               }
                                 s= (char *)uc;
                             }
                             if (uc >(U8*)last_start) break;
@@ -3177,7 +3223,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                                          "%*s  %smatched empty string...%s\n",
                                          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
                         );
-                       break;
+                       if (!trie->jump)
+                           break;
                    } else {
                        DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3569,7 +3616,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            else {
                ln = (locinput != PL_bostr) ?
                    UCHARAT(locinput - 1) : '\n';
-               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+               if (FLAGS(scan) & USE_UNI) {
+
+                    /* Here, can't be BOUNDL or NBOUNDL because they never set
+                     * the flags to USE_UNI */
+                    ln = isWORDCHAR_L1(ln);
+                    n = isWORDCHAR_L1(nextchr);
+                }
+                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
                    ln = isALNUM(ln);
                    n = isALNUM(nextchr);
                }
@@ -3585,22 +3639,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
        case ANYOF:
            if (utf8_target) {
                STRLEN inclasslen = PL_regeol - locinput;
+               if (locinput >= PL_regeol)
+                   sayNO;
 
                if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
                    goto anyof_fail;
-               if (locinput >= PL_regeol)
-                   sayNO;
-               locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
+               locinput += inclasslen;
                nextchr = UCHARAT(locinput);
                break;
            }
            else {
                if (nextchr < 0)
                    nextchr = UCHARAT(locinput);
-               if (!REGINCLASS(rex, scan, (U8*)locinput))
-                   goto anyof_fail;
                if (!nextchr && locinput >= PL_regeol)
                    sayNO;
+               if (!REGINCLASS(rex, scan, (U8*)locinput))
+                   goto anyof_fail;
                nextchr = UCHARAT(++locinput);
                break;
            }
@@ -3616,11 +3670,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 sayNO;
            break;
        /* Special char classes - The defines start on line 129 or so */
-       CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
-       CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
 
-       CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
-       CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
 
        CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
        CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
@@ -5707,23 +5761,103 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
        scan = loceol;
        break;
-    case EXACT:                /* length of string is 1 */
-       c = (U8)*STRING(p);
-       while (scan < loceol && UCHARAT(scan) == c)
-           scan++;
-       break;
-    case EXACTF:       /* length of string is 1 */
+    case EXACT:
+       /* To get here, EXACTish nodes must have *byte* length == 1.  That
+        * means they match only characters in the string that can be expressed
+        * as a single byte.  For non-utf8 strings, that means a simple match.
+        * For utf8 strings, the character matched must be an invariant, or
+        * downgradable to a single byte.  The pattern's utf8ness is
+        * irrelevant, as since it's a single byte, it either isn't utf8, or if
+        * it is, it's an invariant */
+
        c = (U8)*STRING(p);
-       while (scan < loceol &&
-              (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
-           scan++;
+       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+       if (! utf8_target || UNI_IS_INVARIANT(c)) {
+           while (scan < loceol && UCHARAT(scan) == c) {
+               scan++;
+           }
+       }
+       else {
+
+           /* Here, the string is utf8, and the pattern char is different
+            * in utf8 than not, so can't compare them directly.  Outside the
+            * loop, find find the two utf8 bytes that represent c, and then
+            * look for those in sequence in the utf8 string */
+           U8 high = UTF8_TWO_BYTE_HI(c);
+           U8 low = UTF8_TWO_BYTE_LO(c);
+           loceol = PL_regeol;
+
+           while (hardcount < max
+                   && scan + 1 < loceol
+                   && UCHARAT(scan) == high
+                   && UCHARAT(scan + 1) == low)
+           {
+               scan += 2;
+               hardcount++;
+           }
+       }
        break;
-    case EXACTFL:      /* length of string is 1 */
+    case EXACTFL:
        PL_reg_flags |= RF_tainted;
+       /* FALL THROUGH */
+    case EXACTF:
+
+       /* The comments for the EXACT case apply as well to these fold ones */
+
        c = (U8)*STRING(p);
-       while (scan < loceol &&
-              (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
-           scan++;
+       assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+       if (utf8_target) { /* Use full Unicode fold matching */
+
+           /* For the EXACTFL case, It doesn't really make sense to compare
+            * locale and utf8, but it is best we can do.  The documents warn
+            * against mixing them */
+
+           char *tmpeol = loceol;
+           while (hardcount < max
+                   && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+                                   STRING(p), NULL, 1, UTF_PATTERN))
+           {
+               scan = tmpeol;
+               tmpeol = loceol;
+               hardcount++;
+           }
+
+           /* XXX Note that the above handles properly the German sharp s in
+            * the pattern matching ss in the string.  But it doesn't handle
+            * properly cases where the string contains say 'LIGATURE ff' and
+            * the pattern is 'f+'.  This would require, say, a new function or
+            * revised interface to foldEQ_utf8(), in which the maximum number
+            * of characters to match could be passed and it would return how
+            * many actually did.  This is just one of many cases where
+            * multi-char folds don't work properly, and so the fix is being
+            * deferred */
+       }
+       else {
+
+           /* Here, the string isn't utf8; and either the pattern isn't utf8
+            * or c is an invariant, so its utf8ness doesn't affect c.  Can
+            * just do simple comparisons for exact or fold matching. */
+           switch (OP(p)) {
+           case EXACTF:
+               while (scan < loceol &&
+                   (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
+               {
+                   scan++;
+               }
+               break;
+           case EXACTFL:
+               while (scan < loceol &&
+                   (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
+               {
+                   scan++;
+               }
+               break;
+           default:
+               Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
+           }
+       }
        break;
     case ANYOF:
        if (utf8_target) {
@@ -5743,13 +5877,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isALNUM(*scan))
-               scan++;
+            while (scan < loceol && isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case ALNUML:
@@ -5771,13 +5911,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                  !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) {
+                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isALNUM(*scan))
-               scan++;
+            while (scan < loceol && ! isALNUM((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NALNUML:
@@ -5800,13 +5946,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   (*scan == ' ' ||
-                   swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && isSPACE(*scan))
-               scan++;
+            while (scan < loceol && isSPACE((U8) *scan))
+                scan++;
        }
        break;
     case SPACEL:
@@ -5829,13 +5980,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
                   !(*scan == ' ' ||
-                    swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) {
+                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+            {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
+        } else if (FLAGS(p) & USE_UNI) {
+            while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
+                scan++;
+            }
        } else {
-           while (scan < loceol && !isSPACE(*scan))
-               scan++;
+            while (scan < loceol && ! isSPACE((U8) *scan)) {
+                scan++;
+            }
        }
        break;
     case NSPACEL:
@@ -6031,91 +6188,60 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool
 /*
  - reginclass - determine if a character falls into a character class
  
-  The n is the ANYOF regnode, the p is the target string, lenp
-  is pointer to the maximum length of how far to go in the p
-  (if the lenp is zero, UTF8SKIP(p) is used),
-  utf8_target tells whether the target string is in UTF-8.
+  n is the ANYOF regnode
+  p is the target string
+  lenp is pointer to the maximum number of bytes of how far to go in p
+    (This is assumed wthout checking to always be at least the current
+    character's size)
+  utf8_target tells whether p is in UTF-8.
+
+  Returns true if matched; false otherwise.  If lenp is not NULL, on return
+  from a successful match, the value it points to will be updated to how many
+  bytes in p were matched.  If there was no match, the value is undefined,
+  possibly changed from the input.
 
  */
 
 STATIC bool
-S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
+S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
 {
     dVAR;
     const char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c = *p;
-    STRLEN len = 0;
-    STRLEN plen;
+    STRLEN c_len = 0;
+    STRLEN maxlen;
 
     PERL_ARGS_ASSERT_REGINCLASS;
 
+    /* If c is not already the code point, get it */
     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
-       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
+       c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
                (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
                | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
                /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
                 * UTF8_ALLOW_FFFF */
-       if (len == (STRLEN)-1) 
+       if (c_len == (STRLEN)-1)
            Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
     }
+    else {
+       c_len = 1;
+    }
 
-    plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
-    if (utf8_target || (flags & ANYOF_UNICODE)) {
-        if (lenp)
-           *lenp = 0;
-       if (utf8_target && !ANYOF_RUNTIME(n)) {
-           if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
-               match = TRUE;
-       }
-       if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
-           match = TRUE;
-       if (!match) {
-           AV *av;
-           SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-       
-           if (sw) {
-               U8 * utf8_p;
-               if (utf8_target) {
-                   utf8_p = (U8 *) p;
-               } else {
-                   STRLEN len = 1;
-                   utf8_p = bytes_to_utf8(p, &len);
-               }
-               if (swash_fetch(sw, utf8_p, 1))
-                   match = TRUE;
-               else if (flags & ANYOF_FOLD) {
-                   if (!match && lenp && av) {
-                       I32 i;
-                       for (i = 0; i <= av_len(av); i++) {
-                           SV* const sv = *av_fetch(av, i, FALSE);
-                           STRLEN len;
-                           const char * const s = SvPV_const(sv, len);
-                           if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
-                               *lenp = len;
-                               match = TRUE;
-                               break;
-                           }
-                       }
-                   }
-                   if (!match) {
-                       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
-                       STRLEN tmplen;
-                       to_utf8_fold(utf8_p, tmpbuf, &tmplen);
-                       if (swash_fetch(sw, tmpbuf, 1))
-                           match = TRUE;
-                   }
-               }
+    /* Use passed in max length, or one character if none passed in or less
+     * than one character.  And assume will match just one character.  This is
+     * overwritten later if matched more. */
+    if (lenp) {
+       maxlen = (*lenp > c_len) ? *lenp : c_len;
+       *lenp = c_len;
 
-               /* If we allocated a string above, free it */
-               if (! utf8_target) Safefree(utf8_p);
-           }
-       }
-       if (match && lenp && *lenp == 0)
-           *lenp = UNISKIP(NATIVE_TO_UNI(c));
     }
-    if (!match && c < 256) {
+    else {
+       maxlen = c_len;
+    }
+
+    /* If this character is potentially in the bitmap, check it */
+    if (c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
        else if (flags & ANYOF_FOLD) {
@@ -6131,7 +6257,7 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
                match = TRUE;
        }
        
-       if (!match && (flags & ANYOF_CLASS)) {
+       if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
            PL_reg_flags |= RF_tainted;
            if (
                (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
@@ -6171,6 +6297,149 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const
        }
     }
 
+    /* If the bitmap didn't (or couldn't) match, and something outside the
+     * bitmap could match, try that */
+    if (!match) {
+       if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+           match = TRUE;
+       }
+       else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+                || (utf8_target && flags & ANYOF_UTF8))
+       {
+           AV *av;
+           SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
+
+           if (sw) {
+               U8 * utf8_p;
+               if (utf8_target) {
+                   utf8_p = (U8 *) p;
+               } else {
+                   STRLEN len = 1;
+                   utf8_p = bytes_to_utf8(p, &len);
+               }
+               if (swash_fetch(sw, utf8_p, 1))
+                   match = TRUE;
+               else if (flags & ANYOF_FOLD) {
+                   if (!match && lenp && av) {
+                       I32 i;
+                       for (i = 0; i <= av_len(av); i++) {
+                           SV* const sv = *av_fetch(av, i, FALSE);
+                           STRLEN len;
+                           const char * const s = SvPV_const(sv, len);
+                           if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
+                               *lenp = len;
+                               match = TRUE;
+                               break;
+                           }
+                       }
+                   }
+                   if (!match) {
+                       U8 folded[UTF8_MAXBYTES_CASE+1];
+
+                       /* See if the folded version matches */
+                       STRLEN foldlen;
+                       to_utf8_fold(utf8_p, folded, &foldlen);
+                       if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
+                           match = TRUE;
+                       }
+                       else {
+                           /* The fold in a few cases  of an above Latin1 char
+                            * is in the Latin1 range, and hence may be in the
+                            * bitmap */
+                           if (UTF8_IS_INVARIANT(*folded)
+                               && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
+                           {
+                               match = TRUE;
+                           }
+                           else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
+                                    && ANYOF_BITMAP_TEST(n,
+                                         UNI_TO_NATIVE(
+                                            TWO_BYTE_UTF8_TO_UNI(folded[0],
+                                                                  folded[1]))))
+                           { /* Since the fold comes from internally
+                              * generated data, we can safely assume it is
+                              * valid utf8 in the test above */
+
+                               match = TRUE;
+                           }
+                            if (! match) {
+                               SV** listp;
+
+                               /* Consider "k" =~ /[K]/i.  The line above
+                                * would have just folded the 'k' to itself,
+                                * and that isn't going to match 'K'.  So we
+                                * look through the closure of everything that
+                                * folds to 'k'.  That will find the 'K'.
+                                * Initialize the list, if necessary */
+                               if (! PL_utf8_foldclosures) {
+
+                                   /* If the folds haven't been read in, call a
+                                   * fold function to force that */
+                                   if (! PL_utf8_tofold) {
+                                       U8 dummy[UTF8_MAXBYTES+1];
+                                       STRLEN dummy_len;
+                                       to_utf8_fold((U8*) "A",
+                                                           dummy, &dummy_len);
+                                   }
+                                   PL_utf8_foldclosures =
+                                         _swash_inversion_hash(PL_utf8_tofold);
+                               }
+
+                               /* The data structure is a hash with the keys
+                                * every character that is folded to, like 'k',
+                                * and the values each an array of everything
+                                * that folds to its key.  e.g. [ 'k', 'K',
+                                * KELVIN_SIGN ] */
+                               if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                             (char *) folded, foldlen, FALSE)))
+                               {
+                                   AV* list = (AV*) *listp;
+                                   IV i;
+                                   for (i = 0; i <= av_len(list); i++) {
+                                       SV** try_p = av_fetch(list, i, FALSE);
+                                       char* try_c;
+                                       if (try_p == NULL) {
+                                           Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                                       }
+                                       /* Don't have to worry about embeded
+                                        * nulls since NULL isn't folded or
+                                        * foldable */
+                                       try_c = SvPVX(*try_p);
+                                       if (UTF8_IS_INVARIANT(*try_c)
+                                           && ANYOF_BITMAP_TEST(n,
+                                                           UNI_TO_NATIVE(*try_c)))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       }
+                                       else if
+                                           (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+                                            && ANYOF_BITMAP_TEST(n,
+                                            UNI_TO_NATIVE(
+                                               TWO_BYTE_UTF8_TO_UNI(try_c[0],
+                                                                    try_c[1]))))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       } else if (swash_fetch(sw,
+                                                               (U8*) try_c, 1))
+                                       {
+                                           match = TRUE;
+                                           break;
+                                       }
+                                   }
+                               }
+                           }
+                        }
+                   }
+               }
+
+               /* If we allocated a string above, free it */
+               if (! utf8_target) Safefree(utf8_p);
+           }
+       }
+    }
+
     return (flags & ANYOF_INVERT) ? !match : match;
 }