regexec.c: Don't rely on break stmts in macros

[perl5.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index a47bfc7..29058a1 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -179,6 +179,7 @@
  #define RE_utf8_posix_digit PL_utf8_posix_digit
  #endif
  
+#define PLACEHOLDER    /* Something for the preprocessor to grab onto */
  
  /* The actual code for CCC_TRY, which uses several variables from the routine
   * it's callable from.  It is designed to be the bulk of a case statement.
@@ -219,14 +220,15 @@
   * This is because that code fails when the test succeeds, so we want to have
   * the test fail so that the code succeeds.  The swash is stored in a
   * predictable PL_ place */
-#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)                     \
+#define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
+                          CLASS, STR)                                        \
      case NAME:                                                                \
         _CCC_TRY_CODE( !, FUNC,                                               \
                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
                                             (U8*)locinput, TRUE)),            \
                           CLASS, STR)                                         \
      case NNAME:                                                               \
-       _CCC_TRY_CODE(  , FUNC,                                               \
+       _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
                                             (U8*)locinput, TRUE)),            \
                           CLASS, STR)                                         \
@@ -243,13 +245,34 @@
   * irrelevant here */
  #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
+               NAMEA, NNAMEA, FUNCA,                                         \
                 CLASS, STR)                                                   \
      case NAMEL:                                                               \
         PL_reg_flags |= RF_tainted;                                           \
         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
      case NNAMEL:                                                              \
         PL_reg_flags |= RF_tainted;                                           \
-       _CCC_TRY_CODE(  , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
+       _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
+                      CLASS, STR)                                            \
+    case NAMEA:                                                               \
+       if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
+           sayNO;                                                            \
+       }                                                                     \
+       /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
+       nextchr = UCHARAT(++locinput);                                        \
+       break;                                                                \
+    case NNAMEA:                                                              \
+       if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
+           sayNO;                                                            \
+       }                                                                     \
+       if (utf8_target) {                                                    \
+           locinput += PL_utf8skip[nextchr];                                 \
+           nextchr = UCHARAT(locinput);                                      \
+       }                                                                     \
+       else {                                                                \
+           nextchr = UCHARAT(++locinput);                                    \
+       }                                                                     \
+       break;                                                                \
      /* Generate the non-locale cases */                                       \
      _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
  
@@ -258,8 +281,12 @@
  #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
                   NAMEU, NNAMEU, FUNCU,                                        \
+                 NAMEA, NNAMEA, FUNCA,                                        \
                   CLASS, STR)                                                  \
-    CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
+    CCC_TRY(NAME, NNAME, FUNC,                                                 \
+           NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
+           NAMEA, NNAMEA, FUNCA,                                              \
+           CLASS, STR)                                                        \
      _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
  
  /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -1332,6 +1359,86 @@ if ((!reginfo || regtry(reginfo, &s))) \
  #define DUMP_EXEC_POS(li,s,doutf8) \
      dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
  
+
+#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+       tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
+       tmp = TEST_NON_UTF8(tmp);                                              \
+       REXEC_FBC_UTF8_SCAN(                                                   \
+           if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
+               tmp = !tmp;                                                    \
+               IF_SUCCESS;                                                    \
+           }                                                                  \
+           else {                                                             \
+               IF_FAIL;                                                       \
+           }                                                                  \
+       );                                                                     \
+
+#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
+       if (s == PL_bostr) {                                                   \
+           tmp = '\n';                                                        \
+       }                                                                      \
+       else {                                                                 \
+           U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
+           tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
+       }                                                                      \
+       tmp = TeSt1_UtF8;                                                      \
+       LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
+       REXEC_FBC_UTF8_SCAN(                                                   \
+           if (tmp == ! (TeSt2_UtF8)) { \
+               tmp = !tmp;                                                    \
+               IF_SUCCESS;                                                    \
+           }                                                                  \
+           else {                                                             \
+               IF_FAIL;                                                       \
+           }                                                                  \
+       );                                                                     \
+
+/* The only difference between the BOUND and NBOUND cases is that
+ * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
+ * NBOUND.  This is accomplished by passing it in either the if or else clause,
+ * with the other one being empty */
+#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
+
+#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+
+#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
+    FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+
+
+/* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
+ * be passed in completely with the variable name being tested, which isn't
+ * such a clean interface, but this is easier to read than it was before.  We
+ * are looking for the boundary (or non-boundary between a word and non-word
+ * character.  The utf8 and non-utf8 cases have the same logic, but the details
+ * must be different.  Find the "wordness" of the character just prior to this
+ * one, and compare it with the wordness of this one.  If they differ, we have
+ * a boundary.  At the beginning of the string, pretend that the previous
+ * character was a new-line */
+#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
+    if (utf8_target) {                                                         \
+               UTF8_CODE \
+    }                                                                          \
+    else {  /* Not utf8 */                                                     \
+       tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
+       tmp = TEST_NON_UTF8(tmp);                                              \
+       REXEC_FBC_SCAN(                                                        \
+           if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
+               tmp = !tmp;                                                    \
+               IF_SUCCESS;                                                    \
+           }                                                                  \
+           else {                                                             \
+               IF_FAIL;                                                       \
+           }                                                                  \
+       );                                                                     \
+    }                                                                          \
+    if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
+       goto got_it;
+
  /* We know what class REx starts with.  Try to find this position... */
  /* if reginfo is NULL, its a dryrun */
  /* annoyingly all the vars in this routine have different names from their counterparts
@@ -1524,207 +1631,215 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             break;
         case BOUNDL:
             PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
-       case BOUND:
-           if (utf8_target) {
-               if (s == PL_bostr)
-                   tmp = '\n';
-               else {
-                   U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
-                   tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
-               }
-               tmp = ((OP(c) == BOUND ?
-                       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
-               LOAD_UTF8_CHARCLASS_ALNUM();
-               REXEC_FBC_UTF8_SCAN(
-                   if (tmp == !(OP(c) == BOUND ?
-                                cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
-                                isALNUM_LC_utf8((U8*)s)))
-                   {
-                       tmp = !tmp;
-                       REXEC_FBC_TRYIT;
-               }
-               );
-           }
-            else {  /* Not utf8 */
-               tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-                tmp = cBOOL((OP(c) == BOUNDL)
-                            ? isALNUM_LC(tmp)
-                            : (isWORDCHAR_L1(tmp)
-                               && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
-               REXEC_FBC_SCAN(
-                   if (tmp ==
-                        !((OP(c) == BOUNDL)
-                          ? isALNUM_LC(*s)
-                          : (isWORDCHAR_L1((U8) *s)
-                             && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
-                   {
-                       tmp = !tmp;
-                       REXEC_FBC_TRYIT;
-               }
-               );
-           }
-           if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
-               goto got_it;
+           FBC_BOUND(isALNUM_LC,
+                     isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                     isALNUM_LC_utf8((U8*)s));
             break;
         case NBOUNDL:
             PL_reg_flags |= RF_tainted;
-           /* FALL THROUGH */
+           FBC_NBOUND(isALNUM_LC,
+                      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                      isALNUM_LC_utf8((U8*)s));
+           break;
+       case BOUND:
+           FBC_BOUND(isWORDCHAR,
+                     isALNUM_uni(tmp),
+                     cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+           break;
+       case BOUNDA:
+           FBC_BOUND_NOLOAD(isWORDCHAR_A,
+                            isWORDCHAR_A(tmp),
+                            isWORDCHAR_A((U8*)s));
+           break;
         case NBOUND:
-           if (utf8_target) {
-               if (s == PL_bostr)
-                   tmp = '\n';
-               else {
-                   U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
-                   tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
-               }
-               tmp = ((OP(c) == NBOUND ?
-                       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
-               LOAD_UTF8_CHARCLASS_ALNUM();
-               REXEC_FBC_UTF8_SCAN(
-                   if (tmp == !(OP(c) == NBOUND ?
-                                cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
-                                isALNUM_LC_utf8((U8*)s)))
-                       tmp = !tmp;
-                   else REXEC_FBC_TRYIT;
-               );
-           }
-           else {
-               tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
-                tmp = cBOOL((OP(c) == NBOUNDL)
-                            ? isALNUM_LC(tmp)
-                            : (isWORDCHAR_L1(tmp)
-                               && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
-               REXEC_FBC_SCAN(
-                   if (tmp == ! cBOOL(
-                            (OP(c) == NBOUNDL)
-                            ? isALNUM_LC(*s)
-                            : (isWORDCHAR_L1((U8) *s)
-                               && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
-                    {
-                       tmp = !tmp;
-                    }
-                   else REXEC_FBC_TRYIT;
-               );
-           }
-           if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
-               goto got_it;
+           FBC_NBOUND(isWORDCHAR,
+                      isALNUM_uni(tmp),
+                      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+           break;
+       case NBOUNDA:
+           FBC_NBOUND_NOLOAD(isWORDCHAR_A,
+                             isWORDCHAR_A(tmp),
+                             isWORDCHAR_A((U8*)s));
+           break;
+       case BOUNDU:
+           FBC_BOUND(isWORDCHAR_L1,
+                     isALNUM_uni(tmp),
+                     cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+           break;
+       case NBOUNDU:
+           FBC_NBOUND(isWORDCHAR_L1,
+                      isALNUM_uni(tmp),
+                      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
             break;
         case ALNUML:
             REXEC_FBC_CSCAN_TAINT(
                 isALNUM_LC_utf8((U8*)s),
                 isALNUM_LC(*s)
             );
+           break;
         case ALNUMU:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
                  isWORDCHAR_L1((U8) *s)
             );
+           break;
         case ALNUM:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
                  isWORDCHAR((U8) *s)
             );
+           break;
+       case ALNUMA:
+           /* Don't need to worry about utf8, as it can match only a single
+            * byte invariant character */
+           REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
+           break;
         case NALNUMU:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
                  ! isWORDCHAR_L1((U8) *s)
             );
+           break;
         case NALNUM:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
                  ! isALNUM(*s)
             );
+           break;
+       case NALNUMA:
+           REXEC_FBC_CSCAN(
+               !isWORDCHAR_A(*s),
+               !isWORDCHAR_A(*s)
+           );
+           break;
         case NALNUML:
             REXEC_FBC_CSCAN_TAINT(
                 !isALNUM_LC_utf8((U8*)s),
                 !isALNUM_LC(*s)
             );
+           break;
         case SPACEU:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
                  isSPACE_L1((U8) *s)
             );
+           break;
         case SPACE:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
                  isSPACE((U8) *s)
             );
+           break;
+       case SPACEA:
+           /* Don't need to worry about utf8, as it can match only a single
+            * byte invariant character */
+           REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
+           break;
         case SPACEL:
             REXEC_FBC_CSCAN_TAINT(
                 isSPACE_LC_utf8((U8*)s),
                 isSPACE_LC(*s)
             );
+           break;
         case NSPACEU:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
                  ! isSPACE_L1((U8) *s)
             );
+           break;
         case NSPACE:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
                  ! isSPACE((U8) *s)
             );
+           break;
+       case NSPACEA:
+           REXEC_FBC_CSCAN(
+               !isSPACE_A(*s),
+               !isSPACE_A(*s)
+           );
+           break;
         case NSPACEL:
             REXEC_FBC_CSCAN_TAINT(
                 !isSPACE_LC_utf8((U8*)s),
                 !isSPACE_LC(*s)
             );
+           break;
         case DIGIT:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
                 isDIGIT(*s)
             );
+           break;
+       case DIGITA:
+           /* Don't need to worry about utf8, as it can match only a single
+            * byte invariant character */
+           REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
+           break;
         case DIGITL:
             REXEC_FBC_CSCAN_TAINT(
                 isDIGIT_LC_utf8((U8*)s),
                 isDIGIT_LC(*s)
             );
+           break;
         case NDIGIT:
             REXEC_FBC_CSCAN_PRELOAD(
                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
                 !isDIGIT(*s)
             );
+           break;
+       case NDIGITA:
+           REXEC_FBC_CSCAN(
+               !isDIGIT_A(*s),
+               !isDIGIT_A(*s)
+           );
+           break;
         case NDIGITL:
             REXEC_FBC_CSCAN_TAINT(
                 !isDIGIT_LC_utf8((U8*)s),
                 !isDIGIT_LC(*s)
             );
+           break;
         case LNBREAK:
             REXEC_FBC_CSCAN(
                 is_LNBREAK_utf8(s),
                 is_LNBREAK_latin1(s)
             );
+           break;
         case VERTWS:
             REXEC_FBC_CSCAN(
                 is_VERTWS_utf8(s),
                 is_VERTWS_latin1(s)
             );
+           break;
         case NVERTWS:
             REXEC_FBC_CSCAN(
                 !is_VERTWS_utf8(s),
                 !is_VERTWS_latin1(s)
             );
+           break;
         case HORIZWS:
             REXEC_FBC_CSCAN(
                 is_HORIZWS_utf8(s),
                 is_HORIZWS_latin1(s)
             );
+           break;
         case NHORIZWS:
             REXEC_FBC_CSCAN(
                 !is_HORIZWS_utf8(s),
                 !is_HORIZWS_latin1(s)
             );      
+           break;
         case AHOCORASICKC:
         case AHOCORASICK: 
             {
@@ -3641,14 +3756,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             nextchr = UCHARAT(locinput);
             break;
         }
+
+       /* XXX Could improve efficiency by separating these all out using a
+        * macro or in-line function.  At that point regcomp.c would no longer
+        * have to set the FLAGS fields of these */
         case BOUNDL:
         case NBOUNDL:
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
         case BOUND:
+       case BOUNDU:
+       case BOUNDA:
         case NBOUND:
+       case NBOUNDU:
+       case NBOUNDA:
             /* was last char in word? */
-           if (utf8_target) {
+           if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
                 if (locinput == PL_bostr)
                     ln = '\n';
                 else {
@@ -3656,7 +3779,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
  
                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
                 }
-               if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+               if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
                     ln = isALNUM_uni(ln);
                     LOAD_UTF8_CHARCLASS_ALNUM();
                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
@@ -3667,26 +3790,45 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 }
             }
             else {
+
+               /* Here the string isn't utf8, or is utf8 and only ascii
+                * characters are to match \w.  In the latter case looking at
+                * the byte just prior to the current one may be just the final
+                * byte of a multi-byte character.  This is ok.  There are two
+                * cases:
+                * 1) it is a single byte character, and then the test is doing
+                *      just what it's supposed to.
+                * 2) it is a multi-byte character, in which case the final
+                *      byte is never mistakable for ASCII, and so the test
+                *      will say it is not a word character, which is the
+                *      correct answer. */
                 ln = (locinput != PL_bostr) ?
                     UCHARAT(locinput - 1) : '\n';
-               if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
-
-                    /* Here, can't be BOUNDL or NBOUNDL because they never set
-                     * the flags to REGEX_UNICODE_CHARSET */
-                    ln = isWORDCHAR_L1(ln);
-                    n = isWORDCHAR_L1(nextchr);
-                }
-                else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
-                   ln = isALNUM(ln);
-                   n = isALNUM(nextchr);
-               }
-               else {
-                   ln = isALNUM_LC(ln);
-                   n = isALNUM_LC(nextchr);
+               switch (FLAGS(scan)) {
+                   case REGEX_UNICODE_CHARSET:
+                       ln = isWORDCHAR_L1(ln);
+                       n = isWORDCHAR_L1(nextchr);
+                       break;
+                   case REGEX_LOCALE_CHARSET:
+                       ln = isALNUM_LC(ln);
+                       n = isALNUM_LC(nextchr);
+                       break;
+                   case REGEX_DEPENDS_CHARSET:
+                       ln = isALNUM(ln);
+                       n = isALNUM(nextchr);
+                       break;
+                   case REGEX_ASCII_RESTRICTED_CHARSET:
+                       ln = isWORDCHAR_A(ln);
+                       n = isWORDCHAR_A(nextchr);
+                       break;
+                   default:
+                       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
+                       break;
                 }
             }
-           if (((!ln) == (!n)) == (OP(scan) == BOUND ||
-                                   OP(scan) == BOUNDL))
+           /* Note requires that all BOUNDs be lower than all NBOUNDs in
+            * regcomp.sym */
+           if (((!ln) == (!n)) == (OP(scan) < NBOUND))
                     sayNO;
             break;
         case ANYOFV:
@@ -3717,15 +3859,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
          CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
                   ALNUMU, NALNUMU, isWORDCHAR_L1,
+                 ALNUMA, NALNUMA, isWORDCHAR_A,
                   perl_word, "a");
  
          CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
                   SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
                   SPACEU, NSPACEU, isSPACE_L1,
+                 SPACEA, NSPACEA, isSPACE_A,
                   perl_space, " ");
  
          CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
+               DIGITA, NDIGITA, isDIGIT_A,
                 posix_digit, "0");
  
         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
@@ -5979,6 +6124,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             scan++;
         }
         break;
+    case ALNUMA:
+       while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
+           scan++;
+       }
+       break;
      case ALNUML:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
@@ -6019,6 +6169,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             scan++;
         }
         break;
+    case NALNUMA:
+       if (utf8_target) {
+           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
      case NALNUML:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
@@ -6063,6 +6225,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             scan++;
         }
         break;
+    case SPACEA:
+       while (scan < loceol && isSPACE_A((U8) *scan)) {
+           scan++;
+       }
+       break;
      case SPACEL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
@@ -6107,6 +6274,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             scan++;
         }
         break;
+    case NSPACEA:
+       if (utf8_target) {
+           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isSPACE_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
      case NSPACEL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
@@ -6135,6 +6314,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                 scan++;
         }
         break;
+    case DIGITA:
+       while (scan < loceol && isDIGIT_A((U8) *scan)) {
+           scan++;
+       }
+       break;
      case DIGITL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {
@@ -6162,6 +6346,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
             while (scan < loceol && !isDIGIT(*scan))
                 scan++;
         }
+       break;
+    case NDIGITA:
+       if (utf8_target) {
+           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+               scan += UTF8SKIP(scan);
+           }
+       }
+       else {
+           while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
+               scan++;
+           }
+       }
+       break;
      case NDIGITL:
         PL_reg_flags |= RF_tainted;
         if (utf8_target) {